1/****************************************************************************
2**
3** Copyright (C) 2020 The Qt Company Ltd.
4** Copyright (C) 2020 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#ifndef QSTRINGCONVERTER_P_H
42#define QSTRINGCONVERTER_P_H
43
44//
45// W A R N I N G
46// -------------
47//
48// This file is not part of the Qt API. It exists purely as an
49// implementation detail. This header file may change from version to
50// version without notice, or even be removed.
51//
52// We mean it.
53//
54
55#include <QtCore/qstring.h>
56#include <QtCore/qendian.h>
57#include <QtCore/qstringconverter.h>
58
59QT_BEGIN_NAMESPACE
60
61#ifndef __cpp_char8_t
62enum char8_t : uchar {};
63#endif
64
65struct QUtf8BaseTraits
66{
67 static const bool isTrusted = false;
68 static const bool allowNonCharacters = true;
69 static const bool skipAsciiHandling = false;
70 static const int Error = -1;
71 static const int EndOfString = -2;
72
73 static bool isValidCharacter(uint u)
74 { return int(u) >= 0; }
75
76 static void appendByte(uchar *&ptr, uchar b)
77 { *ptr++ = b; }
78
79 static void appendByte(char8_t *&ptr, char8_t b)
80 { *ptr++ = b; }
81
82 static uchar peekByte(const uchar *ptr, qsizetype n = 0)
83 { return ptr[n]; }
84
85 static uchar peekByte(const char8_t *ptr, int n = 0)
86 { return ptr[n]; }
87
88 static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
89 { return end - ptr; }
90
91 static qptrdiff availableBytes(const char8_t *ptr, const char8_t *end)
92 { return end - ptr; }
93
94 static void advanceByte(const uchar *&ptr, qsizetype n = 1)
95 { ptr += n; }
96
97 static void advanceByte(const char8_t *&ptr, int n = 1)
98 { ptr += n; }
99
100 static void appendUtf16(ushort *&ptr, ushort uc)
101 { *ptr++ = uc; }
102
103 static void appendUtf16(char16_t *&ptr, ushort uc)
104 { *ptr++ = char16_t(uc); }
105
106 static void appendUcs4(ushort *&ptr, uint uc)
107 {
108 appendUtf16(ptr, QChar::highSurrogate(uc));
109 appendUtf16(ptr, QChar::lowSurrogate(uc));
110 }
111
112 static void appendUcs4(char16_t *&ptr, char32_t uc)
113 {
114 appendUtf16(ptr, QChar::highSurrogate(uc));
115 appendUtf16(ptr, QChar::lowSurrogate(uc));
116 }
117
118 static ushort peekUtf16(const ushort *ptr, qsizetype n = 0)
119 { return ptr[n]; }
120
121 static ushort peekUtf16(const char16_t *ptr, int n = 0)
122 { return ptr[n]; }
123
124 static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
125 { return end - ptr; }
126
127 static qptrdiff availableUtf16(const char16_t *ptr, const char16_t *end)
128 { return end - ptr; }
129
130 static void advanceUtf16(const ushort *&ptr, qsizetype n = 1)
131 { ptr += n; }
132
133 static void advanceUtf16(const char16_t *&ptr, int n = 1)
134 { ptr += n; }
135
136 // it's possible to output to UCS-4 too
137 static void appendUtf16(uint *&ptr, ushort uc)
138 { *ptr++ = uc; }
139
140 static void appendUtf16(char32_t *&ptr, ushort uc)
141 { *ptr++ = char32_t(uc); }
142
143 static void appendUcs4(uint *&ptr, uint uc)
144 { *ptr++ = uc; }
145
146 static void appendUcs4(char32_t *&ptr, uint uc)
147 { *ptr++ = char32_t(uc); }
148};
149
150struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
151{
152 static const bool skipAsciiHandling = true;
153};
154
155namespace QUtf8Functions
156{
157 /// returns 0 on success; errors can only happen if \a u is a surrogate:
158 /// Error if \a u is a low surrogate;
159 /// if \a u is a high surrogate, Error if the next isn't a low one,
160 /// EndOfString if we run into the end of the string.
161 template <typename Traits, typename OutputPtr, typename InputPtr> inline
162 int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
163 {
164 if (!Traits::skipAsciiHandling && u < 0x80) {
165 // U+0000 to U+007F (US-ASCII) - one byte
166 Traits::appendByte(dst, uchar(u));
167 return 0;
168 } else if (u < 0x0800) {
169 // U+0080 to U+07FF - two bytes
170 // first of two bytes
171 Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
172 } else {
173 if (!QChar::isSurrogate(u)) {
174 // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
175 if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
176 return Traits::Error;
177
178 // first of three bytes
179 Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
180 } else {
181 // U+10000 to U+10FFFF - four bytes
182 // need to get one extra codepoint
183 if (Traits::availableUtf16(src, end) == 0)
184 return Traits::EndOfString;
185
186 ushort low = Traits::peekUtf16(src);
187 if (!QChar::isHighSurrogate(u))
188 return Traits::Error;
189 if (!QChar::isLowSurrogate(low))
190 return Traits::Error;
191
192 Traits::advanceUtf16(src);
193 uint ucs4 = QChar::surrogateToUcs4(u, low);
194
195 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
196 return Traits::Error;
197
198 // first byte
199 Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
200
201 // second of four bytes
202 Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
203
204 // for the rest of the bytes
205 u = ushort(ucs4);
206 }
207
208 // second to last byte
209 Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
210 }
211
212 // last byte
213 Traits::appendByte(dst, 0x80 | (u & 0x3f));
214 return 0;
215 }
216
217 inline bool isContinuationByte(uchar b)
218 {
219 return (b & 0xc0) == 0x80;
220 }
221
222 /// returns the number of characters consumed (including \a b) in case of success;
223 /// returns negative in case of error: Traits::Error or Traits::EndOfString
224 template <typename Traits, typename OutputPtr, typename InputPtr> inline
225 qsizetype fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
226 {
227 qsizetype charsNeeded;
228 uint min_uc;
229 uint uc;
230
231 if (!Traits::skipAsciiHandling && b < 0x80) {
232 // US-ASCII
233 Traits::appendUtf16(dst, b);
234 return 1;
235 }
236
237 if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
238 // an UTF-8 first character must be at least 0xC0
239 // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
240 return Traits::Error;
241 } else if (b < 0xe0) {
242 charsNeeded = 2;
243 min_uc = 0x80;
244 uc = b & 0x1f;
245 } else if (b < 0xf0) {
246 charsNeeded = 3;
247 min_uc = 0x800;
248 uc = b & 0x0f;
249 } else if (b < 0xf5) {
250 charsNeeded = 4;
251 min_uc = 0x10000;
252 uc = b & 0x07;
253 } else {
254 // the last Unicode character is U+10FFFF
255 // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
256 // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
257 return Traits::Error;
258 }
259
260 qptrdiff bytesAvailable = Traits::availableBytes(src, end);
261 if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
262 // it's possible that we have an error instead of just unfinished bytes
263 if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
264 return Traits::Error;
265 if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
266 return Traits::Error;
267 return Traits::EndOfString;
268 }
269
270 // first continuation character
271 b = Traits::peekByte(src, 0);
272 if (!isContinuationByte(b))
273 return Traits::Error;
274 uc <<= 6;
275 uc |= b & 0x3f;
276
277 if (charsNeeded > 2) {
278 // second continuation character
279 b = Traits::peekByte(src, 1);
280 if (!isContinuationByte(b))
281 return Traits::Error;
282 uc <<= 6;
283 uc |= b & 0x3f;
284
285 if (charsNeeded > 3) {
286 // third continuation character
287 b = Traits::peekByte(src, 2);
288 if (!isContinuationByte(b))
289 return Traits::Error;
290 uc <<= 6;
291 uc |= b & 0x3f;
292 }
293 }
294
295 // we've decoded something; safety-check it
296 if (!Traits::isTrusted) {
297 if (uc < min_uc)
298 return Traits::Error;
299 if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
300 return Traits::Error;
301 if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
302 return Traits::Error;
303 }
304
305 // write the UTF-16 sequence
306 if (!QChar::requiresSurrogates(uc)) {
307 // UTF-8 decoded and no surrogates are required
308 // detach if necessary
309 Traits::appendUtf16(dst, ushort(uc));
310 } else {
311 // UTF-8 decoded to something that requires a surrogate pair
312 Traits::appendUcs4(dst, uc);
313 }
314
315 Traits::advanceByte(src, charsNeeded - 1);
316 return charsNeeded;
317 }
318}
319
320enum DataEndianness
321{
322 DetectEndianness,
323 BigEndianness,
324 LittleEndianness
325};
326
327struct QUtf8
328{
329 Q_CORE_EXPORT static QChar *convertToUnicode(QChar *buffer, QByteArrayView in) noexcept;
330 static QString convertToUnicode(QByteArrayView in);
331 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state);
332 static QChar *convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state);
333 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in);
334 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in, QStringConverterBase::State *state);
335 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state);
336 struct ValidUtf8Result {
337 bool isValidUtf8;
338 bool isValidAscii;
339 };
340 static ValidUtf8Result isValidUtf8(QByteArrayView in);
341 static int compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept;
342 static int compareUtf8(QByteArrayView utf8, QLatin1String s);
343};
344
345struct QUtf16
346{
347 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
348 static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
349 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
350 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian);
351};
352
353struct QUtf32
354{
355 static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
356 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
357 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
358 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian);
359};
360
361struct Q_CORE_EXPORT QLocal8Bit
362{
363#if !defined(Q_OS_WIN) || defined(QT_BOOTSTRAPPED)
364 static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
365 { return QUtf8::convertToUnicode(in, state); }
366 static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
367 { return QUtf8::convertFromUnicode(in, state); }
368#else
369 static QString convertToUnicode(QByteArrayView, QStringConverter::State *);
370 static QByteArray convertFromUnicode(QStringView, QStringConverter::State *);
371#endif
372};
373
374QT_END_NAMESPACE
375
376#endif // QSTRINGCONVERTER_P_H
377