1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2020 The Qt Company Ltd. |
4 | ** Copyright (C) 2020 Intel Corporation. |
5 | ** Contact: https://www.qt.io/licensing/ |
6 | ** |
7 | ** This file is part of the QtCore module of the Qt Toolkit. |
8 | ** |
9 | ** $QT_BEGIN_LICENSE:LGPL$ |
10 | ** Commercial License Usage |
11 | ** Licensees holding valid commercial Qt licenses may use this file in |
12 | ** accordance with the commercial license agreement provided with the |
13 | ** Software or, alternatively, in accordance with the terms contained in |
14 | ** a written agreement between you and The Qt Company. For licensing terms |
15 | ** and conditions see https://www.qt.io/terms-conditions. For further |
16 | ** information use the contact form at https://www.qt.io/contact-us. |
17 | ** |
18 | ** GNU Lesser General Public License Usage |
19 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
20 | ** General Public License version 3 as published by the Free Software |
21 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
22 | ** packaging of this file. Please review the following information to |
23 | ** ensure the GNU Lesser General Public License version 3 requirements |
24 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
25 | ** |
26 | ** GNU General Public License Usage |
27 | ** Alternatively, this file may be used under the terms of the GNU |
28 | ** General Public License version 2.0 or (at your option) the GNU General |
29 | ** Public license version 3 or any later version approved by the KDE Free |
30 | ** Qt Foundation. The licenses are as published by the Free Software |
31 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
32 | ** included in the packaging of this file. Please review the following |
33 | ** information to ensure the GNU General Public License requirements will |
34 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
35 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
36 | ** |
37 | ** $QT_END_LICENSE$ |
38 | ** |
39 | ****************************************************************************/ |
40 | |
41 | #ifndef QSTRINGCONVERTER_P_H |
42 | #define QSTRINGCONVERTER_P_H |
43 | |
44 | // |
45 | // W A R N I N G |
46 | // ------------- |
47 | // |
48 | // This file is not part of the Qt API. It exists purely as an |
49 | // implementation detail. This header file may change from version to |
50 | // version without notice, or even be removed. |
51 | // |
52 | // We mean it. |
53 | // |
54 | |
55 | #include <QtCore/qstring.h> |
56 | #include <QtCore/qendian.h> |
57 | #include <QtCore/qstringconverter.h> |
58 | |
59 | QT_BEGIN_NAMESPACE |
60 | |
61 | #ifndef __cpp_char8_t |
62 | enum char8_t : uchar {}; |
63 | #endif |
64 | |
65 | struct QUtf8BaseTraits |
66 | { |
67 | static const bool isTrusted = false; |
68 | static const bool allowNonCharacters = true; |
69 | static const bool skipAsciiHandling = false; |
70 | static const int Error = -1; |
71 | static const int EndOfString = -2; |
72 | |
73 | static bool isValidCharacter(uint u) |
74 | { return int(u) >= 0; } |
75 | |
76 | static void appendByte(uchar *&ptr, uchar b) |
77 | { *ptr++ = b; } |
78 | |
79 | static void appendByte(char8_t *&ptr, char8_t b) |
80 | { *ptr++ = b; } |
81 | |
82 | static uchar peekByte(const uchar *ptr, qsizetype n = 0) |
83 | { return ptr[n]; } |
84 | |
85 | static uchar peekByte(const char8_t *ptr, int n = 0) |
86 | { return ptr[n]; } |
87 | |
88 | static qptrdiff availableBytes(const uchar *ptr, const uchar *end) |
89 | { return end - ptr; } |
90 | |
91 | static qptrdiff availableBytes(const char8_t *ptr, const char8_t *end) |
92 | { return end - ptr; } |
93 | |
94 | static void advanceByte(const uchar *&ptr, qsizetype n = 1) |
95 | { ptr += n; } |
96 | |
97 | static void advanceByte(const char8_t *&ptr, int n = 1) |
98 | { ptr += n; } |
99 | |
100 | static void appendUtf16(ushort *&ptr, ushort uc) |
101 | { *ptr++ = uc; } |
102 | |
103 | static void appendUtf16(char16_t *&ptr, ushort uc) |
104 | { *ptr++ = char16_t(uc); } |
105 | |
106 | static void appendUcs4(ushort *&ptr, uint uc) |
107 | { |
108 | appendUtf16(ptr, QChar::highSurrogate(uc)); |
109 | appendUtf16(ptr, QChar::lowSurrogate(uc)); |
110 | } |
111 | |
112 | static void appendUcs4(char16_t *&ptr, char32_t uc) |
113 | { |
114 | appendUtf16(ptr, QChar::highSurrogate(uc)); |
115 | appendUtf16(ptr, QChar::lowSurrogate(uc)); |
116 | } |
117 | |
118 | static ushort peekUtf16(const ushort *ptr, qsizetype n = 0) |
119 | { return ptr[n]; } |
120 | |
121 | static ushort peekUtf16(const char16_t *ptr, int n = 0) |
122 | { return ptr[n]; } |
123 | |
124 | static qptrdiff availableUtf16(const ushort *ptr, const ushort *end) |
125 | { return end - ptr; } |
126 | |
127 | static qptrdiff availableUtf16(const char16_t *ptr, const char16_t *end) |
128 | { return end - ptr; } |
129 | |
130 | static void advanceUtf16(const ushort *&ptr, qsizetype n = 1) |
131 | { ptr += n; } |
132 | |
133 | static void advanceUtf16(const char16_t *&ptr, int n = 1) |
134 | { ptr += n; } |
135 | |
136 | // it's possible to output to UCS-4 too |
137 | static void appendUtf16(uint *&ptr, ushort uc) |
138 | { *ptr++ = uc; } |
139 | |
140 | static void appendUtf16(char32_t *&ptr, ushort uc) |
141 | { *ptr++ = char32_t(uc); } |
142 | |
143 | static void appendUcs4(uint *&ptr, uint uc) |
144 | { *ptr++ = uc; } |
145 | |
146 | static void appendUcs4(char32_t *&ptr, uint uc) |
147 | { *ptr++ = char32_t(uc); } |
148 | }; |
149 | |
150 | struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits |
151 | { |
152 | static const bool skipAsciiHandling = true; |
153 | }; |
154 | |
155 | namespace QUtf8Functions |
156 | { |
157 | /// returns 0 on success; errors can only happen if \a u is a surrogate: |
158 | /// Error if \a u is a low surrogate; |
159 | /// if \a u is a high surrogate, Error if the next isn't a low one, |
160 | /// EndOfString if we run into the end of the string. |
161 | template <typename Traits, typename OutputPtr, typename InputPtr> inline |
162 | int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end) |
163 | { |
164 | if (!Traits::skipAsciiHandling && u < 0x80) { |
165 | // U+0000 to U+007F (US-ASCII) - one byte |
166 | Traits::appendByte(dst, uchar(u)); |
167 | return 0; |
168 | } else if (u < 0x0800) { |
169 | // U+0080 to U+07FF - two bytes |
170 | // first of two bytes |
171 | Traits::appendByte(dst, 0xc0 | uchar(u >> 6)); |
172 | } else { |
173 | if (!QChar::isSurrogate(u)) { |
174 | // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes |
175 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(u)) |
176 | return Traits::Error; |
177 | |
178 | // first of three bytes |
179 | Traits::appendByte(dst, 0xe0 | uchar(u >> 12)); |
180 | } else { |
181 | // U+10000 to U+10FFFF - four bytes |
182 | // need to get one extra codepoint |
183 | if (Traits::availableUtf16(src, end) == 0) |
184 | return Traits::EndOfString; |
185 | |
186 | ushort low = Traits::peekUtf16(src); |
187 | if (!QChar::isHighSurrogate(u)) |
188 | return Traits::Error; |
189 | if (!QChar::isLowSurrogate(low)) |
190 | return Traits::Error; |
191 | |
192 | Traits::advanceUtf16(src); |
193 | uint ucs4 = QChar::surrogateToUcs4(u, low); |
194 | |
195 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4)) |
196 | return Traits::Error; |
197 | |
198 | // first byte |
199 | Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf)); |
200 | |
201 | // second of four bytes |
202 | Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f)); |
203 | |
204 | // for the rest of the bytes |
205 | u = ushort(ucs4); |
206 | } |
207 | |
208 | // second to last byte |
209 | Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f)); |
210 | } |
211 | |
212 | // last byte |
213 | Traits::appendByte(dst, 0x80 | (u & 0x3f)); |
214 | return 0; |
215 | } |
216 | |
217 | inline bool isContinuationByte(uchar b) |
218 | { |
219 | return (b & 0xc0) == 0x80; |
220 | } |
221 | |
222 | /// returns the number of characters consumed (including \a b) in case of success; |
223 | /// returns negative in case of error: Traits::Error or Traits::EndOfString |
224 | template <typename Traits, typename OutputPtr, typename InputPtr> inline |
225 | qsizetype fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end) |
226 | { |
227 | qsizetype charsNeeded; |
228 | uint min_uc; |
229 | uint uc; |
230 | |
231 | if (!Traits::skipAsciiHandling && b < 0x80) { |
232 | // US-ASCII |
233 | Traits::appendUtf16(dst, b); |
234 | return 1; |
235 | } |
236 | |
237 | if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) { |
238 | // an UTF-8 first character must be at least 0xC0 |
239 | // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences |
240 | return Traits::Error; |
241 | } else if (b < 0xe0) { |
242 | charsNeeded = 2; |
243 | min_uc = 0x80; |
244 | uc = b & 0x1f; |
245 | } else if (b < 0xf0) { |
246 | charsNeeded = 3; |
247 | min_uc = 0x800; |
248 | uc = b & 0x0f; |
249 | } else if (b < 0xf5) { |
250 | charsNeeded = 4; |
251 | min_uc = 0x10000; |
252 | uc = b & 0x07; |
253 | } else { |
254 | // the last Unicode character is U+10FFFF |
255 | // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF" |
256 | // therefore, a byte higher than 0xF4 is not the UTF-8 first byte |
257 | return Traits::Error; |
258 | } |
259 | |
260 | qptrdiff bytesAvailable = Traits::availableBytes(src, end); |
261 | if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) { |
262 | // it's possible that we have an error instead of just unfinished bytes |
263 | if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0))) |
264 | return Traits::Error; |
265 | if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1))) |
266 | return Traits::Error; |
267 | return Traits::EndOfString; |
268 | } |
269 | |
270 | // first continuation character |
271 | b = Traits::peekByte(src, 0); |
272 | if (!isContinuationByte(b)) |
273 | return Traits::Error; |
274 | uc <<= 6; |
275 | uc |= b & 0x3f; |
276 | |
277 | if (charsNeeded > 2) { |
278 | // second continuation character |
279 | b = Traits::peekByte(src, 1); |
280 | if (!isContinuationByte(b)) |
281 | return Traits::Error; |
282 | uc <<= 6; |
283 | uc |= b & 0x3f; |
284 | |
285 | if (charsNeeded > 3) { |
286 | // third continuation character |
287 | b = Traits::peekByte(src, 2); |
288 | if (!isContinuationByte(b)) |
289 | return Traits::Error; |
290 | uc <<= 6; |
291 | uc |= b & 0x3f; |
292 | } |
293 | } |
294 | |
295 | // we've decoded something; safety-check it |
296 | if (!Traits::isTrusted) { |
297 | if (uc < min_uc) |
298 | return Traits::Error; |
299 | if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) |
300 | return Traits::Error; |
301 | if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc)) |
302 | return Traits::Error; |
303 | } |
304 | |
305 | // write the UTF-16 sequence |
306 | if (!QChar::requiresSurrogates(uc)) { |
307 | // UTF-8 decoded and no surrogates are required |
308 | // detach if necessary |
309 | Traits::appendUtf16(dst, ushort(uc)); |
310 | } else { |
311 | // UTF-8 decoded to something that requires a surrogate pair |
312 | Traits::appendUcs4(dst, uc); |
313 | } |
314 | |
315 | Traits::advanceByte(src, charsNeeded - 1); |
316 | return charsNeeded; |
317 | } |
318 | } |
319 | |
320 | enum DataEndianness |
321 | { |
322 | DetectEndianness, |
323 | BigEndianness, |
324 | LittleEndianness |
325 | }; |
326 | |
327 | struct QUtf8 |
328 | { |
329 | Q_CORE_EXPORT static QChar *convertToUnicode(QChar *buffer, QByteArrayView in) noexcept; |
330 | static QString convertToUnicode(QByteArrayView in); |
331 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state); |
332 | static QChar *convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state); |
333 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in); |
334 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in, QStringConverterBase::State *state); |
335 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state); |
336 | struct ValidUtf8Result { |
337 | bool isValidUtf8; |
338 | bool isValidAscii; |
339 | }; |
340 | static ValidUtf8Result isValidUtf8(QByteArrayView in); |
341 | static int compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept; |
342 | static int compareUtf8(QByteArrayView utf8, QLatin1String s); |
343 | }; |
344 | |
345 | struct QUtf16 |
346 | { |
347 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness); |
348 | static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian); |
349 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness); |
350 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian); |
351 | }; |
352 | |
353 | struct QUtf32 |
354 | { |
355 | static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian); |
356 | Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness); |
357 | Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness); |
358 | static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian); |
359 | }; |
360 | |
361 | struct Q_CORE_EXPORT QLocal8Bit |
362 | { |
363 | #if !defined(Q_OS_WIN) || defined(QT_BOOTSTRAPPED) |
364 | static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state) |
365 | { return QUtf8::convertToUnicode(in, state); } |
366 | static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state) |
367 | { return QUtf8::convertFromUnicode(in, state); } |
368 | #else |
369 | static QString convertToUnicode(QByteArrayView, QStringConverter::State *); |
370 | static QByteArray convertFromUnicode(QStringView, QStringConverter::State *); |
371 | #endif |
372 | }; |
373 | |
374 | QT_END_NAMESPACE |
375 | |
376 | #endif // QSTRINGCONVERTER_P_H |
377 | |