qstringconverter_p.h source code [Qt/src/corelib/text/qstringconverter_p.h]

1	/****************************************************************************
2	**
3	** Copyright (C) 2020 The Qt Company Ltd.
4	** Copyright (C) 2020 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#ifndef QSTRINGCONVERTER_P_H
42	#define QSTRINGCONVERTER_P_H
43
44	//
45	// W A R N I N G
46	// -------------
47	//
48	// This file is not part of the Qt API. It exists purely as an
49	// implementation detail. This header file may change from version to
50	// version without notice, or even be removed.
51	//
52	// We mean it.
53	//
54
55	#include <QtCore/qstring.h>
56	#include <QtCore/qendian.h>
57	#include <QtCore/qstringconverter.h>
58
59	QT_BEGIN_NAMESPACE
60
61	#ifndef __cpp_char8_t
62	enum char8_t : uchar {};
63	#endif
64
65	struct QUtf8BaseTraits
66	{
67	static const bool isTrusted = false;
68	static const bool allowNonCharacters = true;
69	static const bool skipAsciiHandling = false;
70	static const int Error = -`1`;
71	static const int EndOfString = -`2`;
72
73	static bool isValidCharacter(uint u)
74	{ return int(u) >= `0`; }
75
76	static void appendByte(uchar *&ptr, uchar b)
77	{ *ptr++ = b; }
78
79	static void appendByte(char8_t *&ptr, char8_t b)
80	{ *ptr++ = b; }
81
82	static uchar peekByte(const uchar *ptr, qsizetype n = `0`)
83	{ return ptr[n]; }
84
85	static uchar peekByte(const char8_t ptr, int* n = `0`)
86	{ return ptr[n]; }
87
88	static qptrdiff availableBytes(const uchar ptr, const* uchar *end)
89	{ return end - ptr; }
90
91	static qptrdiff availableBytes(const char8_t ptr, const* char8_t *end)
92	{ return end - ptr; }
93
94	static void advanceByte(const uchar *&ptr, qsizetype n = `1`)
95	{ ptr += n; }
96
97	static void advanceByte(const char8_t &ptr, int* n = `1`)
98	{ ptr += n; }
99
100	static void appendUtf16(ushort *&ptr, ushort uc)
101	{ *ptr++ = uc; }
102
103	static void appendUtf16(char16_t *&ptr, ushort uc)
104	{ ptr++ = char16_t*(uc); }
105
106	static void appendUcs4(ushort *&ptr, uint uc)
107	{
108	appendUtf16(ptr, QChar::highSurrogate(uc));
109	appendUtf16(ptr, QChar::lowSurrogate(uc));
110	}
111
112	static void appendUcs4(char16_t &ptr, char32_t* uc)
113	{
114	appendUtf16(ptr, QChar::highSurrogate(uc));
115	appendUtf16(ptr, QChar::lowSurrogate(uc));
116	}
117
118	static ushort peekUtf16(const ushort *ptr, qsizetype n = `0`)
119	{ return ptr[n]; }
120
121	static ushort peekUtf16(const char16_t ptr, int* n = `0`)
122	{ return ptr[n]; }
123
124	static qptrdiff availableUtf16(const ushort ptr, const* ushort *end)
125	{ return end - ptr; }
126
127	static qptrdiff availableUtf16(const char16_t ptr, const* char16_t *end)
128	{ return end - ptr; }
129
130	static void advanceUtf16(const ushort *&ptr, qsizetype n = `1`)
131	{ ptr += n; }
132
133	static void advanceUtf16(const char16_t &ptr, int* n = `1`)
134	{ ptr += n; }
135
136	// it's possible to output to UCS-4 too
137	static void appendUtf16(uint *&ptr, ushort uc)
138	{ *ptr++ = uc; }
139
140	static void appendUtf16(char32_t *&ptr, ushort uc)
141	{ ptr++ = char32_t*(uc); }
142
143	static void appendUcs4(uint *&ptr, uint uc)
144	{ *ptr++ = uc; }
145
146	static void appendUcs4(char32_t *&ptr, uint uc)
147	{ ptr++ = char32_t*(uc); }
148	};
149
150	struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
151	{
152	static const bool skipAsciiHandling = true;
153	};
154
155	namespace QUtf8Functions
156	{
157	/// returns 0 on success; errors can only happen if \a u is a surrogate:
158	/// Error if \a u is a low surrogate;
159	/// if \a u is a high surrogate, Error if the next isn't a low one,
160	/// EndOfString if we run into the end of the string.
161	template <typename Traits, typename OutputPtr, typename InputPtr> inline
162	int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
163	{
164	if (!Traits::skipAsciiHandling && u < `0x80`) {
165	// U+0000 to U+007F (US-ASCII) - one byte
166	Traits::appendByte(dst, uchar(u));
167	return `0`;
168	} else if (u < `0x0800`) {
169	// U+0080 to U+07FF - two bytes
170	// first of two bytes
171	Traits::appendByte(dst, `0xc0` \| uchar(u >> `6`));
172	} else {
173	if (!QChar::isSurrogate(u)) {
174	// U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
175	if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
176	return Traits::Error;
177
178	// first of three bytes
179	Traits::appendByte(dst, `0xe0` \| uchar(u >> `12`));
180	} else {
181	// U+10000 to U+10FFFF - four bytes
182	// need to get one extra codepoint
183	if (Traits::availableUtf16(src, end) == `0`)
184	return Traits::EndOfString;
185
186	ushort low = Traits::peekUtf16(src);
187	if (!QChar::isHighSurrogate(u))
188	return Traits::Error;
189	if (!QChar::isLowSurrogate(low))
190	return Traits::Error;
191
192	Traits::advanceUtf16(src);
193	uint ucs4 = QChar::surrogateToUcs4(u, low);
194
195	if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
196	return Traits::Error;
197
198	// first byte
199	Traits::appendByte(dst, `0xf0` \| (uchar(ucs4 >> `18`) & `0xf`));
200
201	// second of four bytes
202	Traits::appendByte(dst, `0x80` \| (uchar(ucs4 >> `12`) & `0x3f`));
203
204	// for the rest of the bytes
205	u = ushort(ucs4);
206	}
207
208	// second to last byte
209	Traits::appendByte(dst, `0x80` \| (uchar(u >> `6`) & `0x3f`));
210	}
211
212	// last byte
213	Traits::appendByte(dst, `0x80` \| (u & `0x3f`));
214	return `0`;
215	}
216
217	inline bool isContinuationByte(uchar b)
218	{
219	return (b & `0xc0`) == `0x80`;
220	}
221
222	/// returns the number of characters consumed (including \a b) in case of success;
223	/// returns negative in case of error: Traits::Error or Traits::EndOfString
224	template <typename Traits, typename OutputPtr, typename InputPtr> inline
225	qsizetype fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
226	{
227	qsizetype charsNeeded;
228	uint min_uc;
229	uint uc;
230
231	if (!Traits::skipAsciiHandling && b < `0x80`) {
232	// US-ASCII
233	Traits::appendUtf16(dst, b);
234	return `1`;
235	}
236
237	if (!Traits::isTrusted && Q_UNLIKELY(b <= `0xC1`)) {
238	// an UTF-8 first character must be at least 0xC0
239	// however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
240	return Traits::Error;
241	} else if (b < `0xe0`) {
242	charsNeeded = `2`;
243	min_uc = `0x80`;
244	uc = b & `0x1f`;
245	} else if (b < `0xf0`) {
246	charsNeeded = `3`;
247	min_uc = `0x800`;
248	uc = b & `0x0f`;
249	} else if (b < `0xf5`) {
250	charsNeeded = `4`;
251	min_uc = `0x10000`;
252	uc = b & `0x07`;
253	} else {
254	// the last Unicode character is U+10FFFF
255	// it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
256	// therefore, a byte higher than 0xF4 is not the UTF-8 first byte
257	return Traits::Error;
258	}
259
260	qptrdiff bytesAvailable = Traits::availableBytes(src, end);
261	if (Q_UNLIKELY(bytesAvailable < charsNeeded - `1`)) {
262	// it's possible that we have an error instead of just unfinished bytes
263	if (bytesAvailable > `0` && !isContinuationByte(Traits::peekByte(src, `0`)))
264	return Traits::Error;
265	if (bytesAvailable > `1` && !isContinuationByte(Traits::peekByte(src, `1`)))
266	return Traits::Error;
267	return Traits::EndOfString;
268	}
269
270	// first continuation character
271	b = Traits::peekByte(src, `0`);
272	if (!isContinuationByte(b))
273	return Traits::Error;
274	uc <<= `6`;
275	uc \|= b & `0x3f`;
276
277	if (charsNeeded > `2`) {
278	// second continuation character
279	b = Traits::peekByte(src, `1`);
280	if (!isContinuationByte(b))
281	return Traits::Error;
282	uc <<= `6`;
283	uc \|= b & `0x3f`;
284
285	if (charsNeeded > `3`) {
286	// third continuation character
287	b = Traits::peekByte(src, `2`);
288	if (!isContinuationByte(b))
289	return Traits::Error;
290	uc <<= `6`;
291	uc \|= b & `0x3f`;
292	}
293	}
294
295	// we've decoded something; safety-check it
296	if (!Traits::isTrusted) {
297	if (uc < min_uc)
298	return Traits::Error;
299	if (QChar::isSurrogate(uc) \|\| uc > QChar::LastValidCodePoint)
300	return Traits::Error;
301	if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
302	return Traits::Error;
303	}
304
305	// write the UTF-16 sequence
306	if (!QChar::requiresSurrogates(uc)) {
307	// UTF-8 decoded and no surrogates are required
308	// detach if necessary
309	Traits::appendUtf16(dst, ushort(uc));
310	} else {
311	// UTF-8 decoded to something that requires a surrogate pair
312	Traits::appendUcs4(dst, uc);
313	}
314
315	Traits::advanceByte(src, charsNeeded - `1`);
316	return charsNeeded;
317	}
318	}
319
320	enum DataEndianness
321	{
322	DetectEndianness,
323	BigEndianness,
324	LittleEndianness
325	};
326
327	struct QUtf8
328	{
329	Q_CORE_EXPORT static QChar convertToUnicode(QChar buffer, QByteArrayView in) noexcept;
330	static QString convertToUnicode(QByteArrayView in);
331	Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state);
332	static QChar convertToUnicode(QChar out, QByteArrayView in, QStringConverter::State *state);
333	Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in);
334	Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in, QStringConverterBase::State *state);
335	static char convertFromUnicode(char* out, QStringView in, QStringConverter::State state);
336	struct ValidUtf8Result {
337	bool isValidUtf8;
338	bool isValidAscii;
339	};
340	static ValidUtf8Result isValidUtf8(QByteArrayView in);
341	static int compareUtf8(QByteArrayView utf8, QStringView utf16) noexcept;
342	static int compareUtf8(QByteArrayView utf8, QLatin1String s);
343	};
344
345	struct QUtf16
346	{
347	Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
348	static QChar convertToUnicode(QChar out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
349	Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
350	static char convertFromUnicode(char* out, QStringView in, QStringConverter::State state, DataEndianness endian);
351	};
352
353	struct QUtf32
354	{
355	static QChar convertToUnicode(QChar out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
356	Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
357	Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
358	static char convertFromUnicode(char* out, QStringView in, QStringConverter::State state, DataEndianness endian);
359	};
360
361	struct Q_CORE_EXPORT QLocal8Bit
362	{
363	#if !defined(Q_OS_WIN) \|\| defined(QT_BOOTSTRAPPED)
364	static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
365	{ return QUtf8::convertToUnicode(in, state); }
366	static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
367	{ return QUtf8::convertFromUnicode(in, state); }
368	#else
369	static QString convertToUnicode(QByteArrayView, QStringConverter::State *);
370	static QByteArray convertFromUnicode(QStringView, QStringConverter::State *);
371	#endif
372	};
373
374	QT_END_NAMESPACE
375
376	#endif // QSTRINGCONVERTER_P_H
377

Browse the source code of Qt/src/corelib/text/qstringconverter_p.h