qchar.cpp source code [Qt/src/corelib/text/qchar.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2020 The Qt Company Ltd.
4	** Contact: https://www.qt.io/licensing/
5	**
6	** This file is part of the QtCore module of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial License Usage
10	** Licensees holding valid commercial Qt licenses may use this file in
11	** accordance with the commercial license agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and The Qt Company. For licensing terms
14	** and conditions see https://www.qt.io/terms-conditions. For further
15	** information use the contact form at https://www.qt.io/contact-us.
16	**
17	** GNU Lesser General Public License Usage
18	** Alternatively, this file may be used under the terms of the GNU Lesser
19	** General Public License version 3 as published by the Free Software
20	** Foundation and appearing in the file LICENSE.LGPL3 included in the
21	** packaging of this file. Please review the following information to
22	** ensure the GNU Lesser General Public License version 3 requirements
23	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24	**
25	** GNU General Public License Usage
26	** Alternatively, this file may be used under the terms of the GNU
27	** General Public License version 2.0 or (at your option) the GNU General
28	** Public license version 3 or any later version approved by the KDE Free
29	** Qt Foundation. The licenses are as published by the Free Software
30	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31	** included in the packaging of this file. Please review the following
32	** information to ensure the GNU General Public License requirements will
33	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34	** https://www.gnu.org/licenses/gpl-3.0.html.
35	**
36	** $QT_END_LICENSE$
37	**
38	****************************************************************************/
39
40	// Don't define it while compiling this module, or USERS of Qt will
41	// not be able to link.
42	#ifdef QT_NO_CAST_FROM_ASCII
43	# undef QT_NO_CAST_FROM_ASCII
44	#endif
45	#ifdef QT_NO_CAST_TO_ASCII
46	# undef QT_NO_CAST_TO_ASCII
47	#endif
48	#include "qchar.h"
49
50	#include "qdatastream.h"
51
52	#include "qunicodetables_p.h"
53	#include "qunicodetables.cpp"
54
55	#include <algorithm>
56
57	QT_BEGIN_NAMESPACE
58
59	#define FLAG(x) (1 << (x))
60
61	/!*
62	\class QLatin1Char
63	\inmodule QtCore
64	\reentrant
65	\brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
66
67	\ingroup string-processing
68
69	This class is only useful to construct a QChar with 8-bit character.
70
71	\sa QChar, QLatin1String, QString
72	*/
73
74	/!*
75	\fn const char QLatin1Char::toLatin1() const
76
77	Converts a Latin-1 character to an 8-bit ASCII representation of the character.
78	*/
79
80	/!*
81	\fn QLatin1Char::unicode() const
82
83	Converts a Latin-1 character to an 16-bit-encoded Unicode representation
84	of the character.
85	*/
86
87	/!*
88	\fn QLatin1Char::QLatin1Char(char c)
89
90	Constructs a Latin-1 character for \a c. This constructor should be
91	used when the encoding of the input character is known to be Latin-1.
92	*/
93
94	/!*
95	\class QChar
96	\inmodule QtCore
97	\brief The QChar class provides a 16-bit Unicode character.
98
99	\ingroup string-processing
100	\reentrant
101
102	In Qt, Unicode characters are 16-bit entities without any markup
103	or structure. This class represents such an entity. It is
104	lightweight, so it can be used everywhere. Most compilers treat
105	it like an \c{unsigned short}.
106
107	QChar provides a full complement of testing/classification
108	functions, converting to and from other formats, converting from
109	composed to decomposed Unicode, and trying to compare and
110	case-convert if you ask it to.
111
112	The classification functions include functions like those in the
113	standard C++ header \<cctype\> (formerly \<ctype.h\>), but
114	operating on the full range of Unicode characters, not just for the ASCII
115	range. They all return true if the character is a certain type of character;
116	otherwise they return false. These classification functions are
117	isNull() (returns \c true if the character is '\\0'), isPrint()
118	(true if the character is any sort of printable character,
119	including whitespace), isPunct() (any sort of punctation),
120	isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
121	sort of numeric character, not just 0-9), isLetterOrNumber(), and
122	isDigit() (decimal digits). All of these are wrappers around
123	category() which return the Unicode-defined category of each
124	character. Some of these also calculate the derived properties
125	(for example isSpace() returns \c true if the character is of category
126	Separator_ or an exceptional code point from Other_Control category).*
127
128	QChar also provides direction(), which indicates the "natural"
129	writing direction of this character. The joiningType() function
130	indicates how the character joins with it's neighbors (needed
131	mostly for Arabic or Syriac) and finally hasMirrored(), which indicates
132	whether the character needs to be mirrored when it is printed in
133	it's "unnatural" writing direction.
134
135	Composed Unicode characters (like \a ring) can be converted to
136	decomposed Unicode ("a" followed by "ring above") by using decomposition().
137
138	In Unicode, comparison is not necessarily possible and case
139	conversion is very difficult at best. Unicode, covering the
140	"entire" world, also includes most of the world's case and
141	sorting problems. operator==() and friends will do comparison
142	based purely on the numeric Unicode value (code point) of the
143	characters, and toUpper() and toLower() will do case changes when
144	the character has a well-defined uppercase/lowercase equivalent.
145	For locale-dependent comparisons, use QString::localeAwareCompare().
146
147	The conversion functions include unicode() (to a scalar),
148	toLatin1() (to scalar, but converts all non-Latin-1 characters to
149	0), row() (gives the Unicode row), cell() (gives the Unicode
150	cell), digitValue() (gives the integer value of any of the
151	numerous digit characters), and a host of constructors.
152
153	QChar provides constructors and cast operators that make it easy
154	to convert to and from traditional 8-bit \c{char}s. If you
155	defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
156	explained in the QString documentation, you will need to
157	explicitly call fromLatin1(), or use QLatin1Char,
158	to construct a QChar from an 8-bit \c char, and you will need to
159	call toLatin1() to get the 8-bit value back.
160
161	For more information see
162	\l{http://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
163
164	\sa Unicode, QString, QLatin1Char
165	*/
166
167	/!*
168	\enum QChar::UnicodeVersion
169
170	Specifies which version of the \l{http://www.unicode.org/}{Unicode standard}
171	introduced a certain character.
172
173	\value Unicode_1_1 Version 1.1
174	\value Unicode_2_0 Version 2.0
175	\value Unicode_2_1_2 Version 2.1.2
176	\value Unicode_3_0 Version 3.0
177	\value Unicode_3_1 Version 3.1
178	\value Unicode_3_2 Version 3.2
179	\value Unicode_4_0 Version 4.0
180	\value Unicode_4_1 Version 4.1
181	\value Unicode_5_0 Version 5.0
182	\value Unicode_5_1 Version 5.1
183	\value Unicode_5_2 Version 5.2
184	\value Unicode_6_0 Version 6.0
185	\value Unicode_6_1 Version 6.1
186	\value Unicode_6_2 Version 6.2
187	\value Unicode_6_3 Version 6.3 Since Qt 5.3
188	\value Unicode_7_0 Version 7.0 Since Qt 5.5
189	\value Unicode_8_0 Version 8.0 Since Qt 5.6
190	\value Unicode_9_0 Version 9.0 Since Qt 5.11
191	\value Unicode_10_0 Version 10.0 Since Qt 5.11
192	\value Unicode_11_0 Version 11.0 Since Qt 5.15
193	\value Unicode_12_0 Version 12.0 Since Qt 5.15
194	\value Unicode_12_1 Version 12.1 Since Qt 5.15
195	\value Unicode_13_0 Version 13.0 Since Qt 5.15
196	\value Unicode_Unassigned The value is not assigned to any character
197	in version 8.0 of Unicode.
198
199	\sa unicodeVersion(), currentUnicodeVersion()
200	*/
201
202	/!*
203	\enum QChar::Category
204
205	This enum maps the Unicode character categories.
206
207	The following characters are normative in Unicode:
208
209	\value Mark_NonSpacing Unicode class name Mn
210
211	\value Mark_SpacingCombining Unicode class name Mc
212
213	\value Mark_Enclosing Unicode class name Me
214
215	\value Number_DecimalDigit Unicode class name Nd
216
217	\value Number_Letter Unicode class name Nl
218
219	\value Number_Other Unicode class name No
220
221	\value Separator_Space Unicode class name Zs
222
223	\value Separator_Line Unicode class name Zl
224
225	\value Separator_Paragraph Unicode class name Zp
226
227	\value Other_Control Unicode class name Cc
228
229	\value Other_Format Unicode class name Cf
230
231	\value Other_Surrogate Unicode class name Cs
232
233	\value Other_PrivateUse Unicode class name Co
234
235	\value Other_NotAssigned Unicode class name Cn
236
237
238	The following categories are informative in Unicode:
239
240	\value Letter_Uppercase Unicode class name Lu
241
242	\value Letter_Lowercase Unicode class name Ll
243
244	\value Letter_Titlecase Unicode class name Lt
245
246	\value Letter_Modifier Unicode class name Lm
247
248	\value Letter_Other Unicode class name Lo
249
250	\value Punctuation_Connector Unicode class name Pc
251
252	\value Punctuation_Dash Unicode class name Pd
253
254	\value Punctuation_Open Unicode class name Ps
255
256	\value Punctuation_Close Unicode class name Pe
257
258	\value Punctuation_InitialQuote Unicode class name Pi
259
260	\value Punctuation_FinalQuote Unicode class name Pf
261
262	\value Punctuation_Other Unicode class name Po
263
264	\value Symbol_Math Unicode class name Sm
265
266	\value Symbol_Currency Unicode class name Sc
267
268	\value Symbol_Modifier Unicode class name Sk
269
270	\value Symbol_Other Unicode class name So
271
272	\sa category()
273	*/
274
275	/!*
276	\enum QChar::Script
277	\since 5.1
278
279	This enum type defines the Unicode script property values.
280
281	For details about the Unicode script property values see
282	\l{http://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}.
283
284	In order to conform to C/C++ naming conventions "Script_" is prepended
285	to the codes used in the Unicode Standard.
286
287	\value Script_Unknown For unassigned, private-use, noncharacter, and surrogate code points.
288	\value Script_Inherited For characters that may be used with multiple scripts
289	and that inherit their script from the preceding characters.
290	These include nonspacing marks, enclosing marks,
291	and zero width joiner/non-joiner characters.
292	\value Script_Common For characters that may be used with multiple scripts
293	and that do not inherit their script from the preceding characters.
294
295	\value Script_Adlam Since Qt 5.11
296	\value Script_Ahom Since Qt 5.6
297	\value Script_AnatolianHieroglyphs Since Qt 5.6
298	\value Script_Arabic
299	\value Script_Armenian
300	\value Script_Avestan
301	\value Script_Balinese
302	\value Script_Bamum
303	\value Script_BassaVah Since Qt 5.5
304	\value Script_Batak
305	\value Script_Bengali
306	\value Script_Bhaiksuki Since Qt 5.11
307	\value Script_Bopomofo
308	\value Script_Brahmi
309	\value Script_Braille
310	\value Script_Buginese
311	\value Script_Buhid
312	\value Script_CanadianAboriginal
313	\value Script_Carian
314	\value Script_CaucasianAlbanian Since Qt 5.5
315	\value Script_Chakma
316	\value Script_Cham
317	\value Script_Cherokee
318	\value Script_Chorasmian Since Qt 5.15
319	\value Script_Coptic
320	\value Script_Cuneiform
321	\value Script_Cypriot
322	\value Script_Cyrillic
323	\value Script_Deseret
324	\value Script_Devanagari
325	\value Script_DivesAkuru Since Qt 5.15
326	\value Script_Dogra Since Qt 5.15
327	\value Script_Duployan Since Qt 5.5
328	\value Script_EgyptianHieroglyphs
329	\value Script_Elbasan Since Qt 5.5
330	\value Script_Elymaic Since Qt 5.15
331	\value Script_Ethiopic
332	\value Script_Georgian
333	\value Script_Glagolitic
334	\value Script_Gothic
335	\value Script_Grantha Since Qt 5.5
336	\value Script_Greek
337	\value Script_Gujarati
338	\value Script_GunjalaGondi Since Qt 5.15
339	\value Script_Gurmukhi
340	\value Script_Han
341	\value Script_Hangul
342	\value Script_HanifiRohingya Since Qt 5.15
343	\value Script_Hanunoo
344	\value Script_Hatran Since Qt 5.6
345	\value Script_Hebrew
346	\value Script_Hiragana
347	\value Script_ImperialAramaic
348	\value Script_InscriptionalPahlavi
349	\value Script_InscriptionalParthian
350	\value Script_Javanese
351	\value Script_Kaithi
352	\value Script_Kannada
353	\value Script_Katakana
354	\value Script_KayahLi
355	\value Script_Kharoshthi
356	\value Script_KhitanSmallScript Since Qt 5.15
357	\value Script_Khmer
358	\value Script_Khojki Since Qt 5.5
359	\value Script_Khudawadi Since Qt 5.5
360	\value Script_Lao
361	\value Script_Latin
362	\value Script_Lepcha
363	\value Script_Limbu
364	\value Script_LinearA Since Qt 5.5
365	\value Script_LinearB
366	\value Script_Lisu
367	\value Script_Lycian
368	\value Script_Lydian
369	\value Script_Mahajani Since Qt 5.5
370	\value Script_Makasar Since Qt 5.15
371	\value Script_Malayalam
372	\value Script_Mandaic
373	\value Script_Manichaean Since Qt 5.5
374	\value Script_Marchen Since Qt 5.11
375	\value Script_MasaramGondi Since Qt 5.11
376	\value Script_Medefaidrin Since Qt 5.15
377	\value Script_MeeteiMayek
378	\value Script_MendeKikakui Since Qt 5.5
379	\value Script_MeroiticCursive
380	\value Script_MeroiticHieroglyphs
381	\value Script_Miao
382	\value Script_Modi Since Qt 5.5
383	\value Script_Mongolian
384	\value Script_Mro Since Qt 5.5
385	\value Script_Multani Since Qt 5.6
386	\value Script_Myanmar
387	\value Script_Nabataean Since Qt 5.5
388	\value Script_Nandinagari Since Qt 5.15
389	\value Script_Newa Since Qt 5.11
390	\value Script_NewTaiLue
391	\value Script_Nko
392	\value Script_Nushu Since Qt 5.11
393	\value Script_NyiakengPuachueHmong Since Qt 5.15
394	\value Script_Ogham
395	\value Script_OlChiki
396	\value Script_OldHungarian Since Qt 5.6
397	\value Script_OldItalic
398	\value Script_OldNorthArabian Since Qt 5.5
399	\value Script_OldPermic Since Qt 5.5
400	\value Script_OldPersian
401	\value Script_OldSogdian Since Qt 5.15
402	\value Script_OldSouthArabian
403	\value Script_OldTurkic
404	\value Script_Oriya
405	\value Script_Osage Since Qt 5.11
406	\value Script_Osmanya
407	\value Script_PahawhHmong Since Qt 5.5
408	\value Script_Palmyrene Since Qt 5.5
409	\value Script_PauCinHau Since Qt 5.5
410	\value Script_PhagsPa
411	\value Script_Phoenician
412	\value Script_PsalterPahlavi Since Qt 5.5
413	\value Script_Rejang
414	\value Script_Runic
415	\value Script_Samaritan
416	\value Script_Saurashtra
417	\value Script_Sharada
418	\value Script_Shavian
419	\value Script_Siddham Since Qt 5.5
420	\value Script_SignWriting Since Qt 5.6
421	\value Script_Sinhala
422	\value Script_Sogdian Since Qt 5.15
423	\value Script_SoraSompeng
424	\value Script_Soyombo Since Qt 5.11
425	\value Script_Sundanese
426	\value Script_SylotiNagri
427	\value Script_Syriac
428	\value Script_Tagalog
429	\value Script_Tagbanwa
430	\value Script_TaiLe
431	\value Script_TaiTham
432	\value Script_TaiViet
433	\value Script_Takri
434	\value Script_Tamil
435	\value Script_Tangut Since Qt 5.11
436	\value Script_Telugu
437	\value Script_Thaana
438	\value Script_Thai
439	\value Script_Tibetan
440	\value Script_Tifinagh
441	\value Script_Tirhuta Since Qt 5.5
442	\value Script_Ugaritic
443	\value Script_Vai
444	\value Script_Wancho Since Qt 5.15
445	\value Script_WarangCiti Since Qt 5.5
446	\value Script_Yezidi Since Qt 5.15
447	\value Script_Yi
448	\value Script_ZanabazarSquare Since Qt 5.11
449
450	\omitvalue ScriptCount
451
452	\sa script()
453	*/
454
455	/!*
456	\enum QChar::Direction
457
458	This enum type defines the Unicode direction attributes. See the
459	\l{http://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode Standard} for a description
460	of the values.
461
462	In order to conform to C/C++ naming conventions "Dir" is prepended
463	to the codes used in the Unicode Standard.
464
465	\value DirAL
466	\value DirAN
467	\value DirB
468	\value DirBN
469	\value DirCS
470	\value DirEN
471	\value DirES
472	\value DirET
473	\value DirFSI Since Qt 5.3
474	\value DirL
475	\value DirLRE
476	\value DirLRI Since Qt 5.3
477	\value DirLRO
478	\value DirNSM
479	\value DirON
480	\value DirPDF
481	\value DirPDI Since Qt 5.3
482	\value DirR
483	\value DirRLE
484	\value DirRLI Since Qt 5.3
485	\value DirRLO
486	\value DirS
487	\value DirWS
488
489	\sa direction()
490	*/
491
492	/!*
493	\enum QChar::Decomposition
494
495	This enum type defines the Unicode decomposition attributes. See
496	the \l{http://www.unicode.org/}{Unicode Standard} for a
497	description of the values.
498
499	\value NoDecomposition
500	\value Canonical
501	\value Circle
502	\value Compat
503	\value Final
504	\value Font
505	\value Fraction
506	\value Initial
507	\value Isolated
508	\value Medial
509	\value Narrow
510	\value NoBreak
511	\value Small
512	\value Square
513	\value Sub
514	\value Super
515	\value Vertical
516	\value Wide
517
518	\sa decomposition()
519	*/
520
521	/!*
522	\enum QChar::JoiningType
523	since 5.3
524
525	This enum type defines the Unicode joining type attributes. See the
526	\l{http://www.unicode.org/}{Unicode Standard} for a description of the values.
527
528	In order to conform to C/C++ naming conventions "Joining_" is prepended
529	to the codes used in the Unicode Standard.
530
531	\value Joining_None
532	\value Joining_Causing
533	\value Joining_Dual
534	\value Joining_Right
535	\value Joining_Left
536	\value Joining_Transparent
537
538	\sa joiningType()
539	*/
540
541	/!*
542	\enum QChar::CombiningClass
543
544	\internal
545
546	This enum type defines names for some of the Unicode combining
547	classes. See the \l{http://www.unicode.org/}{Unicode Standard}
548	for a description of the values.
549
550	\value Combining_Above
551	\value Combining_AboveAttached
552	\value Combining_AboveLeft
553	\value Combining_AboveLeftAttached
554	\value Combining_AboveRight
555	\value Combining_AboveRightAttached
556	\value Combining_Below
557	\value Combining_BelowAttached
558	\value Combining_BelowLeft
559	\value Combining_BelowLeftAttached
560	\value Combining_BelowRight
561	\value Combining_BelowRightAttached
562	\value Combining_DoubleAbove
563	\value Combining_DoubleBelow
564	\value Combining_IotaSubscript
565	\value Combining_Left
566	\value Combining_LeftAttached
567	\value Combining_Right
568	\value Combining_RightAttached
569	*/
570
571	/!*
572	\enum QChar::SpecialCharacter
573
574	\value Null A QChar with this value isNull().
575	\value Tabulation Character tabulation.
576	\value LineFeed
577	\value FormFeed
578	\value CarriageReturn
579	\value Space
580	\value Nbsp Non-breaking space.
581	\value SoftHyphen
582	\value ReplacementCharacter The character shown when a font has no glyph
583	for a certain codepoint. A special question mark character is often
584	used. Codecs use this codepoint when input data cannot be
585	represented in Unicode.
586	\value ObjectReplacementCharacter Used to represent an object such as an
587	image when such objects cannot be presented.
588	\value ByteOrderMark
589	\value ByteOrderSwapped
590	\value ParagraphSeparator
591	\value LineSeparator
592	\value LastValidCodePoint
593	*/
594
595	/!*
596	\fn void QChar::setCell(uchar cell)
597	\internal
598	*/
599
600	/!*
601	\fn void QChar::setRow(uchar row)
602	\internal
603	*/
604
605	/!*
606	\fn QChar::QChar()
607
608	Constructs a null QChar ('\\0').
609
610	\sa isNull()
611	*/
612
613	/!*
614	\fn QChar::QChar(QLatin1Char ch)
615
616	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
617	*/
618
619	/!*
620	\fn QChar::QChar(SpecialCharacter ch)
621
622	Constructs a QChar for the predefined character value \a ch.
623	*/
624
625	/!*
626	\fn QChar::QChar(char16_t ch)
627	\since 5.10
628
629	Constructs a QChar corresponding to the UTF-16 character \a ch.
630	*/
631
632	/!*
633	\fn QChar::QChar(wchar_t ch)
634	\since 5.10
635
636	Constructs a QChar corresponding to the wide character \a ch.
637
638	\note This constructor is only available on Windows.
639	*/
640
641	/!*
642	\fn QChar::QChar(char ch)
643
644	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
645
646	\note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
647	is defined.
648
649	\sa QT_NO_CAST_FROM_ASCII
650	*/
651
652	/!*
653	\fn QChar::QChar(uchar ch)
654
655	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
656
657	\note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
658	or \c QT_RESTRICTED_CAST_FROM_ASCII is defined.
659
660	\sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII
661	*/
662
663	/!*
664	\fn QChar::QChar(uchar cell, uchar row)
665
666	Constructs a QChar for Unicode cell \a cell in row \a row.
667
668	\sa cell(), row()
669	*/
670
671	/!*
672	\fn QChar::QChar(ushort code)
673
674	Constructs a QChar for the character with Unicode code point \a code.
675	*/
676
677	/!*
678	\fn QChar::QChar(short code)
679
680	Constructs a QChar for the character with Unicode code point \a code.
681	*/
682
683	/!*
684	\fn QChar::QChar(uint code)
685
686	Constructs a QChar for the character with Unicode code point \a code.
687	*/
688
689	/!*
690	\fn QChar::QChar(int code)
691
692	Constructs a QChar for the character with Unicode code point \a code.
693	*/
694
695	/!*
696	\fn static QChar QChar::fromUcs2(char16_t c)
697	\since 6.0
698
699	Constructs a QChar from UTF-16 character \a c.
700
701	\sa fromUcs4()
702	*/
703
704	/!*
705	\fn static auto QChar::fromUcs4(char32_t c)
706	\since 6.0
707
708	Returns an anonymous struct that
709	\list
710	\li contains a \c{char16_t chars[2]} array,
711	\li can be implicitly converted to a QStringView, and
712	\li iterated over with a C++11 ranged for loop.
713	\endlist
714
715	If \a c requires surrogates, \c{chars[0]} contains the high surrogate
716	and \c{chars[1]} the low surrogate, and the QStringView has size 2.
717	Otherwise, \c{chars[0]} contains \a c and \c{chars[1]} is
718	\l{QChar::isNull}{null}, and the QStringView has size 1.
719
720	This allows easy use of the result:
721
722	\code
723	QString s;
724	s += QChar::fromUcs4(ch);
725	\endcode
726
727	\code
728	for (char16_t c16 : QChar::fromUcs4(ch))
729	use(c16);
730	\endcode
731
732	\sa fromUcs2(), requiresSurrogates()
733	*/
734
735	/!*
736	\fn bool QChar::isNull() const
737
738	Returns \c true if the character is the Unicode character 0x0000
739	('\\0'); otherwise returns \c false.
740	*/
741
742	/!*
743	\fn uchar QChar::cell() const
744
745	Returns the cell (least significant byte) of the Unicode character.
746
747	\sa row()
748	*/
749
750	/!*
751	\fn uchar QChar::row() const
752
753	Returns the row (most significant byte) of the Unicode character.
754
755	\sa cell()
756	*/
757
758	/!*
759	\fn bool QChar::isPrint() const
760
761	Returns \c true if the character is a printable character; otherwise
762	returns \c false. This is any character not of category Other_.*
763
764	Note that this gives no indication of whether the character is
765	available in a particular font.
766	*/
767
768	/!*
769	\overload
770	\since 5.0
771
772	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
773	a printable character; otherwise returns \c false.
774	This is any character not of category Other_.*
775
776	Note that this gives no indication of whether the character is
777	available in a particular font.
778
779	\note Before Qt 6, this function took a \c uint argument.
780	*/
781	bool QChar::isPrint(char32_t ucs4) noexcept
782	{
783	if (ucs4 > LastValidCodePoint)
784	return false;
785	const int test = FLAG(Other_Control) \|
786	FLAG(Other_Format) \|
787	FLAG(Other_Surrogate) \|
788	FLAG(Other_PrivateUse) \|
789	FLAG(Other_NotAssigned);
790	return !(FLAG(qGetProp(ucs4)->category) & test);
791	}
792
793	/!*
794	\fn bool QChar::isSpace() const
795
796	Returns \c true if the character is a separator character
797	(Separator_ categories or certain code points from Other_Control category);*
798	otherwise returns \c false.
799	*/
800
801	/!*
802	\fn bool QChar::isSpace(char32_t ucs4)
803	\overload
804	\since 5.0
805
806	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
807	a separator character (Separator_ categories or certain code points*
808	from Other_Control category); otherwise returns \c false.
809
810	\note Before Qt 6, this function took a \c uint argument.
811	*/
812
813	/!*
814	\internal
815	*/
816	bool QT_FASTCALL QChar::isSpace_helper(char32_t ucs4) noexcept
817	{
818	if (ucs4 > LastValidCodePoint)
819	return false;
820	const int test = FLAG(Separator_Space) \|
821	FLAG(Separator_Line) \|
822	FLAG(Separator_Paragraph);
823	return FLAG(qGetProp(ucs4)->category) & test;
824	}
825
826	/!*
827	\fn bool QChar::isMark() const
828
829	Returns \c true if the character is a mark (Mark_ categories);*
830	otherwise returns \c false.
831
832	See QChar::Category for more information regarding marks.
833	*/
834
835	/!*
836	\overload
837	\since 5.0
838
839	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
840	a mark (Mark_ categories); otherwise returns \c false.*
841
842	\note Before Qt 6, this function took a \c uint argument.
843	*/
844	bool QChar::isMark(char32_t ucs4) noexcept
845	{
846	if (ucs4 > LastValidCodePoint)
847	return false;
848	const int test = FLAG(Mark_NonSpacing) \|
849	FLAG(Mark_SpacingCombining) \|
850	FLAG(Mark_Enclosing);
851	return FLAG(qGetProp(ucs4)->category) & test;
852	}
853
854	/!*
855	\fn bool QChar::isPunct() const
856
857	Returns \c true if the character is a punctuation mark (Punctuation_*
858	categories); otherwise returns \c false.
859	*/
860
861	/!*
862	\overload
863	\since 5.0
864
865	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
866	a punctuation mark (Punctuation_ categories); otherwise returns \c false.*
867
868	\note Before Qt 6, this function took a \c uint argument.
869	*/
870	bool QChar::isPunct(char32_t ucs4) noexcept
871	{
872	if (ucs4 > LastValidCodePoint)
873	return false;
874	const int test = FLAG(Punctuation_Connector) \|
875	FLAG(Punctuation_Dash) \|
876	FLAG(Punctuation_Open) \|
877	FLAG(Punctuation_Close) \|
878	FLAG(Punctuation_InitialQuote) \|
879	FLAG(Punctuation_FinalQuote) \|
880	FLAG(Punctuation_Other);
881	return FLAG(qGetProp(ucs4)->category) & test;
882	}
883
884	/!*
885	\fn bool QChar::isSymbol() const
886
887	Returns \c true if the character is a symbol (Symbol_ categories);*
888	otherwise returns \c false.
889	*/
890
891	/!*
892	\overload
893	\since 5.0
894
895	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
896	a symbol (Symbol_ categories); otherwise returns \c false.*
897
898	\note Before Qt 6, this function took a \c uint argument.
899	*/
900	bool QChar::isSymbol(char32_t ucs4) noexcept
901	{
902	if (ucs4 > LastValidCodePoint)
903	return false;
904	const int test = FLAG(Symbol_Math) \|
905	FLAG(Symbol_Currency) \|
906	FLAG(Symbol_Modifier) \|
907	FLAG(Symbol_Other);
908	return FLAG(qGetProp(ucs4)->category) & test;
909	}
910
911	/!*
912	\fn bool QChar::isLetter() const
913
914	Returns \c true if the character is a letter (Letter_ categories);*
915	otherwise returns \c false.
916	*/
917
918	/!*
919	\fn bool QChar::isLetter(char32_t ucs4)
920	\overload
921	\since 5.0
922
923	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
924	a letter (Letter_ categories); otherwise returns \c false.*
925
926	\note Before Qt 6, this function took a \c uint argument.
927	*/
928
929	/!*
930	\internal
931	*/
932	bool QT_FASTCALL QChar::isLetter_helper(char32_t ucs4) noexcept
933	{
934	if (ucs4 > LastValidCodePoint)
935	return false;
936	const int test = FLAG(Letter_Uppercase) \|
937	FLAG(Letter_Lowercase) \|
938	FLAG(Letter_Titlecase) \|
939	FLAG(Letter_Modifier) \|
940	FLAG(Letter_Other);
941	return FLAG(qGetProp(ucs4)->category) & test;
942	}
943
944	/!*
945	\fn bool QChar::isNumber() const
946
947	Returns \c true if the character is a number (Number_ categories,*
948	not just 0-9); otherwise returns \c false.
949
950	\sa isDigit()
951	*/
952
953	/!*
954	\fn bool QChar::isNumber(char32_t ucs4)
955	\overload
956	\since 5.0
957
958	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
959	a number (Number_ categories, not just 0-9); otherwise returns \c false.*
960
961	\note Before Qt 6, this function took a \c uint argument.
962
963	\sa isDigit()
964	*/
965
966	/!*
967	\internal
968	*/
969	bool QT_FASTCALL QChar::isNumber_helper(char32_t ucs4) noexcept
970	{
971	if (ucs4 > LastValidCodePoint)
972	return false;
973	const int test = FLAG(Number_DecimalDigit) \|
974	FLAG(Number_Letter) \|
975	FLAG(Number_Other);
976	return FLAG(qGetProp(ucs4)->category) & test;
977	}
978
979	/!*
980	\fn bool QChar::isLetterOrNumber() const
981
982	Returns \c true if the character is a letter or number (Letter_ or*
983	Number_ categories); otherwise returns \c false.*
984	*/
985
986	/!*
987	\fn bool QChar::isLetterOrNumber(char32_t ucs4)
988	\overload
989	\since 5.0
990
991	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
992	a letter or number (Letter_ or Number_* categories); otherwise returns \c false.*
993
994	\note Before Qt 6, this function took a \c uint argument.
995	*/
996
997	/!*
998	\internal
999	*/
1000	bool QT_FASTCALL QChar::isLetterOrNumber_helper(char32_t ucs4) noexcept
1001	{
1002	if (ucs4 > LastValidCodePoint)
1003	return false;
1004	const int test = FLAG(Letter_Uppercase) \|
1005	FLAG(Letter_Lowercase) \|
1006	FLAG(Letter_Titlecase) \|
1007	FLAG(Letter_Modifier) \|
1008	FLAG(Letter_Other) \|
1009	FLAG(Number_DecimalDigit) \|
1010	FLAG(Number_Letter) \|
1011	FLAG(Number_Other);
1012	return FLAG(qGetProp(ucs4)->category) & test;
1013	}
1014
1015	/!*
1016	\fn bool QChar::isDigit() const
1017
1018	Returns \c true if the character is a decimal digit
1019	(Number_DecimalDigit); otherwise returns \c false.
1020
1021	\sa isNumber()
1022	*/
1023
1024	/!*
1025	\fn bool QChar::isDigit(char32_t ucs4)
1026	\overload
1027	\since 5.0
1028
1029	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
1030	a decimal digit (Number_DecimalDigit); otherwise returns \c false.
1031
1032	\note Before Qt 6, this function took a \c uint argument.
1033
1034	\sa isNumber()
1035	*/
1036
1037	/!*
1038	\fn bool QChar::isNonCharacter() const
1039	\since 5.0
1040
1041	Returns \c true if the QChar is a non-character; false otherwise.
1042
1043	Unicode has a certain number of code points that are classified
1044	as "non-characters:" that is, they can be used for internal purposes
1045	in applications but cannot be used for text interchange.
1046	Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1047	[0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1048	*/
1049
1050	/!*
1051	\fn bool QChar::isHighSurrogate() const
1052
1053	Returns \c true if the QChar is the high part of a UTF16 surrogate
1054	(for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1055	*/
1056
1057	/!*
1058	\fn bool QChar::isLowSurrogate() const
1059
1060	Returns \c true if the QChar is the low part of a UTF16 surrogate
1061	(for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1062	*/
1063
1064	/!*
1065	\fn bool QChar::isSurrogate() const
1066	\since 5.0
1067
1068	Returns \c true if the QChar contains a code point that is in either
1069	the high or the low part of the UTF-16 surrogate range
1070	(for example if its code point is in range [0xd800..0xdfff]); false otherwise.
1071	*/
1072
1073	/!*
1074	\fn static bool QChar::isNonCharacter(char32_t ucs4)
1075	\overload
1076	\since 5.0
1077
1078	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1079	is a non-character; false otherwise.
1080
1081	Unicode has a certain number of code points that are classified
1082	as "non-characters:" that is, they can be used for internal purposes
1083	in applications but cannot be used for text interchange.
1084	Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1085	[0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1086
1087	\note Before Qt 6, this function took a \c uint argument.
1088	*/
1089
1090	/!*
1091	\fn static bool QChar::isHighSurrogate(char32_t ucs4)
1092	\overload
1093
1094	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1095	is the high part of a UTF16 surrogate
1096	(for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1097
1098	\note Before Qt 6, this function took a \c uint argument.
1099	*/
1100
1101	/!*
1102	\fn static bool QChar::isLowSurrogate(char32_t ucs4)
1103	\overload
1104
1105	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1106	is the low part of a UTF16 surrogate
1107	(for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1108
1109	\note Before Qt 6, this function took a \c uint argument.
1110	*/
1111
1112	/!*
1113	\fn static bool QChar::isSurrogate(char32_t ucs4)
1114	\overload
1115	\since 5.0
1116
1117	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1118	contains a code point that is in either the high or the low part of the
1119	UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]);
1120	false otherwise.
1121
1122	\note Before Qt 6, this function took a \c uint argument.
1123	*/
1124
1125	/!*
1126	\fn static bool QChar::requiresSurrogates(char32_t ucs4)
1127
1128	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1129	can be split into the high and low parts of a UTF16 surrogate
1130	(for example if its code point is greater than or equals to 0x10000);
1131	false otherwise.
1132
1133	\note Before Qt 6, this function took a \c uint argument.
1134	*/
1135
1136	/!*
1137	\fn static char32_t QChar::surrogateToUcs4(char16_t high, char16_t low)
1138
1139	Converts a UTF16 surrogate pair with the given \a high and \a low values
1140	to it's UCS-4-encoded code point.
1141
1142	\note Before Qt 6, this function took \c ushort arguments and returned \c uint.
1143	*/
1144
1145	/!*
1146	\fn static char32_t QChar::surrogateToUcs4(QChar high, QChar low)
1147	\overload
1148
1149	Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
1150
1151	\note Before Qt 6, this function returned \c uint.
1152	*/
1153
1154	/!*
1155	\fn static char16_t QChar::highSurrogate(char32_t ucs4)
1156
1157	Returns the high surrogate part of a UCS-4-encoded code point.
1158	The returned result is undefined if \a ucs4 is smaller than 0x10000.
1159
1160	\note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1161	*/
1162
1163	/!*
1164	\fn static char16_t QChar::lowSurrogate(char32_t ucs4)
1165
1166	Returns the low surrogate part of a UCS-4-encoded code point.
1167	The returned result is undefined if \a ucs4 is smaller than 0x10000.
1168
1169	\note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1170	*/
1171
1172	/!*
1173	\fn int QChar::digitValue() const
1174
1175	Returns the numeric value of the digit, or -1 if the character is not a digit.
1176	*/
1177
1178	/!*
1179	\overload
1180	Returns the numeric value of the digit specified by the UCS-4-encoded
1181	character, \a ucs4, or -1 if the character is not a digit.
1182
1183	\note Before Qt 6, this function took a \c uint argument.
1184	*/
1185	int QChar::digitValue(char32_t ucs4) noexcept
1186	{
1187	if (ucs4 > LastValidCodePoint)
1188	return -`1`;
1189	return qGetProp(ucs4)->digitValue;
1190	}
1191
1192	/!*
1193	\fn QChar::Category QChar::category() const
1194
1195	Returns the character's category.
1196	*/
1197
1198	/!*
1199	\overload
1200	Returns the category of the UCS-4-encoded character specified by \a ucs4.
1201
1202	\note Before Qt 6, this function took a \c uint argument.
1203	*/
1204	QChar::Category QChar::category(char32_t ucs4) noexcept
1205	{
1206	if (ucs4 > LastValidCodePoint)
1207	return QChar::Other_NotAssigned;
1208	return (QChar::Category) qGetProp(ucs4)->category;
1209	}
1210
1211	/!*
1212	\fn QChar::Direction QChar::direction() const
1213
1214	Returns the character's direction.
1215	*/
1216
1217	/!*
1218	\overload
1219	Returns the direction of the UCS-4-encoded character specified by \a ucs4.
1220
1221	\note Before Qt 6, this function took a \c uint argument.
1222	*/
1223	QChar::Direction QChar::direction(char32_t ucs4) noexcept
1224	{
1225	if (ucs4 > LastValidCodePoint)
1226	return QChar::DirL;
1227	return (QChar::Direction) qGetProp(ucs4)->direction;
1228	}
1229
1230	/!*
1231	\fn QChar::JoiningType QChar::joiningType() const
1232	\since 5.3
1233
1234	Returns information about the joining type attributes of the character
1235	(needed for certain languages such as Arabic or Syriac).
1236	*/
1237
1238	/!*
1239	\overload
1240	\since 5.3
1241
1242	Returns information about the joining type attributes of the UCS-4-encoded
1243	character specified by \a ucs4
1244	(needed for certain languages such as Arabic or Syriac).
1245
1246	\note Before Qt 6, this function took a \c uint argument.
1247	*/
1248	QChar::JoiningType QChar::joiningType(char32_t ucs4) noexcept
1249	{
1250	if (ucs4 > LastValidCodePoint)
1251	return QChar::Joining_None;
1252	return QChar::JoiningType(qGetProp(ucs4)->joining);
1253	}
1254
1255	/!*
1256	\fn bool QChar::hasMirrored() const
1257
1258	Returns \c true if the character should be reversed if the text
1259	direction is reversed; otherwise returns \c false.
1260
1261	A bit faster equivalent of (ch.mirroredChar() != ch).
1262
1263	\sa mirroredChar()
1264	*/
1265
1266	/!*
1267	\overload
1268	\since 5.0
1269
1270	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1271	should be reversed if the text direction is reversed; otherwise returns \c false.
1272
1273	A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
1274
1275	\note Before Qt 6, this function took a \c uint argument.
1276
1277	\sa mirroredChar()
1278	*/
1279	bool QChar::hasMirrored(char32_t ucs4) noexcept
1280	{
1281	if (ucs4 > LastValidCodePoint)
1282	return false;
1283	return qGetProp(ucs4)->mirrorDiff != `0`;
1284	}
1285
1286	/!*
1287	\fn bool QChar::isLower() const
1288
1289	Returns \c true if the character is a lowercase letter, for example
1290	category() is Letter_Lowercase.
1291
1292	\sa isUpper(), toLower(), toUpper()
1293	*/
1294
1295	/!*
1296	\fn static bool QChar::isLower(char32_t ucs4)
1297	\overload
1298	\since 5.0
1299
1300	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1301	is a lowercase letter, for example category() is Letter_Lowercase.
1302
1303	\note Before Qt 6, this function took a \c uint argument.
1304
1305	\sa isUpper(), toLower(), toUpper()
1306	*/
1307
1308	/!*
1309	\fn bool QChar::isUpper() const
1310
1311	Returns \c true if the character is an uppercase letter, for example
1312	category() is Letter_Uppercase.
1313
1314	\sa isLower(), toUpper(), toLower()
1315	*/
1316
1317	/!*
1318	\fn static bool QChar::isUpper(char32_t ucs4)
1319	\overload
1320	\since 5.0
1321
1322	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1323	is an uppercase letter, for example category() is Letter_Uppercase.
1324
1325	\note Before Qt 6, this function took a \c uint argument.
1326
1327	\sa isLower(), toUpper(), toLower()
1328	*/
1329
1330	/!*
1331	\fn bool QChar::isTitleCase() const
1332
1333	Returns \c true if the character is a titlecase letter, for example
1334	category() is Letter_Titlecase.
1335
1336	\sa isLower(), toUpper(), toLower(), toTitleCase()
1337	*/
1338
1339	/!*
1340	\fn static bool QChar::isTitleCase(char32_t ucs4)
1341	\overload
1342	\since 5.0
1343
1344	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1345	is a titlecase letter, for example category() is Letter_Titlecase.
1346
1347	\note Before Qt 6, this function took a \c uint argument.
1348
1349	\sa isLower(), toUpper(), toLower(), toTitleCase()
1350	*/
1351	/!*
1352	\fn QChar QChar::mirroredChar() const
1353
1354	Returns the mirrored character if this character is a mirrored
1355	character; otherwise returns the character itself.
1356
1357	\sa hasMirrored()
1358	*/
1359
1360	/!*
1361	\overload
1362	Returns the mirrored character if the UCS-4-encoded character specified
1363	by \a ucs4 is a mirrored character; otherwise returns the character itself.
1364
1365	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1366
1367	\sa hasMirrored()
1368	*/
1369	char32_t QChar::mirroredChar(char32_t ucs4) noexcept
1370	{
1371	if (ucs4 > LastValidCodePoint)
1372	return ucs4;
1373	return ucs4 + qGetProp(ucs4)->mirrorDiff;
1374	}
1375
1376
1377	// constants for Hangul (de)composition, see UAX #15
1378	enum {
1379	Hangul_SBase = `0xac00`,
1380	Hangul_LBase = `0x1100`,
1381	Hangul_VBase = `0x1161`,
1382	Hangul_TBase = `0x11a7`,
1383	Hangul_LCount = `19`,
1384	Hangul_VCount = `21`,
1385	Hangul_TCount = `28`,
1386	Hangul_NCount = Hangul_VCount * Hangul_TCount,
1387	Hangul_SCount = Hangul_LCount * Hangul_NCount
1388	};
1389
1390	// buffer has to have a length of 3. It's needed for Hangul decomposition
1391	static const unsigned short * QT_FASTCALL decompositionHelper
1392	(uint ucs4, qsizetype length, int* tag, unsigned* short *buffer)
1393	{
1394	if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1395	// compute Hangul syllable decomposition as per UAX #15
1396	const uint SIndex = ucs4 - Hangul_SBase;
1397	buffer[`0`] = Hangul_LBase + SIndex / Hangul_NCount; // L
1398	buffer[`1`] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
1399	buffer[`2`] = Hangul_TBase + SIndex % Hangul_TCount; // T
1400	*length = buffer[`2`] == Hangul_TBase ? `2` : `3`;
1401	*tag = QChar::Canonical;
1402	return buffer;
1403	}
1404
1405	const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1406	if (index == `0xffff`) {
1407	*length = `0`;
1408	*tag = QChar::NoDecomposition;
1409	return nullptr;
1410	}
1411
1412	const unsigned short *decomposition = uc_decomposition_map+index;
1413	tag = (decomposition) & `0xff`;
1414	length = (decomposition) >> `8`;
1415	return decomposition+`1`;
1416	}
1417
1418	/!*
1419	Decomposes a character into it's constituent parts. Returns an empty string
1420	if no decomposition exists.
1421	*/
1422	QString QChar::decomposition() const
1423	{
1424	return QChar::decomposition(ucs);
1425	}
1426
1427	/!*
1428	\overload
1429	Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1430	constituent parts. Returns an empty string if no decomposition exists.
1431
1432	\note Before Qt 6, this function took a \c uint argument.
1433	*/
1434	QString QChar::decomposition(char32_t ucs4)
1435	{
1436	unsigned short buffer[`3`];
1437	qsizetype length;
1438	int tag;
1439	const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1440	return QString (reinterpret_cast<const QChar *>(d), length);
1441	}
1442
1443	/!*
1444	\fn QChar::Decomposition QChar::decompositionTag() const
1445
1446	Returns the tag defining the composition of the character. Returns
1447	QChar::NoDecomposition if no decomposition exists.
1448	*/
1449
1450	/!*
1451	\overload
1452	Returns the tag defining the composition of the UCS-4-encoded character
1453	specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1454
1455	\note Before Qt 6, this function took a \c uint argument.
1456	*/
1457	QChar::Decomposition QChar::decompositionTag(char32_t ucs4) noexcept
1458	{
1459	if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1460	return QChar::Canonical;
1461	const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1462	if (index == `0xffff`)
1463	return QChar::NoDecomposition;
1464	return (QChar::Decomposition)(uc_decomposition_map[index] & `0xff`);
1465	}
1466
1467	/!*
1468	\fn unsigned char QChar::combiningClass() const
1469
1470	Returns the combining class for the character as defined in the
1471	Unicode standard. This is mainly useful as a positioning hint for
1472	marks attached to a base character.
1473
1474	The Qt text rendering engine uses this information to correctly
1475	position non-spacing marks around a base character.
1476	*/
1477
1478	/!*
1479	\overload
1480	Returns the combining class for the UCS-4-encoded character specified by
1481	\a ucs4, as defined in the Unicode standard.
1482
1483	\note Before Qt 6, this function took a \c uint argument.
1484	*/
1485	unsigned char QChar::combiningClass(char32_t ucs4) noexcept
1486	{
1487	if (ucs4 > LastValidCodePoint)
1488	return `0`;
1489	return (unsigned char) qGetProp(ucs4)->combiningClass;
1490	}
1491
1492	/!*
1493	\fn QChar::Script QChar::script() const
1494	\since 5.1
1495
1496	Returns the Unicode script property value for this character.
1497	*/
1498
1499	/!*
1500	\overload
1501	\since 5.1
1502
1503	Returns the Unicode script property value for the character specified in
1504	its UCS-4-encoded form as \a ucs4.
1505
1506	\note Before Qt 6, this function took a \c uint argument.
1507	*/
1508	QChar::Script QChar::script(char32_t ucs4) noexcept
1509	{
1510	if (ucs4 > LastValidCodePoint)
1511	return QChar::Script_Unknown;
1512	return (QChar::Script) qGetProp(ucs4)->script;
1513	}
1514
1515	/!*
1516	\fn QChar::UnicodeVersion QChar::unicodeVersion() const
1517
1518	Returns the Unicode version that introduced this character.
1519	*/
1520
1521	/!*
1522	\overload
1523	Returns the Unicode version that introduced the character specified in
1524	its UCS-4-encoded form as \a ucs4.
1525
1526	\note Before Qt 6, this function took a \c uint argument.
1527	*/
1528	QChar::UnicodeVersion QChar::unicodeVersion(char32_t ucs4) noexcept
1529	{
1530	if (ucs4 > LastValidCodePoint)
1531	return QChar::Unicode_Unassigned;
1532	return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1533	}
1534
1535	/!*
1536	Returns the most recent supported Unicode version.
1537	*/
1538	QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
1539	{
1540	return UNICODE_DATA_VERSION;
1541	}
1542
1543	static auto fullConvertCase(char32_t uc, QUnicodeTables::Case which) noexcept
1544	{
1545	struct R {
1546	char16_t chars[MaxSpecialCaseLength + `1`];
1547	qint8 sz;
1548
1549	// iterable
1550	auto begin() const { return chars; }
1551	auto end() const { return chars + sz; }
1552	// QStringView-compatible
1553	auto data() const { return chars; }
1554	auto size() const { return sz; }
1555	} result;
1556	Q_ASSERT(uc <= QChar::LastValidCodePoint);
1557
1558	auto pp = result.chars;
1559
1560	const auto fold = qGetProp(uc)->cases[which];
1561	const auto caseDiff = fold.diff;
1562
1563	if (Q_UNLIKELY(fold.special)) {
1564	const auto *specialCase = specialCaseMap + caseDiff;
1565	auto length = *specialCase++;
1566	while (length--)
1567	pp++ = specialCase++;
1568	} else {
1569	// so far, case convertion never changes planes (guaranteed by the qunicodetables generator)
1570	for (char16_t c : QChar::fromUcs4(uc + caseDiff))
1571	*pp++ = c;
1572	}
1573	result.sz = pp - result.chars;
1574	return result;
1575	}
1576
1577	template <typename T>
1578	Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
1579	{
1580	const auto fold = qGetProp(uc)->cases[which];
1581
1582	if (Q_UNLIKELY(fold.special)) {
1583	const ushort *specialCase = specialCaseMap + fold.diff;
1584	// so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator)
1585	return *specialCase == `1` ? specialCase[`1`] : uc;
1586	}
1587
1588	return uc + fold.diff;
1589	}
1590
1591	/!*
1592	\fn QChar QChar::toLower() const
1593
1594	Returns the lowercase equivalent if the character is uppercase or titlecase;
1595	otherwise returns the character itself.
1596	*/
1597
1598	/!*
1599	\overload
1600	Returns the lowercase equivalent of the UCS-4-encoded character specified
1601	by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1602	the character itself.
1603
1604	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1605	*/
1606	char32_t QChar::toLower(char32_t ucs4) noexcept
1607	{
1608	if (ucs4 > LastValidCodePoint)
1609	return ucs4;
1610	return convertCase_helper(ucs4, QUnicodeTables::LowerCase);
1611	}
1612
1613	/!*
1614	\fn QChar QChar::toUpper() const
1615
1616	Returns the uppercase equivalent if the character is lowercase or titlecase;
1617	otherwise returns the character itself.
1618	*/
1619
1620	/!*
1621	\overload
1622	Returns the uppercase equivalent of the UCS-4-encoded character specified
1623	by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1624	the character itself.
1625
1626	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1627	*/
1628	char32_t QChar::toUpper(char32_t ucs4) noexcept
1629	{
1630	if (ucs4 > LastValidCodePoint)
1631	return ucs4;
1632	return convertCase_helper(ucs4, QUnicodeTables::UpperCase);
1633	}
1634
1635	/!*
1636	\fn QChar QChar::toTitleCase() const
1637
1638	Returns the title case equivalent if the character is lowercase or uppercase;
1639	otherwise returns the character itself.
1640	*/
1641
1642	/!*
1643	\overload
1644	Returns the title case equivalent of the UCS-4-encoded character specified
1645	by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1646	the character itself.
1647
1648	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1649	*/
1650	char32_t QChar::toTitleCase(char32_t ucs4) noexcept
1651	{
1652	if (ucs4 > LastValidCodePoint)
1653	return ucs4;
1654	return convertCase_helper(ucs4, QUnicodeTables::TitleCase);
1655	}
1656
1657	static inline char32_t foldCase(const char16_t ch, const* char16_t *start)
1658	{
1659	char32_t ucs4 = *ch;
1660	if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(*(ch - `1`)))
1661	ucs4 = QChar::surrogateToUcs4(*(ch - `1`), ucs4);
1662	return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1663	}
1664
1665	static inline char32_t foldCase(char32_t ch, char32_t &last) noexcept
1666	{
1667	char32_t ucs4 = ch;
1668	if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(last))
1669	ucs4 = QChar::surrogateToUcs4(last, ucs4);
1670	last = ch;
1671	return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1672	}
1673
1674	static inline char16_t foldCase(char16_t ch) noexcept
1675	{
1676	return convertCase_helper(ch, QUnicodeTables::CaseFold);
1677	}
1678
1679	static inline QChar foldCase(QChar ch) noexcept
1680	{
1681	return QChar (foldCase(ch.unicode()));
1682	}
1683
1684	/!*
1685	\fn QChar QChar::toCaseFolded() const
1686
1687	Returns the case folded equivalent of the character.
1688	For most Unicode characters this is the same as toLower().
1689	*/
1690
1691	/!*
1692	\overload
1693	Returns the case folded equivalent of the UCS-4-encoded character specified
1694	by \a ucs4. For most Unicode characters this is the same as toLower().
1695
1696	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1697	*/
1698	char32_t QChar::toCaseFolded(char32_t ucs4) noexcept
1699	{
1700	if (ucs4 > LastValidCodePoint)
1701	return ucs4;
1702	return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1703	}
1704
1705	/!*
1706	\fn char QChar::toLatin1() const
1707
1708	Returns the Latin-1 character equivalent to the QChar, or 0. This
1709	is mainly useful for non-internationalized software.
1710
1711	\note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1712	(NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1713
1714	\sa unicode()
1715	*/
1716
1717	/!*
1718	\fn QChar QChar::fromLatin1(char)
1719
1720	Converts the Latin-1 character \a c to its equivalent QChar. This
1721	is mainly useful for non-internationalized software.
1722
1723	An alternative is to use QLatin1Char.
1724
1725	\sa toLatin1(), unicode()
1726	*/
1727
1728	#ifndef QT_NO_DATASTREAM
1729	/!*
1730	\relates QChar
1731
1732	Writes the char \a chr to the stream \a out.
1733
1734	\sa {Serializing Qt Data Types}
1735	*/
1736	QDataStream &operator<<(QDataStream &out, QChar chr)
1737	{
1738	out << quint16(chr.unicode());
1739	return out;
1740	}
1741
1742	/!*
1743	\relates QChar
1744
1745	Reads a char from the stream \a in into char \a chr.
1746
1747	\sa {Serializing Qt Data Types}
1748	*/
1749	QDataStream &operator>>(QDataStream &in, QChar &chr)
1750	{
1751	quint16 u;
1752	in >> u;
1753	chr.unicode() = char16_t(u);
1754	return in;
1755	}
1756	#endif // QT_NO_DATASTREAM
1757
1758	/!*
1759	\fn QChar::unicode()
1760
1761	Returns a reference to the numeric Unicode value of the QChar.
1762	*/
1763
1764	/!*
1765	\fn QChar::unicode() const
1766
1767	Returns the numeric Unicode value of the QChar.
1768	*/
1769
1770	/*****************************************************************************
1771	Documentation of QChar related functions
1772	*****************************************************************************/
1773
1774	/!*
1775	\fn bool QChar::operator==(QChar c1, QChar c2)
1776
1777	Returns \c true if \a c1 and \a c2 are the same Unicode character;
1778	otherwise returns \c false.
1779	*/
1780
1781	/!*
1782	\fn int QChar::operator!=(QChar c1, QChar c2)
1783
1784	Returns \c true if \a c1 and \a c2 are not the same Unicode
1785	character; otherwise returns \c false.
1786	*/
1787
1788	/!*
1789	\fn int QChar::operator<=(QChar c1, QChar c2)
1790
1791	Returns \c true if the numeric Unicode value of \a c1 is less than
1792	or equal to that of \a c2; otherwise returns \c false.
1793	*/
1794
1795	/!*
1796	\fn int QChar::operator>=(QChar c1, QChar c2)
1797
1798	Returns \c true if the numeric Unicode value of \a c1 is greater than
1799	or equal to that of \a c2; otherwise returns \c false.
1800	*/
1801
1802	/!*
1803	\fn int QChar::operator<(QChar c1, QChar c2)
1804
1805	Returns \c true if the numeric Unicode value of \a c1 is less than
1806	that of \a c2; otherwise returns \c false.
1807	*/
1808
1809	/!*
1810	\fn int QChar::operator>(QChar c1, QChar c2)
1811
1812	Returns \c true if the numeric Unicode value of \a c1 is greater than
1813	that of \a c2; otherwise returns \c false.
1814	*/
1815
1816
1817	// ---------------------------------------------------------------------------
1818
1819
1820	static void decomposeHelper(QString str, bool* canonical, QChar::UnicodeVersion version, qsizetype from)
1821	{
1822	qsizetype length;
1823	int tag;
1824	unsigned short buffer[`3`];
1825
1826	QString &s = *str;
1827
1828	const unsigned short utf16 = reinterpret_cast<unsigned* short *>(s.data());
1829	const unsigned short *uc = utf16 + s.length();
1830	while (uc != utf16 + from) {
1831	uint ucs4 = *(--uc);
1832	if (QChar (ucs4).isLowSurrogate() && uc != utf16) {
1833	ushort high = *(uc - `1`);
1834	if (QChar (high).isHighSurrogate()) {
1835	--uc;
1836	ucs4 = QChar::surrogateToUcs4(high, ucs4);
1837	}
1838	}
1839
1840	if (QChar::unicodeVersion(ucs4) > version)
1841	continue;
1842
1843	const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1844	if (!d \|\| (canonical && tag != QChar::Canonical))
1845	continue;
1846
1847	qsizetype pos = uc - utf16;
1848	s.replace(pos, QChar::requiresSurrogates(ucs4) ? `2` : `1`, reinterpret_cast<const QChar *>(d), length);
1849	// since the replace invalidates the pointers and we do decomposition recursive
1850	utf16 = reinterpret_cast<unsigned short *>(s.data());
1851	uc = utf16 + pos + length;
1852	}
1853	}
1854
1855
1856	struct UCS2Pair {
1857	ushort u1;
1858	ushort u2;
1859	};
1860
1861	inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2)
1862	{ return ligature1.u1 < ligature2.u1; }
1863	inline bool operator<(ushort u1, const UCS2Pair &ligature)
1864	{ return u1 < ligature.u1; }
1865	inline bool operator<(const UCS2Pair &ligature, ushort u1)
1866	{ return ligature.u1 < u1; }
1867
1868	struct UCS2SurrogatePair {
1869	UCS2Pair p1;
1870	UCS2Pair p2;
1871	};
1872
1873	inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2)
1874	{ return QChar::surrogateToUcs4(ligature1.p1.u1, ligature1.p1.u2) < QChar::surrogateToUcs4(ligature2.p1.u1, ligature2.p1.u2); }
1875	inline bool operator<(uint u1, const UCS2SurrogatePair &ligature)
1876	{ return u1 < QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2); }
1877	inline bool operator<(const UCS2SurrogatePair &ligature, uint u1)
1878	{ return QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2) < u1; }
1879
1880	static uint inline ligatureHelper(uint u1, uint u2)
1881	{
1882	if (u1 >= Hangul_LBase && u1 <= Hangul_SBase + Hangul_SCount) {
1883	// compute Hangul syllable composition as per UAX #15
1884	// hangul L-V pair
1885	const uint LIndex = u1 - Hangul_LBase;
1886	if (LIndex < Hangul_LCount) {
1887	const uint VIndex = u2 - Hangul_VBase;
1888	if (VIndex < Hangul_VCount)
1889	return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1890	}
1891	// hangul LV-T pair
1892	const uint SIndex = u1 - Hangul_SBase;
1893	if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == `0`) {
1894	const uint TIndex = u2 - Hangul_TBase;
1895	if (TIndex <= Hangul_TCount)
1896	return u1 + TIndex;
1897	}
1898	}
1899
1900	const unsigned short index = GET_LIGATURE_INDEX(u2);
1901	if (index == `0xffff`)
1902	return `0`;
1903	const unsigned short *ligatures = uc_ligature_map+index;
1904	ushort length = *ligatures++;
1905	if (QChar::requiresSurrogates(u1)) {
1906	const UCS2SurrogatePair data = reinterpret_cast<const* UCS2SurrogatePair *>(ligatures);
1907	const UCS2SurrogatePair *r = std::lower_bound(data, data + length, u1);
1908	if (r != data + length && QChar::surrogateToUcs4(r->p1.u1, r->p1.u2) == u1)
1909	return QChar::surrogateToUcs4(r->p2.u1, r->p2.u2);
1910	} else {
1911	const UCS2Pair data = reinterpret_cast<const* UCS2Pair *>(ligatures);
1912	const UCS2Pair *r = std::lower_bound(data, data + length, ushort(u1));
1913	if (r != data + length && r->u1 == ushort(u1))
1914	return r->u2;
1915	}
1916
1917	return `0`;
1918	}
1919
1920	static void composeHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1921	{
1922	QString &s = *str;
1923
1924	if (from < `0` \|\| s.length() - from < `2`)
1925	return;
1926
1927	uint stcode = `0`; // starter code point
1928	qsizetype starter = -`1`; // starter position
1929	qsizetype next = -`1`; // to prevent i == next
1930	int lastCombining = `255`; // to prevent combining > lastCombining
1931
1932	qsizetype pos = from;
1933	while (pos < s.length()) {
1934	qsizetype i = pos;
1935	char32_t uc = s.at(pos).unicode();
1936	if (QChar (uc).isHighSurrogate() && pos < s.length()-`1`) {
1937	ushort low = s.at(pos+`1`).unicode();
1938	if (QChar (low).isLowSurrogate()) {
1939	uc = QChar::surrogateToUcs4(uc, low);
1940	++pos;
1941	}
1942	}
1943
1944	const QUnicodeTables::Properties *p = qGetProp(uc);
1945	if (p->unicodeVersion > version) {
1946	starter = -`1`;
1947	next = -`1`; // to prevent i == next
1948	lastCombining = `255`; // to prevent combining > lastCombining
1949	++pos;
1950	continue;
1951	}
1952
1953	int combining = p->combiningClass;
1954	if ((i == next \|\| combining > lastCombining) && starter >= from) {
1955	// allowed to form ligature with S
1956	uint ligature = ligatureHelper(stcode, uc);
1957	if (ligature) {
1958	stcode = ligature;
1959	QChar *d = s.data();
1960	// ligatureHelper() never changes planes
1961	qsizetype j = `0`;
1962	for (QChar ch : QChar::fromUcs4(ligature))
1963	d[starter + j++] = ch;
1964	s.remove(i, j);
1965	continue;
1966	}
1967	}
1968	if (combining == `0`) {
1969	starter = i;
1970	stcode = uc;
1971	next = pos + `1`;
1972	}
1973	lastCombining = combining;
1974
1975	++pos;
1976	}
1977	}
1978
1979
1980	static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1981	{
1982	QString &s = *str;
1983	const qsizetype l = s.length()-`1`;
1984
1985	char32_t u1, u2;
1986	char16_t c1, c2;
1987
1988	qsizetype pos = from;
1989	while (pos < l) {
1990	qsizetype p2 = pos+`1`;
1991	u1 = s.at(pos).unicode();
1992	if (QChar::isHighSurrogate(u1)) {
1993	const char16_t low = s.at(p2).unicode();
1994	if (QChar::isLowSurrogate(low)) {
1995	u1 = QChar::surrogateToUcs4(u1, low);
1996	if (p2 >= l)
1997	break;
1998	++p2;
1999	}
2000	}
2001	c1 = `0`;
2002
2003	advance:
2004	u2 = s.at(p2).unicode();
2005	if (QChar::isHighSurrogate(u2) && p2 < l) {
2006	const char16_t low = s.at(p2+`1`).unicode();
2007	if (QChar::isLowSurrogate(low)) {
2008	u2 = QChar::surrogateToUcs4(u2, low);
2009	++p2;
2010	}
2011	}
2012
2013	c2 = `0`;
2014	{
2015	const QUnicodeTables::Properties *p = qGetProp(u2);
2016	if (p->unicodeVersion <= version)
2017	c2 = p->combiningClass;
2018	}
2019	if (c2 == `0`) {
2020	pos = p2+`1`;
2021	continue;
2022	}
2023
2024	if (c1 == `0`) {
2025	const QUnicodeTables::Properties *p = qGetProp(u1);
2026	if (p->unicodeVersion <= version)
2027	c1 = p->combiningClass;
2028	}
2029
2030	if (c1 > c2) {
2031	QChar *uc = s.data();
2032	qsizetype p = pos;
2033	// exchange characters
2034	for (QChar ch : QChar::fromUcs4(u2))
2035	uc[p++] = ch;
2036	for (QChar ch : QChar::fromUcs4(u1))
2037	uc[p++] = ch;
2038	if (pos > `0`)
2039	--pos;
2040	if (pos > `0` && s.at(pos).isLowSurrogate())
2041	--pos;
2042	} else {
2043	++pos;
2044	if (QChar::requiresSurrogates(u1))
2045	++pos;
2046
2047	u1 = u2;
2048	c1 = c2; // != 0
2049	p2 = pos + `1`;
2050	if (QChar::requiresSurrogates(u1))
2051	++p2;
2052	if (p2 > l)
2053	break;
2054
2055	goto advance;
2056	}
2057	}
2058	}
2059
2060	// returns true if the text is in a desired Normalization Form already; false otherwise.
2061	// sets lastStable to the position of the last stable code point
2062	static bool normalizationQuickCheckHelper(QString str, QString::NormalizationForm mode, qsizetype from, qsizetype lastStable)
2063	{
2064	static_assert(QString::NormalizationForm_D == `0`);
2065	static_assert(QString::NormalizationForm_C == `1`);
2066	static_assert(QString::NormalizationForm_KD == `2`);
2067	static_assert(QString::NormalizationForm_KC == `3`);
2068
2069	enum { NFQC_YES = `0`, NFQC_NO = `1`, NFQC_MAYBE = `3` };
2070
2071	const ushort string = reinterpret_cast<const* ushort *>(str->constData());
2072	qsizetype length = str->length();
2073
2074	// this avoids one out of bounds check in the loop
2075	while (length > from && QChar::isHighSurrogate(string[length - `1`]))
2076	--length;
2077
2078	uchar lastCombining = `0`;
2079	for (qsizetype i = from; i < length; ++i) {
2080	qsizetype pos = i;
2081	char32_t uc = string[i];
2082	if (uc < `0x80`) {
2083	// ASCII characters are stable code points
2084	lastCombining = `0`;
2085	*lastStable = pos;
2086	continue;
2087	}
2088
2089	if (QChar::isHighSurrogate(uc)) {
2090	ushort low = string[i + `1`];
2091	if (!QChar::isLowSurrogate(low)) {
2092	// treat surrogate like stable code point
2093	lastCombining = `0`;
2094	*lastStable = pos;
2095	continue;
2096	}
2097	++i;
2098	uc = QChar::surrogateToUcs4(uc, low);
2099	}
2100
2101	const QUnicodeTables::Properties *p = qGetProp(uc);
2102
2103	if (p->combiningClass < lastCombining && p->combiningClass > `0`)
2104	return false;
2105
2106	const uchar check = (p->nfQuickCheck >> (mode << `1`)) & `0x03`;
2107	if (check != NFQC_YES)
2108	return false; // ### can we quick check NFQC_MAYBE ?
2109
2110	lastCombining = p->combiningClass;
2111	if (lastCombining == `0`)
2112	*lastStable = pos;
2113	}
2114
2115	if (length != str->length()) // low surrogate parts at the end of text
2116	*lastStable = str->length() - `1`;
2117
2118	return true;
2119	}
2120
2121	QT_END_NAMESPACE
2122

Browse the source code of Qt/src/corelib/text/qchar.cpp