qunicodetools.cpp source code [Qt/src/corelib/text/qunicodetools.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2020 The Qt Company Ltd.
4	** Contact: https://www.qt.io/licensing/
5	**
6	** This file is part of the QtCore module of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial License Usage
10	** Licensees holding valid commercial Qt licenses may use this file in
11	** accordance with the commercial license agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and The Qt Company. For licensing terms
14	** and conditions see https://www.qt.io/terms-conditions. For further
15	** information use the contact form at https://www.qt.io/contact-us.
16	**
17	** GNU Lesser General Public License Usage
18	** Alternatively, this file may be used under the terms of the GNU Lesser
19	** General Public License version 3 as published by the Free Software
20	** Foundation and appearing in the file LICENSE.LGPL3 included in the
21	** packaging of this file. Please review the following information to
22	** ensure the GNU Lesser General Public License version 3 requirements
23	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24	**
25	** GNU General Public License Usage
26	** Alternatively, this file may be used under the terms of the GNU
27	** General Public License version 2.0 or (at your option) the GNU General
28	** Public license version 3 or any later version approved by the KDE Free
29	** Qt Foundation. The licenses are as published by the Free Software
30	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31	** included in the packaging of this file. Please review the following
32	** information to ensure the GNU General Public License requirements will
33	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34	** https://www.gnu.org/licenses/gpl-3.0.html.
35	**
36	** $QT_END_LICENSE$
37	**
38	****************************************************************************/
39
40	#include "qunicodetools_p.h"
41
42	#include "qunicodetables_p.h"
43	#include "qvarlengtharray.h"
44	#include "qlibrary.h"
45
46	#define FLAG(x) (1 << (x))
47
48	QT_BEGIN_NAMESPACE
49
50	Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = `0`;
51
52	namespace QUnicodeTools {
53
54	// -----------------------------------------------------------------------------------------------------
55	//
56	// The text boundaries determination algorithm.
57	// See http://www.unicode.org/reports/tr29/tr29-31.html
58	//
59	// -----------------------------------------------------------------------------------------------------
60
61	namespace GB {
62
63	/*
64	* Most grapheme break rules can be implemented table driven, but rules GB10, GB12 and GB13 need a bit
65	* of special treatment.
66	*/
67	enum State : uchar {
68	Break,
69	Inside,
70	GB10,
71	GB10_2,
72	GB10_3,
73	GB13, // also covers GB12
74	};
75
76	static const State breakTable[QUnicodeTables::NumGraphemeBreakClasses][QUnicodeTables::NumGraphemeBreakClasses] = {
77	// Any CR LF Control Extend ZWJ RI Prepend S-Mark L V T LV LVT E_B E_M GAZ EBG
78	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
79	{ Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
80	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
81	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Control
82	{ Break , Break , Break , Break , GB10_2, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , GB10_3, Break , Break }, // Extend
83	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Inside, Inside }, // ZWJ
84	{ Break , Break , Break , Break , Inside, Inside, GB13 , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
85	{ Inside, Break , Break , Break , Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside }, // Prepend
86	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SpacingMark
87	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Inside, Inside, Break , Inside, Inside, Break , Break , Break , Break }, // L
88	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // V
89	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // T
90	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // LV
91	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // LVT
92	{ Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // E_B
93	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_M
94	{ Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
95	{ Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // EBG
96	};
97
98	} // namespace GB
99
100	static void getGraphemeBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
101	{
102	QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
103	GB::State state = GB::Break; // only required to track some of the rules
104	for (qsizetype i = `0`; i != len; ++i) {
105	qsizetype pos = i;
106	char32_t ucs4 = string[i];
107	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
108	ushort low = string[i + `1`];
109	if (QChar::isLowSurrogate(low)) {
110	ucs4 = QChar::surrogateToUcs4(ucs4, low);
111	++i;
112	}
113	}
114
115	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
116	QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
117
118	switch (GB::breakTable[lcls][cls]) {
119	case GB::Break:
120	attributes[pos].graphemeBoundary = true;
121	state = GB::Break;
122	break;
123	case GB::Inside:
124	state = GB::Break;
125	break;
126	case GB::GB10:
127	state = GB::GB10;
128	break;
129	case GB::GB10_2:
130	if (state == GB::GB10 \|\| state == GB::GB10_2)
131	state = GB::GB10_2;
132	else
133	state = GB::Break;
134	break;
135	case GB::GB10_3:
136	if (state != GB::GB10 && state != GB::GB10_2)
137	attributes[pos].graphemeBoundary = true;
138	state = GB::Break;
139	break;
140	case GB::GB13:
141	if (state != GB::GB13) {
142	state = GB::GB13;
143	} else {
144	attributes[pos].graphemeBoundary = true;
145	state = GB::Break;
146	}
147	}
148
149	lcls = cls;
150	}
151
152	attributes[len].graphemeBoundary = true; // GB2
153	}
154
155
156	namespace WB {
157
158	enum Action {
159	NoBreak,
160	Break,
161	Lookup,
162	LookupW
163	};
164
165	static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
166	// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet E_Base E_Mod GAZ EBG WSeg
167	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
168	{ Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
169	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
170	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
171	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
172	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ZWJ
173	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
174	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
175	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break }, // Katakana
176	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // HebrewLetter
177	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ALetter
178	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
179	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
180	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
181	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
182	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
183	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // Numeric
184	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ExtendNumLet
185	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // E_Base
186	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_Mod
187	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
188	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // EBG
189	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // WSeg
190	};
191
192	} // namespace WB
193
194	static void getWordBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
195	{
196	enum WordType {
197	WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
198	} currentWordType = WordTypeNone;
199
200	QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
201	for (qsizetype i = `0`; i != len; ++i) {
202	qsizetype pos = i;
203	char32_t ucs4 = string[i];
204	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
205	ushort low = string[i + `1`];
206	if (QChar::isLowSurrogate(low)) {
207	ucs4 = QChar::surrogateToUcs4(ucs4, low);
208	++i;
209	}
210	}
211
212	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
213	QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
214	#ifdef QT_BUILD_INTERNAL
215	if (qt_initcharattributes_default_algorithm_only) {
216	// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
217	// which caused "hi.there" to be treated like if it were just a single word;
218	// we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
219	// and this code is needed to pass the coverage tests; remove once the issue is fixed.
220	if (ucs4 == `0x002E`) // FULL STOP
221	ncls = QUnicodeTables::WordBreak_MidNumLet;
222	else if (ucs4 == `0x003A`) // COLON
223	ncls = QUnicodeTables::WordBreak_MidLetter;
224	}
225	#endif
226
227	uchar action = WB::breakTable[cls][ncls];
228	switch (action) {
229	case WB::Break:
230	break;
231	case WB::NoBreak:
232	if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend \|\| ncls == QUnicodeTables::WordBreak_ZWJ \|\| ncls == QUnicodeTables::WordBreak_Format)) {
233	// WB4: X(Extend\|Format) -> X*
234	if (cls != QUnicodeTables::WordBreak_ZWJ) // WB3c
235	continue;
236	}
237	if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
238	// WB15/WB16: break between pairs of Regional indicator
239	ncls = QUnicodeTables::WordBreak_Any;
240	}
241	break;
242	case WB::Lookup:
243	case WB::LookupW:
244	for (qsizetype lookahead = i + `1`; lookahead < len; ++lookahead) {
245	ucs4 = string[lookahead];
246	if (QChar::isHighSurrogate(ucs4) && lookahead + `1` != len) {
247	ushort low = string[lookahead + `1`];
248	if (QChar::isLowSurrogate(low)) {
249	ucs4 = QChar::surrogateToUcs4(ucs4, low);
250	++lookahead;
251	}
252	}
253
254	prop = QUnicodeTables::properties(ucs4);
255	QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
256
257	if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend \|\| tcls == QUnicodeTables::WordBreak_ZWJ \|\| tcls == QUnicodeTables::WordBreak_Format)) {
258	// WB4: X(Extend\|Format) -> X*
259	continue;
260	}
261
262	if (Q_LIKELY(tcls == cls \|\| (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
263	\|\| tcls == QUnicodeTables::WordBreak_ALetter)))) {
264	i = lookahead;
265	ncls = tcls;
266	action = WB::NoBreak;
267	}
268	break;
269	}
270	if (action != WB::NoBreak) {
271	action = WB::Break;
272	if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
273	action = WB::NoBreak; // WB7a
274	}
275	break;
276	}
277
278	cls = ncls;
279	if (action == WB::Break) {
280	attributes[pos].wordBreak = true;
281	if (currentWordType != WordTypeNone)
282	attributes[pos].wordEnd = true;
283	switch (cls) {
284	case QUnicodeTables::WordBreak_Katakana:
285	currentWordType = WordTypeHiraganaKatakana;
286	attributes[pos].wordStart = true;
287	break;
288	case QUnicodeTables::WordBreak_HebrewLetter:
289	case QUnicodeTables::WordBreak_ALetter:
290	case QUnicodeTables::WordBreak_Numeric:
291	currentWordType = WordTypeAlphaNumeric;
292	attributes[pos].wordStart = true;
293	break;
294	default:
295	currentWordType = WordTypeNone;
296	break;
297	}
298	}
299	}
300
301	if (currentWordType != WordTypeNone)
302	attributes[len].wordEnd = true;
303	attributes[len].wordBreak = true; // WB2
304	}
305
306
307	namespace SB {
308
309	enum State {
310	Initial,
311	Lower,
312	Upper,
313	LUATerm,
314	ATerm,
315	ATermC,
316	ACS,
317	STerm,
318	STermC,
319	SCS,
320	BAfterC,
321	BAfter,
322	Break,
323	Lookup
324	};
325
326	static const uchar breakTable[BAfter + `1`][QUnicodeTables::NumSentenceBreakClasses] = {
327	// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
328	{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
329	{ Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
330	{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
331
332	{ Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
333	{ Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
334	{ Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
335	{ Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
336
337	{ Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
338	{ Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
339	{ Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
340	{ Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
341	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
342	};
343
344	} // namespace SB
345
346	static void getSentenceBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
347	{
348	uchar state = SB::BAfter; // to meet SB1
349	for (qsizetype i = `0`; i != len; ++i) {
350	qsizetype pos = i;
351	char32_t ucs4 = string[i];
352	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
353	ushort low = string[i + `1`];
354	if (QChar::isLowSurrogate(low)) {
355	ucs4 = QChar::surrogateToUcs4(ucs4, low);
356	++i;
357	}
358	}
359
360	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
361	QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
362
363	Q_ASSERT(state <= SB::BAfter);
364	state = SB::breakTable[state][ncls];
365	if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
366	state = SB::Break;
367	for (qsizetype lookahead = i + `1`; lookahead < len; ++lookahead) {
368	ucs4 = string[lookahead];
369	if (QChar::isHighSurrogate(ucs4) && lookahead + `1` != len) {
370	ushort low = string[lookahead + `1`];
371	if (QChar::isLowSurrogate(low)) {
372	ucs4 = QChar::surrogateToUcs4(ucs4, low);
373	++lookahead;
374	}
375	}
376
377	prop = QUnicodeTables::properties(ucs4);
378	QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
379	switch (tcls) {
380	case QUnicodeTables::SentenceBreak_Any:
381	case QUnicodeTables::SentenceBreak_Extend:
382	case QUnicodeTables::SentenceBreak_Sp:
383	case QUnicodeTables::SentenceBreak_Numeric:
384	case QUnicodeTables::SentenceBreak_SContinue:
385	case QUnicodeTables::SentenceBreak_Close:
386	continue;
387	case QUnicodeTables::SentenceBreak_Lower:
388	i = lookahead;
389	state = SB::Initial;
390	break;
391	default:
392	break;
393	}
394	break;
395	}
396	}
397	if (Q_UNLIKELY(state == SB::Break)) {
398	attributes[pos].sentenceBoundary = true;
399	state = SB::breakTable[SB::Initial][ncls];
400	}
401	}
402
403	attributes[len].sentenceBoundary = true; // SB2
404	}
405
406
407	// -----------------------------------------------------------------------------------------------------
408	//
409	// The line breaking algorithm.
410	// See http://www.unicode.org/reports/tr14/tr14-39.html
411	//
412	// -----------------------------------------------------------------------------------------------------
413
414	namespace LB {
415
416	namespace NS { // Number Sequence
417
418	// LB25 recommends to not break lines inside numbers of the form
419	// described by the following regular expression:
420	// (PR\|PO)?(OP\|HY)?NU(NU\|SY\|IS)(CL\|CP)?(PR\|PO)?*
421
422	enum Action {
423	None,
424	Start,
425	Continue,
426	Break
427	};
428
429	enum Class {
430	XX,
431	PRPO,
432	OPHY,
433	NU,
434	SYIS,
435	CLCP
436	};
437
438	static const uchar actionTable[CLCP + `1`][CLCP + `1`] = {
439	// XX PRPO OPHY NU SYIS CLCP
440	{ None , Start , Start , Start , None , None }, // XX
441	{ None , Start , Continue, Continue, None , None }, // PRPO
442	{ None , Start , Start , Continue, None , None }, // OPHY
443	{ Break , Break , Break , Continue, Continue, Continue }, // NU
444	{ Break , Break , Break , Continue, Continue, Continue }, // SYIS
445	{ Break , Continue, Break , Break , Break , Break }, // CLCP
446	};
447
448	inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
449	{
450	switch (lbc) {
451	case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
452	// resolve AI math symbols in numerical context to IS
453	if (category == QChar::Symbol_Math)
454	return SYIS;
455	break;
456	case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
457	return PRPO;
458	case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
459	return OPHY;
460	case QUnicodeTables::LineBreak_NU:
461	return NU;
462	case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
463	return SYIS;
464	case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
465	return CLCP;
466	default:
467	break;
468	}
469	return XX;
470	}
471
472	} // namespace NS
473
474	/ In order to support the tailored implementation of LB25 properly*
475	the following changes were made in the pair table to allow breaks
476	where the numeric expression doesn't match the template (i.e. [^NU](IS\|SY)NU):
477	(CL)(PO) from IB to DB
478	(CP)(PO) from IB to DB
479	(CL)(PR) from IB to DB
480	(CP)(PR) from IB to DB
481	(PO)(OP) from IB to DB
482	(PR)(OP) from IB to DB
483	(IS)(NU) from IB to DB
484	(SY)(NU) from IB to DB
485	*/
486
487	/ In order to implementat LB21a properly a special rule HH has been introduced and*
488	the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
489	(HL)(HY\|BA) from IB to CI
490	(HY\|BA)(!CB) from DB to HH
491	*/
492
493	enum Action {
494	ProhibitedBreak, PB = ProhibitedBreak,
495	DirectBreak, DB = DirectBreak,
496	IndirectBreak, IB = IndirectBreak,
497	CombiningIndirectBreak, CI = CombiningIndirectBreak,
498	CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
499	ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen
500	};
501
502	static const uchar breakTable[QUnicodeTables::LineBreak_SA][QUnicodeTables::LineBreak_SA] = {
503	/ OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM ZWJ/
504	/ OP / { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
505	/ CL / { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
506	/ CP / { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
507	/ QU / { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
508	/ GL / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
509	/ NS / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
510	/ EX / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
511	/ SY / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
512	/ IS / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
513	/ PR / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, IB },
514	/ PO / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
515	/ NU / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
516	/ AL / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
517	/ HL / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
518	/ ID / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
519	/ IN / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
520	/ HY / { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
521	/ BA / { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
522	/ BB / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB },
523	/ B2 / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
524	/ ZW / { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
525	/ CM / { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
526	/ WJ / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
527	/ H2 / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
528	/ H3 / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
529	/ JL / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, IB },
530	/ JV / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
531	/ JT / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
532	/ RI / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, IB },
533	/ CB / { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
534	/ EB / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
535	/ EM / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
536	/ ZWJ/ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, IB }
537	};
538
539	// The following line break classes are not treated by the pair table
540	// and must be resolved outside:
541	// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
542
543	} // namespace LB
544
545	static void getLineBreaks(const char16_t string, qsizetype len, QCharAttributes attributes, QUnicodeTools::CharAttributeOptions options)
546	{
547	qsizetype nestart = `0`;
548	LB::NS::Class nelast = LB::NS::XX;
549
550	QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
551	QUnicodeTables::LineBreakClass cls = lcls;
552	for (qsizetype i = `0`; i != len; ++i) {
553	qsizetype pos = i;
554	char32_t ucs4 = string[i];
555	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
556	ushort low = string[i + `1`];
557	if (QChar::isLowSurrogate(low)) {
558	ucs4 = QChar::surrogateToUcs4(ucs4, low);
559	++i;
560	}
561	}
562
563	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
564	QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
565	QUnicodeTables::LineBreakClass tcls;
566
567	if (options & QUnicodeTools::HangulLineBreakTailoring) {
568	if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
569	&& ncls <= QUnicodeTables::LineBreak_JT)
570	\|\| (ucs4 >= `0x3130` && ucs4 <= `0x318F` && ncls == QUnicodeTables::LineBreak_ID))
571	) {
572	// LB27: use SPACE for line breaking
573	// "When Korean uses SPACE for line breaking, the classes in rule LB26,
574	// as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
575	// In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
576	ncls = QUnicodeTables::LineBreak_AL;
577	} else {
578	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
579	// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
580	static const int test = FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining);
581	if (FLAG(prop->category) & test)
582	ncls = QUnicodeTables::LineBreak_CM;
583	}
584	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
585	// LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
586	if (lcls == QUnicodeTables::LineBreak_ZW \|\| lcls >= QUnicodeTables::LineBreak_SP)
587	ncls = QUnicodeTables::LineBreak_AL;
588	}
589	}
590	}
591
592	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
593	// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
594	static const int test = FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining);
595	if (FLAG(prop->category) & test)
596	ncls = QUnicodeTables::LineBreak_CM;
597	}
598
599	if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
600	// LB4: BK!, LB5: (CRxLF\|CR\|LF\|NL)!
601	if (lcls > QUnicodeTables::LineBreak_CR \|\| ncls != QUnicodeTables::LineBreak_LF)
602	attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
603	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM \|\| ncls == QUnicodeTables::LineBreak_ZWJ)) {
604	cls = QUnicodeTables::LineBreak_AL;
605	goto next_no_cls_update;
606	}
607	goto next;
608	}
609
610	if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
611	if (ncls > QUnicodeTables::LineBreak_SP)
612	goto next; // LB6: x(BK\|CR\|LF\|NL)
613	goto next_no_cls_update; // LB7: xSP
614	}
615
616	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM \|\| ncls == QUnicodeTables::LineBreak_ZWJ)) {
617	// LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
618	if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
619	// don't update anything
620	goto next_no_cls_update;
621	}
622
623	if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
624	// LB8a: ZWJ x (ID \| EB \| EM)
625	if (ncls == QUnicodeTables::LineBreak_ID \|\| ncls == QUnicodeTables::LineBreak_EB \|\| ncls == QUnicodeTables::LineBreak_EM)
626	goto next;
627	}
628
629	// LB25: do not break lines inside numbers
630	{
631	LB::NS::Class necur = LB::NS::toClass(ncls, (QChar::Category)prop->category);
632	switch (LB::NS::actionTable[nelast][necur]) {
633	case LB::NS::Break:
634	// do not change breaks before and after the expression
635	for (qsizetype j = nestart + `1`; j < pos; ++j)
636	attributes[j].lineBreak = false;
637	Q_FALLTHROUGH();
638	case LB::NS::None:
639	nelast = LB::NS::XX; // reset state
640	break;
641	case LB::NS::Start:
642	nestart = i;
643	Q_FALLTHROUGH();
644	default:
645	nelast = necur;
646	break;
647	}
648	}
649
650	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
651	// LB30a
652	ncls = QUnicodeTables::LineBreak_SP;
653	goto next;
654	}
655
656	// for South East Asian chars that require a complex analysis, the Unicode
657	// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
658	if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
659	cls = QUnicodeTables::LineBreak_AL;
660
661	tcls = cls;
662	if (tcls == QUnicodeTables::LineBreak_CM)
663	// LB10
664	tcls = QUnicodeTables::LineBreak_AL;
665	switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
666	case LB::DirectBreak:
667	attributes[pos].lineBreak = true;
668	break;
669	case LB::IndirectBreak:
670	if (lcls == QUnicodeTables::LineBreak_SP)
671	attributes[pos].lineBreak = true;
672	break;
673	case LB::CombiningIndirectBreak:
674	if (lcls != QUnicodeTables::LineBreak_SP)
675	goto next_no_cls_update;
676	attributes[pos].lineBreak = true;
677	break;
678	case LB::CombiningProhibitedBreak:
679	if (lcls != QUnicodeTables::LineBreak_SP)
680	goto next_no_cls_update;
681	break;
682	case LB::ProhibitedBreakAfterHebrewPlusHyphen:
683	if (lcls != QUnicodeTables::LineBreak_HL)
684	attributes[pos].lineBreak = true;
685	break;
686	case LB::ProhibitedBreak:
687	// nothing to do
688	default:
689	break;
690	}
691
692	next:
693	cls = ncls;
694	next_no_cls_update:
695	lcls = ncls;
696	}
697
698	if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
699	// LB25: do not break lines inside numbers
700	for (qsizetype j = nestart + `1`; j < len; ++j)
701	attributes[j].lineBreak = false;
702	}
703
704	attributes[`0`].lineBreak = attributes[`0`].mandatoryBreak = false; // LB2
705	attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
706	}
707
708
709	static void getWhiteSpaces(const char16_t string, qsizetype len, QCharAttributes attributes)
710	{
711	for (qsizetype i = `0`; i != len; ++i) {
712	uint ucs4 = string[i];
713	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
714	ushort low = string[i + `1`];
715	if (QChar::isLowSurrogate(low)) {
716	ucs4 = QChar::surrogateToUcs4(ucs4, low);
717	++i;
718	}
719	}
720
721	if (Q_UNLIKELY(QChar::isSpace(ucs4)))
722	attributes[i].whiteSpace = true;
723	}
724	}
725
726	namespace Tailored {
727
728	using CharAttributeFunction = void ()(QChar::Script script, const* char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes);
729
730
731	enum Form {
732	Invalid = `0x0`,
733	UnknownForm = Invalid,
734	Consonant,
735	Nukta,
736	Halant,
737	Matra,
738	VowelMark,
739	StressMark,
740	IndependentVowel,
741	LengthMark,
742	Control,
743	Other
744	};
745
746	static const unsigned char indicForms[`0xe00`-`0x900`] = {
747	// Devangari
748	Invalid, VowelMark, VowelMark, VowelMark,
749	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
750	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
751	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
752
753	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
754	IndependentVowel, Consonant, Consonant, Consonant,
755	Consonant, Consonant, Consonant, Consonant,
756	Consonant, Consonant, Consonant, Consonant,
757
758	Consonant, Consonant, Consonant, Consonant,
759	Consonant, Consonant, Consonant, Consonant,
760	Consonant, Consonant, Consonant, Consonant,
761	Consonant, Consonant, Consonant, Consonant,
762
763	Consonant, Consonant, Consonant, Consonant,
764	Consonant, Consonant, Consonant, Consonant,
765	Consonant, Consonant, UnknownForm, UnknownForm,
766	Nukta, Other, Matra, Matra,
767
768	Matra, Matra, Matra, Matra,
769	Matra, Matra, Matra, Matra,
770	Matra, Matra, Matra, Matra,
771	Matra, Halant, UnknownForm, UnknownForm,
772
773	Other, StressMark, StressMark, StressMark,
774	StressMark, UnknownForm, UnknownForm, UnknownForm,
775	Consonant, Consonant, Consonant, Consonant,
776	Consonant, Consonant, Consonant, Consonant,
777
778	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
779	Other, Other, Other, Other,
780	Other, Other, Other, Other,
781	Other, Other, Other, Other,
782
783	Other, Other, Other, Other,
784	Other, Other, Other, Other,
785	Other, Other, Other, Consonant,
786	Consonant, Consonant / ??? /, Consonant, Consonant,
787
788	// Bengali
789	Invalid, VowelMark, VowelMark, VowelMark,
790	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
791	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
792	IndependentVowel, Invalid, Invalid, IndependentVowel,
793
794	IndependentVowel, Invalid, Invalid, IndependentVowel,
795	IndependentVowel, Consonant, Consonant, Consonant,
796	Consonant, Consonant, Consonant, Consonant,
797	Consonant, Consonant, Consonant, Consonant,
798
799	Consonant, Consonant, Consonant, Consonant,
800	Consonant, Consonant, Consonant, Consonant,
801	Consonant, Invalid, Consonant, Consonant,
802	Consonant, Consonant, Consonant, Consonant,
803
804	Consonant, Invalid, Consonant, Invalid,
805	Invalid, Invalid, Consonant, Consonant,
806	Consonant, Consonant, UnknownForm, UnknownForm,
807	Nukta, Other, Matra, Matra,
808
809	Matra, Matra, Matra, Matra,
810	Matra, Invalid, Invalid, Matra,
811	Matra, Invalid, Invalid, Matra,
812	Matra, Halant, Consonant, UnknownForm,
813
814	Invalid, Invalid, Invalid, Invalid,
815	Invalid, Invalid, Invalid, VowelMark,
816	Invalid, Invalid, Invalid, Invalid,
817	Consonant, Consonant, Invalid, Consonant,
818
819	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
820	Other, Other, Other, Other,
821	Other, Other, Other, Other,
822	Other, Other, Other, Other,
823
824	Consonant, Consonant, Other, Other,
825	Other, Other, Other, Other,
826	Other, Other, Other, Other,
827	Other, Other, Other, Other,
828
829	// Gurmukhi
830	Invalid, VowelMark, VowelMark, VowelMark,
831	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
832	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
833	Invalid, Invalid, Invalid, IndependentVowel,
834
835	IndependentVowel, Invalid, Invalid, IndependentVowel,
836	IndependentVowel, Consonant, Consonant, Consonant,
837	Consonant, Consonant, Consonant, Consonant,
838	Consonant, Consonant, Consonant, Consonant,
839
840	Consonant, Consonant, Consonant, Consonant,
841	Consonant, Consonant, Consonant, Consonant,
842	Consonant, Invalid, Consonant, Consonant,
843	Consonant, Consonant, Consonant, Consonant,
844
845	Consonant, Invalid, Consonant, Consonant,
846	Invalid, Consonant, Consonant, Invalid,
847	Consonant, Consonant, UnknownForm, UnknownForm,
848	Nukta, Other, Matra, Matra,
849
850	Matra, Matra, Matra, Invalid,
851	Invalid, Invalid, Invalid, Matra,
852	Matra, Invalid, Invalid, Matra,
853	Matra, Halant, UnknownForm, UnknownForm,
854
855	Invalid, Invalid, Invalid, Invalid,
856	Invalid, UnknownForm, UnknownForm, UnknownForm,
857	Invalid, Consonant, Consonant, Consonant,
858	Consonant, Invalid, Consonant, Invalid,
859
860	Other, Other, Invalid, Invalid,
861	Other, Other, Other, Other,
862	Other, Other, Other, Other,
863	Other, Other, Other, Other,
864
865	StressMark, StressMark, Consonant, Consonant,
866	Other, Other, Other, Other,
867	Other, Other, Other, Other,
868	Other, Other, Other, Other,
869
870	// Gujarati
871	Invalid, VowelMark, VowelMark, VowelMark,
872	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
873	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
874	IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
875
876	IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
877	IndependentVowel, Consonant, Consonant, Consonant,
878	Consonant, Consonant, Consonant, Consonant,
879	Consonant, Consonant, Consonant, Consonant,
880
881	Consonant, Consonant, Consonant, Consonant,
882	Consonant, Consonant, Consonant, Consonant,
883	Consonant, Invalid, Consonant, Consonant,
884	Consonant, Consonant, Consonant, Consonant,
885
886	Consonant, Invalid, Consonant, Consonant,
887	Invalid, Consonant, Consonant, Consonant,
888	Consonant, Consonant, UnknownForm, UnknownForm,
889	Nukta, Other, Matra, Matra,
890
891	Matra, Matra, Matra, Matra,
892	Matra, Matra, Invalid, Matra,
893	Matra, Matra, Invalid, Matra,
894	Matra, Halant, UnknownForm, UnknownForm,
895
896	Other, UnknownForm, UnknownForm, UnknownForm,
897	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
898	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
899	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
900
901	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
902	Other, Other, Other, Other,
903	Other, Other, Other, Other,
904	Other, Other, Other, Other,
905
906	Other, Other, Other, Other,
907	Other, Other, Other, Other,
908	Other, Other, Other, Other,
909	Other, Other, Other, Other,
910
911	// Oriya
912	Invalid, VowelMark, VowelMark, VowelMark,
913	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
914	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
915	IndependentVowel, Invalid, Invalid, IndependentVowel,
916
917	IndependentVowel, Invalid, Invalid, IndependentVowel,
918	IndependentVowel, Consonant, Consonant, Consonant,
919	Consonant, Consonant, Consonant, Consonant,
920	Consonant, Consonant, Consonant, Consonant,
921
922	Consonant, Consonant, Consonant, Consonant,
923	Consonant, Consonant, Consonant, Consonant,
924	Consonant, Invalid, Consonant, Consonant,
925	Consonant, Consonant, Consonant, Consonant,
926
927	Consonant, Invalid, Consonant, Consonant,
928	Invalid, Consonant, Consonant, Consonant,
929	Consonant, Consonant, UnknownForm, UnknownForm,
930	Nukta, Other, Matra, Matra,
931
932	Matra, Matra, Matra, Matra,
933	Invalid, Invalid, Invalid, Matra,
934	Matra, Invalid, Invalid, Matra,
935	Matra, Halant, UnknownForm, UnknownForm,
936
937	Other, Invalid, Invalid, Invalid,
938	Invalid, UnknownForm, LengthMark, LengthMark,
939	Invalid, Invalid, Invalid, Invalid,
940	Consonant, Consonant, Invalid, Consonant,
941
942	IndependentVowel, IndependentVowel, Invalid, Invalid,
943	Invalid, Invalid, Other, Other,
944	Other, Other, Other, Other,
945	Other, Other, Other, Other,
946
947	Other, Consonant, Other, Other,
948	Other, Other, Other, Other,
949	Other, Other, Other, Other,
950	Other, Other, Other, Other,
951
952	//Tamil
953	Invalid, Invalid, VowelMark, Other,
954	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
955	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
956	Invalid, Invalid, IndependentVowel, IndependentVowel,
957
958	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
959	IndependentVowel, Consonant, Invalid, Invalid,
960	Invalid, Consonant, Consonant, Invalid,
961	Consonant, Invalid, Consonant, Consonant,
962
963	Invalid, Invalid, Invalid, Consonant,
964	Consonant, Invalid, Invalid, Invalid,
965	Consonant, Consonant, Consonant, Invalid,
966	Invalid, Invalid, Consonant, Consonant,
967
968	Consonant, Consonant, Consonant, Consonant,
969	Consonant, Consonant, Consonant, Consonant,
970	Consonant, Consonant, UnknownForm, UnknownForm,
971	Invalid, Invalid, Matra, Matra,
972
973	Matra, Matra, Matra, Invalid,
974	Invalid, Invalid, Matra, Matra,
975	Matra, Invalid, Matra, Matra,
976	Matra, Halant, Invalid, Invalid,
977
978	Invalid, Invalid, Invalid, Invalid,
979	Invalid, Invalid, Invalid, LengthMark,
980	Invalid, Invalid, Invalid, Invalid,
981	Invalid, Invalid, Invalid, Invalid,
982
983	Invalid, Invalid, Invalid, Invalid,
984	Invalid, Invalid, Other, Other,
985	Other, Other, Other, Other,
986	Other, Other, Other, Other,
987
988	Other, Other, Other, Other,
989	Other, Other, Other, Other,
990	Other, Other, Other, Other,
991	Other, Other, Other, Other,
992
993	// Telugu
994	Invalid, VowelMark, VowelMark, VowelMark,
995	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
996	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
997	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
998
999	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1000	IndependentVowel, Consonant, Consonant, Consonant,
1001	Consonant, Consonant, Consonant, Consonant,
1002	Consonant, Consonant, Consonant, Consonant,
1003
1004	Consonant, Consonant, Consonant, Consonant,
1005	Consonant, Consonant, Consonant, Consonant,
1006	Consonant, Invalid, Consonant, Consonant,
1007	Consonant, Consonant, Consonant, Consonant,
1008
1009	Consonant, Consonant, Consonant, Consonant,
1010	Invalid, Consonant, Consonant, Consonant,
1011	Consonant, Consonant, UnknownForm, UnknownForm,
1012	Invalid, Invalid, Matra, Matra,
1013
1014	Matra, Matra, Matra, Matra,
1015	Matra, Invalid, Matra, Matra,
1016	Matra, Invalid, Matra, Matra,
1017	Matra, Halant, Invalid, Invalid,
1018
1019	Invalid, Invalid, Invalid, Invalid,
1020	Invalid, LengthMark, Matra, Invalid,
1021	Invalid, Invalid, Invalid, Invalid,
1022	Invalid, Invalid, Invalid, Invalid,
1023
1024	IndependentVowel, IndependentVowel, Invalid, Invalid,
1025	Invalid, Invalid, Other, Other,
1026	Other, Other, Other, Other,
1027	Other, Other, Other, Other,
1028
1029	Other, Other, Other, Other,
1030	Other, Other, Other, Other,
1031	Other, Other, Other, Other,
1032	Other, Other, Other, Other,
1033
1034	// Kannada
1035	Invalid, Invalid, VowelMark, VowelMark,
1036	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1037	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1038	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1039
1040	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1041	IndependentVowel, Consonant, Consonant, Consonant,
1042	Consonant, Consonant, Consonant, Consonant,
1043	Consonant, Consonant, Consonant, Consonant,
1044
1045	Consonant, Consonant, Consonant, Consonant,
1046	Consonant, Consonant, Consonant, Consonant,
1047	Consonant, Invalid, Consonant, Consonant,
1048	Consonant, Consonant, Consonant, Consonant,
1049
1050	Consonant, Consonant, Consonant, Consonant,
1051	Invalid, Consonant, Consonant, Consonant,
1052	Consonant, Consonant, UnknownForm, UnknownForm,
1053	Nukta, Other, Matra, Matra,
1054
1055	Matra, Matra, Matra, Matra,
1056	Matra, Invalid, Matra, Matra,
1057	Matra, Invalid, Matra, Matra,
1058	Matra, Halant, Invalid, Invalid,
1059
1060	Invalid, Invalid, Invalid, Invalid,
1061	Invalid, LengthMark, LengthMark, Invalid,
1062	Invalid, Invalid, Invalid, Invalid,
1063	Invalid, Invalid, Consonant, Invalid,
1064
1065	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1066	Invalid, Invalid, Other, Other,
1067	Other, Other, Other, Other,
1068	Other, Other, Other, Other,
1069
1070	Other, Other, Other, Other,
1071	Other, Other, Other, Other,
1072	Other, Other, Other, Other,
1073	Other, Other, Other, Other,
1074
1075	// Malayalam
1076	Invalid, Invalid, VowelMark, VowelMark,
1077	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1078	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1079	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1080
1081	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1082	IndependentVowel, Consonant, Consonant, Consonant,
1083	Consonant, Consonant, Consonant, Consonant,
1084	Consonant, Consonant, Consonant, Consonant,
1085
1086	Consonant, Consonant, Consonant, Consonant,
1087	Consonant, Consonant, Consonant, Consonant,
1088	Consonant, Invalid, Consonant, Consonant,
1089	Consonant, Consonant, Consonant, Consonant,
1090
1091	Consonant, Consonant, Consonant, Consonant,
1092	Consonant, Consonant, Consonant, Consonant,
1093	Consonant, Consonant, UnknownForm, UnknownForm,
1094	Invalid, Invalid, Matra, Matra,
1095
1096	Matra, Matra, Matra, Matra,
1097	Invalid, Invalid, Matra, Matra,
1098	Matra, Invalid, Matra, Matra,
1099	Matra, Halant, Invalid, Invalid,
1100
1101	Invalid, Invalid, Invalid, Invalid,
1102	Invalid, Invalid, Invalid, Matra,
1103	Invalid, Invalid, Invalid, Invalid,
1104	Invalid, Invalid, Invalid, Invalid,
1105
1106	IndependentVowel, IndependentVowel, Invalid, Invalid,
1107	Invalid, Invalid, Other, Other,
1108	Other, Other, Other, Other,
1109	Other, Other, Other, Other,
1110
1111	Other, Other, Other, Other,
1112	Other, Other, Other, Other,
1113	Other, Other, Other, Other,
1114	Other, Other, Other, Other,
1115
1116	// Sinhala
1117	Invalid, Invalid, VowelMark, VowelMark,
1118	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1119	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1120	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1121
1122	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1123	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1124	Invalid, Invalid, Consonant, Consonant,
1125	Consonant, Consonant, Consonant, Consonant,
1126
1127	Consonant, Consonant, Consonant, Consonant,
1128	Consonant, Consonant, Consonant, Consonant,
1129	Consonant, Consonant, Consonant, Consonant,
1130	Consonant, Consonant, Consonant, Consonant,
1131
1132	Consonant, Consonant, Invalid, Consonant,
1133	Consonant, Consonant, Consonant, Consonant,
1134	Consonant, Consonant, Consonant, Consonant,
1135	Invalid, Consonant, Invalid, Invalid,
1136
1137	Consonant, Consonant, Consonant, Consonant,
1138	Consonant, Consonant, Consonant, Invalid,
1139	Invalid, Invalid, Halant, Invalid,
1140	Invalid, Invalid, Invalid, Matra,
1141
1142	Matra, Matra, Matra, Matra,
1143	Matra, Invalid, Matra, Invalid,
1144	Matra, Matra, Matra, Matra,
1145	Matra, Matra, Matra, Matra,
1146
1147	Invalid, Invalid, Invalid, Invalid,
1148	Invalid, Invalid, Invalid, Invalid,
1149	Invalid, Invalid, Invalid, Invalid,
1150	Invalid, Invalid, Invalid, Invalid,
1151
1152	Invalid, Invalid, Matra, Matra,
1153	Other, Other, Other, Other,
1154	Other, Other, Other, Other,
1155	Other, Other, Other, Other,
1156	};
1157
1158	static inline Form form(unsigned short uc) {
1159	if (uc < `0x900` \|\| uc > `0xdff`) {
1160	if (uc == `0x25cc`)
1161	return Consonant;
1162	if (uc == `0x200c` \|\| uc == `0x200d`)
1163	return Control;
1164	return Other;
1165	}
1166	return (Form)indicForms[uc-`0x900`];
1167	}
1168
1169	// #define INDIC_DEBUG
1170	#ifdef INDIC_DEBUG
1171	#define IDEBUG qDebug
1172	#else
1173	#define IDEBUG if constexpr (1) ; else qDebug
1174	#endif
1175
1176	/ syllables are of the form:*
1177
1178	(Consonant Nukta? Halant) Consonant Matra? VowelMark? StressMark?*
1179	(Consonant Nukta? Halant) Consonant Halant*
1180	IndependentVowel VowelMark? StressMark?
1181
1182	We return syllable boundaries on invalid combinations aswell
1183	*/
1184	static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
1185	{
1186	invalid = false*;
1187	IDEBUG("indic_nextSyllableBoundary: start=%d, end=%d", int(start), int(end));
1188	const char16_t *uc = s+start;
1189
1190	qsizetype pos = `0`;
1191	Form state = form(uc[pos]);
1192	IDEBUG("state[%d]=%d (uc=%4x)", int(pos), state, uc[pos]);
1193	pos++;
1194
1195	if (state != Consonant && state != IndependentVowel) {
1196	if (state != Other)
1197	invalid = true*;
1198	goto finish;
1199	}
1200
1201	while (pos < end - start) {
1202	Form newState = form(uc[pos]);
1203	IDEBUG("state[%d]=%d (uc=%4x)", int(pos), newState, uc[pos]);
1204	switch (newState) {
1205	case Control:
1206	newState = state;
1207	if (state == Halant && uc[pos] == `0x200d` / ZWJ /)
1208	break;
1209	// the control character should be the last char in the item
1210	if (state == Consonant && script == QChar::Script_Bengali && uc[pos-`1`] == `0x09B0` && uc[pos] == `0x200d` / ZWJ /)
1211	break;
1212	if (state == Consonant && script == QChar::Script_Kannada && uc[pos-`1`] == `0x0CB0` && uc[pos] == `0x200d` / ZWJ /)
1213	break;
1214	// Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1215	++pos;
1216	goto finish;
1217	case Consonant:
1218	if (state == Halant && (script != QChar::Script_Sinhala \|\| uc[pos-`1`] == `0x200d` / ZWJ /))
1219	break;
1220	goto finish;
1221	case Halant:
1222	if (state == Nukta \|\| state == Consonant)
1223	break;
1224	// Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1225	if (script == QChar::Script_Bengali && pos == `1` &&
1226	(uc[`0`] == `0x0985` \|\| uc[`0`] == `0x098f`))
1227	break;
1228	// Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1229	if (script == QChar::Script_Sinhala && state == Matra) {
1230	++pos;
1231	continue;
1232	}
1233	if (script == QChar::Script_Malayalam && state == Matra && uc[pos-`1`] == `0x0d41`) {
1234	++pos;
1235	continue;
1236	}
1237	goto finish;
1238	case Nukta:
1239	if (state == Consonant)
1240	break;
1241	goto finish;
1242	case StressMark:
1243	if (state == VowelMark)
1244	break;
1245	Q_FALLTHROUGH();
1246	case VowelMark:
1247	if (state == Matra \|\| state == LengthMark \|\| state == IndependentVowel)
1248	break;
1249	Q_FALLTHROUGH();
1250	case Matra:
1251	if (state == Consonant \|\| state == Nukta)
1252	break;
1253	if (state == Matra) {
1254	// ### needs proper testing for correct two/three part matras
1255	break;
1256	}
1257	// ### not sure if this is correct. If it is, does it apply only to Bengali or should
1258	// it work for all Indic languages?
1259	// the combination Independent_A + Vowel Sign AA is allowed.
1260	if (script == QChar::Script_Bengali && uc[pos] == `0x9be` && uc[pos-`1`] == `0x985`)
1261	break;
1262	if (script == QChar::Script_Tamil && state == Matra) {
1263	if (uc[pos-`1`] == `0x0bc6` &&
1264	(uc[pos] == `0xbbe` \|\| uc[pos] == `0xbd7`))
1265	break;
1266	if (uc[pos-`1`] == `0x0bc7` && uc[pos] == `0xbbe`)
1267	break;
1268	}
1269	goto finish;
1270
1271	case LengthMark:
1272	if (state == Matra) {
1273	// ### needs proper testing for correct two/three part matras
1274	break;
1275	}
1276	case IndependentVowel:
1277	case Invalid:
1278	case Other:
1279	goto finish;
1280	}
1281	state = newState;
1282	pos++;
1283	}
1284	finish:
1285	return pos+start;
1286	}
1287
1288	static void indicAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1289	{
1290	qsizetype end = from + len;
1291	const char16_t *uc = text + from;
1292	attributes += from;
1293	qsizetype i = `0`;
1294	while (i < len) {
1295	bool invalid;
1296	qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1297	attributes[i].graphemeBoundary = true;
1298
1299	if (boundary > len-`1`) boundary = len;
1300	i++;
1301	while (i < boundary) {
1302	attributes[i].graphemeBoundary = false;
1303	++uc;
1304	++i;
1305	}
1306	assert(i == boundary);
1307	}
1308
1309
1310	}
1311
1312	#define LIBTHAI_MAJOR 0
1313
1314	/*
1315	* if libthai changed please update these codes too.
1316	*/
1317	struct thcell_t {
1318	unsigned char base; /< base character /*
1319	unsigned char hilo; /< upper/lower vowel/diacritic /*
1320	unsigned char top; /< top-level mark /*
1321	};
1322	typedef int (th_brk_def) (const* unsigned char, int**, size_t);
1323	typedef size_t (th_next_cell_def) (const* unsigned char , size_t, struct* thcell_t , int*);
1324
1325	/ libthai related function handles /
1326	static th_brk_def th_brk = nullptr;
1327	static th_next_cell_def th_next_cell = nullptr;
1328
1329	static int init_libthai() {
1330	static bool initialized = false;
1331	if (!initialized && (!th_brk \|\| !th_next_cell)) {
1332	th_brk = (th_brk_def) QLibrary::resolve(QLatin1String ("thai"), (int)LIBTHAI_MAJOR, "th_brk");
1333	th_next_cell = (th_next_cell_def)QLibrary::resolve(QLatin1String ("thai"), LIBTHAI_MAJOR, "th_next_cell");
1334	initialized = true;
1335	}
1336	if (th_brk && th_next_cell)
1337	return `1`;
1338	else
1339	return `0`;
1340	}
1341
1342	static void to_tis620(const char16_t string, qsizetype len, char* *cstr)
1343	{
1344	qsizetype i;
1345	unsigned char result = (unsigned* char *)cstr;
1346
1347	for (i = `0`; i < len; ++i) {
1348	if (string[i] <= `0xa0`)
1349	result[i] = (unsigned char)string[i];
1350	else if (string[i] >= `0xe01` && string[i] <= `0xe5b`)
1351	result[i] = (unsigned char)(string[i] - `0xe00` + `0xa0`);
1352	else
1353	result[i] = (unsigned char)~`0`; // Same encoding as libthai uses for invalid chars
1354	}
1355
1356	result[len] = `0`;
1357	}
1358
1359	/*
1360	* Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1361	*/
1362	static void thaiAssignAttributes(const char16_t string, qsizetype len, QCharAttributes attributes)
1363	{
1364	char s[`128`];
1365	char *cstr = s;
1366	int break_positions = nullptr*;
1367	int brp[`128`];
1368	int brp_size = `0`;
1369	qsizetype numbreaks, i, j, cell_length;
1370	struct thcell_t tis_cell;
1371
1372	if (!init_libthai())
1373	return ;
1374
1375	if (len >= `128`)
1376	cstr = (char )malloc(lensizeof(char) + `1`);
1377
1378	to_tis620(string, len, cstr);
1379
1380	for (i = `0`; i < len; ++i) {
1381	attributes[i].wordBreak = false;
1382	attributes[i].wordStart = false;
1383	attributes[i].wordEnd = false;
1384	attributes[i].lineBreak = false;
1385	}
1386
1387	if (len > `128`) {
1388	break_positions = (int) malloc (sizeof(int) len);
1389	memset (break_positions, `0`, sizeof(int) * len);
1390	brp_size = len;
1391	}
1392	else {
1393	break_positions = brp;
1394	brp_size = `128`;
1395	}
1396
1397	if (break_positions) {
1398	attributes[`0`].wordBreak = true;
1399	attributes[`0`].wordStart = true;
1400	attributes[`0`].wordEnd = false;
1401	numbreaks = th_brk((const unsigned char *)cstr, break_positions, brp_size);
1402	for (i = `0`; i < numbreaks; ++i) {
1403	attributes[break_positions[i]].wordBreak = true;
1404	attributes[break_positions[i]].wordStart = true;
1405	attributes[break_positions[i]].wordEnd = true;
1406	attributes[break_positions[i]].lineBreak = true;
1407	}
1408	if (numbreaks > `0`)
1409	attributes[break_positions[numbreaks - `1`]].wordStart = false;
1410
1411	if (break_positions != brp)
1412	free(break_positions);
1413	}
1414
1415	/ manage grapheme boundaries /
1416	i = `0`;
1417	while (i < len) {
1418	cell_length = (uint)(th_next_cell((const unsigned char )cstr + i, len - i, &tis_cell, true*));
1419
1420	attributes[i].graphemeBoundary = true;
1421	for (j = `1`; j < cell_length; j++)
1422	attributes[i + j].graphemeBoundary = false;
1423
1424	/ Set graphemeBoundary for SARA AM /
1425	if (cstr[i + cell_length - `1`] == (char)`0xd3`)
1426	attributes[i + cell_length - `1`].graphemeBoundary = true;
1427
1428	i += cell_length;
1429	}
1430
1431	if (len >= `128`)
1432	free(cstr);
1433	}
1434
1435	static void thaiAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1436	{
1437	assert(script == QChar::Script_Thai);
1438	const char16_t *uc = text + from;
1439	attributes += from;
1440	Q_UNUSED(script);
1441	thaiAssignAttributes(uc, len, attributes);
1442	}
1443
1444	/*
1445	tibetan syllables are of the form:
1446	head position consonant
1447	first sub-joined consonant
1448	....intermediate sub-joined consonants (if any)
1449	last sub-joined consonant
1450	sub-joined vowel (a-chung U+0F71)
1451	standard or compound vowel sign (or 'virama' for devanagari transliteration)
1452	*/
1453
1454	typedef enum {
1455	TibetanOther,
1456	TibetanHeadConsonant,
1457	TibetanSubjoinedConsonant,
1458	TibetanSubjoinedVowel,
1459	TibetanVowel
1460	} TibetanForm;
1461
1462	/ this table starts at U+0f40 /
1463	static const unsigned char tibetanForm[`0x80`] = {
1464	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1465	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1466	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1467	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1468
1469	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1470	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1471	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1472	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1473
1474	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1475	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1476	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1477	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1478
1479	TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel,
1480	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1481	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1482	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1483
1484	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1485	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1486	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1487	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1488
1489	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1490	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1491	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1492	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1493
1494	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1495	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1496	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1497	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1498
1499	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1500	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1501	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1502	TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther
1503	};
1504
1505	#define tibetan_form(c) \
1506	((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1507
1508	static qsizetype tibetan_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
1509	{
1510	const char16_t *uc = s + start;
1511
1512	qsizetype pos = `0`;
1513	TibetanForm state = tibetan_form(*uc);
1514
1515	/ qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);/
1516	pos++;
1517
1518	if (state != TibetanHeadConsonant) {
1519	if (state != TibetanOther)
1520	invalid = true*;
1521	goto finish;
1522	}
1523
1524	while (pos < end - start) {
1525	TibetanForm newState = tibetan_form(uc[pos]);
1526	switch (newState) {
1527	case TibetanSubjoinedConsonant:
1528	case TibetanSubjoinedVowel:
1529	if (state != TibetanHeadConsonant &&
1530	state != TibetanSubjoinedConsonant)
1531	goto finish;
1532	state = newState;
1533	break;
1534	case TibetanVowel:
1535	if (state != TibetanHeadConsonant &&
1536	state != TibetanSubjoinedConsonant &&
1537	state != TibetanSubjoinedVowel)
1538	goto finish;
1539	break;
1540	case TibetanOther:
1541	case TibetanHeadConsonant:
1542	goto finish;
1543	}
1544	pos++;
1545	}
1546
1547	finish:
1548	invalid = false*;
1549	return start+pos;
1550	}
1551
1552	static void tibetanAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1553	{
1554	qsizetype end = from + len;
1555	const char16_t *uc = text + from;
1556	qsizetype i = `0`;
1557	Q_UNUSED(script);
1558	attributes += from;
1559	while (i < len) {
1560	bool invalid;
1561	qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1562
1563	attributes[i].graphemeBoundary = true;
1564
1565	if (boundary > len-`1`) boundary = len;
1566	i++;
1567	while (i < boundary) {
1568	attributes[i].graphemeBoundary = false;
1569	++uc;
1570	++i;
1571	}
1572	assert(i == boundary);
1573	}
1574	}
1575
1576	enum MymrCharClassValues {
1577	Mymr_CC_RESERVED = `0`,
1578	Mymr_CC_CONSONANT = `1`, / Consonant of type 1, that has subscript form /
1579	Mymr_CC_CONSONANT2 = `2`, / Consonant of type 2, that has no subscript form /
1580	Mymr_CC_NGA = `3`, / Consonant NGA /
1581	Mymr_CC_YA = `4`, / Consonant YA /
1582	Mymr_CC_RA = `5`, / Consonant RA /
1583	Mymr_CC_WA = `6`, / Consonant WA /
1584	Mymr_CC_HA = `7`, / Consonant HA /
1585	Mymr_CC_IND_VOWEL = `8`, / Independent vowel /
1586	Mymr_CC_ZERO_WIDTH_NJ_MARK = `9`, / Zero Width non joiner character (0x200C) /
1587	Mymr_CC_VIRAMA = `10`, / Subscript consonant combining character /
1588	Mymr_CC_PRE_VOWEL = `11`, / Dependent vowel, prebase (Vowel e) /
1589	Mymr_CC_BELOW_VOWEL = `12`, / Dependent vowel, prebase (Vowel u, uu) /
1590	Mymr_CC_ABOVE_VOWEL = `13`, / Dependent vowel, prebase (Vowel i, ii, ai) /
1591	Mymr_CC_POST_VOWEL = `14`, / Dependent vowel, prebase (Vowel aa) /
1592	Mymr_CC_SIGN_ABOVE = `15`,
1593	Mymr_CC_SIGN_BELOW = `16`,
1594	Mymr_CC_SIGN_AFTER = `17`,
1595	Mymr_CC_ZERO_WIDTH_J_MARK = `18`, / Zero width joiner character /
1596	Mymr_CC_COUNT = `19` / This is the number of character classes /
1597	};
1598
1599	enum MymrCharClassFlags {
1600	Mymr_CF_CLASS_MASK = `0x0000FFFF`,
1601
1602	Mymr_CF_CONSONANT = `0x01000000`, / flag to speed up comparing /
1603	Mymr_CF_MEDIAL = `0x02000000`, / flag to speed up comparing /
1604	Mymr_CF_IND_VOWEL = `0x04000000`, / flag to speed up comparing /
1605	Mymr_CF_DEP_VOWEL = `0x08000000`, / flag to speed up comparing /
1606	Mymr_CF_DOTTED_CIRCLE = `0x10000000`, / add a dotted circle if a character with this flag is the*
1607	first in a syllable /*
1608	Mymr_CF_VIRAMA = `0x20000000`, / flag to speed up comparing /
1609
1610	/ position flags /
1611	Mymr_CF_POS_BEFORE = `0x00080000`,
1612	Mymr_CF_POS_BELOW = `0x00040000`,
1613	Mymr_CF_POS_ABOVE = `0x00020000`,
1614	Mymr_CF_POS_AFTER = `0x00010000`,
1615	Mymr_CF_POS_MASK = `0x000f0000`,
1616
1617	Mymr_CF_AFTER_KINZI = `0x00100000`
1618	};
1619
1620	/ Characters that get refrered to by name /
1621	enum MymrChar
1622	{
1623	Mymr_C_SIGN_ZWNJ = `0x200C`,
1624	Mymr_C_SIGN_ZWJ = `0x200D`,
1625	Mymr_C_DOTTED_CIRCLE = `0x25CC`,
1626	Mymr_C_RA = `0x101B`,
1627	Mymr_C_YA = `0x101A`,
1628	Mymr_C_NGA = `0x1004`,
1629	Mymr_C_VOWEL_E = `0x1031`,
1630	Mymr_C_VIRAMA = `0x1039`
1631	};
1632
1633	enum
1634	{
1635	Mymr_xx = Mymr_CC_RESERVED,
1636	Mymr_c1 = Mymr_CC_CONSONANT \| Mymr_CF_CONSONANT \| Mymr_CF_POS_BELOW,
1637	Mymr_c2 = Mymr_CC_CONSONANT2 \| Mymr_CF_CONSONANT,
1638	Mymr_ng = Mymr_CC_NGA \| Mymr_CF_CONSONANT \| Mymr_CF_POS_ABOVE,
1639	Mymr_ya = Mymr_CC_YA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_AFTER \| Mymr_CF_AFTER_KINZI,
1640	Mymr_ra = Mymr_CC_RA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BEFORE,
1641	Mymr_wa = Mymr_CC_WA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BELOW,
1642	Mymr_ha = Mymr_CC_HA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BELOW,
1643	Mymr_id = Mymr_CC_IND_VOWEL \| Mymr_CF_IND_VOWEL,
1644	Mymr_vi = Mymr_CC_VIRAMA \| Mymr_CF_VIRAMA \| Mymr_CF_POS_ABOVE \| Mymr_CF_DOTTED_CIRCLE,
1645	Mymr_dl = Mymr_CC_PRE_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_BEFORE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
1646	Mymr_db = Mymr_CC_BELOW_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_BELOW \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
1647	Mymr_da = Mymr_CC_ABOVE_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_ABOVE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
1648	Mymr_dr = Mymr_CC_POST_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_AFTER \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
1649	Mymr_sa = Mymr_CC_SIGN_ABOVE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_POS_ABOVE \| Mymr_CF_AFTER_KINZI,
1650	Mymr_sb = Mymr_CC_SIGN_BELOW \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_POS_BELOW \| Mymr_CF_AFTER_KINZI,
1651	Mymr_sp = Mymr_CC_SIGN_AFTER \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI
1652	};
1653
1654
1655	typedef int MymrCharClass;
1656
1657
1658	static const MymrCharClass mymrCharClasses[] =
1659	{
1660	Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1,
1661	Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, / 1000 - 100F /
1662	Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1,
1663	Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, / 1010 - 101F /
1664	Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id,
1665	Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, / 1020 - 102F /
1666	Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb,
1667	Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1030 - 103F /
1668	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1669	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1040 - 104F /
1670	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1671	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1050 - 105F /
1672	};
1673
1674	static MymrCharClass
1675	getMyanmarCharClass (ushort ch)
1676	{
1677	if (ch == Mymr_C_SIGN_ZWJ)
1678	return Mymr_CC_ZERO_WIDTH_J_MARK;
1679
1680	if (ch == Mymr_C_SIGN_ZWNJ)
1681	return Mymr_CC_ZERO_WIDTH_NJ_MARK;
1682
1683	if (ch < `0x1000` \|\| ch > `0x105f`)
1684	return Mymr_CC_RESERVED;
1685
1686	return mymrCharClasses[ch - `0x1000`];
1687	}
1688
1689	static const signed char mymrStateTable[][Mymr_CC_COUNT] =
1690	{
1691	/ xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj /
1692	{ `1`, `4`, `4`, `2`, `4`, `4`, `4`, `4`, `24`, `1`, `27`, `17`, `18`, `19`, `20`, `21`, `1`, `1`, `4`}, / 0 - ground state /
1693	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 1 - exit state (or sp to the right of the syllable) /
1694	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `3`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, `4`}, / 2 - NGA /
1695	{-`1`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 3 - Virama after NGA /
1696	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `5`, `17`, `18`, `19`, `20`, `21`, `1`, `1`, -`1`}, / 4 - Base consonant /
1697	{-`2`, `6`, -`2`, -`2`, `7`, `8`, `9`, `10`, -`2`, `23`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 5 - First virama /
1698	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `25`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 6 - c1 after virama /
1699	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 7 - ya after virama /
1700	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 8 - ra after virama /
1701	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 9 - wa after virama /
1702	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 10 - ha after virama /
1703	{-`1`, -`1`, -`1`, -`1`, `7`, `8`, `9`, `10`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 11 - Virama after NGA+zwj /
1704	{-`2`, -`2`, -`2`, -`2`, -`2`, -`2`, `13`, `14`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 12 - Second virama /
1705	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `15`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 13 - wa after virama /
1706	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 14 - ha after virama /
1707	{-`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, `16`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 15 - Third virama /
1708	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 16 - ha after virama /
1709	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `20`, `21`, `1`, `1`, -`1`}, / 17 - dl, Dependent vowel e /
1710	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `19`, -`1`, `21`, `1`, `1`, -`1`}, / 18 - db, Dependent vowel u,uu /
1711	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, `1`, -`1`}, / 19 - da, Dependent vowel i,ii,ai /
1712	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `22`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 20 - dr, Dependent vowel aa /
1713	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 21 - sa, Sign anusvara /
1714	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 22 - atha /
1715	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 23 - zwnj for atha /
1716	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, -`1`}, / 24 - Independent vowel /
1717	{-`2`, -`2`, -`2`, -`2`, `26`, `26`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 25 - Virama after subscript consonant /
1718	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, `1`, -`1`}, / 26 - ra/ya after subscript consonant + virama /
1719	{-`1`, `6`, -`1`, -`1`, `7`, `8`, `9`, `10`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 27 - Virama after ground state /
1720	/ exit state -2 is for invalid order of medials and combination of invalids*
1721	with virama where virama should treat as start of next syllable
1722	*/
1723	};
1724
1725	/#define MYANMAR_DEBUG /
1726	#ifdef MYANMAR_DEBUG
1727	#define MMDEBUG qDebug
1728	#else
1729	# define MMDEBUG \
1730	if (0) \
1731	printf
1732	#endif
1733
1734	/*
1735	// Given an input string of characters and a location in which to start looking
1736	// calculate, using the state table, which one is the last character of the syllable
1737	// that starts in the starting position.
1738	*/
1739	static qsizetype myanmar_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
1740	{
1741	const char16_t *uc = s + start;
1742	int state = `0`;
1743	qsizetype pos = start;
1744	invalid = false*;
1745
1746	while (pos < end) {
1747	MymrCharClass charClass = getMyanmarCharClass(*uc);
1748	state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
1749	if (pos == start)
1750	invalid = (bool*)(charClass & Mymr_CF_DOTTED_CIRCLE);
1751
1752	MMDEBUG("state[%d]=%d class=%8x (uc=%4x)", int(pos - start), state, charClass, *uc);
1753
1754	if (state < `0`) {
1755	if (state < -`1`)
1756	--pos;
1757	break;
1758	}
1759	++uc;
1760	++pos;
1761	}
1762	return pos;
1763	}
1764
1765	static void myanmarAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1766	{
1767	qsizetype end = from + len;
1768	const char16_t *uc = text + from;
1769	qsizetype i = `0`;
1770	Q_UNUSED(script);
1771	attributes += from;
1772	while (i < len) {
1773	bool invalid;
1774	qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1775
1776	attributes[i].graphemeBoundary = true;
1777	attributes[i].lineBreak = true;
1778
1779	if (boundary > len-`1`)
1780	boundary = len;
1781	i++;
1782	while (i < boundary) {
1783	attributes[i].graphemeBoundary = false;
1784	++uc;
1785	++i;
1786	}
1787	assert(i == boundary);
1788	}
1789	}
1790
1791	/*
1792	// Vocabulary
1793	// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
1794	// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
1795	// split vowels, signs... but there is only one base in a syllable, it has to be coded as
1796	// the first character of the syllable.
1797	// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
1798	// Khmer language has five of them. Khmer split vowels either have one part before the
1799	// base and one after the base or they have a part before the base and a part above the base.
1800	// The first part of all Khmer split vowels is the same character, identical to
1801	// the glyph of Khmer dependent vowel SRA EI
1802	// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
1803	// Differently than indian languages, the coeng modifies the consonant that follows it,
1804	// not the one preceding it Each consonant has two forms, the base form and the subscript form
1805	// the base form is the normal one (using the consonants code-point), the subscript form is
1806	// displayed when the combination coeng + consonant is encountered.
1807	// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
1808	// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
1809	// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
1810	// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
1811	// if it is attached to a consonant of the first series or a consonant of the second series
1812	// Most consonants have an equivalent in the other series, but some of theme exist only in
1813	// one series (for example SA). If we want to use the consonant SA with a vowel sound that
1814	// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
1815	// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
1816	// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
1817	// MUSIKATOAN a second series consonant to have a first series vowel sound.
1818	// Consonant shifter are both normally supercript marks, but, when they are followed by a
1819	// superscript, they change shape and take the form of subscript dependent vowel SRA U.
1820	// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
1821	// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
1822	// be placed after the coeng consonant.
1823	// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
1824	// Each vowel has its own position. Only one vowel per syllable is allowed.
1825	// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
1826	// Allowed in a syllable.
1827	//
1828	//
1829	// order is important here! This order must be the same that is found in each horizontal
1830	// line in the statetable for Khmer (see khmerStateTable) .
1831	*/
1832	enum KhmerCharClassValues {
1833	CC_RESERVED = `0`,
1834	CC_CONSONANT = `1`, / Consonant of type 1 or independent vowel /
1835	CC_CONSONANT2 = `2`, / Consonant of type 2 /
1836	CC_CONSONANT3 = `3`, / Consonant of type 3 /
1837	CC_ZERO_WIDTH_NJ_MARK = `4`, / Zero Width non joiner character (0x200C) /
1838	CC_CONSONANT_SHIFTER = `5`,
1839	CC_ROBAT = `6`, / Khmer special diacritic accent -treated differently in state table /
1840	CC_COENG = `7`, / Subscript consonant combining character /
1841	CC_DEPENDENT_VOWEL = `8`,
1842	CC_SIGN_ABOVE = `9`,
1843	CC_SIGN_AFTER = `10`,
1844	CC_ZERO_WIDTH_J_MARK = `11`, / Zero width joiner character /
1845	CC_COUNT = `12` / This is the number of character classes /
1846	};
1847
1848
1849	enum KhmerCharClassFlags {
1850	CF_CLASS_MASK = `0x0000FFFF`,
1851
1852	CF_CONSONANT = `0x01000000`, / flag to speed up comparing /
1853	CF_SPLIT_VOWEL = `0x02000000`, / flag for a split vowel -> the first part is added in front of the syllable /
1854	CF_DOTTED_CIRCLE = `0x04000000`, / add a dotted circle if a character with this flag is the first in a syllable /
1855	CF_COENG = `0x08000000`, / flag to speed up comparing /
1856	CF_SHIFTER = `0x10000000`, / flag to speed up comparing /
1857	CF_ABOVE_VOWEL = `0x20000000`, / flag to speed up comparing /
1858
1859	/ position flags /
1860	CF_POS_BEFORE = `0x00080000`,
1861	CF_POS_BELOW = `0x00040000`,
1862	CF_POS_ABOVE = `0x00020000`,
1863	CF_POS_AFTER = `0x00010000`,
1864	CF_POS_MASK = `0x000f0000`
1865	};
1866
1867
1868	/ Characters that get referred to by name /
1869	enum KhmerChar {
1870	C_SIGN_ZWNJ = `0x200C`,
1871	C_SIGN_ZWJ = `0x200D`,
1872	C_RO = `0x179A`,
1873	C_VOWEL_AA = `0x17B6`,
1874	C_SIGN_NIKAHIT = `0x17C6`,
1875	C_VOWEL_E = `0x17C1`,
1876	C_COENG = `0x17D2`
1877	};
1878
1879
1880	/*
1881	// simple classes, they are used in the statetable (in this file) to control the length of a syllable
1882	// they are also used to know where a character should be placed (location in reference to the base character)
1883	// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
1884	// indicate error in syllable construction
1885	*/
1886	enum {
1887	_xx = CC_RESERVED,
1888	_sa = CC_SIGN_ABOVE \| CF_DOTTED_CIRCLE \| CF_POS_ABOVE,
1889	_sp = CC_SIGN_AFTER \| CF_DOTTED_CIRCLE\| CF_POS_AFTER,
1890	_c1 = CC_CONSONANT \| CF_CONSONANT,
1891	_c2 = CC_CONSONANT2 \| CF_CONSONANT,
1892	_c3 = CC_CONSONANT3 \| CF_CONSONANT,
1893	_rb = CC_ROBAT \| CF_POS_ABOVE \| CF_DOTTED_CIRCLE,
1894	_cs = CC_CONSONANT_SHIFTER \| CF_DOTTED_CIRCLE \| CF_SHIFTER,
1895	_dl = CC_DEPENDENT_VOWEL \| CF_POS_BEFORE \| CF_DOTTED_CIRCLE,
1896	_db = CC_DEPENDENT_VOWEL \| CF_POS_BELOW \| CF_DOTTED_CIRCLE,
1897	_da = CC_DEPENDENT_VOWEL \| CF_POS_ABOVE \| CF_DOTTED_CIRCLE \| CF_ABOVE_VOWEL,
1898	_dr = CC_DEPENDENT_VOWEL \| CF_POS_AFTER \| CF_DOTTED_CIRCLE,
1899	_co = CC_COENG \| CF_COENG \| CF_DOTTED_CIRCLE,
1900
1901	/ split vowel /
1902	_va = _da \| CF_SPLIT_VOWEL,
1903	_vr = _dr \| CF_SPLIT_VOWEL
1904	};
1905
1906
1907	/*
1908	// Character class: a character class value
1909	// ORed with character class flags.
1910	*/
1911	typedef unsigned long KhmerCharClass;
1912
1913
1914	/*
1915	// Character class tables
1916	// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
1917	// _sa Sign placed above the base
1918	// _sp Sign placed after the base
1919	// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
1920	// _c2 Consonant of type 2 (only RO)
1921	// _c3 Consonant of type 3
1922	// _rb Khmer sign robat u17CC. combining mark for subscript consonants
1923	// _cd Consonant-shifter
1924	// _dl Dependent vowel placed before the base (left of the base)
1925	// _db Dependent vowel placed below the base
1926	// _da Dependent vowel placed above the base
1927	// _dr Dependent vowel placed behind the base (right of the base)
1928	// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
1929	// it to create a subscript consonant or independent vowel
1930	// _va Khmer split vowel in which the first part is before the base and the second one above the base
1931	// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
1932	*/
1933	static const KhmerCharClass khmerCharClasses[] = {
1934	_c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, / 1780 - 178F /
1935	_c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, / 1790 - 179F /
1936	_c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, / 17A0 - 17AF /
1937	_c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, / 17B0 - 17BF /
1938	_vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, / 17C0 - 17CF /
1939	_sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx / 17D0 - 17DF /
1940	};
1941
1942	/ this enum must reflect the range of khmerCharClasses /
1943	enum KhmerCharClassesRange {
1944	KhmerFirstChar = `0x1780`,
1945	KhmerLastChar = `0x17df`
1946	};
1947
1948	/*
1949	// Below we define how a character in the input string is either in the khmerCharClasses table
1950	// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
1951	// within the syllable, but are not in the table) we also get their type back, or an unknown object
1952	// in which case we get _xx (CC_RESERVED) back
1953	*/
1954	static KhmerCharClass getKhmerCharClass(ushort uc)
1955	{
1956	if (uc == C_SIGN_ZWJ) {
1957	return CC_ZERO_WIDTH_J_MARK;
1958	}
1959
1960	if (uc == C_SIGN_ZWNJ) {
1961	return CC_ZERO_WIDTH_NJ_MARK;
1962	}
1963
1964	if (uc < KhmerFirstChar \|\| uc > KhmerLastChar) {
1965	return CC_RESERVED;
1966	}
1967
1968	return khmerCharClasses[uc - KhmerFirstChar];
1969	}
1970
1971
1972	/*
1973	// The stateTable is used to calculate the end (the length) of a well
1974	// formed Khmer Syllable.
1975	//
1976	// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
1977	// CharClassValues. This coincidence of values allows the follow up of the table.
1978	//
1979	// Each line corresponds to a state, which does not necessarily need to be a type
1980	// of component... for example, state 2 is a base, with is always a first character
1981	// in the syllable, but the state could be produced a consonant of any type when
1982	// it is the first character that is analysed (in ground state).
1983	//
1984	// Differentiating 3 types of consonants is necessary in order to
1985	// forbid the use of certain combinations, such as having a second
1986	// coeng after a coeng RO,
1987	// The inexistent possibility of having a type 3 after another type 3 is permitted,
1988	// eliminating it would very much complicate the table, and it does not create typing
1989	// problems, as the case above.
1990	//
1991	// The table is quite complex, in order to limit the number of coeng consonants
1992	// to 2 (by means of the table).
1993	//
1994	// There a peculiarity, as far as Unicode is concerned:
1995	// - The consonant-shifter is considered in two possible different
1996	// locations, the one considered in Unicode 3.0 and the one considered in
1997	// Unicode 4.0. (there is a backwards compatibility problem in this standard).
1998	//
1999	//
2000	// xx independent character, such as a number, punctuation sign or non-khmer char
2001	//
2002	// c1 Khmer consonant of type 1 or an independent vowel
2003	// that is, a letter in which the subscript for is only under the
2004	// base, not taking any space to the right or to the left
2005	//
2006	// c2 Khmer consonant of type 2, the coeng form takes space under
2007	// and to the left of the base (only RO is of this type)
2008	//
2009	// c3 Khmer consonant of type 3. Its subscript form takes space under
2010	// and to the right of the base.
2011	//
2012	// cs Khmer consonant shifter
2013	//
2014	// rb Khmer robat
2015	//
2016	// co coeng character (u17D2)
2017	//
2018	// dv dependent vowel (including split vowels, they are treated in the same way).
2019	// even if dv is not defined above, the component that is really tested for is
2020	// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2021	//
2022	// zwj Zero Width joiner
2023	//
2024	// zwnj Zero width non joiner
2025	//
2026	// sa above sign
2027	//
2028	// sp post sign
2029	//
2030	// there are lines with equal content but for an easier understanding
2031	// (and maybe change in the future) we did not join them
2032	*/
2033	static const signed char khmerStateTable[][CC_COUNT] =
2034	{
2035	/ xx c1 c2 c3 zwnj cs rb co dv sa sp zwj /
2036	{ `1`, `2`, `2`, `2`, `1`, `1`, `1`, `6`, `1`, `1`, `1`, `2`}, / 0 - ground state /
2037	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 1 - exit state (or sign to the right of the syllable) /
2038	{-`1`, -`1`, -`1`, -`1`, `3`, `4`, `5`, `6`, `16`, `17`, `1`, -`1`}, / 2 - Base consonant /
2039	{-`1`, -`1`, -`1`, -`1`, -`1`, `4`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel /
2040	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, `6`, `16`, `17`, `1`, `14`}, / 4 - First register shifter /
2041	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `20`, -`1`, `1`, -`1`}, / 5 - Robat /
2042	{-`1`, `7`, `8`, `9`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 6 - First Coeng /
2043	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, `10`, `16`, `17`, `1`, `14`}, / 7 - First consonant of type 1 after coeng /
2044	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 8 - First consonant of type 2 after coeng /
2045	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, `10`, `16`, `17`, `1`, `14`}, / 9 - First consonant or type 3 after ceong /
2046	{-`1`, `11`, `11`, `11`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 10 - Second Coeng (no register shifter before) /
2047	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 11 - Second coeng consonant (or ind. vowel) no register shifter before /
2048	{-`1`, -`1`, -`1`, -`1`, -`1`, `13`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 12 - Second ZWNJ before a register shifter /
2049	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 13 - Second register shifter /
2050	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 14 - ZWJ before vowel /
2051	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 15 - ZWNJ before vowel /
2052	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `1`, `18`}, / 16 - dependent vowel /
2053	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `18`}, / 17 - sign above /
2054	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `19`, -`1`, -`1`, -`1`, -`1`}, / 18 - ZWJ after vowel /
2055	{-`1`, `1`, -`1`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 19 - Third coeng /
2056	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, -`1`}, / 20 - dependent vowel after a Robat /
2057	};
2058
2059
2060	/ #define KHMER_DEBUG /
2061	#ifdef KHMER_DEBUG
2062	#define KHDEBUG qDebug
2063	#else
2064	# define KHDEBUG \
2065	if (0) \
2066	printf
2067	#endif
2068
2069	/*
2070	// Given an input string of characters and a location in which to start looking
2071	// calculate, using the state table, which one is the last character of the syllable
2072	// that starts in the starting position.
2073	*/
2074	static qsizetype khmer_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
2075	{
2076	const char16_t *uc = s + start;
2077	int state = `0`;
2078	qsizetype pos = start;
2079	invalid = false*;
2080
2081	while (pos < end) {
2082	KhmerCharClass charClass = getKhmerCharClass(*uc);
2083	if (pos == start) {
2084	*invalid = (charClass > `0`) && ! (charClass & CF_CONSONANT);
2085	}
2086	state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2087
2088	KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", int(pos - start), state,
2089	charClass, *uc );
2090
2091	if (state < `0`) {
2092	break;
2093	}
2094	++uc;
2095	++pos;
2096	}
2097	return pos;
2098	}
2099
2100	static void khmerAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
2101	{
2102	qsizetype end = from + len;
2103	const char16_t *uc = text + from;
2104	qsizetype i = `0`;
2105	Q_UNUSED(script);
2106	attributes += from;
2107	while ( i < len ) {
2108	bool invalid;
2109	qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2110
2111	attributes[i].graphemeBoundary = true;
2112
2113	if ( boundary > len-`1` ) boundary = len;
2114	i++;
2115	while ( i < boundary ) {
2116	attributes[i].graphemeBoundary = false;
2117	++uc;
2118	++i;
2119	}
2120	assert( i == boundary );
2121	}
2122	}
2123
2124
2125	const CharAttributeFunction charAttributeFunction[] = {
2126	// Script_Unknown,
2127	nullptr,
2128	// Script_Inherited,
2129	nullptr,
2130	// Script_Common,
2131	nullptr,
2132	// Script_Latin,
2133	nullptr,
2134	// Script_Greek,
2135	nullptr,
2136	// Script_Cyrillic,
2137	nullptr,
2138	// Script_Armenian,
2139	nullptr,
2140	// Script_Hebrew,
2141	nullptr,
2142	// Script_Arabic,
2143	nullptr,
2144	// Script_Syriac,
2145	nullptr,
2146	// Script_Thaana,
2147	nullptr,
2148	// Script_Devanagari,
2149	indicAttributes,
2150	// Script_Bengali,
2151	indicAttributes,
2152	// Script_Gurmukhi,
2153	indicAttributes,
2154	// Script_Gujarati,
2155	indicAttributes,
2156	// Script_Oriya,
2157	indicAttributes,
2158	// Script_Tamil,
2159	indicAttributes,
2160	// Script_Telugu,
2161	indicAttributes,
2162	// Script_Kannada,
2163	indicAttributes,
2164	// Script_Malayalam,
2165	indicAttributes,
2166	// Script_Sinhala,
2167	indicAttributes,
2168	// Script_Thai,
2169	thaiAttributes,
2170	// Script_Lao,
2171	nullptr,
2172	// Script_Tibetan,
2173	tibetanAttributes,
2174	// Script_Myanmar,
2175	myanmarAttributes,
2176	// Script_Georgian,
2177	nullptr,
2178	// Script_Hangul,
2179	nullptr,
2180	// Script_Ethiopic,
2181	nullptr,
2182	// Script_Cherokee,
2183	nullptr,
2184	// Script_CanadianAboriginal,
2185	nullptr,
2186	// Script_Ogham,
2187	nullptr,
2188	// Script_Runic,
2189	nullptr,
2190	// Script_Khmer,
2191	khmerAttributes
2192	};
2193
2194	static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2195	const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2196	QCharAttributes *attributes)
2197	{
2198	if (stringLength == `0`)
2199	return;
2200	for (qsizetype i = `0`; i < numItems; ++i) {
2201	QChar::Script script = items[i].script;
2202	if (script > QChar::Script_Khmer)
2203	script = QChar::Script_Common;
2204	CharAttributeFunction attributeFunction = charAttributeFunction[script];
2205	if (!attributeFunction)
2206	continue;
2207	qsizetype end = i < numItems - `1` ? items[i + `1`].position : stringLength;
2208	attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2209	}
2210	}
2211
2212	}
2213
2214	Q_CORE_EXPORT void initCharAttributes(QStringView string,
2215	const ScriptItem *items, qsizetype numItems,
2216	QCharAttributes *attributes, CharAttributeOptions options)
2217	{
2218	if (string.size() <= `0`)
2219	return;
2220
2221	if (!(options & DontClearAttributes))
2222	::memset(attributes, `0`, (string.size() + `1`) * sizeof(QCharAttributes));
2223
2224	if (options & GraphemeBreaks)
2225	getGraphemeBreaks(string.utf16(), string.size(), attributes);
2226	if (options & WordBreaks)
2227	getWordBreaks(string.utf16(), string.size(), attributes);
2228	if (options & SentenceBreaks)
2229	getSentenceBreaks(string.utf16(), string.size(), attributes);
2230	if (options & LineBreaks)
2231	getLineBreaks(string.utf16(), string.size(), attributes, options);
2232	if (options & WhiteSpaces)
2233	getWhiteSpaces(string.utf16(), string.size(), attributes);
2234
2235	if (!qt_initcharattributes_default_algorithm_only) {
2236	if (!items \|\| numItems <= `0`)
2237	return;
2238
2239	Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2240	}
2241	}
2242
2243
2244	// ----------------------------------------------------------------------------
2245	//
2246	// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2247	//
2248	// ----------------------------------------------------------------------------
2249
2250	Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2251	{
2252	qsizetype sor = `0`;
2253	qsizetype eor = `0`;
2254	QChar::Script script = QChar::Script_Common;
2255
2256	for (qsizetype i = `0`; i < string.size(); ++i, eor = i) {
2257	char32_t ucs4 = string [i].unicode();
2258	if (QChar::isHighSurrogate(ucs4) && i + `1` < string.size()) {
2259	ushort low = string [i + `1`].unicode();
2260	if (QChar::isLowSurrogate(low)) {
2261	ucs4 = QChar::surrogateToUcs4(ucs4, low);
2262	++i;
2263	}
2264	}
2265
2266	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2267
2268	QChar::Script nscript = QChar::Script(prop->script);
2269
2270	if (Q_LIKELY(nscript == script \|\| nscript <= QChar::Script_Common))
2271	continue;
2272
2273	// inherit preceding Common-s
2274	if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2275	// also covers a case where the base character of Common script followed
2276	// by one or more combining marks of non-Inherited, non-Common script
2277	script = nscript;
2278	continue;
2279	}
2280
2281	// Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2282	// Thus, a combining mark - whatever its script property value is - should inherit
2283	// the script property value of its base character.
2284	static const int test = (FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining) \| FLAG(QChar::Mark_Enclosing));
2285	if (Q_UNLIKELY(FLAG(prop->category) & test))
2286	continue;
2287
2288	Q_ASSERT(script > QChar::Script_Common);
2289	Q_ASSERT(sor < eor);
2290	scripts->append(ScriptItem{sor, script});
2291	sor = eor;
2292
2293	script = nscript;
2294	}
2295
2296	Q_ASSERT(script >= QChar::Script_Common);
2297	Q_ASSERT(eor == string.size());
2298	scripts->append(ScriptItem{sor, script});
2299	}
2300
2301	} // namespace QUnicodeTools
2302
2303	QT_END_NAMESPACE
2304

Browse the source code of Qt/src/corelib/text/qunicodetools.cpp