1/****************************************************************************
2**
3** Copyright (C) 2020 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#include "qunicodetools_p.h"
41
42#include "qunicodetables_p.h"
43#include "qvarlengtharray.h"
44#include "qlibrary.h"
45
46#define FLAG(x) (1 << (x))
47
48QT_BEGIN_NAMESPACE
49
50Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
51
52namespace QUnicodeTools {
53
54// -----------------------------------------------------------------------------------------------------
55//
56// The text boundaries determination algorithm.
57// See http://www.unicode.org/reports/tr29/tr29-31.html
58//
59// -----------------------------------------------------------------------------------------------------
60
61namespace GB {
62
63/*
64 * Most grapheme break rules can be implemented table driven, but rules GB10, GB12 and GB13 need a bit
65 * of special treatment.
66 */
67enum State : uchar {
68 Break,
69 Inside,
70 GB10,
71 GB10_2,
72 GB10_3,
73 GB13, // also covers GB12
74};
75
76static const State breakTable[QUnicodeTables::NumGraphemeBreakClasses][QUnicodeTables::NumGraphemeBreakClasses] = {
77// Any CR LF Control Extend ZWJ RI Prepend S-Mark L V T LV LVT E_B E_M GAZ EBG
78 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
79 { Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
80 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
81 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Control
82 { Break , Break , Break , Break , GB10_2, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , GB10_3, Break , Break }, // Extend
83 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Inside, Inside }, // ZWJ
84 { Break , Break , Break , Break , Inside, Inside, GB13 , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
85 { Inside, Break , Break , Break , Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside }, // Prepend
86 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SpacingMark
87 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Inside, Inside, Break , Inside, Inside, Break , Break , Break , Break }, // L
88 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // V
89 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // T
90 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break }, // LV
91 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break }, // LVT
92 { Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // E_B
93 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_M
94 { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
95 { Break , Break , Break , Break , GB10 , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break }, // EBG
96};
97
98} // namespace GB
99
100static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
101{
102 QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
103 GB::State state = GB::Break; // only required to track some of the rules
104 for (qsizetype i = 0; i != len; ++i) {
105 qsizetype pos = i;
106 char32_t ucs4 = string[i];
107 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
108 ushort low = string[i + 1];
109 if (QChar::isLowSurrogate(low)) {
110 ucs4 = QChar::surrogateToUcs4(ucs4, low);
111 ++i;
112 }
113 }
114
115 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
116 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
117
118 switch (GB::breakTable[lcls][cls]) {
119 case GB::Break:
120 attributes[pos].graphemeBoundary = true;
121 state = GB::Break;
122 break;
123 case GB::Inside:
124 state = GB::Break;
125 break;
126 case GB::GB10:
127 state = GB::GB10;
128 break;
129 case GB::GB10_2:
130 if (state == GB::GB10 || state == GB::GB10_2)
131 state = GB::GB10_2;
132 else
133 state = GB::Break;
134 break;
135 case GB::GB10_3:
136 if (state != GB::GB10 && state != GB::GB10_2)
137 attributes[pos].graphemeBoundary = true;
138 state = GB::Break;
139 break;
140 case GB::GB13:
141 if (state != GB::GB13) {
142 state = GB::GB13;
143 } else {
144 attributes[pos].graphemeBoundary = true;
145 state = GB::Break;
146 }
147 }
148
149 lcls = cls;
150 }
151
152 attributes[len].graphemeBoundary = true; // GB2
153}
154
155
156namespace WB {
157
158enum Action {
159 NoBreak,
160 Break,
161 Lookup,
162 LookupW
163};
164
165static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
166// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet E_Base E_Mod GAZ EBG WSeg
167 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
168 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
169 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
170 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
171 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
172 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ZWJ
173 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
174 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
175 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break }, // Katakana
176 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // HebrewLetter
177 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ALetter
178 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
179 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
180 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
181 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
182 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
183 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // Numeric
184 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ExtendNumLet
185 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // E_Base
186 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_Mod
187 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
188 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // EBG
189 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // WSeg
190};
191
192} // namespace WB
193
194static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
195{
196 enum WordType {
197 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
198 } currentWordType = WordTypeNone;
199
200 QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
201 for (qsizetype i = 0; i != len; ++i) {
202 qsizetype pos = i;
203 char32_t ucs4 = string[i];
204 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
205 ushort low = string[i + 1];
206 if (QChar::isLowSurrogate(low)) {
207 ucs4 = QChar::surrogateToUcs4(ucs4, low);
208 ++i;
209 }
210 }
211
212 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
213 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
214#ifdef QT_BUILD_INTERNAL
215 if (qt_initcharattributes_default_algorithm_only) {
216 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
217 // which caused "hi.there" to be treated like if it were just a single word;
218 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
219 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
220 if (ucs4 == 0x002E) // FULL STOP
221 ncls = QUnicodeTables::WordBreak_MidNumLet;
222 else if (ucs4 == 0x003A) // COLON
223 ncls = QUnicodeTables::WordBreak_MidLetter;
224 }
225#endif
226
227 uchar action = WB::breakTable[cls][ncls];
228 switch (action) {
229 case WB::Break:
230 break;
231 case WB::NoBreak:
232 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
233 // WB4: X(Extend|Format)* -> X
234 if (cls != QUnicodeTables::WordBreak_ZWJ) // WB3c
235 continue;
236 }
237 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
238 // WB15/WB16: break between pairs of Regional indicator
239 ncls = QUnicodeTables::WordBreak_Any;
240 }
241 break;
242 case WB::Lookup:
243 case WB::LookupW:
244 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
245 ucs4 = string[lookahead];
246 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
247 ushort low = string[lookahead + 1];
248 if (QChar::isLowSurrogate(low)) {
249 ucs4 = QChar::surrogateToUcs4(ucs4, low);
250 ++lookahead;
251 }
252 }
253
254 prop = QUnicodeTables::properties(ucs4);
255 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
256
257 if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) {
258 // WB4: X(Extend|Format)* -> X
259 continue;
260 }
261
262 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
263 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
264 i = lookahead;
265 ncls = tcls;
266 action = WB::NoBreak;
267 }
268 break;
269 }
270 if (action != WB::NoBreak) {
271 action = WB::Break;
272 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
273 action = WB::NoBreak; // WB7a
274 }
275 break;
276 }
277
278 cls = ncls;
279 if (action == WB::Break) {
280 attributes[pos].wordBreak = true;
281 if (currentWordType != WordTypeNone)
282 attributes[pos].wordEnd = true;
283 switch (cls) {
284 case QUnicodeTables::WordBreak_Katakana:
285 currentWordType = WordTypeHiraganaKatakana;
286 attributes[pos].wordStart = true;
287 break;
288 case QUnicodeTables::WordBreak_HebrewLetter:
289 case QUnicodeTables::WordBreak_ALetter:
290 case QUnicodeTables::WordBreak_Numeric:
291 currentWordType = WordTypeAlphaNumeric;
292 attributes[pos].wordStart = true;
293 break;
294 default:
295 currentWordType = WordTypeNone;
296 break;
297 }
298 }
299 }
300
301 if (currentWordType != WordTypeNone)
302 attributes[len].wordEnd = true;
303 attributes[len].wordBreak = true; // WB2
304}
305
306
307namespace SB {
308
309enum State {
310 Initial,
311 Lower,
312 Upper,
313 LUATerm,
314 ATerm,
315 ATermC,
316 ACS,
317 STerm,
318 STermC,
319 SCS,
320 BAfterC,
321 BAfter,
322 Break,
323 Lookup
324};
325
326static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
327// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
328 { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
329 { Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
330 { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
331
332 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
333 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
334 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
335 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
336
337 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
338 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
339 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
340 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
341 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
342};
343
344} // namespace SB
345
346static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
347{
348 uchar state = SB::BAfter; // to meet SB1
349 for (qsizetype i = 0; i != len; ++i) {
350 qsizetype pos = i;
351 char32_t ucs4 = string[i];
352 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
353 ushort low = string[i + 1];
354 if (QChar::isLowSurrogate(low)) {
355 ucs4 = QChar::surrogateToUcs4(ucs4, low);
356 ++i;
357 }
358 }
359
360 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
361 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
362
363 Q_ASSERT(state <= SB::BAfter);
364 state = SB::breakTable[state][ncls];
365 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
366 state = SB::Break;
367 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
368 ucs4 = string[lookahead];
369 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
370 ushort low = string[lookahead + 1];
371 if (QChar::isLowSurrogate(low)) {
372 ucs4 = QChar::surrogateToUcs4(ucs4, low);
373 ++lookahead;
374 }
375 }
376
377 prop = QUnicodeTables::properties(ucs4);
378 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
379 switch (tcls) {
380 case QUnicodeTables::SentenceBreak_Any:
381 case QUnicodeTables::SentenceBreak_Extend:
382 case QUnicodeTables::SentenceBreak_Sp:
383 case QUnicodeTables::SentenceBreak_Numeric:
384 case QUnicodeTables::SentenceBreak_SContinue:
385 case QUnicodeTables::SentenceBreak_Close:
386 continue;
387 case QUnicodeTables::SentenceBreak_Lower:
388 i = lookahead;
389 state = SB::Initial;
390 break;
391 default:
392 break;
393 }
394 break;
395 }
396 }
397 if (Q_UNLIKELY(state == SB::Break)) {
398 attributes[pos].sentenceBoundary = true;
399 state = SB::breakTable[SB::Initial][ncls];
400 }
401 }
402
403 attributes[len].sentenceBoundary = true; // SB2
404}
405
406
407// -----------------------------------------------------------------------------------------------------
408//
409// The line breaking algorithm.
410// See http://www.unicode.org/reports/tr14/tr14-39.html
411//
412// -----------------------------------------------------------------------------------------------------
413
414namespace LB {
415
416namespace NS { // Number Sequence
417
418// LB25 recommends to not break lines inside numbers of the form
419// described by the following regular expression:
420// (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
421
422enum Action {
423 None,
424 Start,
425 Continue,
426 Break
427};
428
429enum Class {
430 XX,
431 PRPO,
432 OPHY,
433 NU,
434 SYIS,
435 CLCP
436};
437
438static const uchar actionTable[CLCP + 1][CLCP + 1] = {
439// XX PRPO OPHY NU SYIS CLCP
440 { None , Start , Start , Start , None , None }, // XX
441 { None , Start , Continue, Continue, None , None }, // PRPO
442 { None , Start , Start , Continue, None , None }, // OPHY
443 { Break , Break , Break , Continue, Continue, Continue }, // NU
444 { Break , Break , Break , Continue, Continue, Continue }, // SYIS
445 { Break , Continue, Break , Break , Break , Break }, // CLCP
446};
447
448inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
449{
450 switch (lbc) {
451 case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
452 // resolve AI math symbols in numerical context to IS
453 if (category == QChar::Symbol_Math)
454 return SYIS;
455 break;
456 case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
457 return PRPO;
458 case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
459 return OPHY;
460 case QUnicodeTables::LineBreak_NU:
461 return NU;
462 case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
463 return SYIS;
464 case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
465 return CLCP;
466 default:
467 break;
468 }
469 return XX;
470}
471
472} // namespace NS
473
474/* In order to support the tailored implementation of LB25 properly
475 the following changes were made in the pair table to allow breaks
476 where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
477 (CL)(PO) from IB to DB
478 (CP)(PO) from IB to DB
479 (CL)(PR) from IB to DB
480 (CP)(PR) from IB to DB
481 (PO)(OP) from IB to DB
482 (PR)(OP) from IB to DB
483 (IS)(NU) from IB to DB
484 (SY)(NU) from IB to DB
485*/
486
487/* In order to implementat LB21a properly a special rule HH has been introduced and
488 the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
489 (HL)(HY|BA) from IB to CI
490 (HY|BA)(!CB) from DB to HH
491*/
492
493enum Action {
494 ProhibitedBreak, PB = ProhibitedBreak,
495 DirectBreak, DB = DirectBreak,
496 IndirectBreak, IB = IndirectBreak,
497 CombiningIndirectBreak, CI = CombiningIndirectBreak,
498 CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
499 ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen
500};
501
502static const uchar breakTable[QUnicodeTables::LineBreak_SA][QUnicodeTables::LineBreak_SA] = {
503/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM ZWJ*/
504/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
505/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
506/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
507/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
508/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
509/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
510/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
511/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
512/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
513/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, IB },
514/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
515/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
516/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
517/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
518/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
519/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
520/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
521/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
522/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB },
523/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
524/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
525/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
526/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
527/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
528/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
529/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, IB },
530/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
531/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
532/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, IB },
533/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
534/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
535/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
536/* ZWJ*/ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, IB }
537};
538
539// The following line break classes are not treated by the pair table
540// and must be resolved outside:
541// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
542
543} // namespace LB
544
545static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
546{
547 qsizetype nestart = 0;
548 LB::NS::Class nelast = LB::NS::XX;
549
550 QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
551 QUnicodeTables::LineBreakClass cls = lcls;
552 for (qsizetype i = 0; i != len; ++i) {
553 qsizetype pos = i;
554 char32_t ucs4 = string[i];
555 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
556 ushort low = string[i + 1];
557 if (QChar::isLowSurrogate(low)) {
558 ucs4 = QChar::surrogateToUcs4(ucs4, low);
559 ++i;
560 }
561 }
562
563 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
564 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
565 QUnicodeTables::LineBreakClass tcls;
566
567 if (options & QUnicodeTools::HangulLineBreakTailoring) {
568 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
569 && ncls <= QUnicodeTables::LineBreak_JT)
570 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
571 ) {
572 // LB27: use SPACE for line breaking
573 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
574 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
575 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
576 ncls = QUnicodeTables::LineBreak_AL;
577 } else {
578 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
579 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
580 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
581 if (FLAG(prop->category) & test)
582 ncls = QUnicodeTables::LineBreak_CM;
583 }
584 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
585 // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
586 if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
587 ncls = QUnicodeTables::LineBreak_AL;
588 }
589 }
590 }
591
592 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
593 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
594 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
595 if (FLAG(prop->category) & test)
596 ncls = QUnicodeTables::LineBreak_CM;
597 }
598
599 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
600 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
601 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
602 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
603 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
604 cls = QUnicodeTables::LineBreak_AL;
605 goto next_no_cls_update;
606 }
607 goto next;
608 }
609
610 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
611 if (ncls > QUnicodeTables::LineBreak_SP)
612 goto next; // LB6: x(BK|CR|LF|NL)
613 goto next_no_cls_update; // LB7: xSP
614 }
615
616 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
617 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
618 if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
619 // don't update anything
620 goto next_no_cls_update;
621 }
622
623 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
624 // LB8a: ZWJ x (ID | EB | EM)
625 if (ncls == QUnicodeTables::LineBreak_ID || ncls == QUnicodeTables::LineBreak_EB || ncls == QUnicodeTables::LineBreak_EM)
626 goto next;
627 }
628
629 // LB25: do not break lines inside numbers
630 {
631 LB::NS::Class necur = LB::NS::toClass(ncls, (QChar::Category)prop->category);
632 switch (LB::NS::actionTable[nelast][necur]) {
633 case LB::NS::Break:
634 // do not change breaks before and after the expression
635 for (qsizetype j = nestart + 1; j < pos; ++j)
636 attributes[j].lineBreak = false;
637 Q_FALLTHROUGH();
638 case LB::NS::None:
639 nelast = LB::NS::XX; // reset state
640 break;
641 case LB::NS::Start:
642 nestart = i;
643 Q_FALLTHROUGH();
644 default:
645 nelast = necur;
646 break;
647 }
648 }
649
650 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
651 // LB30a
652 ncls = QUnicodeTables::LineBreak_SP;
653 goto next;
654 }
655
656 // for South East Asian chars that require a complex analysis, the Unicode
657 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
658 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
659 cls = QUnicodeTables::LineBreak_AL;
660
661 tcls = cls;
662 if (tcls == QUnicodeTables::LineBreak_CM)
663 // LB10
664 tcls = QUnicodeTables::LineBreak_AL;
665 switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
666 case LB::DirectBreak:
667 attributes[pos].lineBreak = true;
668 break;
669 case LB::IndirectBreak:
670 if (lcls == QUnicodeTables::LineBreak_SP)
671 attributes[pos].lineBreak = true;
672 break;
673 case LB::CombiningIndirectBreak:
674 if (lcls != QUnicodeTables::LineBreak_SP)
675 goto next_no_cls_update;
676 attributes[pos].lineBreak = true;
677 break;
678 case LB::CombiningProhibitedBreak:
679 if (lcls != QUnicodeTables::LineBreak_SP)
680 goto next_no_cls_update;
681 break;
682 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
683 if (lcls != QUnicodeTables::LineBreak_HL)
684 attributes[pos].lineBreak = true;
685 break;
686 case LB::ProhibitedBreak:
687 // nothing to do
688 default:
689 break;
690 }
691
692 next:
693 cls = ncls;
694 next_no_cls_update:
695 lcls = ncls;
696 }
697
698 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
699 // LB25: do not break lines inside numbers
700 for (qsizetype j = nestart + 1; j < len; ++j)
701 attributes[j].lineBreak = false;
702 }
703
704 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
705 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
706}
707
708
709static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
710{
711 for (qsizetype i = 0; i != len; ++i) {
712 uint ucs4 = string[i];
713 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
714 ushort low = string[i + 1];
715 if (QChar::isLowSurrogate(low)) {
716 ucs4 = QChar::surrogateToUcs4(ucs4, low);
717 ++i;
718 }
719 }
720
721 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
722 attributes[i].whiteSpace = true;
723 }
724}
725
726namespace Tailored {
727
728using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
729
730
731enum Form {
732 Invalid = 0x0,
733 UnknownForm = Invalid,
734 Consonant,
735 Nukta,
736 Halant,
737 Matra,
738 VowelMark,
739 StressMark,
740 IndependentVowel,
741 LengthMark,
742 Control,
743 Other
744};
745
746static const unsigned char indicForms[0xe00-0x900] = {
747 // Devangari
748 Invalid, VowelMark, VowelMark, VowelMark,
749 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
750 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
751 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
752
753 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
754 IndependentVowel, Consonant, Consonant, Consonant,
755 Consonant, Consonant, Consonant, Consonant,
756 Consonant, Consonant, Consonant, Consonant,
757
758 Consonant, Consonant, Consonant, Consonant,
759 Consonant, Consonant, Consonant, Consonant,
760 Consonant, Consonant, Consonant, Consonant,
761 Consonant, Consonant, Consonant, Consonant,
762
763 Consonant, Consonant, Consonant, Consonant,
764 Consonant, Consonant, Consonant, Consonant,
765 Consonant, Consonant, UnknownForm, UnknownForm,
766 Nukta, Other, Matra, Matra,
767
768 Matra, Matra, Matra, Matra,
769 Matra, Matra, Matra, Matra,
770 Matra, Matra, Matra, Matra,
771 Matra, Halant, UnknownForm, UnknownForm,
772
773 Other, StressMark, StressMark, StressMark,
774 StressMark, UnknownForm, UnknownForm, UnknownForm,
775 Consonant, Consonant, Consonant, Consonant,
776 Consonant, Consonant, Consonant, Consonant,
777
778 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
779 Other, Other, Other, Other,
780 Other, Other, Other, Other,
781 Other, Other, Other, Other,
782
783 Other, Other, Other, Other,
784 Other, Other, Other, Other,
785 Other, Other, Other, Consonant,
786 Consonant, Consonant /* ??? */, Consonant, Consonant,
787
788 // Bengali
789 Invalid, VowelMark, VowelMark, VowelMark,
790 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
791 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
792 IndependentVowel, Invalid, Invalid, IndependentVowel,
793
794 IndependentVowel, Invalid, Invalid, IndependentVowel,
795 IndependentVowel, Consonant, Consonant, Consonant,
796 Consonant, Consonant, Consonant, Consonant,
797 Consonant, Consonant, Consonant, Consonant,
798
799 Consonant, Consonant, Consonant, Consonant,
800 Consonant, Consonant, Consonant, Consonant,
801 Consonant, Invalid, Consonant, Consonant,
802 Consonant, Consonant, Consonant, Consonant,
803
804 Consonant, Invalid, Consonant, Invalid,
805 Invalid, Invalid, Consonant, Consonant,
806 Consonant, Consonant, UnknownForm, UnknownForm,
807 Nukta, Other, Matra, Matra,
808
809 Matra, Matra, Matra, Matra,
810 Matra, Invalid, Invalid, Matra,
811 Matra, Invalid, Invalid, Matra,
812 Matra, Halant, Consonant, UnknownForm,
813
814 Invalid, Invalid, Invalid, Invalid,
815 Invalid, Invalid, Invalid, VowelMark,
816 Invalid, Invalid, Invalid, Invalid,
817 Consonant, Consonant, Invalid, Consonant,
818
819 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
820 Other, Other, Other, Other,
821 Other, Other, Other, Other,
822 Other, Other, Other, Other,
823
824 Consonant, Consonant, Other, Other,
825 Other, Other, Other, Other,
826 Other, Other, Other, Other,
827 Other, Other, Other, Other,
828
829 // Gurmukhi
830 Invalid, VowelMark, VowelMark, VowelMark,
831 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
832 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
833 Invalid, Invalid, Invalid, IndependentVowel,
834
835 IndependentVowel, Invalid, Invalid, IndependentVowel,
836 IndependentVowel, Consonant, Consonant, Consonant,
837 Consonant, Consonant, Consonant, Consonant,
838 Consonant, Consonant, Consonant, Consonant,
839
840 Consonant, Consonant, Consonant, Consonant,
841 Consonant, Consonant, Consonant, Consonant,
842 Consonant, Invalid, Consonant, Consonant,
843 Consonant, Consonant, Consonant, Consonant,
844
845 Consonant, Invalid, Consonant, Consonant,
846 Invalid, Consonant, Consonant, Invalid,
847 Consonant, Consonant, UnknownForm, UnknownForm,
848 Nukta, Other, Matra, Matra,
849
850 Matra, Matra, Matra, Invalid,
851 Invalid, Invalid, Invalid, Matra,
852 Matra, Invalid, Invalid, Matra,
853 Matra, Halant, UnknownForm, UnknownForm,
854
855 Invalid, Invalid, Invalid, Invalid,
856 Invalid, UnknownForm, UnknownForm, UnknownForm,
857 Invalid, Consonant, Consonant, Consonant,
858 Consonant, Invalid, Consonant, Invalid,
859
860 Other, Other, Invalid, Invalid,
861 Other, Other, Other, Other,
862 Other, Other, Other, Other,
863 Other, Other, Other, Other,
864
865 StressMark, StressMark, Consonant, Consonant,
866 Other, Other, Other, Other,
867 Other, Other, Other, Other,
868 Other, Other, Other, Other,
869
870 // Gujarati
871 Invalid, VowelMark, VowelMark, VowelMark,
872 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
873 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
874 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
875
876 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
877 IndependentVowel, Consonant, Consonant, Consonant,
878 Consonant, Consonant, Consonant, Consonant,
879 Consonant, Consonant, Consonant, Consonant,
880
881 Consonant, Consonant, Consonant, Consonant,
882 Consonant, Consonant, Consonant, Consonant,
883 Consonant, Invalid, Consonant, Consonant,
884 Consonant, Consonant, Consonant, Consonant,
885
886 Consonant, Invalid, Consonant, Consonant,
887 Invalid, Consonant, Consonant, Consonant,
888 Consonant, Consonant, UnknownForm, UnknownForm,
889 Nukta, Other, Matra, Matra,
890
891 Matra, Matra, Matra, Matra,
892 Matra, Matra, Invalid, Matra,
893 Matra, Matra, Invalid, Matra,
894 Matra, Halant, UnknownForm, UnknownForm,
895
896 Other, UnknownForm, UnknownForm, UnknownForm,
897 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
898 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
899 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
900
901 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
902 Other, Other, Other, Other,
903 Other, Other, Other, Other,
904 Other, Other, Other, Other,
905
906 Other, Other, Other, Other,
907 Other, Other, Other, Other,
908 Other, Other, Other, Other,
909 Other, Other, Other, Other,
910
911 // Oriya
912 Invalid, VowelMark, VowelMark, VowelMark,
913 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
914 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
915 IndependentVowel, Invalid, Invalid, IndependentVowel,
916
917 IndependentVowel, Invalid, Invalid, IndependentVowel,
918 IndependentVowel, Consonant, Consonant, Consonant,
919 Consonant, Consonant, Consonant, Consonant,
920 Consonant, Consonant, Consonant, Consonant,
921
922 Consonant, Consonant, Consonant, Consonant,
923 Consonant, Consonant, Consonant, Consonant,
924 Consonant, Invalid, Consonant, Consonant,
925 Consonant, Consonant, Consonant, Consonant,
926
927 Consonant, Invalid, Consonant, Consonant,
928 Invalid, Consonant, Consonant, Consonant,
929 Consonant, Consonant, UnknownForm, UnknownForm,
930 Nukta, Other, Matra, Matra,
931
932 Matra, Matra, Matra, Matra,
933 Invalid, Invalid, Invalid, Matra,
934 Matra, Invalid, Invalid, Matra,
935 Matra, Halant, UnknownForm, UnknownForm,
936
937 Other, Invalid, Invalid, Invalid,
938 Invalid, UnknownForm, LengthMark, LengthMark,
939 Invalid, Invalid, Invalid, Invalid,
940 Consonant, Consonant, Invalid, Consonant,
941
942 IndependentVowel, IndependentVowel, Invalid, Invalid,
943 Invalid, Invalid, Other, Other,
944 Other, Other, Other, Other,
945 Other, Other, Other, Other,
946
947 Other, Consonant, Other, Other,
948 Other, Other, Other, Other,
949 Other, Other, Other, Other,
950 Other, Other, Other, Other,
951
952 //Tamil
953 Invalid, Invalid, VowelMark, Other,
954 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
955 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
956 Invalid, Invalid, IndependentVowel, IndependentVowel,
957
958 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
959 IndependentVowel, Consonant, Invalid, Invalid,
960 Invalid, Consonant, Consonant, Invalid,
961 Consonant, Invalid, Consonant, Consonant,
962
963 Invalid, Invalid, Invalid, Consonant,
964 Consonant, Invalid, Invalid, Invalid,
965 Consonant, Consonant, Consonant, Invalid,
966 Invalid, Invalid, Consonant, Consonant,
967
968 Consonant, Consonant, Consonant, Consonant,
969 Consonant, Consonant, Consonant, Consonant,
970 Consonant, Consonant, UnknownForm, UnknownForm,
971 Invalid, Invalid, Matra, Matra,
972
973 Matra, Matra, Matra, Invalid,
974 Invalid, Invalid, Matra, Matra,
975 Matra, Invalid, Matra, Matra,
976 Matra, Halant, Invalid, Invalid,
977
978 Invalid, Invalid, Invalid, Invalid,
979 Invalid, Invalid, Invalid, LengthMark,
980 Invalid, Invalid, Invalid, Invalid,
981 Invalid, Invalid, Invalid, Invalid,
982
983 Invalid, Invalid, Invalid, Invalid,
984 Invalid, Invalid, Other, Other,
985 Other, Other, Other, Other,
986 Other, Other, Other, Other,
987
988 Other, Other, Other, Other,
989 Other, Other, Other, Other,
990 Other, Other, Other, Other,
991 Other, Other, Other, Other,
992
993 // Telugu
994 Invalid, VowelMark, VowelMark, VowelMark,
995 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
996 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
997 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
998
999 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1000 IndependentVowel, Consonant, Consonant, Consonant,
1001 Consonant, Consonant, Consonant, Consonant,
1002 Consonant, Consonant, Consonant, Consonant,
1003
1004 Consonant, Consonant, Consonant, Consonant,
1005 Consonant, Consonant, Consonant, Consonant,
1006 Consonant, Invalid, Consonant, Consonant,
1007 Consonant, Consonant, Consonant, Consonant,
1008
1009 Consonant, Consonant, Consonant, Consonant,
1010 Invalid, Consonant, Consonant, Consonant,
1011 Consonant, Consonant, UnknownForm, UnknownForm,
1012 Invalid, Invalid, Matra, Matra,
1013
1014 Matra, Matra, Matra, Matra,
1015 Matra, Invalid, Matra, Matra,
1016 Matra, Invalid, Matra, Matra,
1017 Matra, Halant, Invalid, Invalid,
1018
1019 Invalid, Invalid, Invalid, Invalid,
1020 Invalid, LengthMark, Matra, Invalid,
1021 Invalid, Invalid, Invalid, Invalid,
1022 Invalid, Invalid, Invalid, Invalid,
1023
1024 IndependentVowel, IndependentVowel, Invalid, Invalid,
1025 Invalid, Invalid, Other, Other,
1026 Other, Other, Other, Other,
1027 Other, Other, Other, Other,
1028
1029 Other, Other, Other, Other,
1030 Other, Other, Other, Other,
1031 Other, Other, Other, Other,
1032 Other, Other, Other, Other,
1033
1034 // Kannada
1035 Invalid, Invalid, VowelMark, VowelMark,
1036 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1037 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1038 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1039
1040 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1041 IndependentVowel, Consonant, Consonant, Consonant,
1042 Consonant, Consonant, Consonant, Consonant,
1043 Consonant, Consonant, Consonant, Consonant,
1044
1045 Consonant, Consonant, Consonant, Consonant,
1046 Consonant, Consonant, Consonant, Consonant,
1047 Consonant, Invalid, Consonant, Consonant,
1048 Consonant, Consonant, Consonant, Consonant,
1049
1050 Consonant, Consonant, Consonant, Consonant,
1051 Invalid, Consonant, Consonant, Consonant,
1052 Consonant, Consonant, UnknownForm, UnknownForm,
1053 Nukta, Other, Matra, Matra,
1054
1055 Matra, Matra, Matra, Matra,
1056 Matra, Invalid, Matra, Matra,
1057 Matra, Invalid, Matra, Matra,
1058 Matra, Halant, Invalid, Invalid,
1059
1060 Invalid, Invalid, Invalid, Invalid,
1061 Invalid, LengthMark, LengthMark, Invalid,
1062 Invalid, Invalid, Invalid, Invalid,
1063 Invalid, Invalid, Consonant, Invalid,
1064
1065 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1066 Invalid, Invalid, Other, Other,
1067 Other, Other, Other, Other,
1068 Other, Other, Other, Other,
1069
1070 Other, Other, Other, Other,
1071 Other, Other, Other, Other,
1072 Other, Other, Other, Other,
1073 Other, Other, Other, Other,
1074
1075 // Malayalam
1076 Invalid, Invalid, VowelMark, VowelMark,
1077 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1078 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1079 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1080
1081 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1082 IndependentVowel, Consonant, Consonant, Consonant,
1083 Consonant, Consonant, Consonant, Consonant,
1084 Consonant, Consonant, Consonant, Consonant,
1085
1086 Consonant, Consonant, Consonant, Consonant,
1087 Consonant, Consonant, Consonant, Consonant,
1088 Consonant, Invalid, Consonant, Consonant,
1089 Consonant, Consonant, Consonant, Consonant,
1090
1091 Consonant, Consonant, Consonant, Consonant,
1092 Consonant, Consonant, Consonant, Consonant,
1093 Consonant, Consonant, UnknownForm, UnknownForm,
1094 Invalid, Invalid, Matra, Matra,
1095
1096 Matra, Matra, Matra, Matra,
1097 Invalid, Invalid, Matra, Matra,
1098 Matra, Invalid, Matra, Matra,
1099 Matra, Halant, Invalid, Invalid,
1100
1101 Invalid, Invalid, Invalid, Invalid,
1102 Invalid, Invalid, Invalid, Matra,
1103 Invalid, Invalid, Invalid, Invalid,
1104 Invalid, Invalid, Invalid, Invalid,
1105
1106 IndependentVowel, IndependentVowel, Invalid, Invalid,
1107 Invalid, Invalid, Other, Other,
1108 Other, Other, Other, Other,
1109 Other, Other, Other, Other,
1110
1111 Other, Other, Other, Other,
1112 Other, Other, Other, Other,
1113 Other, Other, Other, Other,
1114 Other, Other, Other, Other,
1115
1116 // Sinhala
1117 Invalid, Invalid, VowelMark, VowelMark,
1118 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1119 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1120 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1121
1122 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1123 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1124 Invalid, Invalid, Consonant, Consonant,
1125 Consonant, Consonant, Consonant, Consonant,
1126
1127 Consonant, Consonant, Consonant, Consonant,
1128 Consonant, Consonant, Consonant, Consonant,
1129 Consonant, Consonant, Consonant, Consonant,
1130 Consonant, Consonant, Consonant, Consonant,
1131
1132 Consonant, Consonant, Invalid, Consonant,
1133 Consonant, Consonant, Consonant, Consonant,
1134 Consonant, Consonant, Consonant, Consonant,
1135 Invalid, Consonant, Invalid, Invalid,
1136
1137 Consonant, Consonant, Consonant, Consonant,
1138 Consonant, Consonant, Consonant, Invalid,
1139 Invalid, Invalid, Halant, Invalid,
1140 Invalid, Invalid, Invalid, Matra,
1141
1142 Matra, Matra, Matra, Matra,
1143 Matra, Invalid, Matra, Invalid,
1144 Matra, Matra, Matra, Matra,
1145 Matra, Matra, Matra, Matra,
1146
1147 Invalid, Invalid, Invalid, Invalid,
1148 Invalid, Invalid, Invalid, Invalid,
1149 Invalid, Invalid, Invalid, Invalid,
1150 Invalid, Invalid, Invalid, Invalid,
1151
1152 Invalid, Invalid, Matra, Matra,
1153 Other, Other, Other, Other,
1154 Other, Other, Other, Other,
1155 Other, Other, Other, Other,
1156};
1157
1158static inline Form form(unsigned short uc) {
1159 if (uc < 0x900 || uc > 0xdff) {
1160 if (uc == 0x25cc)
1161 return Consonant;
1162 if (uc == 0x200c || uc == 0x200d)
1163 return Control;
1164 return Other;
1165 }
1166 return (Form)indicForms[uc-0x900];
1167}
1168
1169// #define INDIC_DEBUG
1170#ifdef INDIC_DEBUG
1171#define IDEBUG qDebug
1172#else
1173#define IDEBUG if constexpr (1) ; else qDebug
1174#endif
1175
1176/* syllables are of the form:
1177
1178 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1179 (Consonant Nukta? Halant)* Consonant Halant
1180 IndependentVowel VowelMark? StressMark?
1181
1182 We return syllable boundaries on invalid combinations aswell
1183*/
1184static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1185{
1186 *invalid = false;
1187 IDEBUG("indic_nextSyllableBoundary: start=%d, end=%d", int(start), int(end));
1188 const char16_t *uc = s+start;
1189
1190 qsizetype pos = 0;
1191 Form state = form(uc[pos]);
1192 IDEBUG("state[%d]=%d (uc=%4x)", int(pos), state, uc[pos]);
1193 pos++;
1194
1195 if (state != Consonant && state != IndependentVowel) {
1196 if (state != Other)
1197 *invalid = true;
1198 goto finish;
1199 }
1200
1201 while (pos < end - start) {
1202 Form newState = form(uc[pos]);
1203 IDEBUG("state[%d]=%d (uc=%4x)", int(pos), newState, uc[pos]);
1204 switch (newState) {
1205 case Control:
1206 newState = state;
1207 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1208 break;
1209 // the control character should be the last char in the item
1210 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1211 break;
1212 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1213 break;
1214 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1215 ++pos;
1216 goto finish;
1217 case Consonant:
1218 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1219 break;
1220 goto finish;
1221 case Halant:
1222 if (state == Nukta || state == Consonant)
1223 break;
1224 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1225 if (script == QChar::Script_Bengali && pos == 1 &&
1226 (uc[0] == 0x0985 || uc[0] == 0x098f))
1227 break;
1228 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1229 if (script == QChar::Script_Sinhala && state == Matra) {
1230 ++pos;
1231 continue;
1232 }
1233 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1234 ++pos;
1235 continue;
1236 }
1237 goto finish;
1238 case Nukta:
1239 if (state == Consonant)
1240 break;
1241 goto finish;
1242 case StressMark:
1243 if (state == VowelMark)
1244 break;
1245 Q_FALLTHROUGH();
1246 case VowelMark:
1247 if (state == Matra || state == LengthMark || state == IndependentVowel)
1248 break;
1249 Q_FALLTHROUGH();
1250 case Matra:
1251 if (state == Consonant || state == Nukta)
1252 break;
1253 if (state == Matra) {
1254 // ### needs proper testing for correct two/three part matras
1255 break;
1256 }
1257 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1258 // it work for all Indic languages?
1259 // the combination Independent_A + Vowel Sign AA is allowed.
1260 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1261 break;
1262 if (script == QChar::Script_Tamil && state == Matra) {
1263 if (uc[pos-1] == 0x0bc6 &&
1264 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1265 break;
1266 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1267 break;
1268 }
1269 goto finish;
1270
1271 case LengthMark:
1272 if (state == Matra) {
1273 // ### needs proper testing for correct two/three part matras
1274 break;
1275 }
1276 case IndependentVowel:
1277 case Invalid:
1278 case Other:
1279 goto finish;
1280 }
1281 state = newState;
1282 pos++;
1283 }
1284 finish:
1285 return pos+start;
1286}
1287
1288static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1289{
1290 qsizetype end = from + len;
1291 const char16_t *uc = text + from;
1292 attributes += from;
1293 qsizetype i = 0;
1294 while (i < len) {
1295 bool invalid;
1296 qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1297 attributes[i].graphemeBoundary = true;
1298
1299 if (boundary > len-1) boundary = len;
1300 i++;
1301 while (i < boundary) {
1302 attributes[i].graphemeBoundary = false;
1303 ++uc;
1304 ++i;
1305 }
1306 assert(i == boundary);
1307 }
1308
1309
1310}
1311
1312#define LIBTHAI_MAJOR 0
1313
1314/*
1315 * if libthai changed please update these codes too.
1316 */
1317struct thcell_t {
1318 unsigned char base; /**< base character */
1319 unsigned char hilo; /**< upper/lower vowel/diacritic */
1320 unsigned char top; /**< top-level mark */
1321};
1322typedef int (*th_brk_def) (const unsigned char*, int*, size_t);
1323typedef size_t (*th_next_cell_def) (const unsigned char *, size_t, struct thcell_t *, int);
1324
1325/* libthai related function handles */
1326static th_brk_def th_brk = nullptr;
1327static th_next_cell_def th_next_cell = nullptr;
1328
1329static int init_libthai() {
1330 static bool initialized = false;
1331 if (!initialized && (!th_brk || !th_next_cell)) {
1332 th_brk = (th_brk_def) QLibrary::resolve(QLatin1String("thai"), (int)LIBTHAI_MAJOR, "th_brk");
1333 th_next_cell = (th_next_cell_def)QLibrary::resolve(QLatin1String("thai"), LIBTHAI_MAJOR, "th_next_cell");
1334 initialized = true;
1335 }
1336 if (th_brk && th_next_cell)
1337 return 1;
1338 else
1339 return 0;
1340}
1341
1342static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1343{
1344 qsizetype i;
1345 unsigned char *result = (unsigned char *)cstr;
1346
1347 for (i = 0; i < len; ++i) {
1348 if (string[i] <= 0xa0)
1349 result[i] = (unsigned char)string[i];
1350 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1351 result[i] = (unsigned char)(string[i] - 0xe00 + 0xa0);
1352 else
1353 result[i] = (unsigned char)~0; // Same encoding as libthai uses for invalid chars
1354 }
1355
1356 result[len] = 0;
1357}
1358
1359/*
1360 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1361 */
1362static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1363{
1364 char s[128];
1365 char *cstr = s;
1366 int *break_positions = nullptr;
1367 int brp[128];
1368 int brp_size = 0;
1369 qsizetype numbreaks, i, j, cell_length;
1370 struct thcell_t tis_cell;
1371
1372 if (!init_libthai())
1373 return ;
1374
1375 if (len >= 128)
1376 cstr = (char *)malloc(len*sizeof(char) + 1);
1377
1378 to_tis620(string, len, cstr);
1379
1380 for (i = 0; i < len; ++i) {
1381 attributes[i].wordBreak = false;
1382 attributes[i].wordStart = false;
1383 attributes[i].wordEnd = false;
1384 attributes[i].lineBreak = false;
1385 }
1386
1387 if (len > 128) {
1388 break_positions = (int*) malloc (sizeof(int) * len);
1389 memset (break_positions, 0, sizeof(int) * len);
1390 brp_size = len;
1391 }
1392 else {
1393 break_positions = brp;
1394 brp_size = 128;
1395 }
1396
1397 if (break_positions) {
1398 attributes[0].wordBreak = true;
1399 attributes[0].wordStart = true;
1400 attributes[0].wordEnd = false;
1401 numbreaks = th_brk((const unsigned char *)cstr, break_positions, brp_size);
1402 for (i = 0; i < numbreaks; ++i) {
1403 attributes[break_positions[i]].wordBreak = true;
1404 attributes[break_positions[i]].wordStart = true;
1405 attributes[break_positions[i]].wordEnd = true;
1406 attributes[break_positions[i]].lineBreak = true;
1407 }
1408 if (numbreaks > 0)
1409 attributes[break_positions[numbreaks - 1]].wordStart = false;
1410
1411 if (break_positions != brp)
1412 free(break_positions);
1413 }
1414
1415 /* manage grapheme boundaries */
1416 i = 0;
1417 while (i < len) {
1418 cell_length = (uint)(th_next_cell((const unsigned char *)cstr + i, len - i, &tis_cell, true));
1419
1420 attributes[i].graphemeBoundary = true;
1421 for (j = 1; j < cell_length; j++)
1422 attributes[i + j].graphemeBoundary = false;
1423
1424 /* Set graphemeBoundary for SARA AM */
1425 if (cstr[i + cell_length - 1] == (char)0xd3)
1426 attributes[i + cell_length - 1].graphemeBoundary = true;
1427
1428 i += cell_length;
1429 }
1430
1431 if (len >= 128)
1432 free(cstr);
1433}
1434
1435static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1436{
1437 assert(script == QChar::Script_Thai);
1438 const char16_t *uc = text + from;
1439 attributes += from;
1440 Q_UNUSED(script);
1441 thaiAssignAttributes(uc, len, attributes);
1442}
1443
1444/*
1445 tibetan syllables are of the form:
1446 head position consonant
1447 first sub-joined consonant
1448 ....intermediate sub-joined consonants (if any)
1449 last sub-joined consonant
1450 sub-joined vowel (a-chung U+0F71)
1451 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1452*/
1453
1454typedef enum {
1455 TibetanOther,
1456 TibetanHeadConsonant,
1457 TibetanSubjoinedConsonant,
1458 TibetanSubjoinedVowel,
1459 TibetanVowel
1460} TibetanForm;
1461
1462/* this table starts at U+0f40 */
1463static const unsigned char tibetanForm[0x80] = {
1464 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1465 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1466 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1467 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1468
1469 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1470 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1471 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1472 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1473
1474 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1475 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1476 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1477 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1478
1479 TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel,
1480 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1481 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1482 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1483
1484 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1485 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1486 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1487 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1488
1489 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1490 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1491 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1492 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1493
1494 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1495 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1496 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1497 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1498
1499 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1500 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1501 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1502 TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther
1503};
1504
1505#define tibetan_form(c) \
1506 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1507
1508static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1509{
1510 const char16_t *uc = s + start;
1511
1512 qsizetype pos = 0;
1513 TibetanForm state = tibetan_form(*uc);
1514
1515/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1516 pos++;
1517
1518 if (state != TibetanHeadConsonant) {
1519 if (state != TibetanOther)
1520 *invalid = true;
1521 goto finish;
1522 }
1523
1524 while (pos < end - start) {
1525 TibetanForm newState = tibetan_form(uc[pos]);
1526 switch (newState) {
1527 case TibetanSubjoinedConsonant:
1528 case TibetanSubjoinedVowel:
1529 if (state != TibetanHeadConsonant &&
1530 state != TibetanSubjoinedConsonant)
1531 goto finish;
1532 state = newState;
1533 break;
1534 case TibetanVowel:
1535 if (state != TibetanHeadConsonant &&
1536 state != TibetanSubjoinedConsonant &&
1537 state != TibetanSubjoinedVowel)
1538 goto finish;
1539 break;
1540 case TibetanOther:
1541 case TibetanHeadConsonant:
1542 goto finish;
1543 }
1544 pos++;
1545 }
1546
1547finish:
1548 *invalid = false;
1549 return start+pos;
1550}
1551
1552static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1553{
1554 qsizetype end = from + len;
1555 const char16_t *uc = text + from;
1556 qsizetype i = 0;
1557 Q_UNUSED(script);
1558 attributes += from;
1559 while (i < len) {
1560 bool invalid;
1561 qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1562
1563 attributes[i].graphemeBoundary = true;
1564
1565 if (boundary > len-1) boundary = len;
1566 i++;
1567 while (i < boundary) {
1568 attributes[i].graphemeBoundary = false;
1569 ++uc;
1570 ++i;
1571 }
1572 assert(i == boundary);
1573 }
1574}
1575
1576enum MymrCharClassValues {
1577 Mymr_CC_RESERVED = 0,
1578 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
1579 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
1580 Mymr_CC_NGA = 3, /* Consonant NGA */
1581 Mymr_CC_YA = 4, /* Consonant YA */
1582 Mymr_CC_RA = 5, /* Consonant RA */
1583 Mymr_CC_WA = 6, /* Consonant WA */
1584 Mymr_CC_HA = 7, /* Consonant HA */
1585 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
1586 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
1587 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
1588 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
1589 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
1590 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
1591 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
1592 Mymr_CC_SIGN_ABOVE = 15,
1593 Mymr_CC_SIGN_BELOW = 16,
1594 Mymr_CC_SIGN_AFTER = 17,
1595 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
1596 Mymr_CC_COUNT = 19 /* This is the number of character classes */
1597};
1598
1599enum MymrCharClassFlags {
1600 Mymr_CF_CLASS_MASK = 0x0000FFFF,
1601
1602 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1603 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
1604 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
1605 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
1606 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
1607 first in a syllable */
1608 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
1609
1610 /* position flags */
1611 Mymr_CF_POS_BEFORE = 0x00080000,
1612 Mymr_CF_POS_BELOW = 0x00040000,
1613 Mymr_CF_POS_ABOVE = 0x00020000,
1614 Mymr_CF_POS_AFTER = 0x00010000,
1615 Mymr_CF_POS_MASK = 0x000f0000,
1616
1617 Mymr_CF_AFTER_KINZI = 0x00100000
1618};
1619
1620/* Characters that get refrered to by name */
1621enum MymrChar
1622{
1623 Mymr_C_SIGN_ZWNJ = 0x200C,
1624 Mymr_C_SIGN_ZWJ = 0x200D,
1625 Mymr_C_DOTTED_CIRCLE = 0x25CC,
1626 Mymr_C_RA = 0x101B,
1627 Mymr_C_YA = 0x101A,
1628 Mymr_C_NGA = 0x1004,
1629 Mymr_C_VOWEL_E = 0x1031,
1630 Mymr_C_VIRAMA = 0x1039
1631};
1632
1633enum
1634{
1635 Mymr_xx = Mymr_CC_RESERVED,
1636 Mymr_c1 = Mymr_CC_CONSONANT | Mymr_CF_CONSONANT | Mymr_CF_POS_BELOW,
1637 Mymr_c2 = Mymr_CC_CONSONANT2 | Mymr_CF_CONSONANT,
1638 Mymr_ng = Mymr_CC_NGA | Mymr_CF_CONSONANT | Mymr_CF_POS_ABOVE,
1639 Mymr_ya = Mymr_CC_YA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_AFTER | Mymr_CF_AFTER_KINZI,
1640 Mymr_ra = Mymr_CC_RA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BEFORE,
1641 Mymr_wa = Mymr_CC_WA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
1642 Mymr_ha = Mymr_CC_HA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
1643 Mymr_id = Mymr_CC_IND_VOWEL | Mymr_CF_IND_VOWEL,
1644 Mymr_vi = Mymr_CC_VIRAMA | Mymr_CF_VIRAMA | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE,
1645 Mymr_dl = Mymr_CC_PRE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BEFORE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1646 Mymr_db = Mymr_CC_BELOW_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1647 Mymr_da = Mymr_CC_ABOVE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1648 Mymr_dr = Mymr_CC_POST_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1649 Mymr_sa = Mymr_CC_SIGN_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_ABOVE | Mymr_CF_AFTER_KINZI,
1650 Mymr_sb = Mymr_CC_SIGN_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_BELOW | Mymr_CF_AFTER_KINZI,
1651 Mymr_sp = Mymr_CC_SIGN_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI
1652};
1653
1654
1655typedef int MymrCharClass;
1656
1657
1658static const MymrCharClass mymrCharClasses[] =
1659{
1660 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1,
1661 Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, /* 1000 - 100F */
1662 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1,
1663 Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, /* 1010 - 101F */
1664 Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id,
1665 Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, /* 1020 - 102F */
1666 Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb,
1667 Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1030 - 103F */
1668 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1669 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1040 - 104F */
1670 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1671 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1050 - 105F */
1672};
1673
1674static MymrCharClass
1675getMyanmarCharClass (ushort ch)
1676{
1677 if (ch == Mymr_C_SIGN_ZWJ)
1678 return Mymr_CC_ZERO_WIDTH_J_MARK;
1679
1680 if (ch == Mymr_C_SIGN_ZWNJ)
1681 return Mymr_CC_ZERO_WIDTH_NJ_MARK;
1682
1683 if (ch < 0x1000 || ch > 0x105f)
1684 return Mymr_CC_RESERVED;
1685
1686 return mymrCharClasses[ch - 0x1000];
1687}
1688
1689static const signed char mymrStateTable[][Mymr_CC_COUNT] =
1690{
1691/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
1692 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
1693 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
1694 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
1695 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
1696 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
1697 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
1698 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
1699 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
1700 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
1701 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
1702 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
1703 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
1704 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
1705 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
1706 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
1707 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
1708 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
1709 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
1710 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
1711 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
1712 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
1713 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
1714 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
1715 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
1716 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
1717 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
1718 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
1719 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
1720/* exit state -2 is for invalid order of medials and combination of invalids
1721 with virama where virama should treat as start of next syllable
1722 */
1723};
1724
1725/*#define MYANMAR_DEBUG */
1726#ifdef MYANMAR_DEBUG
1727#define MMDEBUG qDebug
1728#else
1729# define MMDEBUG \
1730 if (0) \
1731 printf
1732#endif
1733
1734/*
1735// Given an input string of characters and a location in which to start looking
1736// calculate, using the state table, which one is the last character of the syllable
1737// that starts in the starting position.
1738*/
1739static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1740{
1741 const char16_t *uc = s + start;
1742 int state = 0;
1743 qsizetype pos = start;
1744 *invalid = false;
1745
1746 while (pos < end) {
1747 MymrCharClass charClass = getMyanmarCharClass(*uc);
1748 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
1749 if (pos == start)
1750 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
1751
1752 MMDEBUG("state[%d]=%d class=%8x (uc=%4x)", int(pos - start), state, charClass, *uc);
1753
1754 if (state < 0) {
1755 if (state < -1)
1756 --pos;
1757 break;
1758 }
1759 ++uc;
1760 ++pos;
1761 }
1762 return pos;
1763}
1764
1765static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1766{
1767 qsizetype end = from + len;
1768 const char16_t *uc = text + from;
1769 qsizetype i = 0;
1770 Q_UNUSED(script);
1771 attributes += from;
1772 while (i < len) {
1773 bool invalid;
1774 qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1775
1776 attributes[i].graphemeBoundary = true;
1777 attributes[i].lineBreak = true;
1778
1779 if (boundary > len-1)
1780 boundary = len;
1781 i++;
1782 while (i < boundary) {
1783 attributes[i].graphemeBoundary = false;
1784 ++uc;
1785 ++i;
1786 }
1787 assert(i == boundary);
1788 }
1789}
1790
1791/*
1792// Vocabulary
1793// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
1794// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
1795// split vowels, signs... but there is only one base in a syllable, it has to be coded as
1796// the first character of the syllable.
1797// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
1798// Khmer language has five of them. Khmer split vowels either have one part before the
1799// base and one after the base or they have a part before the base and a part above the base.
1800// The first part of all Khmer split vowels is the same character, identical to
1801// the glyph of Khmer dependent vowel SRA EI
1802// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
1803// Differently than indian languages, the coeng modifies the consonant that follows it,
1804// not the one preceding it Each consonant has two forms, the base form and the subscript form
1805// the base form is the normal one (using the consonants code-point), the subscript form is
1806// displayed when the combination coeng + consonant is encountered.
1807// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
1808// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
1809// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
1810// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
1811// if it is attached to a consonant of the first series or a consonant of the second series
1812// Most consonants have an equivalent in the other series, but some of theme exist only in
1813// one series (for example SA). If we want to use the consonant SA with a vowel sound that
1814// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
1815// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
1816// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
1817// MUSIKATOAN a second series consonant to have a first series vowel sound.
1818// Consonant shifter are both normally supercript marks, but, when they are followed by a
1819// superscript, they change shape and take the form of subscript dependent vowel SRA U.
1820// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
1821// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
1822// be placed after the coeng consonant.
1823// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
1824// Each vowel has its own position. Only one vowel per syllable is allowed.
1825// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
1826// Allowed in a syllable.
1827//
1828//
1829// order is important here! This order must be the same that is found in each horizontal
1830// line in the statetable for Khmer (see khmerStateTable) .
1831*/
1832enum KhmerCharClassValues {
1833 CC_RESERVED = 0,
1834 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
1835 CC_CONSONANT2 = 2, /* Consonant of type 2 */
1836 CC_CONSONANT3 = 3, /* Consonant of type 3 */
1837 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
1838 CC_CONSONANT_SHIFTER = 5,
1839 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
1840 CC_COENG = 7, /* Subscript consonant combining character */
1841 CC_DEPENDENT_VOWEL = 8,
1842 CC_SIGN_ABOVE = 9,
1843 CC_SIGN_AFTER = 10,
1844 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
1845 CC_COUNT = 12 /* This is the number of character classes */
1846};
1847
1848
1849enum KhmerCharClassFlags {
1850 CF_CLASS_MASK = 0x0000FFFF,
1851
1852 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1853 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
1854 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
1855 CF_COENG = 0x08000000, /* flag to speed up comparing */
1856 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
1857 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
1858
1859 /* position flags */
1860 CF_POS_BEFORE = 0x00080000,
1861 CF_POS_BELOW = 0x00040000,
1862 CF_POS_ABOVE = 0x00020000,
1863 CF_POS_AFTER = 0x00010000,
1864 CF_POS_MASK = 0x000f0000
1865};
1866
1867
1868/* Characters that get referred to by name */
1869enum KhmerChar {
1870 C_SIGN_ZWNJ = 0x200C,
1871 C_SIGN_ZWJ = 0x200D,
1872 C_RO = 0x179A,
1873 C_VOWEL_AA = 0x17B6,
1874 C_SIGN_NIKAHIT = 0x17C6,
1875 C_VOWEL_E = 0x17C1,
1876 C_COENG = 0x17D2
1877};
1878
1879
1880/*
1881// simple classes, they are used in the statetable (in this file) to control the length of a syllable
1882// they are also used to know where a character should be placed (location in reference to the base character)
1883// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
1884// indicate error in syllable construction
1885*/
1886enum {
1887 _xx = CC_RESERVED,
1888 _sa = CC_SIGN_ABOVE | CF_DOTTED_CIRCLE | CF_POS_ABOVE,
1889 _sp = CC_SIGN_AFTER | CF_DOTTED_CIRCLE| CF_POS_AFTER,
1890 _c1 = CC_CONSONANT | CF_CONSONANT,
1891 _c2 = CC_CONSONANT2 | CF_CONSONANT,
1892 _c3 = CC_CONSONANT3 | CF_CONSONANT,
1893 _rb = CC_ROBAT | CF_POS_ABOVE | CF_DOTTED_CIRCLE,
1894 _cs = CC_CONSONANT_SHIFTER | CF_DOTTED_CIRCLE | CF_SHIFTER,
1895 _dl = CC_DEPENDENT_VOWEL | CF_POS_BEFORE | CF_DOTTED_CIRCLE,
1896 _db = CC_DEPENDENT_VOWEL | CF_POS_BELOW | CF_DOTTED_CIRCLE,
1897 _da = CC_DEPENDENT_VOWEL | CF_POS_ABOVE | CF_DOTTED_CIRCLE | CF_ABOVE_VOWEL,
1898 _dr = CC_DEPENDENT_VOWEL | CF_POS_AFTER | CF_DOTTED_CIRCLE,
1899 _co = CC_COENG | CF_COENG | CF_DOTTED_CIRCLE,
1900
1901 /* split vowel */
1902 _va = _da | CF_SPLIT_VOWEL,
1903 _vr = _dr | CF_SPLIT_VOWEL
1904};
1905
1906
1907/*
1908// Character class: a character class value
1909// ORed with character class flags.
1910*/
1911typedef unsigned long KhmerCharClass;
1912
1913
1914/*
1915// Character class tables
1916// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
1917// _sa Sign placed above the base
1918// _sp Sign placed after the base
1919// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
1920// _c2 Consonant of type 2 (only RO)
1921// _c3 Consonant of type 3
1922// _rb Khmer sign robat u17CC. combining mark for subscript consonants
1923// _cd Consonant-shifter
1924// _dl Dependent vowel placed before the base (left of the base)
1925// _db Dependent vowel placed below the base
1926// _da Dependent vowel placed above the base
1927// _dr Dependent vowel placed behind the base (right of the base)
1928// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
1929// it to create a subscript consonant or independent vowel
1930// _va Khmer split vowel in which the first part is before the base and the second one above the base
1931// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
1932*/
1933static const KhmerCharClass khmerCharClasses[] = {
1934 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
1935 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
1936 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
1937 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
1938 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
1939 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
1940};
1941
1942/* this enum must reflect the range of khmerCharClasses */
1943enum KhmerCharClassesRange {
1944 KhmerFirstChar = 0x1780,
1945 KhmerLastChar = 0x17df
1946};
1947
1948/*
1949// Below we define how a character in the input string is either in the khmerCharClasses table
1950// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
1951// within the syllable, but are not in the table) we also get their type back, or an unknown object
1952// in which case we get _xx (CC_RESERVED) back
1953*/
1954static KhmerCharClass getKhmerCharClass(ushort uc)
1955{
1956 if (uc == C_SIGN_ZWJ) {
1957 return CC_ZERO_WIDTH_J_MARK;
1958 }
1959
1960 if (uc == C_SIGN_ZWNJ) {
1961 return CC_ZERO_WIDTH_NJ_MARK;
1962 }
1963
1964 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
1965 return CC_RESERVED;
1966 }
1967
1968 return khmerCharClasses[uc - KhmerFirstChar];
1969}
1970
1971
1972/*
1973// The stateTable is used to calculate the end (the length) of a well
1974// formed Khmer Syllable.
1975//
1976// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
1977// CharClassValues. This coincidence of values allows the follow up of the table.
1978//
1979// Each line corresponds to a state, which does not necessarily need to be a type
1980// of component... for example, state 2 is a base, with is always a first character
1981// in the syllable, but the state could be produced a consonant of any type when
1982// it is the first character that is analysed (in ground state).
1983//
1984// Differentiating 3 types of consonants is necessary in order to
1985// forbid the use of certain combinations, such as having a second
1986// coeng after a coeng RO,
1987// The inexistent possibility of having a type 3 after another type 3 is permitted,
1988// eliminating it would very much complicate the table, and it does not create typing
1989// problems, as the case above.
1990//
1991// The table is quite complex, in order to limit the number of coeng consonants
1992// to 2 (by means of the table).
1993//
1994// There a peculiarity, as far as Unicode is concerned:
1995// - The consonant-shifter is considered in two possible different
1996// locations, the one considered in Unicode 3.0 and the one considered in
1997// Unicode 4.0. (there is a backwards compatibility problem in this standard).
1998//
1999//
2000// xx independent character, such as a number, punctuation sign or non-khmer char
2001//
2002// c1 Khmer consonant of type 1 or an independent vowel
2003// that is, a letter in which the subscript for is only under the
2004// base, not taking any space to the right or to the left
2005//
2006// c2 Khmer consonant of type 2, the coeng form takes space under
2007// and to the left of the base (only RO is of this type)
2008//
2009// c3 Khmer consonant of type 3. Its subscript form takes space under
2010// and to the right of the base.
2011//
2012// cs Khmer consonant shifter
2013//
2014// rb Khmer robat
2015//
2016// co coeng character (u17D2)
2017//
2018// dv dependent vowel (including split vowels, they are treated in the same way).
2019// even if dv is not defined above, the component that is really tested for is
2020// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2021//
2022// zwj Zero Width joiner
2023//
2024// zwnj Zero width non joiner
2025//
2026// sa above sign
2027//
2028// sp post sign
2029//
2030// there are lines with equal content but for an easier understanding
2031// (and maybe change in the future) we did not join them
2032*/
2033static const signed char khmerStateTable[][CC_COUNT] =
2034{
2035 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2036 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2037 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2038 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2039 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2040 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2041 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2042 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2043 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2044 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2045 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2046 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2047 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2048 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2049 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2050 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2051 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2052 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2053 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2054 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2055 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2056 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2057};
2058
2059
2060/* #define KHMER_DEBUG */
2061#ifdef KHMER_DEBUG
2062#define KHDEBUG qDebug
2063#else
2064# define KHDEBUG \
2065 if (0) \
2066 printf
2067#endif
2068
2069/*
2070// Given an input string of characters and a location in which to start looking
2071// calculate, using the state table, which one is the last character of the syllable
2072// that starts in the starting position.
2073*/
2074static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2075{
2076 const char16_t *uc = s + start;
2077 int state = 0;
2078 qsizetype pos = start;
2079 *invalid = false;
2080
2081 while (pos < end) {
2082 KhmerCharClass charClass = getKhmerCharClass(*uc);
2083 if (pos == start) {
2084 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2085 }
2086 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2087
2088 KHDEBUG("state[%d]=%d class=%8lx (uc=%4x)", int(pos - start), state,
2089 charClass, *uc );
2090
2091 if (state < 0) {
2092 break;
2093 }
2094 ++uc;
2095 ++pos;
2096 }
2097 return pos;
2098}
2099
2100static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2101{
2102 qsizetype end = from + len;
2103 const char16_t *uc = text + from;
2104 qsizetype i = 0;
2105 Q_UNUSED(script);
2106 attributes += from;
2107 while ( i < len ) {
2108 bool invalid;
2109 qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2110
2111 attributes[i].graphemeBoundary = true;
2112
2113 if ( boundary > len-1 ) boundary = len;
2114 i++;
2115 while ( i < boundary ) {
2116 attributes[i].graphemeBoundary = false;
2117 ++uc;
2118 ++i;
2119 }
2120 assert( i == boundary );
2121 }
2122}
2123
2124
2125const CharAttributeFunction charAttributeFunction[] = {
2126// Script_Unknown,
2127 nullptr,
2128// Script_Inherited,
2129 nullptr,
2130// Script_Common,
2131 nullptr,
2132// Script_Latin,
2133 nullptr,
2134// Script_Greek,
2135 nullptr,
2136// Script_Cyrillic,
2137 nullptr,
2138// Script_Armenian,
2139 nullptr,
2140// Script_Hebrew,
2141 nullptr,
2142// Script_Arabic,
2143 nullptr,
2144// Script_Syriac,
2145 nullptr,
2146// Script_Thaana,
2147 nullptr,
2148// Script_Devanagari,
2149 indicAttributes,
2150// Script_Bengali,
2151 indicAttributes,
2152// Script_Gurmukhi,
2153 indicAttributes,
2154// Script_Gujarati,
2155 indicAttributes,
2156// Script_Oriya,
2157 indicAttributes,
2158// Script_Tamil,
2159 indicAttributes,
2160// Script_Telugu,
2161 indicAttributes,
2162// Script_Kannada,
2163 indicAttributes,
2164// Script_Malayalam,
2165 indicAttributes,
2166// Script_Sinhala,
2167 indicAttributes,
2168// Script_Thai,
2169 thaiAttributes,
2170// Script_Lao,
2171 nullptr,
2172// Script_Tibetan,
2173 tibetanAttributes,
2174// Script_Myanmar,
2175 myanmarAttributes,
2176// Script_Georgian,
2177 nullptr,
2178// Script_Hangul,
2179 nullptr,
2180// Script_Ethiopic,
2181 nullptr,
2182// Script_Cherokee,
2183 nullptr,
2184// Script_CanadianAboriginal,
2185 nullptr,
2186// Script_Ogham,
2187 nullptr,
2188// Script_Runic,
2189 nullptr,
2190// Script_Khmer,
2191 khmerAttributes
2192};
2193
2194static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2195 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2196 QCharAttributes *attributes)
2197{
2198 if (stringLength == 0)
2199 return;
2200 for (qsizetype i = 0; i < numItems; ++i) {
2201 QChar::Script script = items[i].script;
2202 if (script > QChar::Script_Khmer)
2203 script = QChar::Script_Common;
2204 CharAttributeFunction attributeFunction = charAttributeFunction[script];
2205 if (!attributeFunction)
2206 continue;
2207 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2208 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2209 }
2210}
2211
2212}
2213
2214Q_CORE_EXPORT void initCharAttributes(QStringView string,
2215 const ScriptItem *items, qsizetype numItems,
2216 QCharAttributes *attributes, CharAttributeOptions options)
2217{
2218 if (string.size() <= 0)
2219 return;
2220
2221 if (!(options & DontClearAttributes))
2222 ::memset(attributes, 0, (string.size() + 1) * sizeof(QCharAttributes));
2223
2224 if (options & GraphemeBreaks)
2225 getGraphemeBreaks(string.utf16(), string.size(), attributes);
2226 if (options & WordBreaks)
2227 getWordBreaks(string.utf16(), string.size(), attributes);
2228 if (options & SentenceBreaks)
2229 getSentenceBreaks(string.utf16(), string.size(), attributes);
2230 if (options & LineBreaks)
2231 getLineBreaks(string.utf16(), string.size(), attributes, options);
2232 if (options & WhiteSpaces)
2233 getWhiteSpaces(string.utf16(), string.size(), attributes);
2234
2235 if (!qt_initcharattributes_default_algorithm_only) {
2236 if (!items || numItems <= 0)
2237 return;
2238
2239 Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2240 }
2241}
2242
2243
2244// ----------------------------------------------------------------------------
2245//
2246// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2247//
2248// ----------------------------------------------------------------------------
2249
2250Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2251{
2252 qsizetype sor = 0;
2253 qsizetype eor = 0;
2254 QChar::Script script = QChar::Script_Common;
2255
2256 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2257 char32_t ucs4 = string[i].unicode();
2258 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2259 ushort low = string[i + 1].unicode();
2260 if (QChar::isLowSurrogate(low)) {
2261 ucs4 = QChar::surrogateToUcs4(ucs4, low);
2262 ++i;
2263 }
2264 }
2265
2266 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2267
2268 QChar::Script nscript = QChar::Script(prop->script);
2269
2270 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2271 continue;
2272
2273 // inherit preceding Common-s
2274 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2275 // also covers a case where the base character of Common script followed
2276 // by one or more combining marks of non-Inherited, non-Common script
2277 script = nscript;
2278 continue;
2279 }
2280
2281 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2282 // Thus, a combining mark - whatever its script property value is - should inherit
2283 // the script property value of its base character.
2284 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2285 if (Q_UNLIKELY(FLAG(prop->category) & test))
2286 continue;
2287
2288 Q_ASSERT(script > QChar::Script_Common);
2289 Q_ASSERT(sor < eor);
2290 scripts->append(ScriptItem{sor, script});
2291 sor = eor;
2292
2293 script = nscript;
2294 }
2295
2296 Q_ASSERT(script >= QChar::Script_Common);
2297 Q_ASSERT(eor == string.size());
2298 scripts->append(ScriptItem{sor, script});
2299}
2300
2301} // namespace QUnicodeTools
2302
2303QT_END_NAMESPACE
2304