1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39#include <QtCore/qtextboundaryfinder.h>
40#include <QtCore/qvarlengtharray.h>
41
42#include <private/qunicodetools_p.h>
43
44QT_BEGIN_NAMESPACE
45
46static void init(QTextBoundaryFinder::BoundaryType type, QStringView str, QCharAttributes *attributes)
47{
48 QUnicodeTools::ScriptItemArray scriptItems;
49 QUnicodeTools::initScripts(str, &scriptItems);
50
51 QUnicodeTools::CharAttributeOptions options;
52 switch (type) {
53 case QTextBoundaryFinder::Grapheme: options |= QUnicodeTools::GraphemeBreaks; break;
54 case QTextBoundaryFinder::Word: options |= QUnicodeTools::WordBreaks; break;
55 case QTextBoundaryFinder::Sentence: options |= QUnicodeTools::SentenceBreaks; break;
56 case QTextBoundaryFinder::Line: options |= QUnicodeTools::LineBreaks; break;
57 default: break;
58 }
59 QUnicodeTools::initCharAttributes(str, scriptItems.data(), scriptItems.count(), attributes, options);
60}
61
62/*!
63 \class QTextBoundaryFinder
64 \inmodule QtCore
65
66 \brief The QTextBoundaryFinder class provides a way of finding Unicode text boundaries in a string.
67
68 \since 4.4
69 \ingroup tools
70 \ingroup shared
71 \ingroup string-processing
72 \reentrant
73
74 QTextBoundaryFinder allows to find Unicode text boundaries in a
75 string, accordingly to the Unicode text boundary specification (see
76 \l{http://www.unicode.org/reports/tr14/}{Unicode Standard Annex #14} and
77 \l{http://www.unicode.org/reports/tr29/}{Unicode Standard Annex #29}).
78
79 QTextBoundaryFinder can operate on a QString in four possible
80 modes depending on the value of \a BoundaryType.
81
82 Units of Unicode characters that make up what the user thinks of
83 as a character or basic unit of the language are here called
84 Grapheme clusters. The two unicode characters 'A' + diaeresis do
85 for example form one grapheme cluster as the user thinks of them
86 as one character, yet it is in this case represented by two
87 unicode code points
88 (see \l{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}).
89
90 Word boundaries are there to locate the start and end of what a
91 language considers to be a word
92 (see \l{http://www.unicode.org/reports/tr29/#Word_Boundaries}).
93
94 Line break boundaries give possible places where a line break
95 might happen and sentence boundaries will show the beginning and
96 end of whole sentences
97 (see \l{http://www.unicode.org/reports/tr29/#Sentence_Boundaries} and
98 \l{http://www.unicode.org/reports/tr14/}).
99
100 The first position in a string is always a valid boundary and
101 refers to the position before the first character. The last
102 position at the length of the string is also valid and refers
103 to the position after the last character.
104*/
105
106/*!
107 \enum QTextBoundaryFinder::BoundaryType
108
109 \value Grapheme Finds a grapheme which is the smallest boundary. It
110 including letters, punctuation marks, numerals and more.
111 \value Word Finds a word.
112 \value Line Finds possible positions for breaking the text into multiple
113 lines.
114 \value Sentence Finds sentence boundaries. These include periods, question
115 marks etc.
116*/
117
118/*!
119 \enum QTextBoundaryFinder::BoundaryReason
120
121 \value NotAtBoundary The boundary finder is not at a boundary position.
122 \value BreakOpportunity The boundary finder is at a break opportunity position.
123 Such a break opportunity might also be an item boundary
124 (either StartOfItem, EndOfItem, or combination of both),
125 a mandatory line break, or a soft hyphen.
126 \value StartOfItem Since 5.0. The boundary finder is at the start of
127 a grapheme, a word, a sentence, or a line.
128 \value EndOfItem Since 5.0. The boundary finder is at the end of
129 a grapheme, a word, a sentence, or a line.
130 \value MandatoryBreak Since 5.0. The boundary finder is at the end of line
131 (can occur for a Line boundary type only).
132 \value SoftHyphen The boundary finder is at the soft hyphen
133 (can occur for a Line boundary type only).
134*/
135
136/*!
137 Constructs an invalid QTextBoundaryFinder object.
138*/
139QTextBoundaryFinder::QTextBoundaryFinder()
140 : freeBuffer(true)
141{
142}
143
144/*!
145 Copies the QTextBoundaryFinder object, \a other.
146*/
147QTextBoundaryFinder::QTextBoundaryFinder(const QTextBoundaryFinder &other)
148 : t(other.t)
149 , s(other.s)
150 , sv(other.sv)
151 , pos(other.pos)
152 , freeBuffer(true)
153{
154 if (other.attributes) {
155 Q_ASSERT(sv.size() > 0);
156 attributes = (QCharAttributes *) malloc((sv.size() + 1) * sizeof(QCharAttributes));
157 Q_CHECK_PTR(attributes);
158 memcpy(attributes, other.attributes, (sv.size() + 1) * sizeof(QCharAttributes));
159 }
160}
161
162/*!
163 Assigns the object, \a other, to another QTextBoundaryFinder object.
164*/
165QTextBoundaryFinder &QTextBoundaryFinder::operator=(const QTextBoundaryFinder &other)
166{
167 if (&other == this)
168 return *this;
169
170 if (other.attributes) {
171 Q_ASSERT(other.sv.size() > 0);
172 size_t newCapacity = (size_t(other.sv.size()) + 1) * sizeof(QCharAttributes);
173 QCharAttributes *newD = (QCharAttributes *) realloc(freeBuffer ? attributes : nullptr, newCapacity);
174 Q_CHECK_PTR(newD);
175 freeBuffer = true;
176 attributes = newD;
177 }
178
179 t = other.t;
180 s = other.s;
181 sv = other.sv;
182 pos = other.pos;
183
184 if (other.attributes) {
185 memcpy(attributes, other.attributes, (sv.size() + 1) * sizeof(QCharAttributes));
186 } else {
187 if (freeBuffer)
188 free(attributes);
189 attributes = nullptr;
190 }
191
192 return *this;
193}
194
195/*!
196 Destructs the QTextBoundaryFinder object.
197*/
198QTextBoundaryFinder::~QTextBoundaryFinder()
199{
200 Q_UNUSED(unused);
201 if (freeBuffer)
202 free(attributes);
203}
204
205/*!
206 Creates a QTextBoundaryFinder object of \a type operating on \a string.
207*/
208QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &string)
209 : t(type)
210 , s(string)
211 , sv(s)
212 , pos(0)
213 , freeBuffer(true)
214 , attributes(nullptr)
215{
216 if (sv.size() > 0) {
217 attributes = (QCharAttributes *) malloc((sv.size() + 1) * sizeof(QCharAttributes));
218 Q_CHECK_PTR(attributes);
219 init(t, sv, attributes);
220 }
221}
222
223/*!
224 \fn QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QChar *chars, qsizetype length, unsigned char *buffer, qsizetype bufferSize)
225 \overload
226
227 The same as QTextBoundaryFinder(type, QStringView(chars, length), buffer, bufferSize).
228*/
229
230/*!
231 Creates a QTextBoundaryFinder object of \a type operating on \a string.
232 \since 6.0
233
234 \a buffer is an optional working buffer of size \a bufferSize you can pass to
235 the QTextBoundaryFinder. If the buffer is large enough to hold the working
236 data required (bufferSize >= length + 1), it will use this
237 instead of allocating its own buffer.
238
239 \warning QTextBoundaryFinder does not create a copy of \a string. It is the
240 application programmer's responsibility to ensure the array is allocated for
241 as long as the QTextBoundaryFinder object stays alive. The same applies to
242 \a buffer.
243*/
244QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, QStringView string, unsigned char *buffer, qsizetype bufferSize)
245 : t(type)
246 , sv(string)
247 , pos(0)
248 , freeBuffer(true)
249 , attributes(nullptr)
250{
251 if (!sv.isEmpty()) {
252 if (buffer && (uint)bufferSize >= (sv.size() + 1) * sizeof(QCharAttributes)) {
253 attributes = reinterpret_cast<QCharAttributes *>(buffer);
254 freeBuffer = false;
255 } else {
256 attributes = (QCharAttributes *) malloc((sv.size() + 1) * sizeof(QCharAttributes));
257 Q_CHECK_PTR(attributes);
258 }
259 init(t, sv, attributes);
260 }
261}
262
263/*!
264 Moves the finder to the start of the string. This is equivalent to setPosition(0).
265
266 \sa setPosition(), position()
267*/
268void QTextBoundaryFinder::toStart()
269{
270 pos = 0;
271}
272
273/*!
274 Moves the finder to the end of the string. This is equivalent to setPosition(string.length()).
275
276 \sa setPosition(), position()
277*/
278void QTextBoundaryFinder::toEnd()
279{
280 pos = sv.size();
281}
282
283/*!
284 Returns the current position of the QTextBoundaryFinder.
285
286 The range is from 0 (the beginning of the string) to the length of
287 the string inclusive.
288
289 \sa setPosition()
290*/
291qsizetype QTextBoundaryFinder::position() const
292{
293 return pos;
294}
295
296/*!
297 Sets the current position of the QTextBoundaryFinder to \a position.
298
299 If \a position is out of bounds, it will be bound to only valid
300 positions. In this case, valid positions are from 0 to the length of
301 the string inclusive.
302
303 \sa position()
304*/
305void QTextBoundaryFinder::setPosition(qsizetype position)
306{
307 pos = qBound(0, position, sv.size());
308}
309
310/*! \fn QTextBoundaryFinder::BoundaryType QTextBoundaryFinder::type() const
311
312 Returns the type of the QTextBoundaryFinder.
313*/
314
315/*! \fn bool QTextBoundaryFinder::isValid() const
316
317 Returns \c true if the text boundary finder is valid; otherwise returns \c false.
318 A default QTextBoundaryFinder is invalid.
319*/
320
321/*!
322 Returns the string the QTextBoundaryFinder object operates on.
323*/
324QString QTextBoundaryFinder::string() const
325{
326 if (sv.data() == s.unicode() && sv.size() == s.size())
327 return s;
328 return sv.toString();
329}
330
331
332/*!
333 Moves the QTextBoundaryFinder to the next boundary position and returns that position.
334
335 Returns -1 if there is no next boundary.
336*/
337qsizetype QTextBoundaryFinder::toNextBoundary()
338{
339 if (!attributes || pos < 0 || pos >= sv.size()) {
340 pos = -1;
341 return pos;
342 }
343
344 ++pos;
345 switch(t) {
346 case Grapheme:
347 while (pos < sv.size() && !attributes[pos].graphemeBoundary)
348 ++pos;
349 break;
350 case Word:
351 while (pos < sv.size() && !attributes[pos].wordBreak)
352 ++pos;
353 break;
354 case Sentence:
355 while (pos < sv.size() && !attributes[pos].sentenceBoundary)
356 ++pos;
357 break;
358 case Line:
359 while (pos < sv.size() && !attributes[pos].lineBreak)
360 ++pos;
361 break;
362 }
363
364 return pos;
365}
366
367/*!
368 Moves the QTextBoundaryFinder to the previous boundary position and returns that position.
369
370 Returns -1 if there is no previous boundary.
371*/
372qsizetype QTextBoundaryFinder::toPreviousBoundary()
373{
374 if (!attributes || pos <= 0 || pos > sv.size()) {
375 pos = -1;
376 return pos;
377 }
378
379 --pos;
380 switch(t) {
381 case Grapheme:
382 while (pos > 0 && !attributes[pos].graphemeBoundary)
383 --pos;
384 break;
385 case Word:
386 while (pos > 0 && !attributes[pos].wordBreak)
387 --pos;
388 break;
389 case Sentence:
390 while (pos > 0 && !attributes[pos].sentenceBoundary)
391 --pos;
392 break;
393 case Line:
394 while (pos > 0 && !attributes[pos].lineBreak)
395 --pos;
396 break;
397 }
398
399 return pos;
400}
401
402/*!
403 Returns \c true if the object's position() is currently at a valid text boundary.
404*/
405bool QTextBoundaryFinder::isAtBoundary() const
406{
407 if (!attributes || pos < 0 || pos > sv.size())
408 return false;
409
410 switch(t) {
411 case Grapheme:
412 return attributes[pos].graphemeBoundary;
413 case Word:
414 return attributes[pos].wordBreak;
415 case Sentence:
416 return attributes[pos].sentenceBoundary;
417 case Line:
418 // ### TR#14 LB2 prohibits break at sot
419 return attributes[pos].lineBreak || pos == 0;
420 }
421 return false;
422}
423
424/*!
425 Returns the reasons for the boundary finder to have chosen the current position as a boundary.
426*/
427QTextBoundaryFinder::BoundaryReasons QTextBoundaryFinder::boundaryReasons() const
428{
429 BoundaryReasons reasons = NotAtBoundary;
430 if (!attributes || pos < 0 || pos > sv.size())
431 return reasons;
432
433 const QCharAttributes attr = attributes[pos];
434 switch (t) {
435 case Grapheme:
436 if (attr.graphemeBoundary) {
437 reasons |= BreakOpportunity | StartOfItem | EndOfItem;
438 if (pos == 0)
439 reasons &= (~EndOfItem);
440 else if (pos == sv.size())
441 reasons &= (~StartOfItem);
442 }
443 break;
444 case Word:
445 if (attr.wordBreak) {
446 reasons |= BreakOpportunity;
447 if (attr.wordStart)
448 reasons |= StartOfItem;
449 if (attr.wordEnd)
450 reasons |= EndOfItem;
451 }
452 break;
453 case Sentence:
454 if (attr.sentenceBoundary) {
455 reasons |= BreakOpportunity | StartOfItem | EndOfItem;
456 if (pos == 0)
457 reasons &= (~EndOfItem);
458 else if (pos == sv.size())
459 reasons &= (~StartOfItem);
460 }
461 break;
462 case Line:
463 // ### TR#14 LB2 prohibits break at sot
464 if (attr.lineBreak || pos == 0) {
465 reasons |= BreakOpportunity;
466 if (attr.mandatoryBreak || pos == 0) {
467 reasons |= MandatoryBreak | StartOfItem | EndOfItem;
468 if (pos == 0)
469 reasons &= (~EndOfItem);
470 else if (pos == sv.size())
471 reasons &= (~StartOfItem);
472 } else if (pos > 0 && sv[pos - 1].unicode() == QChar::SoftHyphen) {
473 reasons |= SoftHyphen;
474 }
475 }
476 break;
477 default:
478 break;
479 }
480
481 return reasons;
482}
483
484QT_END_NAMESPACE
485