WordBreaker.cpp source code [engine/flutter/third_party/txt/src/minikin/WordBreaker.cpp]

1	/*
2	* Copyright (C) 2015 The Android Open Source Project
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#define LOG_TAG "Minikin"
18
19	#include <log/log.h>
20
21	#include <minikin/Emoji.h>
22	#include <minikin/Hyphenator.h>
23	#include <minikin/WordBreaker.h>
24	#include "MinikinInternal.h"
25
26	#include <unicode/uchar.h>
27	#include <unicode/utf16.h>
28
29	namespace minikin {
30
31	const uint32_t CHAR_SOFT_HYPHEN = `0x00AD`;
32	const uint32_t CHAR_ZWJ = `0x200D`;
33
34	void WordBreaker::setLocale(const icu::Locale& locale) {
35	UErrorCode status = U_ZERO_ERROR;
36	mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
37	// TODO: handle failure status
38	if (mText != nullptr) {
39	mBreakIterator ->setText(&mUText, status);
40	}
41	mIteratorWasReset = true;
42	}
43
44	void WordBreaker::setText(const uint16_t* data, size_t size) {
45	mText = data;
46	mTextSize = size;
47	mIteratorWasReset = false;
48	mLast = `0`;
49	mCurrent = `0`;
50	mScanOffset = `0`;
51	mInEmailOrUrl = false;
52	UErrorCode status = U_ZERO_ERROR;
53	utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size,
54	&status);
55	mBreakIterator ->setText(&mUText, status);
56	mBreakIterator ->first();
57	}
58
59	ssize_t WordBreaker::current() const {
60	return mCurrent;
61	}
62
63	/**
64	* Determine whether a line break at position i within the buffer buf is valid.
65	*This represents customization beyond the ICU behavior, because plain ICU
66	*provides some line break opportunities that we don't want.
67	**/
68	static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
69	uint32_t codePoint;
70	size_t prev_offset = i;
71	U16_PREV(buf, `0`, prev_offset, codePoint);
72	// Do not break on hard or soft hyphens. These are handled by automatic
73	// hyphenation.
74	if (Hyphenator::isLineBreakingHyphen(codePoint) \|\|
75	codePoint == CHAR_SOFT_HYPHEN) {
76	// txt addition: Temporarily always break on hyphen. Changed from false to
77	// true.
78	return true;
79	}
80	// For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA,
81	// consonant>. This is to go around a bug in ICU line breaking:
82	// http://bugs.icu-project.org/trac/ticket/12561. To avoid too much looking
83	// around in the strings, we simply avoid breaking after any Myanmar virama,
84	// where no line break could be imagined, since the Myanmar virama is a pure
85	// stacker.
86	if (codePoint == `0x1039`) { // MYANMAR SIGN VIRAMA
87	return false;
88	}
89
90	uint32_t next_codepoint;
91	size_t next_offset = i;
92	U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
93
94	// Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may
95	// have fresher emoji data than ICU does.
96	if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
97	return false;
98	}
99
100	// Rule LB30b. We need to this ourselves since we may have fresher emoji data
101	// than ICU does.
102	if (isEmojiModifier(next_codepoint)) {
103	if (codePoint == `0xFE0F` && prev_offset > `0`) {
104	// skip over emoji variation selector
105	U16_PREV(buf, `0`, prev_offset, codePoint);
106	}
107	if (isEmojiBase(codePoint)) {
108	return false;
109	}
110	}
111	return true;
112	}
113
114	// Customized iteratorNext that takes care of both resets and our modifications
115	// to ICU's behavior.
116	int32_t WordBreaker::iteratorNext() {
117	int32_t result;
118	do {
119	if (mIteratorWasReset) {
120	result = mBreakIterator ->following(mCurrent);
121	mIteratorWasReset = false;
122	} else {
123	result = mBreakIterator ->next();
124	}
125	} while (!(result == icu::BreakIterator::DONE \|\|
126	(size_t)result == mTextSize \|\|
127	isBreakValid(mText, mTextSize, result)));
128	return result;
129	}
130
131	// Chicago Manual of Style recommends breaking after these characters in URLs
132	// and email addresses
133	static bool breakAfter(uint16_t c) {
134	return c == `':'` \|\| c == `'='` \|\| c == `'&'`;
135	}
136
137	// Chicago Manual of Style recommends breaking before these characters in URLs
138	// and email addresses
139	static bool breakBefore(uint16_t c) {
140	return c == `'~'` \|\| c == `'.'` \|\| c == `','` \|\| c == `'-'` \|\| c == `'_'` \|\| c == `'?'` \|\|
141	c == `'#'` \|\| c == `'%'` \|\| c == `'='` \|\| c == `'&'`;
142	}
143
144	enum ScanState {
145	START,
146	SAW_AT,
147	SAW_COLON,
148	SAW_COLON_SLASH,
149	SAW_COLON_SLASH_SLASH,
150	};
151
152	void WordBreaker::detectEmailOrUrl() {
153	// scan forward from current ICU position for email address or URL
154	if (mLast >= mScanOffset) {
155	ScanState state = START;
156	size_t i;
157	for (i = mLast; i < mTextSize; i++) {
158	uint16_t c = mText[i];
159	// scan only ASCII characters, stop at space
160	if (!(`' '` < c && c <= `0x007E`)) {
161	break;
162	}
163	if (state == START && c == `'@'`) {
164	state = SAW_AT;
165	} else if (state == START && c == `':'`) {
166	state = SAW_COLON;
167	} else if (state == SAW_COLON \|\| state == SAW_COLON_SLASH) {
168	if (c == `'/'`) {
169	state = static_cast<ScanState>((int)state +
170	`1`); // next state adds a slash
171	} else {
172	state = START;
173	}
174	}
175	}
176	if (state == SAW_AT \|\| state == SAW_COLON_SLASH_SLASH) {
177	if (!mBreakIterator ->isBoundary(i)) {
178	// If there are combining marks or such at the end of the URL or the
179	// email address, consider them a part of the URL or the email, and skip
180	// to the next actual boundary.
181	i = mBreakIterator ->following(i);
182	}
183	mInEmailOrUrl = true;
184	mIteratorWasReset = true;
185	} else {
186	mInEmailOrUrl = false;
187	}
188	mScanOffset = i;
189	}
190	}
191
192	ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
193	// special rules for email addresses and URL's as per Chicago Manual of Style
194	// (16th ed.)
195	uint16_t lastChar = mText[mLast];
196	ssize_t i;
197	for (i = mLast + `1`; i < mScanOffset; i++) {
198	if (breakAfter(lastChar)) {
199	break;
200	}
201	// break after double slash
202	if (lastChar == `'/'` && i >= mLast + `2` && mText[i - `2`] == `'/'`) {
203	break;
204	}
205	const uint16_t thisChar = mText[i];
206	// never break after hyphen
207	if (lastChar != `'-'`) {
208	if (breakBefore(thisChar)) {
209	break;
210	}
211	// break before single slash
212	if (thisChar == `'/'` && lastChar != `'/'` &&
213	!(i + `1` < mScanOffset && mText[i + `1`] == `'/'`)) {
214	break;
215	}
216	}
217	lastChar = thisChar;
218	}
219	return i;
220	}
221
222	ssize_t WordBreaker::next() {
223	mLast = mCurrent;
224
225	detectEmailOrUrl();
226	if (mInEmailOrUrl) {
227	mCurrent = findNextBreakInEmailOrUrl();
228	} else { // Business as usual
229	mCurrent = (ssize_t)iteratorNext();
230	}
231	return mCurrent;
232	}
233
234	ssize_t WordBreaker::wordStart() const {
235	if (mInEmailOrUrl) {
236	return mLast;
237	}
238	ssize_t result = mLast;
239	while (result < mCurrent) {
240	UChar32 c;
241	ssize_t ix = result;
242	U16_NEXT(mText, ix, mCurrent, c);
243	const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
244	// strip leading punctuation, defined as OP and QU line breaking classes,
245	// see UAX #14
246	if (!(lb == U_LB_OPEN_PUNCTUATION \|\| lb == U_LB_QUOTATION)) {
247	break;
248	}
249	result = ix;
250	}
251	return result;
252	}
253
254	ssize_t WordBreaker::wordEnd() const {
255	if (mInEmailOrUrl) {
256	return mLast;
257	}
258	ssize_t result = mCurrent;
259	while (result > mLast) {
260	UChar32 c;
261	ssize_t ix = result;
262	U16_PREV(mText, mLast, ix, c);
263	const int32_t gc_mask = U_GET_GC_MASK(c);
264	// strip trailing space and punctuation
265	if ((gc_mask & (U_GC_ZS_MASK \| U_GC_P_MASK)) == `0`) {
266	break;
267	}
268	result = ix;
269	}
270	return result;
271	}
272
273	int WordBreaker::breakBadness() const {
274	return (mInEmailOrUrl && mCurrent < mScanOffset) ? `1` : `0`;
275	}
276
277	void WordBreaker::finish() {
278	mText = nullptr;
279	// Note: calling utext_close multiply is safe
280	utext_close(&mUText);
281	}
282
283	} // namespace minikin
284

Browse the source code of engine/flutter/third_party/txt/src/minikin/WordBreaker.cpp