1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18
19#include <log/log.h>
20
21#include <minikin/Emoji.h>
22#include <minikin/Hyphenator.h>
23#include <minikin/WordBreaker.h>
24#include "MinikinInternal.h"
25
26#include <unicode/uchar.h>
27#include <unicode/utf16.h>
28
29namespace minikin {
30
31const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
32const uint32_t CHAR_ZWJ = 0x200D;
33
34void WordBreaker::setLocale(const icu::Locale& locale) {
35 UErrorCode status = U_ZERO_ERROR;
36 mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
37 // TODO: handle failure status
38 if (mText != nullptr) {
39 mBreakIterator->setText(&mUText, status);
40 }
41 mIteratorWasReset = true;
42}
43
44void WordBreaker::setText(const uint16_t* data, size_t size) {
45 mText = data;
46 mTextSize = size;
47 mIteratorWasReset = false;
48 mLast = 0;
49 mCurrent = 0;
50 mScanOffset = 0;
51 mInEmailOrUrl = false;
52 UErrorCode status = U_ZERO_ERROR;
53 utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size,
54 &status);
55 mBreakIterator->setText(&mUText, status);
56 mBreakIterator->first();
57}
58
59ssize_t WordBreaker::current() const {
60 return mCurrent;
61}
62
63/**
64 * Determine whether a line break at position i within the buffer buf is valid.
65 *This represents customization beyond the ICU behavior, because plain ICU
66 *provides some line break opportunities that we don't want.
67 **/
68static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
69 uint32_t codePoint;
70 size_t prev_offset = i;
71 U16_PREV(buf, 0, prev_offset, codePoint);
72 // Do not break on hard or soft hyphens. These are handled by automatic
73 // hyphenation.
74 if (Hyphenator::isLineBreakingHyphen(codePoint) ||
75 codePoint == CHAR_SOFT_HYPHEN) {
76 // txt addition: Temporarily always break on hyphen. Changed from false to
77 // true.
78 return true;
79 }
80 // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA,
81 // consonant>. This is to go around a bug in ICU line breaking:
82 // http://bugs.icu-project.org/trac/ticket/12561. To avoid too much looking
83 // around in the strings, we simply avoid breaking after any Myanmar virama,
84 // where no line break could be imagined, since the Myanmar virama is a pure
85 // stacker.
86 if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA
87 return false;
88 }
89
90 uint32_t next_codepoint;
91 size_t next_offset = i;
92 U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
93
94 // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may
95 // have fresher emoji data than ICU does.
96 if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
97 return false;
98 }
99
100 // Rule LB30b. We need to this ourselves since we may have fresher emoji data
101 // than ICU does.
102 if (isEmojiModifier(next_codepoint)) {
103 if (codePoint == 0xFE0F && prev_offset > 0) {
104 // skip over emoji variation selector
105 U16_PREV(buf, 0, prev_offset, codePoint);
106 }
107 if (isEmojiBase(codePoint)) {
108 return false;
109 }
110 }
111 return true;
112}
113
114// Customized iteratorNext that takes care of both resets and our modifications
115// to ICU's behavior.
116int32_t WordBreaker::iteratorNext() {
117 int32_t result;
118 do {
119 if (mIteratorWasReset) {
120 result = mBreakIterator->following(mCurrent);
121 mIteratorWasReset = false;
122 } else {
123 result = mBreakIterator->next();
124 }
125 } while (!(result == icu::BreakIterator::DONE ||
126 (size_t)result == mTextSize ||
127 isBreakValid(mText, mTextSize, result)));
128 return result;
129}
130
131// Chicago Manual of Style recommends breaking after these characters in URLs
132// and email addresses
133static bool breakAfter(uint16_t c) {
134 return c == ':' || c == '=' || c == '&';
135}
136
137// Chicago Manual of Style recommends breaking before these characters in URLs
138// and email addresses
139static bool breakBefore(uint16_t c) {
140 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' ||
141 c == '#' || c == '%' || c == '=' || c == '&';
142}
143
144enum ScanState {
145 START,
146 SAW_AT,
147 SAW_COLON,
148 SAW_COLON_SLASH,
149 SAW_COLON_SLASH_SLASH,
150};
151
152void WordBreaker::detectEmailOrUrl() {
153 // scan forward from current ICU position for email address or URL
154 if (mLast >= mScanOffset) {
155 ScanState state = START;
156 size_t i;
157 for (i = mLast; i < mTextSize; i++) {
158 uint16_t c = mText[i];
159 // scan only ASCII characters, stop at space
160 if (!(' ' < c && c <= 0x007E)) {
161 break;
162 }
163 if (state == START && c == '@') {
164 state = SAW_AT;
165 } else if (state == START && c == ':') {
166 state = SAW_COLON;
167 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
168 if (c == '/') {
169 state = static_cast<ScanState>((int)state +
170 1); // next state adds a slash
171 } else {
172 state = START;
173 }
174 }
175 }
176 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
177 if (!mBreakIterator->isBoundary(i)) {
178 // If there are combining marks or such at the end of the URL or the
179 // email address, consider them a part of the URL or the email, and skip
180 // to the next actual boundary.
181 i = mBreakIterator->following(i);
182 }
183 mInEmailOrUrl = true;
184 mIteratorWasReset = true;
185 } else {
186 mInEmailOrUrl = false;
187 }
188 mScanOffset = i;
189 }
190}
191
192ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
193 // special rules for email addresses and URL's as per Chicago Manual of Style
194 // (16th ed.)
195 uint16_t lastChar = mText[mLast];
196 ssize_t i;
197 for (i = mLast + 1; i < mScanOffset; i++) {
198 if (breakAfter(lastChar)) {
199 break;
200 }
201 // break after double slash
202 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
203 break;
204 }
205 const uint16_t thisChar = mText[i];
206 // never break after hyphen
207 if (lastChar != '-') {
208 if (breakBefore(thisChar)) {
209 break;
210 }
211 // break before single slash
212 if (thisChar == '/' && lastChar != '/' &&
213 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
214 break;
215 }
216 }
217 lastChar = thisChar;
218 }
219 return i;
220}
221
222ssize_t WordBreaker::next() {
223 mLast = mCurrent;
224
225 detectEmailOrUrl();
226 if (mInEmailOrUrl) {
227 mCurrent = findNextBreakInEmailOrUrl();
228 } else { // Business as usual
229 mCurrent = (ssize_t)iteratorNext();
230 }
231 return mCurrent;
232}
233
234ssize_t WordBreaker::wordStart() const {
235 if (mInEmailOrUrl) {
236 return mLast;
237 }
238 ssize_t result = mLast;
239 while (result < mCurrent) {
240 UChar32 c;
241 ssize_t ix = result;
242 U16_NEXT(mText, ix, mCurrent, c);
243 const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
244 // strip leading punctuation, defined as OP and QU line breaking classes,
245 // see UAX #14
246 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
247 break;
248 }
249 result = ix;
250 }
251 return result;
252}
253
254ssize_t WordBreaker::wordEnd() const {
255 if (mInEmailOrUrl) {
256 return mLast;
257 }
258 ssize_t result = mCurrent;
259 while (result > mLast) {
260 UChar32 c;
261 ssize_t ix = result;
262 U16_PREV(mText, mLast, ix, c);
263 const int32_t gc_mask = U_GET_GC_MASK(c);
264 // strip trailing space and punctuation
265 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
266 break;
267 }
268 result = ix;
269 }
270 return result;
271}
272
273int WordBreaker::breakBadness() const {
274 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
275}
276
277void WordBreaker::finish() {
278 mText = nullptr;
279 // Note: calling utext_close multiply is safe
280 utext_close(&mUText);
281}
282
283} // namespace minikin
284