1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <unicode/uchar.h>
18#include <unicode/uscript.h>
19#include <algorithm>
20#include <memory>
21#include <string>
22#include <vector>
23
24// HACK: for reading pattern file
25#include <fcntl.h>
26
27#define LOG_TAG "Minikin"
28
29#include "minikin/Hyphenator.h"
30#include "utils/WindowsUtils.h"
31
32using std::vector;
33
34namespace minikin {
35
36static const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
37static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
38static const uint16_t CHAR_MIDDLE_DOT = 0x00B7;
39static const uint16_t CHAR_HYPHEN = 0x2010;
40
41// The following are structs that correspond to tables inside the hyb file
42// format
43
44struct AlphabetTable0 {
45 uint32_t version;
46 uint32_t min_codepoint;
47 uint32_t max_codepoint;
48 uint8_t data[1]; // actually flexible array, size is known at runtime
49};
50
51struct AlphabetTable1 {
52 uint32_t version;
53 uint32_t n_entries;
54 uint32_t data[1]; // actually flexible array, size is known at runtime
55
56 static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
57 static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
58};
59
60struct Trie {
61 uint32_t version;
62 uint32_t char_mask;
63 uint32_t link_shift;
64 uint32_t link_mask;
65 uint32_t pattern_shift;
66 uint32_t n_entries;
67 uint32_t data[1]; // actually flexible array, size is known at runtime
68};
69
70struct Pattern {
71 uint32_t version;
72 uint32_t n_entries;
73 uint32_t pattern_offset;
74 uint32_t pattern_size;
75 uint32_t data[1]; // actually flexible array, size is known at runtime
76
77 // accessors
78 static uint32_t len(uint32_t entry) { return entry >> 26; }
79 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
80 const uint8_t* buf(uint32_t entry) const {
81 return reinterpret_cast<const uint8_t*>(this) + pattern_offset +
82 (entry & 0xfffff);
83 }
84};
85
86struct Header {
87 uint32_t magic;
88 uint32_t version;
89 uint32_t alphabet_offset;
90 uint32_t trie_offset;
91 uint32_t pattern_offset;
92 uint32_t file_size;
93
94 // accessors
95 const uint8_t* bytes() const {
96 return reinterpret_cast<const uint8_t*>(this);
97 }
98 uint32_t alphabetVersion() const {
99 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
100 }
101 const AlphabetTable0* alphabetTable0() const {
102 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
103 }
104 const AlphabetTable1* alphabetTable1() const {
105 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
106 }
107 const Trie* trieTable() const {
108 return reinterpret_cast<const Trie*>(bytes() + trie_offset);
109 }
110 const Pattern* patternTable() const {
111 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
112 }
113};
114
115Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData,
116 size_t minPrefix,
117 size_t minSuffix) {
118 Hyphenator* result = new Hyphenator;
119 result->patternData = patternData;
120 result->minPrefix = minPrefix;
121 result->minSuffix = minSuffix;
122 return result;
123}
124
125void Hyphenator::hyphenate(vector<HyphenationType>* result,
126 const uint16_t* word,
127 size_t len,
128 const icu::Locale& locale) {
129 result->clear();
130 result->resize(len);
131 const size_t paddedLen = len + 2; // start and stop code each count for 1
132 if (patternData != nullptr && len >= minPrefix + minSuffix &&
133 paddedLen <= MAX_HYPHENATED_SIZE) {
134 uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
135 const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len);
136 if (hyphenValue != HyphenationType::DONT_BREAK) {
137 hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue);
138 return;
139 }
140 // TODO: try NFC normalization
141 // TODO: handle non-BMP Unicode (requires remapping of offsets)
142 }
143 // Note that we will always get here if the word contains a hyphen or a soft
144 // hyphen, because the alphabet is not expected to contain a hyphen or a soft
145 // hyphen character, so alphabetLookup would return DONT_BREAK.
146 hyphenateWithNoPatterns(result->data(), word, len, locale);
147}
148
149// This function determines whether a character is like U+2010 HYPHEN in
150// line breaking and usage: a character immediately after which line breaks
151// are allowed, but words containing it should not be automatically
152// hyphenated using patterns. This is a curated set, created by manually
153// inspecting all the characters that have the Unicode line breaking
154// property of BA or HY and seeing which ones are hyphens.
155bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
156 return (c == 0x002D || // HYPHEN-MINUS
157 c == 0x058A || // ARMENIAN HYPHEN
158 c == 0x05BE || // HEBREW PUNCTUATION MAQAF
159 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
160 c == 0x2010 || // HYPHEN
161 c == 0x2013 || // EN DASH
162 c == 0x2027 || // HYPHENATION POINT
163 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
164 c == 0x2E40); // DOUBLE HYPHEN
165}
166
167const static uint32_t HYPHEN_STR[] = {0x2010, 0};
168const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0};
169const static uint32_t MAQAF_STR[] = {0x05BE, 0};
170const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0};
171const static uint32_t ZWJ_STR[] = {0x200D, 0};
172const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0};
173
174const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) {
175 switch (hyph) {
176 case INSERT_HYPHEN_AT_END:
177 case REPLACE_WITH_HYPHEN_AT_END:
178 case INSERT_HYPHEN_AT_START:
179 return HYPHEN_STR;
180 case INSERT_ARMENIAN_HYPHEN_AT_END:
181 return ARMENIAN_HYPHEN_STR;
182 case INSERT_MAQAF_AT_END:
183 return MAQAF_STR;
184 case INSERT_UCAS_HYPHEN_AT_END:
185 return UCAS_HYPHEN_STR;
186 case INSERT_ZWJ_AND_HYPHEN_AT_END:
187 return ZWJ_AND_HYPHEN_STR;
188 case INSERT_ZWJ_AT_START:
189 return ZWJ_STR;
190 default:
191 return nullptr;
192 }
193}
194
195uint32_t HyphenEdit::editForThisLine(HyphenationType type) {
196 switch (type) {
197 case HyphenationType::DONT_BREAK:
198 return NO_EDIT;
199 case HyphenationType::BREAK_AND_INSERT_HYPHEN:
200 return INSERT_HYPHEN_AT_END;
201 case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
202 return INSERT_ARMENIAN_HYPHEN_AT_END;
203 case HyphenationType::BREAK_AND_INSERT_MAQAF:
204 return INSERT_MAQAF_AT_END;
205 case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
206 return INSERT_UCAS_HYPHEN_AT_END;
207 case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
208 return REPLACE_WITH_HYPHEN_AT_END;
209 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
210 return INSERT_ZWJ_AND_HYPHEN_AT_END;
211 default:
212 return BREAK_AT_END;
213 }
214}
215
216uint32_t HyphenEdit::editForNextLine(HyphenationType type) {
217 switch (type) {
218 case HyphenationType::DONT_BREAK:
219 return NO_EDIT;
220 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
221 return INSERT_HYPHEN_AT_START;
222 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
223 return INSERT_ZWJ_AT_START;
224 default:
225 return BREAK_AT_START;
226 }
227}
228
229static UScriptCode getScript(uint32_t codePoint) {
230 UErrorCode errorCode = U_ZERO_ERROR;
231 const UScriptCode script =
232 uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
233 if (U_SUCCESS(errorCode)) {
234 return script;
235 } else {
236 return USCRIPT_INVALID_CODE;
237 }
238}
239
240static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
241 // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the
242 // "correct" hyphen for Hebrew, modern practice may have shifted towards
243 // Western hyphens. We use normal hyphens for now to be safe.
244 // BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch to
245 // maqaf for Hebrew, we can simply add a condition here.
246 const UScriptCode script = getScript(codePoint);
247 if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM ||
248 script == USCRIPT_TAMIL || script == USCRIPT_TELUGU) {
249 // Grantha is not included, since we don't support non-BMP hyphenation yet.
250 return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
251 } else if (script == USCRIPT_ARMENIAN) {
252 return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
253 } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
254 return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
255 } else {
256 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
257 }
258}
259
260static inline int32_t getJoiningType(UChar32 codepoint) {
261 return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
262}
263
264// Assumption for caller: location must be >= 2 and word[location] ==
265// CHAR_SOFT_HYPHEN. This function decides if the letters before and after the
266// hyphen should appear as joining.
267static inline HyphenationType getHyphTypeForArabic(const uint16_t* word,
268 size_t len,
269 size_t location) {
270 ssize_t i = location;
271 int32_t type = U_JT_NON_JOINING;
272 while (static_cast<size_t>(i) < len &&
273 (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
274 i++;
275 }
276 if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING ||
277 type == U_JT_JOIN_CAUSING) {
278 // The next character is of the type that may join the last character. See
279 // if the last character is also of the right type.
280 i = location - 2; // Skip the soft hyphen
281 type = U_JT_NON_JOINING;
282 while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
283 i--;
284 }
285 if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING ||
286 type == U_JT_JOIN_CAUSING) {
287 return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
288 }
289 }
290 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
291}
292
293// Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for
294// hyphenating words that didn't match patterns, especially words that contain
295// hyphens or soft hyphens (See sections 5.3, Use of Hyphen, and 5.4, Use of
296// Soft Hyphen).
297void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result,
298 const uint16_t* word,
299 size_t len,
300 const icu::Locale& locale) {
301 result[0] = HyphenationType::DONT_BREAK;
302 for (size_t i = 1; i < len; i++) {
303 const uint16_t prevChar = word[i - 1];
304 if (i > 1 && isLineBreakingHyphen(prevChar)) {
305 // Break after hyphens, but only if they don't start the word.
306
307 if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
308 strcmp(locale.getLanguage(), "pl") == 0 &&
309 getScript(word[i]) == USCRIPT_LATIN) {
310 // In Polish, hyphens get repeated at the next line. To be safe,
311 // we will do this only if the next character is Latin.
312 result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
313 } else {
314 result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
315 }
316 } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
317 // Break after soft hyphens, but only if they don't start the word (a soft
318 // hyphen starting the word doesn't give any useful break opportunities).
319 // The type of the break is based on the script of the character we break
320 // on.
321 if (getScript(word[i]) == USCRIPT_ARABIC) {
322 // For Arabic, we need to look and see if the characters around the soft
323 // hyphen actually join. If they don't, we'll just insert a normal
324 // hyphen.
325 result[i] = getHyphTypeForArabic(word, len, i);
326 } else {
327 result[i] = hyphenationTypeBasedOnScript(word[i]);
328 }
329 } else if (prevChar == CHAR_MIDDLE_DOT && minPrefix < i &&
330 i <= len - minSuffix &&
331 ((word[i - 2] == 'l' && word[i] == 'l') ||
332 (word[i - 2] == 'L' && word[i] == 'L')) &&
333 strcmp(locale.getLanguage(), "ca") == 0) {
334 // In Catalan, "l·l" should break as "l-" on the first line
335 // and "l" on the next line.
336 result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
337 } else {
338 result[i] = HyphenationType::DONT_BREAK;
339 }
340 }
341}
342
343HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
344 const uint16_t* word,
345 size_t len) {
346 const Header* header = getHeader();
347 HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
348 // TODO: check header magic
349 uint32_t alphabetVersion = header->alphabetVersion();
350 if (alphabetVersion == 0) {
351 const AlphabetTable0* alphabet = header->alphabetTable0();
352 uint32_t min_codepoint = alphabet->min_codepoint;
353 uint32_t max_codepoint = alphabet->max_codepoint;
354 alpha_codes[0] = 0; // word start
355 for (size_t i = 0; i < len; i++) {
356 uint16_t c = word[i];
357 if (c < min_codepoint || c >= max_codepoint) {
358 return HyphenationType::DONT_BREAK;
359 }
360 uint8_t code = alphabet->data[c - min_codepoint];
361 if (code == 0) {
362 return HyphenationType::DONT_BREAK;
363 }
364 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
365 result = hyphenationTypeBasedOnScript(c);
366 }
367 alpha_codes[i + 1] = code;
368 }
369 alpha_codes[len + 1] = 0; // word termination
370 return result;
371 } else if (alphabetVersion == 1) {
372 const AlphabetTable1* alphabet = header->alphabetTable1();
373 size_t n_entries = alphabet->n_entries;
374 const uint32_t* begin = alphabet->data;
375 const uint32_t* end = begin + n_entries;
376 alpha_codes[0] = 0;
377 for (size_t i = 0; i < len; i++) {
378 uint16_t c = word[i];
379 auto p = std::lower_bound<const uint32_t*, uint32_t>(begin, end, c << 11);
380 if (p == end) {
381 return HyphenationType::DONT_BREAK;
382 }
383 uint32_t entry = *p;
384 if (AlphabetTable1::codepoint(entry) != c) {
385 return HyphenationType::DONT_BREAK;
386 }
387 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
388 result = hyphenationTypeBasedOnScript(c);
389 }
390 alpha_codes[i + 1] = AlphabetTable1::value(entry);
391 }
392 alpha_codes[len + 1] = 0;
393 return result;
394 }
395 return HyphenationType::DONT_BREAK;
396}
397
398/**
399 * Internal implementation, after conversion to codes. All case folding and
400 *normalization has been done by now, and all characters have been found in the
401 *alphabet. Note: len here is the padded length including 0 codes at start and
402 *end.
403 **/
404void Hyphenator::hyphenateFromCodes(HyphenationType* result,
405 const uint16_t* codes,
406 size_t len,
407 HyphenationType hyphenValue) {
408 static_assert(sizeof(HyphenationType) == sizeof(uint8_t),
409 "HyphnationType must be uint8_t.");
410 // Reuse the result array as a buffer for calculating intermediate hyphenation
411 // numbers.
412 uint8_t* buffer = reinterpret_cast<uint8_t*>(result);
413
414 const Header* header = getHeader();
415 const Trie* trie = header->trieTable();
416 const Pattern* pattern = header->patternTable();
417 uint32_t char_mask = trie->char_mask;
418 uint32_t link_shift = trie->link_shift;
419 uint32_t link_mask = trie->link_mask;
420 uint32_t pattern_shift = trie->pattern_shift;
421 size_t maxOffset = len - minSuffix - 1;
422 for (size_t i = 0; i < len - 1; i++) {
423 uint32_t node = 0; // index into Trie table
424 for (size_t j = i; j < len; j++) {
425 uint16_t c = codes[j];
426 uint32_t entry = trie->data[node + c];
427 if ((entry & char_mask) == c) {
428 node = (entry & link_mask) >> link_shift;
429 } else {
430 break;
431 }
432 uint32_t pat_ix = trie->data[node] >> pattern_shift;
433 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros),
434 // and an offset into the buf pool. This is the pattern for the substring
435 // (i..j) we just matched, which we combine (via point-wise max) into the
436 // buffer vector.
437 if (pat_ix != 0) {
438 uint32_t pat_entry = pattern->data[pat_ix];
439 int pat_len = Pattern::len(pat_entry);
440 int pat_shift = Pattern::shift(pat_entry);
441 const uint8_t* pat_buf = pattern->buf(pat_entry);
442 int offset = j + 1 - (pat_len + pat_shift);
443 // offset is the index within buffer that lines up with the start of
444 // pat_buf
445 int start = std::max((int)minPrefix - offset, 0);
446 int end = std::min(pat_len, (int)maxOffset - offset);
447 for (int k = start; k < end; k++) {
448 buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
449 }
450 }
451 }
452 }
453 // Since the above calculation does not modify values outside
454 // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK.
455 for (size_t i = minPrefix; i < maxOffset; i++) {
456 // Hyphenation opportunities happen when the hyphenation numbers are odd.
457 result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
458 }
459}
460
461} // namespace minikin
462