1 | /* |
2 | * Copyright (C) 2015 The Android Open Source Project |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include <unicode/uchar.h> |
18 | #include <unicode/uscript.h> |
19 | #include <algorithm> |
20 | #include <memory> |
21 | #include <string> |
22 | #include <vector> |
23 | |
24 | // HACK: for reading pattern file |
25 | #include <fcntl.h> |
26 | |
27 | #define LOG_TAG "Minikin" |
28 | |
29 | #include "minikin/Hyphenator.h" |
30 | #include "utils/WindowsUtils.h" |
31 | |
32 | using std::vector; |
33 | |
34 | namespace minikin { |
35 | |
36 | static const uint16_t CHAR_HYPHEN_MINUS = 0x002D; |
37 | static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; |
38 | static const uint16_t CHAR_MIDDLE_DOT = 0x00B7; |
39 | static const uint16_t CHAR_HYPHEN = 0x2010; |
40 | |
41 | // The following are structs that correspond to tables inside the hyb file |
42 | // format |
43 | |
44 | struct AlphabetTable0 { |
45 | uint32_t version; |
46 | uint32_t min_codepoint; |
47 | uint32_t max_codepoint; |
48 | uint8_t data[1]; // actually flexible array, size is known at runtime |
49 | }; |
50 | |
51 | struct AlphabetTable1 { |
52 | uint32_t version; |
53 | uint32_t n_entries; |
54 | uint32_t data[1]; // actually flexible array, size is known at runtime |
55 | |
56 | static uint32_t codepoint(uint32_t entry) { return entry >> 11; } |
57 | static uint32_t value(uint32_t entry) { return entry & 0x7ff; } |
58 | }; |
59 | |
60 | struct Trie { |
61 | uint32_t version; |
62 | uint32_t char_mask; |
63 | uint32_t link_shift; |
64 | uint32_t link_mask; |
65 | uint32_t pattern_shift; |
66 | uint32_t n_entries; |
67 | uint32_t data[1]; // actually flexible array, size is known at runtime |
68 | }; |
69 | |
70 | struct Pattern { |
71 | uint32_t version; |
72 | uint32_t n_entries; |
73 | uint32_t pattern_offset; |
74 | uint32_t pattern_size; |
75 | uint32_t data[1]; // actually flexible array, size is known at runtime |
76 | |
77 | // accessors |
78 | static uint32_t len(uint32_t entry) { return entry >> 26; } |
79 | static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } |
80 | const uint8_t* buf(uint32_t entry) const { |
81 | return reinterpret_cast<const uint8_t*>(this) + pattern_offset + |
82 | (entry & 0xfffff); |
83 | } |
84 | }; |
85 | |
86 | struct { |
87 | uint32_t ; |
88 | uint32_t ; |
89 | uint32_t ; |
90 | uint32_t ; |
91 | uint32_t ; |
92 | uint32_t ; |
93 | |
94 | // accessors |
95 | const uint8_t* () const { |
96 | return reinterpret_cast<const uint8_t*>(this); |
97 | } |
98 | uint32_t () const { |
99 | return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset); |
100 | } |
101 | const AlphabetTable0* () const { |
102 | return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset); |
103 | } |
104 | const AlphabetTable1* () const { |
105 | return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset); |
106 | } |
107 | const Trie* () const { |
108 | return reinterpret_cast<const Trie*>(bytes() + trie_offset); |
109 | } |
110 | const Pattern* () const { |
111 | return reinterpret_cast<const Pattern*>(bytes() + pattern_offset); |
112 | } |
113 | }; |
114 | |
115 | Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, |
116 | size_t minPrefix, |
117 | size_t minSuffix) { |
118 | Hyphenator* result = new Hyphenator; |
119 | result->patternData = patternData; |
120 | result->minPrefix = minPrefix; |
121 | result->minSuffix = minSuffix; |
122 | return result; |
123 | } |
124 | |
125 | void Hyphenator::hyphenate(vector<HyphenationType>* result, |
126 | const uint16_t* word, |
127 | size_t len, |
128 | const icu::Locale& locale) { |
129 | result->clear(); |
130 | result->resize(len); |
131 | const size_t paddedLen = len + 2; // start and stop code each count for 1 |
132 | if (patternData != nullptr && len >= minPrefix + minSuffix && |
133 | paddedLen <= MAX_HYPHENATED_SIZE) { |
134 | uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; |
135 | const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len); |
136 | if (hyphenValue != HyphenationType::DONT_BREAK) { |
137 | hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue); |
138 | return; |
139 | } |
140 | // TODO: try NFC normalization |
141 | // TODO: handle non-BMP Unicode (requires remapping of offsets) |
142 | } |
143 | // Note that we will always get here if the word contains a hyphen or a soft |
144 | // hyphen, because the alphabet is not expected to contain a hyphen or a soft |
145 | // hyphen character, so alphabetLookup would return DONT_BREAK. |
146 | hyphenateWithNoPatterns(result->data(), word, len, locale); |
147 | } |
148 | |
149 | // This function determines whether a character is like U+2010 HYPHEN in |
150 | // line breaking and usage: a character immediately after which line breaks |
151 | // are allowed, but words containing it should not be automatically |
152 | // hyphenated using patterns. This is a curated set, created by manually |
153 | // inspecting all the characters that have the Unicode line breaking |
154 | // property of BA or HY and seeing which ones are hyphens. |
155 | bool Hyphenator::isLineBreakingHyphen(uint32_t c) { |
156 | return (c == 0x002D || // HYPHEN-MINUS |
157 | c == 0x058A || // ARMENIAN HYPHEN |
158 | c == 0x05BE || // HEBREW PUNCTUATION MAQAF |
159 | c == 0x1400 || // CANADIAN SYLLABICS HYPHEN |
160 | c == 0x2010 || // HYPHEN |
161 | c == 0x2013 || // EN DASH |
162 | c == 0x2027 || // HYPHENATION POINT |
163 | c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN |
164 | c == 0x2E40); // DOUBLE HYPHEN |
165 | } |
166 | |
167 | const static uint32_t HYPHEN_STR[] = {0x2010, 0}; |
168 | const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0}; |
169 | const static uint32_t MAQAF_STR[] = {0x05BE, 0}; |
170 | const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0}; |
171 | const static uint32_t ZWJ_STR[] = {0x200D, 0}; |
172 | const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0}; |
173 | |
174 | const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) { |
175 | switch (hyph) { |
176 | case INSERT_HYPHEN_AT_END: |
177 | case REPLACE_WITH_HYPHEN_AT_END: |
178 | case INSERT_HYPHEN_AT_START: |
179 | return HYPHEN_STR; |
180 | case INSERT_ARMENIAN_HYPHEN_AT_END: |
181 | return ARMENIAN_HYPHEN_STR; |
182 | case INSERT_MAQAF_AT_END: |
183 | return MAQAF_STR; |
184 | case INSERT_UCAS_HYPHEN_AT_END: |
185 | return UCAS_HYPHEN_STR; |
186 | case INSERT_ZWJ_AND_HYPHEN_AT_END: |
187 | return ZWJ_AND_HYPHEN_STR; |
188 | case INSERT_ZWJ_AT_START: |
189 | return ZWJ_STR; |
190 | default: |
191 | return nullptr; |
192 | } |
193 | } |
194 | |
195 | uint32_t HyphenEdit::editForThisLine(HyphenationType type) { |
196 | switch (type) { |
197 | case HyphenationType::DONT_BREAK: |
198 | return NO_EDIT; |
199 | case HyphenationType::BREAK_AND_INSERT_HYPHEN: |
200 | return INSERT_HYPHEN_AT_END; |
201 | case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN: |
202 | return INSERT_ARMENIAN_HYPHEN_AT_END; |
203 | case HyphenationType::BREAK_AND_INSERT_MAQAF: |
204 | return INSERT_MAQAF_AT_END; |
205 | case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN: |
206 | return INSERT_UCAS_HYPHEN_AT_END; |
207 | case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN: |
208 | return REPLACE_WITH_HYPHEN_AT_END; |
209 | case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: |
210 | return INSERT_ZWJ_AND_HYPHEN_AT_END; |
211 | default: |
212 | return BREAK_AT_END; |
213 | } |
214 | } |
215 | |
216 | uint32_t HyphenEdit::editForNextLine(HyphenationType type) { |
217 | switch (type) { |
218 | case HyphenationType::DONT_BREAK: |
219 | return NO_EDIT; |
220 | case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE: |
221 | return INSERT_HYPHEN_AT_START; |
222 | case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: |
223 | return INSERT_ZWJ_AT_START; |
224 | default: |
225 | return BREAK_AT_START; |
226 | } |
227 | } |
228 | |
229 | static UScriptCode getScript(uint32_t codePoint) { |
230 | UErrorCode errorCode = U_ZERO_ERROR; |
231 | const UScriptCode script = |
232 | uscript_getScript(static_cast<UChar32>(codePoint), &errorCode); |
233 | if (U_SUCCESS(errorCode)) { |
234 | return script; |
235 | } else { |
236 | return USCRIPT_INVALID_CODE; |
237 | } |
238 | } |
239 | |
240 | static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) { |
241 | // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the |
242 | // "correct" hyphen for Hebrew, modern practice may have shifted towards |
243 | // Western hyphens. We use normal hyphens for now to be safe. |
244 | // BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch to |
245 | // maqaf for Hebrew, we can simply add a condition here. |
246 | const UScriptCode script = getScript(codePoint); |
247 | if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || |
248 | script == USCRIPT_TAMIL || script == USCRIPT_TELUGU) { |
249 | // Grantha is not included, since we don't support non-BMP hyphenation yet. |
250 | return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; |
251 | } else if (script == USCRIPT_ARMENIAN) { |
252 | return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN; |
253 | } else if (script == USCRIPT_CANADIAN_ABORIGINAL) { |
254 | return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN; |
255 | } else { |
256 | return HyphenationType::BREAK_AND_INSERT_HYPHEN; |
257 | } |
258 | } |
259 | |
260 | static inline int32_t getJoiningType(UChar32 codepoint) { |
261 | return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE); |
262 | } |
263 | |
264 | // Assumption for caller: location must be >= 2 and word[location] == |
265 | // CHAR_SOFT_HYPHEN. This function decides if the letters before and after the |
266 | // hyphen should appear as joining. |
267 | static inline HyphenationType getHyphTypeForArabic(const uint16_t* word, |
268 | size_t len, |
269 | size_t location) { |
270 | ssize_t i = location; |
271 | int32_t type = U_JT_NON_JOINING; |
272 | while (static_cast<size_t>(i) < len && |
273 | (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { |
274 | i++; |
275 | } |
276 | if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || |
277 | type == U_JT_JOIN_CAUSING) { |
278 | // The next character is of the type that may join the last character. See |
279 | // if the last character is also of the right type. |
280 | i = location - 2; // Skip the soft hyphen |
281 | type = U_JT_NON_JOINING; |
282 | while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { |
283 | i--; |
284 | } |
285 | if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || |
286 | type == U_JT_JOIN_CAUSING) { |
287 | return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ; |
288 | } |
289 | } |
290 | return HyphenationType::BREAK_AND_INSERT_HYPHEN; |
291 | } |
292 | |
293 | // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for |
294 | // hyphenating words that didn't match patterns, especially words that contain |
295 | // hyphens or soft hyphens (See sections 5.3, Use of Hyphen, and 5.4, Use of |
296 | // Soft Hyphen). |
297 | void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result, |
298 | const uint16_t* word, |
299 | size_t len, |
300 | const icu::Locale& locale) { |
301 | result[0] = HyphenationType::DONT_BREAK; |
302 | for (size_t i = 1; i < len; i++) { |
303 | const uint16_t prevChar = word[i - 1]; |
304 | if (i > 1 && isLineBreakingHyphen(prevChar)) { |
305 | // Break after hyphens, but only if they don't start the word. |
306 | |
307 | if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) && |
308 | strcmp(locale.getLanguage(), "pl" ) == 0 && |
309 | getScript(word[i]) == USCRIPT_LATIN) { |
310 | // In Polish, hyphens get repeated at the next line. To be safe, |
311 | // we will do this only if the next character is Latin. |
312 | result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE; |
313 | } else { |
314 | result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; |
315 | } |
316 | } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) { |
317 | // Break after soft hyphens, but only if they don't start the word (a soft |
318 | // hyphen starting the word doesn't give any useful break opportunities). |
319 | // The type of the break is based on the script of the character we break |
320 | // on. |
321 | if (getScript(word[i]) == USCRIPT_ARABIC) { |
322 | // For Arabic, we need to look and see if the characters around the soft |
323 | // hyphen actually join. If they don't, we'll just insert a normal |
324 | // hyphen. |
325 | result[i] = getHyphTypeForArabic(word, len, i); |
326 | } else { |
327 | result[i] = hyphenationTypeBasedOnScript(word[i]); |
328 | } |
329 | } else if (prevChar == CHAR_MIDDLE_DOT && minPrefix < i && |
330 | i <= len - minSuffix && |
331 | ((word[i - 2] == 'l' && word[i] == 'l') || |
332 | (word[i - 2] == 'L' && word[i] == 'L')) && |
333 | strcmp(locale.getLanguage(), "ca" ) == 0) { |
334 | // In Catalan, "l·l" should break as "l-" on the first line |
335 | // and "l" on the next line. |
336 | result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN; |
337 | } else { |
338 | result[i] = HyphenationType::DONT_BREAK; |
339 | } |
340 | } |
341 | } |
342 | |
343 | HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, |
344 | const uint16_t* word, |
345 | size_t len) { |
346 | const Header* = getHeader(); |
347 | HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN; |
348 | // TODO: check header magic |
349 | uint32_t alphabetVersion = header->alphabetVersion(); |
350 | if (alphabetVersion == 0) { |
351 | const AlphabetTable0* alphabet = header->alphabetTable0(); |
352 | uint32_t min_codepoint = alphabet->min_codepoint; |
353 | uint32_t max_codepoint = alphabet->max_codepoint; |
354 | alpha_codes[0] = 0; // word start |
355 | for (size_t i = 0; i < len; i++) { |
356 | uint16_t c = word[i]; |
357 | if (c < min_codepoint || c >= max_codepoint) { |
358 | return HyphenationType::DONT_BREAK; |
359 | } |
360 | uint8_t code = alphabet->data[c - min_codepoint]; |
361 | if (code == 0) { |
362 | return HyphenationType::DONT_BREAK; |
363 | } |
364 | if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { |
365 | result = hyphenationTypeBasedOnScript(c); |
366 | } |
367 | alpha_codes[i + 1] = code; |
368 | } |
369 | alpha_codes[len + 1] = 0; // word termination |
370 | return result; |
371 | } else if (alphabetVersion == 1) { |
372 | const AlphabetTable1* alphabet = header->alphabetTable1(); |
373 | size_t n_entries = alphabet->n_entries; |
374 | const uint32_t* begin = alphabet->data; |
375 | const uint32_t* end = begin + n_entries; |
376 | alpha_codes[0] = 0; |
377 | for (size_t i = 0; i < len; i++) { |
378 | uint16_t c = word[i]; |
379 | auto p = std::lower_bound<const uint32_t*, uint32_t>(begin, end, c << 11); |
380 | if (p == end) { |
381 | return HyphenationType::DONT_BREAK; |
382 | } |
383 | uint32_t entry = *p; |
384 | if (AlphabetTable1::codepoint(entry) != c) { |
385 | return HyphenationType::DONT_BREAK; |
386 | } |
387 | if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { |
388 | result = hyphenationTypeBasedOnScript(c); |
389 | } |
390 | alpha_codes[i + 1] = AlphabetTable1::value(entry); |
391 | } |
392 | alpha_codes[len + 1] = 0; |
393 | return result; |
394 | } |
395 | return HyphenationType::DONT_BREAK; |
396 | } |
397 | |
398 | /** |
399 | * Internal implementation, after conversion to codes. All case folding and |
400 | *normalization has been done by now, and all characters have been found in the |
401 | *alphabet. Note: len here is the padded length including 0 codes at start and |
402 | *end. |
403 | **/ |
404 | void Hyphenator::hyphenateFromCodes(HyphenationType* result, |
405 | const uint16_t* codes, |
406 | size_t len, |
407 | HyphenationType hyphenValue) { |
408 | static_assert(sizeof(HyphenationType) == sizeof(uint8_t), |
409 | "HyphnationType must be uint8_t." ); |
410 | // Reuse the result array as a buffer for calculating intermediate hyphenation |
411 | // numbers. |
412 | uint8_t* buffer = reinterpret_cast<uint8_t*>(result); |
413 | |
414 | const Header* = getHeader(); |
415 | const Trie* trie = header->trieTable(); |
416 | const Pattern* pattern = header->patternTable(); |
417 | uint32_t char_mask = trie->char_mask; |
418 | uint32_t link_shift = trie->link_shift; |
419 | uint32_t link_mask = trie->link_mask; |
420 | uint32_t pattern_shift = trie->pattern_shift; |
421 | size_t maxOffset = len - minSuffix - 1; |
422 | for (size_t i = 0; i < len - 1; i++) { |
423 | uint32_t node = 0; // index into Trie table |
424 | for (size_t j = i; j < len; j++) { |
425 | uint16_t c = codes[j]; |
426 | uint32_t entry = trie->data[node + c]; |
427 | if ((entry & char_mask) == c) { |
428 | node = (entry & link_mask) >> link_shift; |
429 | } else { |
430 | break; |
431 | } |
432 | uint32_t pat_ix = trie->data[node] >> pattern_shift; |
433 | // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), |
434 | // and an offset into the buf pool. This is the pattern for the substring |
435 | // (i..j) we just matched, which we combine (via point-wise max) into the |
436 | // buffer vector. |
437 | if (pat_ix != 0) { |
438 | uint32_t pat_entry = pattern->data[pat_ix]; |
439 | int pat_len = Pattern::len(pat_entry); |
440 | int pat_shift = Pattern::shift(pat_entry); |
441 | const uint8_t* pat_buf = pattern->buf(pat_entry); |
442 | int offset = j + 1 - (pat_len + pat_shift); |
443 | // offset is the index within buffer that lines up with the start of |
444 | // pat_buf |
445 | int start = std::max((int)minPrefix - offset, 0); |
446 | int end = std::min(pat_len, (int)maxOffset - offset); |
447 | for (int k = start; k < end; k++) { |
448 | buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]); |
449 | } |
450 | } |
451 | } |
452 | } |
453 | // Since the above calculation does not modify values outside |
454 | // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK. |
455 | for (size_t i = minPrefix; i < maxOffset; i++) { |
456 | // Hyphenation opportunities happen when the hyphenation numbers are odd. |
457 | result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK; |
458 | } |
459 | } |
460 | |
461 | } // namespace minikin |
462 | |