Hyphenator.cpp source code [engine/flutter/third_party/txt/src/minikin/Hyphenator.cpp]

1	/*
2	* Copyright (C) 2015 The Android Open Source Project
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#include <unicode/uchar.h>
18	#include <unicode/uscript.h>
19	#include <algorithm>
20	#include <memory>
21	#include <string>
22	#include <vector>
23
24	// HACK: for reading pattern file
25	#include <fcntl.h>
26
27	#define LOG_TAG "Minikin"
28
29	#include "minikin/Hyphenator.h"
30	#include "utils/WindowsUtils.h"
31
32	using std::vector;
33
34	namespace minikin {
35
36	static const uint16_t CHAR_HYPHEN_MINUS = `0x002D`;
37	static const uint16_t CHAR_SOFT_HYPHEN = `0x00AD`;
38	static const uint16_t CHAR_MIDDLE_DOT = `0x00B7`;
39	static const uint16_t CHAR_HYPHEN = `0x2010`;
40
41	// The following are structs that correspond to tables inside the hyb file
42	// format
43
44	struct AlphabetTable0 {
45	uint32_t version;
46	uint32_t min_codepoint;
47	uint32_t max_codepoint;
48	uint8_t data[`1`]; // actually flexible array, size is known at runtime
49	};
50
51	struct AlphabetTable1 {
52	uint32_t version;
53	uint32_t n_entries;
54	uint32_t data[`1`]; // actually flexible array, size is known at runtime
55
56	static uint32_t codepoint(uint32_t entry) { return entry >> `11`; }
57	static uint32_t value(uint32_t entry) { return entry & `0x7ff`; }
58	};
59
60	struct Trie {
61	uint32_t version;
62	uint32_t char_mask;
63	uint32_t link_shift;
64	uint32_t link_mask;
65	uint32_t pattern_shift;
66	uint32_t n_entries;
67	uint32_t data[`1`]; // actually flexible array, size is known at runtime
68	};
69
70	struct Pattern {
71	uint32_t version;
72	uint32_t n_entries;
73	uint32_t pattern_offset;
74	uint32_t pattern_size;
75	uint32_t data[`1`]; // actually flexible array, size is known at runtime
76
77	// accessors
78	static uint32_t len(uint32_t entry) { return entry >> `26`; }
79	static uint32_t shift(uint32_t entry) { return (entry >> `20`) & `0x3f`; }
80	const uint8_t* buf(uint32_t entry) const {
81	return reinterpret_cast<const uint8_t>(this*) + pattern_offset +
82	(entry & `0xfffff`);
83	}
84	};
85
86	struct Header {
87	uint32_t magic;
88	uint32_t version;
89	uint32_t alphabet_offset;
90	uint32_t trie_offset;
91	uint32_t pattern_offset;
92	uint32_t file_size;
93
94	// accessors
95	const uint8_t* bytes() const {
96	return reinterpret_cast<const uint8_t>(this*);
97	}
98	uint32_t alphabetVersion() const {
99	return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
100	}
101	const AlphabetTable0* alphabetTable0() const {
102	return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
103	}
104	const AlphabetTable1* alphabetTable1() const {
105	return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
106	}
107	const Trie* trieTable() const {
108	return reinterpret_cast<const Trie*>(bytes() + trie_offset);
109	}
110	const Pattern* patternTable() const {
111	return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
112	}
113	};
114
115	Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData,
116	size_t minPrefix,
117	size_t minSuffix) {
118	Hyphenator* result = new Hyphenator;
119	result->patternData = patternData;
120	result->minPrefix = minPrefix;
121	result->minSuffix = minSuffix;
122	return result;
123	}
124
125	void Hyphenator::hyphenate(vector<HyphenationType>* result,
126	const uint16_t* word,
127	size_t len,
128	const icu::Locale& locale) {
129	result->clear();
130	result->resize(len);
131	const size_t paddedLen = len + `2`; // start and stop code each count for 1
132	if (patternData != nullptr && len >= minPrefix + minSuffix &&
133	paddedLen <= MAX_HYPHENATED_SIZE) {
134	uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
135	const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len);
136	if (hyphenValue != HyphenationType::DONT_BREAK) {
137	hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue);
138	return;
139	}
140	// TODO: try NFC normalization
141	// TODO: handle non-BMP Unicode (requires remapping of offsets)
142	}
143	// Note that we will always get here if the word contains a hyphen or a soft
144	// hyphen, because the alphabet is not expected to contain a hyphen or a soft
145	// hyphen character, so alphabetLookup would return DONT_BREAK.
146	hyphenateWithNoPatterns(result->data(), word, len, locale);
147	}
148
149	// This function determines whether a character is like U+2010 HYPHEN in
150	// line breaking and usage: a character immediately after which line breaks
151	// are allowed, but words containing it should not be automatically
152	// hyphenated using patterns. This is a curated set, created by manually
153	// inspecting all the characters that have the Unicode line breaking
154	// property of BA or HY and seeing which ones are hyphens.
155	bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
156	return (c == `0x002D` \|\| // HYPHEN-MINUS
157	c == `0x058A` \|\| // ARMENIAN HYPHEN
158	c == `0x05BE` \|\| // HEBREW PUNCTUATION MAQAF
159	c == `0x1400` \|\| // CANADIAN SYLLABICS HYPHEN
160	c == `0x2010` \|\| // HYPHEN
161	c == `0x2013` \|\| // EN DASH
162	c == `0x2027` \|\| // HYPHENATION POINT
163	c == `0x2E17` \|\| // DOUBLE OBLIQUE HYPHEN
164	c == `0x2E40`); // DOUBLE HYPHEN
165	}
166
167	const static uint32_t HYPHEN_STR[] = {`0x2010`, `0`};
168	const static uint32_t ARMENIAN_HYPHEN_STR[] = {`0x058A`, `0`};
169	const static uint32_t MAQAF_STR[] = {`0x05BE`, `0`};
170	const static uint32_t UCAS_HYPHEN_STR[] = {`0x1400`, `0`};
171	const static uint32_t ZWJ_STR[] = {`0x200D`, `0`};
172	const static uint32_t ZWJ_AND_HYPHEN_STR[] = {`0x200D`, `0x2010`, `0`};
173
174	const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) {
175	switch (hyph) {
176	case INSERT_HYPHEN_AT_END:
177	case REPLACE_WITH_HYPHEN_AT_END:
178	case INSERT_HYPHEN_AT_START:
179	return HYPHEN_STR;
180	case INSERT_ARMENIAN_HYPHEN_AT_END:
181	return ARMENIAN_HYPHEN_STR;
182	case INSERT_MAQAF_AT_END:
183	return MAQAF_STR;
184	case INSERT_UCAS_HYPHEN_AT_END:
185	return UCAS_HYPHEN_STR;
186	case INSERT_ZWJ_AND_HYPHEN_AT_END:
187	return ZWJ_AND_HYPHEN_STR;
188	case INSERT_ZWJ_AT_START:
189	return ZWJ_STR;
190	default:
191	return nullptr;
192	}
193	}
194
195	uint32_t HyphenEdit::editForThisLine(HyphenationType type) {
196	switch (type) {
197	case HyphenationType::DONT_BREAK:
198	return NO_EDIT;
199	case HyphenationType::BREAK_AND_INSERT_HYPHEN:
200	return INSERT_HYPHEN_AT_END;
201	case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
202	return INSERT_ARMENIAN_HYPHEN_AT_END;
203	case HyphenationType::BREAK_AND_INSERT_MAQAF:
204	return INSERT_MAQAF_AT_END;
205	case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
206	return INSERT_UCAS_HYPHEN_AT_END;
207	case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
208	return REPLACE_WITH_HYPHEN_AT_END;
209	case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
210	return INSERT_ZWJ_AND_HYPHEN_AT_END;
211	default:
212	return BREAK_AT_END;
213	}
214	}
215
216	uint32_t HyphenEdit::editForNextLine(HyphenationType type) {
217	switch (type) {
218	case HyphenationType::DONT_BREAK:
219	return NO_EDIT;
220	case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
221	return INSERT_HYPHEN_AT_START;
222	case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
223	return INSERT_ZWJ_AT_START;
224	default:
225	return BREAK_AT_START;
226	}
227	}
228
229	static UScriptCode getScript(uint32_t codePoint) {
230	UErrorCode errorCode = U_ZERO_ERROR;
231	const UScriptCode script =
232	uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
233	if (U_SUCCESS(errorCode)) {
234	return script;
235	} else {
236	return USCRIPT_INVALID_CODE;
237	}
238	}
239
240	static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
241	// Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the
242	// "correct" hyphen for Hebrew, modern practice may have shifted towards
243	// Western hyphens. We use normal hyphens for now to be safe.
244	// BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch to
245	// maqaf for Hebrew, we can simply add a condition here.
246	const UScriptCode script = getScript(codePoint);
247	if (script == USCRIPT_KANNADA \|\| script == USCRIPT_MALAYALAM \|\|
248	script == USCRIPT_TAMIL \|\| script == USCRIPT_TELUGU) {
249	// Grantha is not included, since we don't support non-BMP hyphenation yet.
250	return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
251	} else if (script == USCRIPT_ARMENIAN) {
252	return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
253	} else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
254	return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
255	} else {
256	return HyphenationType::BREAK_AND_INSERT_HYPHEN;
257	}
258	}
259
260	static inline int32_t getJoiningType(UChar32 codepoint) {
261	return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
262	}
263
264	// Assumption for caller: location must be >= 2 and word[location] ==
265	// CHAR_SOFT_HYPHEN. This function decides if the letters before and after the
266	// hyphen should appear as joining.
267	static inline HyphenationType getHyphTypeForArabic(const uint16_t* word,
268	size_t len,
269	size_t location) {
270	ssize_t i = location;
271	int32_t type = U_JT_NON_JOINING;
272	while (static_cast<size_t>(i) < len &&
273	(type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
274	i++;
275	}
276	if (type == U_JT_DUAL_JOINING \|\| type == U_JT_RIGHT_JOINING \|\|
277	type == U_JT_JOIN_CAUSING) {
278	// The next character is of the type that may join the last character. See
279	// if the last character is also of the right type.
280	i = location - `2`; // Skip the soft hyphen
281	type = U_JT_NON_JOINING;
282	while (i >= `0` && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
283	i--;
284	}
285	if (type == U_JT_DUAL_JOINING \|\| type == U_JT_LEFT_JOINING \|\|
286	type == U_JT_JOIN_CAUSING) {
287	return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
288	}
289	}
290	return HyphenationType::BREAK_AND_INSERT_HYPHEN;
291	}
292
293	// Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for
294	// hyphenating words that didn't match patterns, especially words that contain
295	// hyphens or soft hyphens (See sections 5.3, Use of Hyphen, and 5.4, Use of
296	// Soft Hyphen).
297	void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result,
298	const uint16_t* word,
299	size_t len,
300	const icu::Locale& locale) {
301	result[`0`] = HyphenationType::DONT_BREAK;
302	for (size_t i = `1`; i < len; i++) {
303	const uint16_t prevChar = word[i - `1`];
304	if (i > `1` && isLineBreakingHyphen(prevChar)) {
305	// Break after hyphens, but only if they don't start the word.
306
307	if ((prevChar == CHAR_HYPHEN_MINUS \|\| prevChar == CHAR_HYPHEN) &&
308	strcmp(locale.getLanguage(), "pl") == `0` &&
309	getScript(word[i]) == USCRIPT_LATIN) {
310	// In Polish, hyphens get repeated at the next line. To be safe,
311	// we will do this only if the next character is Latin.
312	result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
313	} else {
314	result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
315	}
316	} else if (i > `1` && prevChar == CHAR_SOFT_HYPHEN) {
317	// Break after soft hyphens, but only if they don't start the word (a soft
318	// hyphen starting the word doesn't give any useful break opportunities).
319	// The type of the break is based on the script of the character we break
320	// on.
321	if (getScript(word[i]) == USCRIPT_ARABIC) {
322	// For Arabic, we need to look and see if the characters around the soft
323	// hyphen actually join. If they don't, we'll just insert a normal
324	// hyphen.
325	result[i] = getHyphTypeForArabic(word, len, i);
326	} else {
327	result[i] = hyphenationTypeBasedOnScript(word[i]);
328	}
329	} else if (prevChar == CHAR_MIDDLE_DOT && minPrefix < i &&
330	i <= len - minSuffix &&
331	((word[i - `2`] == `'l'` && word[i] == `'l'`) \|\|
332	(word[i - `2`] == `'L'` && word[i] == `'L'`)) &&
333	strcmp(locale.getLanguage(), "ca") == `0`) {
334	// In Catalan, "l·l" should break as "l-" on the first line
335	// and "l" on the next line.
336	result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
337	} else {
338	result[i] = HyphenationType::DONT_BREAK;
339	}
340	}
341	}
342
343	HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
344	const uint16_t* word,
345	size_t len) {
346	const Header* header = getHeader();
347	HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
348	// TODO: check header magic
349	uint32_t alphabetVersion = header->alphabetVersion();
350	if (alphabetVersion == `0`) {
351	const AlphabetTable0* alphabet = header->alphabetTable0();
352	uint32_t min_codepoint = alphabet->min_codepoint;
353	uint32_t max_codepoint = alphabet->max_codepoint;
354	alpha_codes[`0`] = `0`; // word start
355	for (size_t i = `0`; i < len; i++) {
356	uint16_t c = word[i];
357	if (c < min_codepoint \|\| c >= max_codepoint) {
358	return HyphenationType::DONT_BREAK;
359	}
360	uint8_t code = alphabet->data[c - min_codepoint];
361	if (code == `0`) {
362	return HyphenationType::DONT_BREAK;
363	}
364	if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
365	result = hyphenationTypeBasedOnScript(c);
366	}
367	alpha_codes[i + `1`] = code;
368	}
369	alpha_codes[len + `1`] = `0`; // word termination
370	return result;
371	} else if (alphabetVersion == `1`) {
372	const AlphabetTable1* alphabet = header->alphabetTable1();
373	size_t n_entries = alphabet->n_entries;
374	const uint32_t* begin = alphabet->data;
375	const uint32_t* end = begin + n_entries;
376	alpha_codes[`0`] = `0`;
377	for (size_t i = `0`; i < len; i++) {
378	uint16_t c = word[i];
379	auto p = std::lower_bound<const uint32_t*, uint32_t>(begin, end, c << `11`);
380	if (p == end) {
381	return HyphenationType::DONT_BREAK;
382	}
383	uint32_t entry = *p;
384	if (AlphabetTable1::codepoint(entry) != c) {
385	return HyphenationType::DONT_BREAK;
386	}
387	if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
388	result = hyphenationTypeBasedOnScript(c);
389	}
390	alpha_codes[i + `1`] = AlphabetTable1::value(entry);
391	}
392	alpha_codes[len + `1`] = `0`;
393	return result;
394	}
395	return HyphenationType::DONT_BREAK;
396	}
397
398	/**
399	* Internal implementation, after conversion to codes. All case folding and
400	*normalization has been done by now, and all characters have been found in the
401	*alphabet. Note: len here is the padded length including 0 codes at start and
402	*end.
403	**/
404	void Hyphenator::hyphenateFromCodes(HyphenationType* result,
405	const uint16_t* codes,
406	size_t len,
407	HyphenationType hyphenValue) {
408	static_assert(sizeof(HyphenationType) == sizeof(uint8_t),
409	"HyphnationType must be uint8_t.");
410	// Reuse the result array as a buffer for calculating intermediate hyphenation
411	// numbers.
412	uint8_t* buffer = reinterpret_cast<uint8_t*>(result);
413
414	const Header* header = getHeader();
415	const Trie* trie = header->trieTable();
416	const Pattern* pattern = header->patternTable();
417	uint32_t char_mask = trie->char_mask;
418	uint32_t link_shift = trie->link_shift;
419	uint32_t link_mask = trie->link_mask;
420	uint32_t pattern_shift = trie->pattern_shift;
421	size_t maxOffset = len - minSuffix - `1`;
422	for (size_t i = `0`; i < len - `1`; i++) {
423	uint32_t node = `0`; // index into Trie table
424	for (size_t j = i; j < len; j++) {
425	uint16_t c = codes[j];
426	uint32_t entry = trie->data[node + c];
427	if ((entry & char_mask) == c) {
428	node = (entry & link_mask) >> link_shift;
429	} else {
430	break;
431	}
432	uint32_t pat_ix = trie->data[node] >> pattern_shift;
433	// pat_ix contains a 3-tuple of length, shift (number of trailing zeros),
434	// and an offset into the buf pool. This is the pattern for the substring
435	// (i..j) we just matched, which we combine (via point-wise max) into the
436	// buffer vector.
437	if (pat_ix != `0`) {
438	uint32_t pat_entry = pattern->data[pat_ix];
439	int pat_len = Pattern::len(pat_entry);
440	int pat_shift = Pattern::shift(pat_entry);
441	const uint8_t* pat_buf = pattern->buf(pat_entry);
442	int offset = j + `1` - (pat_len + pat_shift);
443	// offset is the index within buffer that lines up with the start of
444	// pat_buf
445	int start = std::max((int)minPrefix - offset, `0`);
446	int end = std::min(pat_len, (int)maxOffset - offset);
447	for (int k = start; k < end; k++) {
448	buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
449	}
450	}
451	}
452	}
453	// Since the above calculation does not modify values outside
454	// [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK.
455	for (size_t i = minPrefix; i < maxOffset; i++) {
456	// Hyphenation opportunities happen when the hyphenation numbers are odd.
457	result[i] = (buffer[i] & `1u`) ? hyphenValue : HyphenationType::DONT_BREAK;
458	}
459	}
460
461	} // namespace minikin
462

Browse the source code of engine/flutter/third_party/txt/src/minikin/Hyphenator.cpp