hb-unicode.hh source code [OpenJDK/src/java.desktop/share/native/libfontmanager/harfbuzz/hb-unicode.hh]

1	/*
2	* Copyright © 2009 Red Hat, Inc.
3	* Copyright © 2011 Codethink Limited
4	* Copyright © 2010,2011,2012 Google, Inc.
5	*
6	* This is part of HarfBuzz, a text shaping library.
7	*
8	* Permission is hereby granted, without written agreement and without
9	* license or royalty fees, to use, copy, modify, and distribute this
10	* software and its documentation for any purpose, provided that the
11	* above copyright notice and the following two paragraphs appear in
12	* all copies of this software.
13	*
14	* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
15	* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
16	* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
17	* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
18	* DAMAGE.
19	*
20	* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
21	* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
22	* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
23	* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
24	* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
25	*
26	* Red Hat Author(s): Behdad Esfahbod
27	* Codethink Author(s): Ryan Lortie
28	* Google Author(s): Behdad Esfahbod
29	*/
30
31	#ifndef HB_UNICODE_HH
32	#define HB_UNICODE_HH
33
34	#include "hb.hh"
35
36
37	extern HB_INTERNAL const uint8_t _hb_modified_combining_class[`256`];
38
39	/*
40	* hb_unicode_funcs_t
41	*/
42
43	#define HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS \
44	HB_UNICODE_FUNC_IMPLEMENT (combining_class) \
45	HB_UNICODE_FUNC_IMPLEMENT (eastasian_width) \
46	HB_UNICODE_FUNC_IMPLEMENT (general_category) \
47	HB_UNICODE_FUNC_IMPLEMENT (mirroring) \
48	HB_UNICODE_FUNC_IMPLEMENT (script) \
49	HB_UNICODE_FUNC_IMPLEMENT (compose) \
50	HB_UNICODE_FUNC_IMPLEMENT (decompose) \
51	HB_UNICODE_FUNC_IMPLEMENT (decompose_compatibility) \
52	/* ^--- Add new callbacks here */
53
54	/ Simple callbacks are those taking a hb_codepoint_t and returning a hb_codepoint_t /
55	#define HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS_SIMPLE \
56	HB_UNICODE_FUNC_IMPLEMENT (hb_unicode_combining_class_t, combining_class) \
57	HB_UNICODE_FUNC_IMPLEMENT (unsigned int, eastasian_width) \
58	HB_UNICODE_FUNC_IMPLEMENT (hb_unicode_general_category_t, general_category) \
59	HB_UNICODE_FUNC_IMPLEMENT (hb_codepoint_t, mirroring) \
60	HB_UNICODE_FUNC_IMPLEMENT (hb_script_t, script) \
61	/* ^--- Add new simple callbacks here */
62
63	struct hb_unicode_funcs_t
64	{
65	hb_object_header_t header;
66
67	hb_unicode_funcs_t *parent;
68
69	#define HB_UNICODE_FUNC_IMPLEMENT(return_type, name) \
70	return_type name (hb_codepoint_t unicode) { return func.name (this, unicode, user_data.name); }
71	HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS_SIMPLE
72	#undef HB_UNICODE_FUNC_IMPLEMENT
73
74	hb_bool_t compose (hb_codepoint_t a, hb_codepoint_t b,
75	hb_codepoint_t *ab)
76	{
77	*ab = `0`;
78	if (unlikely (!a \|\| !b)) return false;
79	return func.compose (this, a, b, ab, user_data.compose);
80	}
81
82	hb_bool_t decompose (hb_codepoint_t ab,
83	hb_codepoint_t a, hb_codepoint_t b)
84	{
85	a = ab; b = `0`;
86	return func.decompose (this, ab, a, b, user_data.decompose);
87	}
88
89	unsigned int decompose_compatibility (hb_codepoint_t u,
90	hb_codepoint_t *decomposed)
91	{
92	unsigned int ret = func.decompose_compatibility (this, u, decomposed, user_data.decompose_compatibility);
93	if (ret == `1` && u == decomposed[`0`]) {
94	decomposed[`0`] = `0`;
95	return `0`;
96	}
97	decomposed[ret] = `0`;
98	return ret;
99	}
100
101	unsigned int
102	modified_combining_class (hb_codepoint_t u)
103	{
104	/ XXX This hack belongs to the Myanmar shaper. /
105	if (unlikely (u == `0x1037u`)) u = `0x103Au`;
106
107	/ XXX This hack belongs to the USE shaper (for Tai Tham):*
108	* Reorder SAKOT to ensure it comes after any tone marks. */
109	if (unlikely (u == `0x1A60u`)) return `254`;
110
111	/ XXX This hack belongs to the Tibetan shaper:*
112	* Reorder PADMA to ensure it comes after any vowel marks. */
113	if (unlikely (u == `0x0FC6u`)) return `254`;
114	/ Reorder TSA -PHRU to reorder before U+0F74 /
115	if (unlikely (u == `0x0F39u`)) return `127`;
116
117	return _hb_modified_combining_class[combining_class (u)];
118	}
119
120	static hb_bool_t
121	is_variation_selector (hb_codepoint_t unicode)
122	{
123	/ U+180B..180D MONGOLIAN FREE VARIATION SELECTORs are handled in the*
124	* Arabic shaper. No need to match them here. */
125	return unlikely (hb_in_ranges<hb_codepoint_t> (unicode,
126	`0xFE00u`, `0xFE0Fu`, / VARIATION SELECTOR-1..16 /
127	`0xE0100u`, `0xE01EFu`)); / VARIATION SELECTOR-17..256 /
128	}
129
130	/ Default_Ignorable codepoints:*
131	*
132	* Note: While U+115F, U+1160, U+3164 and U+FFA0 are Default_Ignorable,
133	* we do NOT want to hide them, as the way Uniscribe has implemented them
134	* is with regular spacing glyphs, and that's the way fonts are made to work.
135	* As such, we make exceptions for those four.
136	* Also ignoring U+1BCA0..1BCA3. https://github.com/harfbuzz/harfbuzz/issues/503
137	*
138	* Unicode 7.0:
139	* $ grep '; Default_Ignorable_Code_Point ' DerivedCoreProperties.txt \| sed 's/;.*#/#/'
140	* 00AD # Cf SOFT HYPHEN
141	* 034F # Mn COMBINING GRAPHEME JOINER
142	* 061C # Cf ARABIC LETTER MARK
143	* 115F..1160 # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER
144	* 17B4..17B5 # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
145	* 180B..180D # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
146	* 180E # Cf MONGOLIAN VOWEL SEPARATOR
147	* 200B..200F # Cf [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK
148	* 202A..202E # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
149	* 2060..2064 # Cf [5] WORD JOINER..INVISIBLE PLUS
150	* 2065 # Cn <reserved-2065>
151	* 2066..206F # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
152	* 3164 # Lo HANGUL FILLER
153	* FE00..FE0F # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
154	* FEFF # Cf ZERO WIDTH NO-BREAK SPACE
155	* FFA0 # Lo HALFWIDTH HANGUL FILLER
156	* FFF0..FFF8 # Cn [9] <reserved-FFF0>..<reserved-FFF8>
157	* 1BCA0..1BCA3 # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
158	* 1D173..1D17A # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
159	* E0000 # Cn <reserved-E0000>
160	* E0001 # Cf LANGUAGE TAG
161	* E0002..E001F # Cn [30] <reserved-E0002>..<reserved-E001F>
162	* E0020..E007F # Cf [96] TAG SPACE..CANCEL TAG
163	* E0080..E00FF # Cn [128] <reserved-E0080>..<reserved-E00FF>
164	* E0100..E01EF # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
165	* E01F0..E0FFF # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
166	*/
167	static hb_bool_t
168	is_default_ignorable (hb_codepoint_t ch)
169	{
170	hb_codepoint_t plane = ch >> `16`;
171	if (likely (plane == `0`))
172	{
173	/ BMP /
174	hb_codepoint_t page = ch >> `8`;
175	switch (page) {
176	case `0x00`: return unlikely (ch == `0x00ADu`);
177	case `0x03`: return unlikely (ch == `0x034Fu`);
178	case `0x06`: return unlikely (ch == `0x061Cu`);
179	case `0x17`: return hb_in_range<hb_codepoint_t> (ch, `0x17B4u`, `0x17B5u`);
180	case `0x18`: return hb_in_range<hb_codepoint_t> (ch, `0x180Bu`, `0x180Eu`);
181	case `0x20`: return hb_in_ranges<hb_codepoint_t> (ch, `0x200Bu`, `0x200Fu`,
182	`0x202Au`, `0x202Eu`,
183	`0x2060u`, `0x206Fu`);
184	case `0xFE`: return hb_in_range<hb_codepoint_t> (ch, `0xFE00u`, `0xFE0Fu`) \|\| ch == `0xFEFFu`;
185	case `0xFF`: return hb_in_range<hb_codepoint_t> (ch, `0xFFF0u`, `0xFFF8u`);
186	default: return false;
187	}
188	}
189	else
190	{
191	/ Other planes /
192	switch (plane) {
193	case `0x01`: return hb_in_range<hb_codepoint_t> (ch, `0x1D173u`, `0x1D17Au`);
194	case `0x0E`: return hb_in_range<hb_codepoint_t> (ch, `0xE0000u`, `0xE0FFFu`);
195	default: return false;
196	}
197	}
198	}
199
200	/ Space estimates based on:*
201	* https://unicode.org/charts/PDF/U2000.pdf
202	* https://docs.microsoft.com/en-us/typography/develop/character-design-standards/whitespace
203	*/
204	enum space_t {
205	NOT_SPACE = `0`,
206	SPACE_EM = `1`,
207	SPACE_EM_2 = `2`,
208	SPACE_EM_3 = `3`,
209	SPACE_EM_4 = `4`,
210	SPACE_EM_5 = `5`,
211	SPACE_EM_6 = `6`,
212	SPACE_EM_16 = `16`,
213	SPACE_4_EM_18, / 4/18th of an EM! /
214	SPACE,
215	SPACE_FIGURE,
216	SPACE_PUNCTUATION,
217	SPACE_NARROW,
218	};
219	static space_t
220	space_fallback_type (hb_codepoint_t u)
221	{
222	switch (u)
223	{
224	/ All GC=Zs chars that can use a fallback. /
225	default: return NOT_SPACE; / U+1680 OGHAM SPACE MARK /
226	case `0x0020u`: return SPACE; / U+0020 SPACE /
227	case `0x00A0u`: return SPACE; / U+00A0 NO-BREAK SPACE /
228	case `0x2000u`: return SPACE_EM_2; / U+2000 EN QUAD /
229	case `0x2001u`: return SPACE_EM; / U+2001 EM QUAD /
230	case `0x2002u`: return SPACE_EM_2; / U+2002 EN SPACE /
231	case `0x2003u`: return SPACE_EM; / U+2003 EM SPACE /
232	case `0x2004u`: return SPACE_EM_3; / U+2004 THREE-PER-EM SPACE /
233	case `0x2005u`: return SPACE_EM_4; / U+2005 FOUR-PER-EM SPACE /
234	case `0x2006u`: return SPACE_EM_6; / U+2006 SIX-PER-EM SPACE /
235	case `0x2007u`: return SPACE_FIGURE; / U+2007 FIGURE SPACE /
236	case `0x2008u`: return SPACE_PUNCTUATION; / U+2008 PUNCTUATION SPACE /
237	case `0x2009u`: return SPACE_EM_5; / U+2009 THIN SPACE /
238	case `0x200Au`: return SPACE_EM_16; / U+200A HAIR SPACE /
239	case `0x202Fu`: return SPACE_NARROW; / U+202F NARROW NO-BREAK SPACE /
240	case `0x205Fu`: return SPACE_4_EM_18; / U+205F MEDIUM MATHEMATICAL SPACE /
241	case `0x3000u`: return SPACE_EM; / U+3000 IDEOGRAPHIC SPACE /
242	}
243	}
244
245	struct {
246	#define HB_UNICODE_FUNC_IMPLEMENT(name) hb_unicode_##name##_func_t name;
247	HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
248	#undef HB_UNICODE_FUNC_IMPLEMENT
249	} func;
250
251	struct {
252	#define HB_UNICODE_FUNC_IMPLEMENT(name) void *name;
253	HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
254	#undef HB_UNICODE_FUNC_IMPLEMENT
255	} user_data;
256
257	struct {
258	#define HB_UNICODE_FUNC_IMPLEMENT(name) hb_destroy_func_t name;
259	HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS
260	#undef HB_UNICODE_FUNC_IMPLEMENT
261	} destroy;
262	};
263	DECLARE_NULL_INSTANCE (hb_unicode_funcs_t);
264
265
266	/*
267	* Modified combining marks
268	*/
269
270	/ Hebrew*
271	*
272	* We permute the "fixed-position" classes 10-26 into the order
273	* described in the SBL Hebrew manual:
274	*
275	* https://www.sbl-site.org/Fonts/SBLHebrewUserManual1.5x.pdf
276	*
277	* (as recommended by:
278	* https://forum.fontlab.com/archive-old-microsoft-volt-group/vista-and-diacritic-ordering/msg22823/)
279	*
280	* More details here:
281	* https://bugzilla.mozilla.org/show_bug.cgi?id=662055
282	*/
283	#define HB_MODIFIED_COMBINING_CLASS_CCC10 22 /* sheva */
284	#define HB_MODIFIED_COMBINING_CLASS_CCC11 15 /* hataf segol */
285	#define HB_MODIFIED_COMBINING_CLASS_CCC12 16 /* hataf patah */
286	#define HB_MODIFIED_COMBINING_CLASS_CCC13 17 /* hataf qamats */
287	#define HB_MODIFIED_COMBINING_CLASS_CCC14 23 /* hiriq */
288	#define HB_MODIFIED_COMBINING_CLASS_CCC15 18 /* tsere */
289	#define HB_MODIFIED_COMBINING_CLASS_CCC16 19 /* segol */
290	#define HB_MODIFIED_COMBINING_CLASS_CCC17 20 /* patah */
291	#define HB_MODIFIED_COMBINING_CLASS_CCC18 21 /* qamats */
292	#define HB_MODIFIED_COMBINING_CLASS_CCC19 14 /* holam */
293	#define HB_MODIFIED_COMBINING_CLASS_CCC20 24 /* qubuts */
294	#define HB_MODIFIED_COMBINING_CLASS_CCC21 12 /* dagesh */
295	#define HB_MODIFIED_COMBINING_CLASS_CCC22 25 /* meteg */
296	#define HB_MODIFIED_COMBINING_CLASS_CCC23 13 /* rafe */
297	#define HB_MODIFIED_COMBINING_CLASS_CCC24 10 /* shin dot */
298	#define HB_MODIFIED_COMBINING_CLASS_CCC25 11 /* sin dot */
299	#define HB_MODIFIED_COMBINING_CLASS_CCC26 26 /* point varika */
300
301	/*
302	* Arabic
303	*
304	* Modify to move Shadda (ccc=33) before other marks. See:
305	* https://unicode.org/faq/normalization.html#8
306	* https://unicode.org/faq/normalization.html#9
307	*/
308	#define HB_MODIFIED_COMBINING_CLASS_CCC27 28 /* fathatan */
309	#define HB_MODIFIED_COMBINING_CLASS_CCC28 29 /* dammatan */
310	#define HB_MODIFIED_COMBINING_CLASS_CCC29 30 /* kasratan */
311	#define HB_MODIFIED_COMBINING_CLASS_CCC30 31 /* fatha */
312	#define HB_MODIFIED_COMBINING_CLASS_CCC31 32 /* damma */
313	#define HB_MODIFIED_COMBINING_CLASS_CCC32 33 /* kasra */
314	#define HB_MODIFIED_COMBINING_CLASS_CCC33 27 /* shadda */
315	#define HB_MODIFIED_COMBINING_CLASS_CCC34 34 /* sukun */
316	#define HB_MODIFIED_COMBINING_CLASS_CCC35 35 /* superscript alef */
317
318	/ Syriac /
319	#define HB_MODIFIED_COMBINING_CLASS_CCC36 36 /* superscript alaph */
320
321	/ Telugu*
322	*
323	* Modify Telugu length marks (ccc=84, ccc=91).
324	* These are the only matras in the main Indic scripts range that have
325	* a non-zero ccc. That makes them reorder with the Halant that is
326	* ccc=9. Just zero them, we don't need them in our Indic shaper.
327	*/
328	#define HB_MODIFIED_COMBINING_CLASS_CCC84 0 /* length mark */
329	#define HB_MODIFIED_COMBINING_CLASS_CCC91 0 /* ai length mark */
330
331	/ Thai*
332	*
333	* Modify U+0E38 and U+0E39 (ccc=103) to be reordered before U+0E3A (ccc=9).
334	* Assign 3, which is unassigned otherwise.
335	* Uniscribe does this reordering too.
336	*/
337	#define HB_MODIFIED_COMBINING_CLASS_CCC103 3 /* sara u / sara uu */
338	#define HB_MODIFIED_COMBINING_CLASS_CCC107 107 /* mai * */
339
340	/ Lao /
341	#define HB_MODIFIED_COMBINING_CLASS_CCC118 118 /* sign u / sign uu */
342	#define HB_MODIFIED_COMBINING_CLASS_CCC122 122 /* mai * */
343
344	/ Tibetan*
345	*
346	* In case of multiple vowel-signs, use u first (but after achung)
347	* this allows Dzongkha multi-vowel shortcuts to render correctly
348	*/
349	#define HB_MODIFIED_COMBINING_CLASS_CCC129 129 /* sign aa */
350	#define HB_MODIFIED_COMBINING_CLASS_CCC130 132 /* sign i */
351	#define HB_MODIFIED_COMBINING_CLASS_CCC132 131 /* sign u */
352
353	/ Misc /
354
355	#define HB_UNICODE_GENERAL_CATEGORY_IS_MARK(gen_cat) \
356	(FLAG_UNSAFE (gen_cat) & \
357	(FLAG (HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK) \| \
358	FLAG (HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK) \| \
359	FLAG (HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)))
360
361
362	/*
363	* Ranges, used for bsearch tables.
364	*/
365
366	struct hb_unicode_range_t
367	{
368	static int
369	cmp (const void _key, const* void *_item)
370	{
371	hb_codepoint_t cp = ((hb_codepoint_t ) _key);
372	const hb_unicode_range_t range = (hb_unicode_range_t ) _item;
373
374	if (cp < range->start)
375	return -`1`;
376	else if (cp <= range->end)
377	return `0`;
378	else
379	return +`1`;
380	}
381
382	hb_codepoint_t start;
383	hb_codepoint_t end;
384	};
385
386	/*
387	* Emoji.
388	*/
389
390	HB_INTERNAL bool
391	_hb_unicode_is_emoji_Extended_Pictographic (hb_codepoint_t cp);
392
393
394	#endif /* HB_UNICODE_HH */
395

Browse the source code of OpenJDK/src/java.desktop/share/native/libfontmanager/harfbuzz/hb-unicode.hh