hb-ot-shape-complex-thai.cc source code [MuPDF/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc]

1	/*
2	* Copyright © 2010,2012 Google, Inc.
3	*
4	* This is part of HarfBuzz, a text shaping library.
5	*
6	* Permission is hereby granted, without written agreement and without
7	* license or royalty fees, to use, copy, modify, and distribute this
8	* software and its documentation for any purpose, provided that the
9	* above copyright notice and the following two paragraphs appear in
10	* all copies of this software.
11	*
12	* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13	* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14	* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15	* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16	* DAMAGE.
17	*
18	* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19	* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20	* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
21	* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22	* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23	*
24	* Google Author(s): Behdad Esfahbod
25	*/
26
27	#include "hb-ot-shape-complex.hh"
28
29
30	/ Thai / Lao shaper /
31
32
33	/ PUA shaping /
34
35
36	enum thai_consonant_type_t
37	{
38	NC,
39	AC,
40	RC,
41	DC,
42	NOT_CONSONANT,
43	NUM_CONSONANT_TYPES = NOT_CONSONANT
44	};
45
46	static thai_consonant_type_t
47	get_consonant_type (hb_codepoint_t u)
48	{
49	if (u == `0x0E1Bu` \|\| u == `0x0E1Du` \|\| u == `0x0E1Fu`/ \|\| u == 0x0E2Cu/)
50	return AC;
51	if (u == `0x0E0Du` \|\| u == `0x0E10u`)
52	return RC;
53	if (u == `0x0E0Eu` \|\| u == `0x0E0Fu`)
54	return DC;
55	if (hb_in_range<hb_codepoint_t> (u, `0x0E01u`, `0x0E2Eu`))
56	return NC;
57	return NOT_CONSONANT;
58	}
59
60
61	enum thai_mark_type_t
62	{
63	AV,
64	BV,
65	T,
66	NOT_MARK,
67	NUM_MARK_TYPES = NOT_MARK
68	};
69
70	static thai_mark_type_t
71	get_mark_type (hb_codepoint_t u)
72	{
73	if (u == `0x0E31u` \|\| hb_in_range<hb_codepoint_t> (u, `0x0E34u`, `0x0E37u`) \|\|
74	u == `0x0E47u` \|\| hb_in_range<hb_codepoint_t> (u, `0x0E4Du`, `0x0E4Eu`))
75	return AV;
76	if (hb_in_range<hb_codepoint_t> (u, `0x0E38u`, `0x0E3Au`))
77	return BV;
78	if (hb_in_range<hb_codepoint_t> (u, `0x0E48u`, `0x0E4Cu`))
79	return T;
80	return NOT_MARK;
81	}
82
83
84	enum thai_action_t
85	{
86	NOP,
87	SD, / Shift combining-mark down /
88	SL, / Shift combining-mark left /
89	SDL, / Shift combining-mark down-left /
90	RD / Remove descender from base /
91	};
92
93	static hb_codepoint_t
94	thai_pua_shape (hb_codepoint_t u, thai_action_t action, hb_font_t *font)
95	{
96	struct thai_pua_mapping_t {
97	hb_codepoint_t u;
98	hb_codepoint_t win_pua;
99	hb_codepoint_t mac_pua;
100	} const pua_mappings = nullptr*;
101	static const thai_pua_mapping_t SD_mappings[] = {
102	{`0x0E48u`, `0xF70Au`, `0xF88Bu`}, / MAI EK /
103	{`0x0E49u`, `0xF70Bu`, `0xF88Eu`}, / MAI THO /
104	{`0x0E4Au`, `0xF70Cu`, `0xF891u`}, / MAI TRI /
105	{`0x0E4Bu`, `0xF70Du`, `0xF894u`}, / MAI CHATTAWA /
106	{`0x0E4Cu`, `0xF70Eu`, `0xF897u`}, / THANTHAKHAT /
107	{`0x0E38u`, `0xF718u`, `0xF89Bu`}, / SARA U /
108	{`0x0E39u`, `0xF719u`, `0xF89Cu`}, / SARA UU /
109	{`0x0E3Au`, `0xF71Au`, `0xF89Du`}, / PHINTHU /
110	{`0x0000u`, `0x0000u`, `0x0000u`}
111	};
112	static const thai_pua_mapping_t SDL_mappings[] = {
113	{`0x0E48u`, `0xF705u`, `0xF88Cu`}, / MAI EK /
114	{`0x0E49u`, `0xF706u`, `0xF88Fu`}, / MAI THO /
115	{`0x0E4Au`, `0xF707u`, `0xF892u`}, / MAI TRI /
116	{`0x0E4Bu`, `0xF708u`, `0xF895u`}, / MAI CHATTAWA /
117	{`0x0E4Cu`, `0xF709u`, `0xF898u`}, / THANTHAKHAT /
118	{`0x0000u`, `0x0000u`, `0x0000u`}
119	};
120	static const thai_pua_mapping_t SL_mappings[] = {
121	{`0x0E48u`, `0xF713u`, `0xF88Au`}, / MAI EK /
122	{`0x0E49u`, `0xF714u`, `0xF88Du`}, / MAI THO /
123	{`0x0E4Au`, `0xF715u`, `0xF890u`}, / MAI TRI /
124	{`0x0E4Bu`, `0xF716u`, `0xF893u`}, / MAI CHATTAWA /
125	{`0x0E4Cu`, `0xF717u`, `0xF896u`}, / THANTHAKHAT /
126	{`0x0E31u`, `0xF710u`, `0xF884u`}, / MAI HAN-AKAT /
127	{`0x0E34u`, `0xF701u`, `0xF885u`}, / SARA I /
128	{`0x0E35u`, `0xF702u`, `0xF886u`}, / SARA II /
129	{`0x0E36u`, `0xF703u`, `0xF887u`}, / SARA UE /
130	{`0x0E37u`, `0xF704u`, `0xF888u`}, / SARA UEE /
131	{`0x0E47u`, `0xF712u`, `0xF889u`}, / MAITAIKHU /
132	{`0x0E4Du`, `0xF711u`, `0xF899u`}, / NIKHAHIT /
133	{`0x0000u`, `0x0000u`, `0x0000u`}
134	};
135	static const thai_pua_mapping_t RD_mappings[] = {
136	{`0x0E0Du`, `0xF70Fu`, `0xF89Au`}, / YO YING /
137	{`0x0E10u`, `0xF700u`, `0xF89Eu`}, / THO THAN /
138	{`0x0000u`, `0x0000u`, `0x0000u`}
139	};
140
141	switch (action) {
142	case NOP: return u;
143	case SD: pua_mappings = SD_mappings; break;
144	case SDL: pua_mappings = SDL_mappings; break;
145	case SL: pua_mappings = SL_mappings; break;
146	case RD: pua_mappings = RD_mappings; break;
147	}
148	for (; pua_mappings->u; pua_mappings++)
149	if (pua_mappings->u == u)
150	{
151	hb_codepoint_t glyph;
152	if (hb_font_get_glyph (font, pua_mappings->win_pua, `0`, &glyph))
153	return pua_mappings->win_pua;
154	if (hb_font_get_glyph (font, pua_mappings->mac_pua, `0`, &glyph))
155	return pua_mappings->mac_pua;
156	break;
157	}
158	return u;
159	}
160
161
162	static enum thai_above_state_t
163	{ / Cluster above looks like: /
164	T0, / ⣤ /
165	T1, / ⣼ /
166	T2, / ⣾ /
167	T3, / ⣿ /
168	NUM_ABOVE_STATES
169	} thai_above_start_state[NUM_CONSONANT_TYPES + `1`/ For NOT_CONSONANT /] =
170	{
171	T0, / NC /
172	T1, / AC /
173	T0, / RC /
174	T0, / DC /
175	T3, / NOT_CONSONANT /
176	};
177
178	static const struct thai_above_state_machine_edge_t {
179	thai_action_t action;
180	thai_above_state_t next_state;
181	} thai_above_state_machine[NUM_ABOVE_STATES][NUM_MARK_TYPES] =
182	{ /AV/ /BV/ /T/
183	/T0/ {{NOP,T3}, {NOP,T0}, {SD, T3}},
184	/T1/ {{SL, T2}, {NOP,T1}, {SDL,T2}},
185	/T2/ {{NOP,T3}, {NOP,T2}, {SL, T3}},
186	/T3/ {{NOP,T3}, {NOP,T3}, {NOP,T3}},
187	};
188
189
190	static enum thai_below_state_t
191	{
192	B0, / No descender /
193	B1, / Removable descender /
194	B2, / Strict descender /
195	NUM_BELOW_STATES
196	} thai_below_start_state[NUM_CONSONANT_TYPES + `1`/ For NOT_CONSONANT /] =
197	{
198	B0, / NC /
199	B0, / AC /
200	B1, / RC /
201	B2, / DC /
202	B2, / NOT_CONSONANT /
203	};
204
205	static const struct thai_below_state_machine_edge_t {
206	thai_action_t action;
207	thai_below_state_t next_state;
208	} thai_below_state_machine[NUM_BELOW_STATES][NUM_MARK_TYPES] =
209	{ /AV/ /BV/ /T/
210	/B0/ {{NOP,B0}, {NOP,B2}, {NOP, B0}},
211	/B1/ {{NOP,B1}, {RD, B2}, {NOP, B1}},
212	/B2/ {{NOP,B2}, {SD, B2}, {NOP, B2}},
213	};
214
215
216	static void
217	do_thai_pua_shaping (const hb_ot_shape_plan_t *plan HB_UNUSED,
218	hb_buffer_t *buffer,
219	hb_font_t *font)
220	{
221	thai_above_state_t above_state = thai_above_start_state[NOT_CONSONANT];
222	thai_below_state_t below_state = thai_below_start_state[NOT_CONSONANT];
223	unsigned int base = `0`;
224
225	hb_glyph_info_t *info = buffer->info;
226	unsigned int count = buffer->len;
227	for (unsigned int i = `0`; i < count; i++)
228	{
229	thai_mark_type_t mt = get_mark_type (info[i].codepoint);
230
231	if (mt == NOT_MARK) {
232	thai_consonant_type_t ct = get_consonant_type (info[i].codepoint);
233	above_state = thai_above_start_state[ct];
234	below_state = thai_below_start_state[ct];
235	base = i;
236	continue;
237	}
238
239	const thai_above_state_machine_edge_t &above_edge = thai_above_state_machine[above_state][mt];
240	const thai_below_state_machine_edge_t &below_edge = thai_below_state_machine[below_state][mt];
241	above_state = above_edge.next_state;
242	below_state = below_edge.next_state;
243
244	/ At least one of the above/below actions is NOP. /
245	thai_action_t action = above_edge.action != NOP ? above_edge.action : below_edge.action;
246
247	buffer->unsafe_to_break (base, i);
248	if (action == RD)
249	info[base].codepoint = thai_pua_shape (info[base].codepoint, action, font);
250	else
251	info[i].codepoint = thai_pua_shape (info[i].codepoint, action, font);
252	}
253	}
254
255
256	static void
257	preprocess_text_thai (const hb_ot_shape_plan_t *plan,
258	hb_buffer_t *buffer,
259	hb_font_t *font)
260	{
261	/ This function implements the shaping logic documented here:*
262	*
263	* https://linux.thai.net/~thep/th-otf/shaping.html
264	*
265	* The first shaping rule listed there is needed even if the font has Thai
266	* OpenType tables. The rest do fallback positioning based on PUA codepoints.
267	* We implement that only if there exist no Thai GSUB in the font.
268	*/
269
270	/ The following is NOT specified in the MS OT Thai spec, however, it seems*
271	* to be what Uniscribe and other engines implement. According to Eric Muller:
272	*
273	* When you have a SARA AM, decompose it in NIKHAHIT + SARA AA, and move the
274	* NIKHAHIT backwards over any tone mark (0E48-0E4B).
275	*
276	* <0E14, 0E4B, 0E33> -> <0E14, 0E4D, 0E4B, 0E32>
277	*
278	* This reordering is legit only when the NIKHAHIT comes from a SARA AM, not
279	* when it's there to start with. The string <0E14, 0E4B, 0E4D> is probably
280	* not what a user wanted, but the rendering is nevertheless nikhahit above
281	* chattawa.
282	*
283	* Same for Lao.
284	*
285	* Note:
286	*
287	* Uniscribe also does some below-marks reordering. Namely, it positions U+0E3A
288	* after U+0E38 and U+0E39. We do that by modifying the ccc for U+0E3A.
289	* See unicode->modified_combining_class (). Lao does NOT have a U+0E3A
290	* equivalent.
291	*/
292
293
294	/*
295	* Here are the characters of significance:
296	*
297	* Thai Lao
298	* SARA AM: U+0E33 U+0EB3
299	* SARA AA: U+0E32 U+0EB2
300	* Nikhahit: U+0E4D U+0ECD
301	*
302	* Testing shows that Uniscribe reorder the following marks:
303	* Thai: <0E31,0E34..0E37,0E47..0E4E>
304	* Lao: <0EB1,0EB4..0EB7,0EC7..0ECE>
305	*
306	* Note how the Lao versions are the same as Thai + 0x80.
307	*/
308
309	/ We only get one script at a time, so a script-agnostic implementation*
310	* is adequate here. */
311	#define IS_SARA_AM(x) (((x) & ~0x0080u) == 0x0E33u)
312	#define NIKHAHIT_FROM_SARA_AM(x) ((x) - 0x0E33u + 0x0E4Du)
313	#define SARA_AA_FROM_SARA_AM(x) ((x) - 1)
314	#define IS_TONE_MARK(x) (hb_in_ranges<hb_codepoint_t> ((x) & ~0x0080u, 0x0E34u, 0x0E37u, 0x0E47u, 0x0E4Eu, 0x0E31u, 0x0E31u))
315
316	buffer->clear_output ();
317	unsigned int count = buffer->len;
318	for (buffer->idx = `0`; buffer->idx < count && buffer->successful;)
319	{
320	hb_codepoint_t u = buffer->cur().codepoint;
321	if (likely (!IS_SARA_AM (u))) {
322	buffer->next_glyph ();
323	continue;
324	}
325
326	/ Is SARA AM. Decompose and reorder. /
327	hb_codepoint_t decomposed[`2`] = {hb_codepoint_t (NIKHAHIT_FROM_SARA_AM (u)),
328	hb_codepoint_t (SARA_AA_FROM_SARA_AM (u))};
329	buffer->replace_glyphs (`1`, `2`, decomposed);
330	if (unlikely (!buffer->successful))
331	return;
332
333	/ Make Nikhahit be recognized as a ccc=0 mark when zeroing widths. /
334	unsigned int end = buffer->out_len;
335	_hb_glyph_info_set_general_category (&buffer->out_info[end - `2`], HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK);
336
337	/ Ok, let's see... /
338	unsigned int start = end - `2`;
339	while (start > `0` && IS_TONE_MARK (buffer->out_info[start - `1`].codepoint))
340	start--;
341
342	if (start + `2` < end)
343	{
344	/ Move Nikhahit (end-2) to the beginning /
345	buffer->merge_out_clusters (start, end);
346	hb_glyph_info_t t = buffer->out_info[end - `2`];
347	memmove (buffer->out_info + start + `1`,
348	buffer->out_info + start,
349	sizeof (buffer->out_info[`0`]) * (end - start - `2`));
350	buffer->out_info[start] = t;
351	}
352	else
353	{
354	/ Since we decomposed, and NIKHAHIT is combining, merge clusters with the*
355	* previous cluster. */
356	if (start && buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES)
357	buffer->merge_out_clusters (start - `1`, end);
358	}
359	}
360	buffer->swap_buffers ();
361
362	/ If font has Thai GSUB, we are done. /
363	if (plan->props.script == HB_SCRIPT_THAI && !plan->map.found_script[`0`])
364	do_thai_pua_shaping (plan, buffer, font);
365	}
366
367	const hb_ot_complex_shaper_t _hb_ot_complex_shaper_thai =
368	{
369	nullptr, / collect_features /
370	nullptr, / override_features /
371	nullptr, / data_create /
372	nullptr, / data_destroy /
373	preprocess_text_thai,
374	nullptr, / postprocess_glyphs /
375	HB_OT_SHAPE_NORMALIZATION_MODE_DEFAULT,
376	nullptr, / decompose /
377	nullptr, / compose /
378	nullptr, / setup_masks /
379	nullptr, / disable_otl /
380	nullptr, / reorder_marks /
381	HB_OT_SHAPE_ZERO_WIDTH_MARKS_BY_GDEF_LATE,
382	false,/ fallback_position /
383	};
384

Browse the source code of MuPDF/thirdparty/harfbuzz/src/hb-ot-shape-complex-thai.cc