hb-utf.hh source code [Godot/thirdparty/harfbuzz/src/hb-utf.hh]

1	/*
2	* Copyright © 2011,2012,2014 Google, Inc.
3	*
4	* This is part of HarfBuzz, a text shaping library.
5	*
6	* Permission is hereby granted, without written agreement and without
7	* license or royalty fees, to use, copy, modify, and distribute this
8	* software and its documentation for any purpose, provided that the
9	* above copyright notice and the following two paragraphs appear in
10	* all copies of this software.
11	*
12	* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13	* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14	* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15	* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16	* DAMAGE.
17	*
18	* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19	* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20	* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
21	* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22	* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23	*
24	* Google Author(s): Behdad Esfahbod
25	*/
26
27	#ifndef HB_UTF_HH
28	#define HB_UTF_HH
29
30	#include "hb.hh"
31
32	#include "hb-open-type.hh"
33
34
35	struct hb_utf8_t
36	{
37	typedef uint8_t codepoint_t;
38	static constexpr unsigned max_len = `4`;
39
40	static const codepoint_t *
41	next (const codepoint_t *text,
42	const codepoint_t *end,
43	hb_codepoint_t *unicode,
44	hb_codepoint_t replacement)
45	{
46	/ Written to only accept well-formed sequences.*
47	* Based on ideas from ICU's U8_NEXT.
48	* Generates one "replacement" for each ill-formed byte. */
49
50	hb_codepoint_t c = *text++;
51
52	if (c > `0x7Fu`)
53	{
54	if (hb_in_range<hb_codepoint_t> (c, `0xC2u`, `0xDFu`)) / Two-byte /
55	{
56	unsigned int t1;
57	if (likely (text < end &&
58	(t1 = text[`0`] - `0x80u`) <= `0x3Fu`))
59	{
60	c = ((c&`0x1Fu`)<<`6`) \| t1;
61	text++;
62	}
63	else
64	goto error;
65	}
66	else if (hb_in_range<hb_codepoint_t> (c, `0xE0u`, `0xEFu`)) / Three-byte /
67	{
68	unsigned int t1, t2;
69	if (likely (`1` < end - text &&
70	(t1 = text[`0`] - `0x80u`) <= `0x3Fu` &&
71	(t2 = text[`1`] - `0x80u`) <= `0x3Fu`))
72	{
73	c = ((c&`0xFu`)<<`12`) \| (t1<<`6`) \| t2;
74	if (unlikely (c < `0x0800u` \|\| hb_in_range<hb_codepoint_t> (c, `0xD800u`, `0xDFFFu`)))
75	goto error;
76	text += `2`;
77	}
78	else
79	goto error;
80	}
81	else if (hb_in_range<hb_codepoint_t> (c, `0xF0u`, `0xF4u`)) / Four-byte /
82	{
83	unsigned int t1, t2, t3;
84	if (likely (`2` < end - text &&
85	(t1 = text[`0`] - `0x80u`) <= `0x3Fu` &&
86	(t2 = text[`1`] - `0x80u`) <= `0x3Fu` &&
87	(t3 = text[`2`] - `0x80u`) <= `0x3Fu`))
88	{
89	c = ((c&`0x7u`)<<`18`) \| (t1<<`12`) \| (t2<<`6`) \| t3;
90	if (unlikely (!hb_in_range<hb_codepoint_t> (c, `0x10000u`, `0x10FFFFu`)))
91	goto error;
92	text += `3`;
93	}
94	else
95	goto error;
96	}
97	else
98	goto error;
99	}
100
101	*unicode = c;
102	return text;
103
104	error:
105	*unicode = replacement;
106	return text;
107	}
108
109	static const codepoint_t *
110	prev (const codepoint_t *text,
111	const codepoint_t *start,
112	hb_codepoint_t *unicode,
113	hb_codepoint_t replacement)
114	{
115	const codepoint_t *end = text--;
116	while (start < text && (*text & `0xc0`) == `0x80` && end - text < `4`)
117	text--;
118
119	if (likely (next (text, end, unicode, replacement) == end))
120	return text;
121
122	*unicode = replacement;
123	return end - `1`;
124	}
125
126	static unsigned int
127	strlen (const codepoint_t *text)
128	{ return ::strlen ((const char *) text); }
129
130	static unsigned int
131	encode_len (hb_codepoint_t unicode)
132	{
133	if (unicode < `0x0080u`) return `1`;
134	if (unicode < `0x0800u`) return `2`;
135	if (unicode < `0x10000u`) return `3`;
136	if (unicode < `0x110000u`) return `4`;
137	return `3`;
138	}
139
140	static codepoint_t *
141	encode (codepoint_t *text,
142	const codepoint_t *end,
143	hb_codepoint_t unicode)
144	{
145	if (unlikely (unicode >= `0xD800u` && (unicode <= `0xDFFFu` \|\| unicode > `0x10FFFFu`)))
146	unicode = `0xFFFDu`;
147	if (unicode < `0x0080u`)
148	*text++ = unicode;
149	else if (unicode < `0x0800u`)
150	{
151	if (end - text >= `2`)
152	{
153	*text++ = `0xC0u` + (`0x1Fu` & (unicode >> `6`));
154	*text++ = `0x80u` + (`0x3Fu` & (unicode ));
155	}
156	}
157	else if (unicode < `0x10000u`)
158	{
159	if (end - text >= `3`)
160	{
161	*text++ = `0xE0u` + (`0x0Fu` & (unicode >> `12`));
162	*text++ = `0x80u` + (`0x3Fu` & (unicode >> `6`));
163	*text++ = `0x80u` + (`0x3Fu` & (unicode ));
164	}
165	}
166	else
167	{
168	if (end - text >= `4`)
169	{
170	*text++ = `0xF0u` + (`0x07u` & (unicode >> `18`));
171	*text++ = `0x80u` + (`0x3Fu` & (unicode >> `12`));
172	*text++ = `0x80u` + (`0x3Fu` & (unicode >> `6`));
173	*text++ = `0x80u` + (`0x3Fu` & (unicode ));
174	}
175	}
176	return text;
177	}
178	};
179
180
181	template <typename TCodepoint>
182	struct hb_utf16_xe_t
183	{
184	static_assert (sizeof (TCodepoint) == `2`, "");
185	typedef TCodepoint codepoint_t;
186	static constexpr unsigned max_len = `2`;
187
188	static const codepoint_t *
189	next (const codepoint_t *text,
190	const codepoint_t *end,
191	hb_codepoint_t *unicode,
192	hb_codepoint_t replacement)
193	{
194	hb_codepoint_t c = *text++;
195
196	if (likely (!hb_in_range<hb_codepoint_t> (c, `0xD800u`, `0xDFFFu`)))
197	{
198	*unicode = c;
199	return text;
200	}
201
202	if (likely (c <= `0xDBFFu` && text < end))
203	{
204	/ High-surrogate in c /
205	hb_codepoint_t l = *text;
206	if (likely (hb_in_range<hb_codepoint_t> (l, `0xDC00u`, `0xDFFFu`)))
207	{
208	/ Low-surrogate in l /
209	*unicode = (c << `10`) + l - ((`0xD800u` << `10`) - `0x10000u` + `0xDC00u`);
210	text++;
211	return text;
212	}
213	}
214
215	/ Lonely / out-of-order surrogate. /
216	*unicode = replacement;
217	return text;
218	}
219
220	static const codepoint_t *
221	prev (const codepoint_t *text,
222	const codepoint_t *start,
223	hb_codepoint_t *unicode,
224	hb_codepoint_t replacement)
225	{
226	hb_codepoint_t c = *--text;
227
228	if (likely (!hb_in_range<hb_codepoint_t> (c, `0xD800u`, `0xDFFFu`)))
229	{
230	*unicode = c;
231	return text;
232	}
233
234	if (likely (c >= `0xDC00u` && start < text))
235	{
236	/ Low-surrogate in c /
237	hb_codepoint_t h = text[-`1`];
238	if (likely (hb_in_range<hb_codepoint_t> (h, `0xD800u`, `0xDBFFu`)))
239	{
240	/ High-surrogate in h /
241	*unicode = (h << `10`) + c - ((`0xD800u` << `10`) - `0x10000u` + `0xDC00u`);
242	text--;
243	return text;
244	}
245	}
246
247	/ Lonely / out-of-order surrogate. /
248	*unicode = replacement;
249	return text;
250	}
251
252
253	static unsigned int
254	strlen (const codepoint_t *text)
255	{
256	unsigned int l = `0`;
257	while (*text++) l++;
258	return l;
259	}
260
261	static unsigned int
262	encode_len (hb_codepoint_t unicode)
263	{
264	return unicode < `0x10000` ? `1` : `2`;
265	}
266
267	static codepoint_t *
268	encode (codepoint_t *text,
269	const codepoint_t *end,
270	hb_codepoint_t unicode)
271	{
272	if (unlikely (unicode >= `0xD800u` && (unicode <= `0xDFFFu` \|\| unicode > `0x10FFFFu`)))
273	unicode = `0xFFFDu`;
274	if (unicode < `0x10000u`)
275	*text++ = unicode;
276	else if (end - text >= `2`)
277	{
278	unicode -= `0x10000u`;
279	*text++ = `0xD800u` + (unicode >> `10`);
280	*text++ = `0xDC00u` + (unicode & `0x03FFu`);
281	}
282	return text;
283	}
284	};
285
286	typedef hb_utf16_xe_t<uint16_t> hb_utf16_t;
287	typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t;
288
289
290	template <typename TCodepoint, bool validate=true>
291	struct hb_utf32_xe_t
292	{
293	static_assert (sizeof (TCodepoint) == `4`, "");
294	typedef TCodepoint codepoint_t;
295	static constexpr unsigned max_len = `1`;
296
297	static const TCodepoint *
298	next (const TCodepoint *text,
299	const TCodepoint *end HB_UNUSED,
300	hb_codepoint_t *unicode,
301	hb_codepoint_t replacement)
302	{
303	hb_codepoint_t c = unicode = text++;
304	if (validate && unlikely (c >= `0xD800u` && (c <= `0xDFFFu` \|\| c > `0x10FFFFu`)))
305	*unicode = replacement;
306	return text;
307	}
308
309	static const TCodepoint *
310	prev (const TCodepoint *text,
311	const TCodepoint *start HB_UNUSED,
312	hb_codepoint_t *unicode,
313	hb_codepoint_t replacement)
314	{
315	hb_codepoint_t c = unicode = --text;
316	if (validate && unlikely (c >= `0xD800u` && (c <= `0xDFFFu` \|\| c > `0x10FFFFu`)))
317	*unicode = replacement;
318	return text;
319	}
320
321	static unsigned int
322	strlen (const TCodepoint *text)
323	{
324	unsigned int l = `0`;
325	while (*text++) l++;
326	return l;
327	}
328
329	static unsigned int
330	encode_len (hb_codepoint_t unicode HB_UNUSED)
331	{
332	return `1`;
333	}
334
335	static codepoint_t *
336	encode (codepoint_t *text,
337	const codepoint_t *end HB_UNUSED,
338	hb_codepoint_t unicode)
339	{
340	if (validate && unlikely (unicode >= `0xD800u` && (unicode <= `0xDFFFu` \|\| unicode > `0x10FFFFu`)))
341	unicode = `0xFFFDu`;
342	*text++ = unicode;
343	return text;
344	}
345	};
346
347	typedef hb_utf32_xe_t<uint32_t> hb_utf32_t;
348	typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t;
349
350
351	struct hb_latin1_t
352	{
353	typedef uint8_t codepoint_t;
354	static constexpr unsigned max_len = `1`;
355
356	static const codepoint_t *
357	next (const codepoint_t *text,
358	const codepoint_t *end HB_UNUSED,
359	hb_codepoint_t *unicode,
360	hb_codepoint_t replacement HB_UNUSED)
361	{
362	unicode = text++;
363	return text;
364	}
365
366	static const codepoint_t *
367	prev (const codepoint_t *text,
368	const codepoint_t *start HB_UNUSED,
369	hb_codepoint_t *unicode,
370	hb_codepoint_t replacement HB_UNUSED)
371	{
372	unicode = --text;
373	return text;
374	}
375
376	static unsigned int
377	strlen (const codepoint_t *text)
378	{
379	unsigned int l = `0`;
380	while (*text++) l++;
381	return l;
382	}
383
384	static unsigned int
385	encode_len (hb_codepoint_t unicode HB_UNUSED)
386	{
387	return `1`;
388	}
389
390	static codepoint_t *
391	encode (codepoint_t *text,
392	const codepoint_t *end HB_UNUSED,
393	hb_codepoint_t unicode)
394	{
395	if (unlikely (unicode >= `0x0100u`))
396	unicode = `'?'`;
397	*text++ = unicode;
398	return text;
399	}
400	};
401
402
403	struct hb_ascii_t
404	{
405	typedef uint8_t codepoint_t;
406	static constexpr unsigned max_len = `1`;
407
408	static const codepoint_t *
409	next (const codepoint_t *text,
410	const codepoint_t *end HB_UNUSED,
411	hb_codepoint_t *unicode,
412	hb_codepoint_t replacement)
413	{
414	unicode = text++;
415	if (*unicode >= `0x0080u`)
416	*unicode = replacement;
417	return text;
418	}
419
420	static const codepoint_t *
421	prev (const codepoint_t *text,
422	const codepoint_t *start HB_UNUSED,
423	hb_codepoint_t *unicode,
424	hb_codepoint_t replacement)
425	{
426	unicode = --text;
427	if (*unicode >= `0x0080u`)
428	*unicode = replacement;
429	return text;
430	}
431
432	static unsigned int
433	strlen (const codepoint_t *text)
434	{
435	unsigned int l = `0`;
436	while (*text++) l++;
437	return l;
438	}
439
440	static unsigned int
441	encode_len (hb_codepoint_t unicode HB_UNUSED)
442	{
443	return `1`;
444	}
445
446	static codepoint_t *
447	encode (codepoint_t *text,
448	const codepoint_t *end HB_UNUSED,
449	hb_codepoint_t unicode)
450	{
451	if (unlikely (unicode >= `0x0080u`))
452	unicode = `'?'`;
453	*text++ = unicode;
454	return text;
455	}
456	};
457
458	template <typename utf_t>
459	static inline const typename utf_t::codepoint_t *
460	hb_utf_offset_to_pointer (const typename utf_t::codepoint_t *start,
461	signed offset)
462	{
463	hb_codepoint_t unicode;
464
465	while (offset-- > `0`)
466	start = utf_t::next (start,
467	start + utf_t::max_len,
468	&unicode,
469	HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT);
470
471	while (offset++ < `0`)
472	start = utf_t::prev (start,
473	start - utf_t::max_len,
474	&unicode,
475	HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT);
476
477	return start;
478	}
479
480
481	#endif /* HB_UTF_HH */
482

Browse the source code of Godot/thirdparty/harfbuzz/src/hb-utf.hh