hb-utf.hh source code [Skia/third_party/externals/harfbuzz/src/hb-utf.hh]

1	/*
2	* Copyright © 2011,2012,2014 Google, Inc.
3	*
4	* This is part of HarfBuzz, a text shaping library.
5	*
6	* Permission is hereby granted, without written agreement and without
7	* license or royalty fees, to use, copy, modify, and distribute this
8	* software and its documentation for any purpose, provided that the
9	* above copyright notice and the following two paragraphs appear in
10	* all copies of this software.
11	*
12	* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13	* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14	* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15	* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16	* DAMAGE.
17	*
18	* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19	* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20	* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
21	* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22	* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23	*
24	* Google Author(s): Behdad Esfahbod
25	*/
26
27	#ifndef HB_UTF_HH
28	#define HB_UTF_HH
29
30	#include "hb.hh"
31
32	#include "hb-open-type.hh"
33
34
35	struct hb_utf8_t
36	{
37	typedef uint8_t codepoint_t;
38
39	static const codepoint_t *
40	next (const codepoint_t *text,
41	const codepoint_t *end,
42	hb_codepoint_t *unicode,
43	hb_codepoint_t replacement)
44	{
45	/ Written to only accept well-formed sequences.*
46	* Based on ideas from ICU's U8_NEXT.
47	* Generates one "replacement" for each ill-formed byte. */
48
49	hb_codepoint_t c = *text++;
50
51	if (c > `0x7Fu`)
52	{
53	if (hb_in_range<hb_codepoint_t> (c, `0xC2u`, `0xDFu`)) / Two-byte /
54	{
55	unsigned int t1;
56	if (likely (text < end &&
57	(t1 = text[`0`] - `0x80u`) <= `0x3Fu`))
58	{
59	c = ((c&`0x1Fu`)<<`6`) \| t1;
60	text++;
61	}
62	else
63	goto error;
64	}
65	else if (hb_in_range<hb_codepoint_t> (c, `0xE0u`, `0xEFu`)) / Three-byte /
66	{
67	unsigned int t1, t2;
68	if (likely (`1` < end - text &&
69	(t1 = text[`0`] - `0x80u`) <= `0x3Fu` &&
70	(t2 = text[`1`] - `0x80u`) <= `0x3Fu`))
71	{
72	c = ((c&`0xFu`)<<`12`) \| (t1<<`6`) \| t2;
73	if (unlikely (c < `0x0800u` \|\| hb_in_range<hb_codepoint_t> (c, `0xD800u`, `0xDFFFu`)))
74	goto error;
75	text += `2`;
76	}
77	else
78	goto error;
79	}
80	else if (hb_in_range<hb_codepoint_t> (c, `0xF0u`, `0xF4u`)) / Four-byte /
81	{
82	unsigned int t1, t2, t3;
83	if (likely (`2` < end - text &&
84	(t1 = text[`0`] - `0x80u`) <= `0x3Fu` &&
85	(t2 = text[`1`] - `0x80u`) <= `0x3Fu` &&
86	(t3 = text[`2`] - `0x80u`) <= `0x3Fu`))
87	{
88	c = ((c&`0x7u`)<<`18`) \| (t1<<`12`) \| (t2<<`6`) \| t3;
89	if (unlikely (!hb_in_range<hb_codepoint_t> (c, `0x10000u`, `0x10FFFFu`)))
90	goto error;
91	text += `3`;
92	}
93	else
94	goto error;
95	}
96	else
97	goto error;
98	}
99
100	*unicode = c;
101	return text;
102
103	error:
104	*unicode = replacement;
105	return text;
106	}
107
108	static const codepoint_t *
109	prev (const codepoint_t *text,
110	const codepoint_t *start,
111	hb_codepoint_t *unicode,
112	hb_codepoint_t replacement)
113	{
114	const codepoint_t *end = text--;
115	while (start < text && (*text & `0xc0`) == `0x80` && end - text < `4`)
116	text--;
117
118	if (likely (next (text, end, unicode, replacement) == end))
119	return text;
120
121	*unicode = replacement;
122	return end - `1`;
123	}
124
125	static unsigned int
126	strlen (const codepoint_t *text)
127	{ return ::strlen ((const char *) text); }
128
129	static unsigned int
130	encode_len (hb_codepoint_t unicode)
131	{
132	if (unicode < `0x0080u`) return `1`;
133	if (unicode < `0x0800u`) return `2`;
134	if (unicode < `0x10000u`) return `3`;
135	if (unicode < `0x110000u`) return `4`;
136	return `3`;
137	}
138
139	static codepoint_t *
140	encode (codepoint_t *text,
141	const codepoint_t *end,
142	hb_codepoint_t unicode)
143	{
144	if (unlikely (unicode >= `0xD800u` && (unicode <= `0xDFFFu` \|\| unicode > `0x10FFFFu`)))
145	unicode = `0xFFFDu`;
146	if (unicode < `0x0080u`)
147	*text++ = unicode;
148	else if (unicode < `0x0800u`)
149	{
150	if (end - text >= `2`)
151	{
152	*text++ = `0xC0u` + (`0x1Fu` & (unicode >> `6`));
153	*text++ = `0x80u` + (`0x3Fu` & (unicode ));
154	}
155	}
156	else if (unicode < `0x10000u`)
157	{
158	if (end - text >= `3`)
159	{
160	*text++ = `0xE0u` + (`0x0Fu` & (unicode >> `12`));
161	*text++ = `0x80u` + (`0x3Fu` & (unicode >> `6`));
162	*text++ = `0x80u` + (`0x3Fu` & (unicode ));
163	}
164	}
165	else
166	{
167	if (end - text >= `4`)
168	{
169	*text++ = `0xF0u` + (`0x07u` & (unicode >> `18`));
170	*text++ = `0x80u` + (`0x3Fu` & (unicode >> `12`));
171	*text++ = `0x80u` + (`0x3Fu` & (unicode >> `6`));
172	*text++ = `0x80u` + (`0x3Fu` & (unicode ));
173	}
174	}
175	return text;
176	}
177	};
178
179
180	template <typename TCodepoint>
181	struct hb_utf16_xe_t
182	{
183	static_assert (sizeof (TCodepoint) == `2`, "");
184	typedef TCodepoint codepoint_t;
185
186	static const codepoint_t *
187	next (const codepoint_t *text,
188	const codepoint_t *end,
189	hb_codepoint_t *unicode,
190	hb_codepoint_t replacement)
191	{
192	hb_codepoint_t c = *text++;
193
194	if (likely (!hb_in_range<hb_codepoint_t> (c, `0xD800u`, `0xDFFFu`)))
195	{
196	*unicode = c;
197	return text;
198	}
199
200	if (likely (c <= `0xDBFFu` && text < end))
201	{
202	/ High-surrogate in c /
203	hb_codepoint_t l = *text;
204	if (likely (hb_in_range<hb_codepoint_t> (l, `0xDC00u`, `0xDFFFu`)))
205	{
206	/ Low-surrogate in l /
207	*unicode = (c << `10`) + l - ((`0xD800u` << `10`) - `0x10000u` + `0xDC00u`);
208	text++;
209	return text;
210	}
211	}
212
213	/ Lonely / out-of-order surrogate. /
214	*unicode = replacement;
215	return text;
216	}
217
218	static const codepoint_t *
219	prev (const codepoint_t *text,
220	const codepoint_t *start,
221	hb_codepoint_t *unicode,
222	hb_codepoint_t replacement)
223	{
224	hb_codepoint_t c = *--text;
225
226	if (likely (!hb_in_range<hb_codepoint_t> (c, `0xD800u`, `0xDFFFu`)))
227	{
228	*unicode = c;
229	return text;
230	}
231
232	if (likely (c >= `0xDC00u` && start < text))
233	{
234	/ Low-surrogate in c /
235	hb_codepoint_t h = text[-`1`];
236	if (likely (hb_in_range<hb_codepoint_t> (h, `0xD800u`, `0xDBFFu`)))
237	{
238	/ High-surrogate in h /
239	*unicode = (h << `10`) + c - ((`0xD800u` << `10`) - `0x10000u` + `0xDC00u`);
240	text--;
241	return text;
242	}
243	}
244
245	/ Lonely / out-of-order surrogate. /
246	*unicode = replacement;
247	return text;
248	}
249
250
251	static unsigned int
252	strlen (const codepoint_t *text)
253	{
254	unsigned int l = `0`;
255	while (*text++) l++;
256	return l;
257	}
258
259	static unsigned int
260	encode_len (hb_codepoint_t unicode)
261	{
262	return unicode < `0x10000` ? `1` : `2`;
263	}
264
265	static codepoint_t *
266	encode (codepoint_t *text,
267	const codepoint_t *end,
268	hb_codepoint_t unicode)
269	{
270	if (unlikely (unicode >= `0xD800u` && (unicode <= `0xDFFFu` \|\| unicode > `0x10FFFFu`)))
271	unicode = `0xFFFDu`;
272	if (unicode < `0x10000u`)
273	*text++ = unicode;
274	else if (end - text >= `2`)
275	{
276	unicode -= `0x10000u`;
277	*text++ = `0xD800u` + (unicode >> `10`);
278	*text++ = `0xDC00u` + (unicode & `0x03FFu`);
279	}
280	return text;
281	}
282	};
283
284	typedef hb_utf16_xe_t<uint16_t> hb_utf16_t;
285	typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t;
286
287
288	template <typename TCodepoint, bool validate=true>
289	struct hb_utf32_xe_t
290	{
291	static_assert (sizeof (TCodepoint) == `4`, "");
292	typedef TCodepoint codepoint_t;
293
294	static const TCodepoint *
295	next (const TCodepoint *text,
296	const TCodepoint *end HB_UNUSED,
297	hb_codepoint_t *unicode,
298	hb_codepoint_t replacement)
299	{
300	hb_codepoint_t c = unicode = text++;
301	if (validate && unlikely (c >= `0xD800u` && (c <= `0xDFFFu` \|\| c > `0x10FFFFu`)))
302	*unicode = replacement;
303	return text;
304	}
305
306	static const TCodepoint *
307	prev (const TCodepoint *text,
308	const TCodepoint *start HB_UNUSED,
309	hb_codepoint_t *unicode,
310	hb_codepoint_t replacement)
311	{
312	hb_codepoint_t c = unicode = --text;
313	if (validate && unlikely (c >= `0xD800u` && (c <= `0xDFFFu` \|\| c > `0x10FFFFu`)))
314	*unicode = replacement;
315	return text;
316	}
317
318	static unsigned int
319	strlen (const TCodepoint *text)
320	{
321	unsigned int l = `0`;
322	while (*text++) l++;
323	return l;
324	}
325
326	static unsigned int
327	encode_len (hb_codepoint_t unicode HB_UNUSED)
328	{
329	return `1`;
330	}
331
332	static codepoint_t *
333	encode (codepoint_t *text,
334	const codepoint_t *end HB_UNUSED,
335	hb_codepoint_t unicode)
336	{
337	if (validate && unlikely (unicode >= `0xD800u` && (unicode <= `0xDFFFu` \|\| unicode > `0x10FFFFu`)))
338	unicode = `0xFFFDu`;
339	*text++ = unicode;
340	return text;
341	}
342	};
343
344	typedef hb_utf32_xe_t<uint32_t> hb_utf32_t;
345	typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t;
346
347
348	struct hb_latin1_t
349	{
350	typedef uint8_t codepoint_t;
351
352	static const codepoint_t *
353	next (const codepoint_t *text,
354	const codepoint_t *end HB_UNUSED,
355	hb_codepoint_t *unicode,
356	hb_codepoint_t replacement HB_UNUSED)
357	{
358	unicode = text++;
359	return text;
360	}
361
362	static const codepoint_t *
363	prev (const codepoint_t *text,
364	const codepoint_t *start HB_UNUSED,
365	hb_codepoint_t *unicode,
366	hb_codepoint_t replacement HB_UNUSED)
367	{
368	unicode = --text;
369	return text;
370	}
371
372	static unsigned int
373	strlen (const codepoint_t *text)
374	{
375	unsigned int l = `0`;
376	while (*text++) l++;
377	return l;
378	}
379
380	static unsigned int
381	encode_len (hb_codepoint_t unicode HB_UNUSED)
382	{
383	return `1`;
384	}
385
386	static codepoint_t *
387	encode (codepoint_t *text,
388	const codepoint_t *end HB_UNUSED,
389	hb_codepoint_t unicode)
390	{
391	if (unlikely (unicode >= `0x0100u`))
392	unicode = `'?'`;
393	*text++ = unicode;
394	return text;
395	}
396	};
397
398
399	struct hb_ascii_t
400	{
401	typedef uint8_t codepoint_t;
402
403	static const codepoint_t *
404	next (const codepoint_t *text,
405	const codepoint_t *end HB_UNUSED,
406	hb_codepoint_t *unicode,
407	hb_codepoint_t replacement HB_UNUSED)
408	{
409	unicode = text++;
410	if (*unicode >= `0x0080u`)
411	*unicode = replacement;
412	return text;
413	}
414
415	static const codepoint_t *
416	prev (const codepoint_t *text,
417	const codepoint_t *start HB_UNUSED,
418	hb_codepoint_t *unicode,
419	hb_codepoint_t replacement)
420	{
421	unicode = --text;
422	if (*unicode >= `0x0080u`)
423	*unicode = replacement;
424	return text;
425	}
426
427	static unsigned int
428	strlen (const codepoint_t *text)
429	{
430	unsigned int l = `0`;
431	while (*text++) l++;
432	return l;
433	}
434
435	static unsigned int
436	encode_len (hb_codepoint_t unicode HB_UNUSED)
437	{
438	return `1`;
439	}
440
441	static codepoint_t *
442	encode (codepoint_t *text,
443	const codepoint_t *end HB_UNUSED,
444	hb_codepoint_t unicode)
445	{
446	if (unlikely (unicode >= `0x0080u`))
447	unicode = `'?'`;
448	*text++ = unicode;
449	return text;
450	}
451	};
452
453	#endif /* HB_UTF_HH */
454

Browse the source code of Skia/third_party/externals/harfbuzz/src/hb-utf.hh