1 | /* |
2 | * Copyright © 2011,2012,2014 Google, Inc. |
3 | * |
4 | * This is part of HarfBuzz, a text shaping library. |
5 | * |
6 | * Permission is hereby granted, without written agreement and without |
7 | * license or royalty fees, to use, copy, modify, and distribute this |
8 | * software and its documentation for any purpose, provided that the |
9 | * above copyright notice and the following two paragraphs appear in |
10 | * all copies of this software. |
11 | * |
12 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
13 | * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
14 | * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
15 | * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
16 | * DAMAGE. |
17 | * |
18 | * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
19 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
20 | * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
21 | * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
22 | * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
23 | * |
24 | * Google Author(s): Behdad Esfahbod |
25 | */ |
26 | |
27 | #ifndef HB_UTF_HH |
28 | #define HB_UTF_HH |
29 | |
30 | #include "hb.hh" |
31 | |
32 | #include "hb-open-type.hh" |
33 | |
34 | |
35 | struct hb_utf8_t |
36 | { |
37 | typedef uint8_t codepoint_t; |
38 | |
39 | static const codepoint_t * |
40 | next (const codepoint_t *text, |
41 | const codepoint_t *end, |
42 | hb_codepoint_t *unicode, |
43 | hb_codepoint_t replacement) |
44 | { |
45 | /* Written to only accept well-formed sequences. |
46 | * Based on ideas from ICU's U8_NEXT. |
47 | * Generates one "replacement" for each ill-formed byte. */ |
48 | |
49 | hb_codepoint_t c = *text++; |
50 | |
51 | if (c > 0x7Fu) |
52 | { |
53 | if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */ |
54 | { |
55 | unsigned int t1; |
56 | if (likely (text < end && |
57 | (t1 = text[0] - 0x80u) <= 0x3Fu)) |
58 | { |
59 | c = ((c&0x1Fu)<<6) | t1; |
60 | text++; |
61 | } |
62 | else |
63 | goto error; |
64 | } |
65 | else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */ |
66 | { |
67 | unsigned int t1, t2; |
68 | if (likely (1 < end - text && |
69 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
70 | (t2 = text[1] - 0x80u) <= 0x3Fu)) |
71 | { |
72 | c = ((c&0xFu)<<12) | (t1<<6) | t2; |
73 | if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
74 | goto error; |
75 | text += 2; |
76 | } |
77 | else |
78 | goto error; |
79 | } |
80 | else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */ |
81 | { |
82 | unsigned int t1, t2, t3; |
83 | if (likely (2 < end - text && |
84 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
85 | (t2 = text[1] - 0x80u) <= 0x3Fu && |
86 | (t3 = text[2] - 0x80u) <= 0x3Fu)) |
87 | { |
88 | c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; |
89 | if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu))) |
90 | goto error; |
91 | text += 3; |
92 | } |
93 | else |
94 | goto error; |
95 | } |
96 | else |
97 | goto error; |
98 | } |
99 | |
100 | *unicode = c; |
101 | return text; |
102 | |
103 | error: |
104 | *unicode = replacement; |
105 | return text; |
106 | } |
107 | |
108 | static const codepoint_t * |
109 | prev (const codepoint_t *text, |
110 | const codepoint_t *start, |
111 | hb_codepoint_t *unicode, |
112 | hb_codepoint_t replacement) |
113 | { |
114 | const codepoint_t *end = text--; |
115 | while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) |
116 | text--; |
117 | |
118 | if (likely (next (text, end, unicode, replacement) == end)) |
119 | return text; |
120 | |
121 | *unicode = replacement; |
122 | return end - 1; |
123 | } |
124 | |
125 | static unsigned int |
126 | strlen (const codepoint_t *text) |
127 | { return ::strlen ((const char *) text); } |
128 | |
129 | static unsigned int |
130 | encode_len (hb_codepoint_t unicode) |
131 | { |
132 | if (unicode < 0x0080u) return 1; |
133 | if (unicode < 0x0800u) return 2; |
134 | if (unicode < 0x10000u) return 3; |
135 | if (unicode < 0x110000u) return 4; |
136 | return 3; |
137 | } |
138 | |
139 | static codepoint_t * |
140 | encode (codepoint_t *text, |
141 | const codepoint_t *end, |
142 | hb_codepoint_t unicode) |
143 | { |
144 | if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
145 | unicode = 0xFFFDu; |
146 | if (unicode < 0x0080u) |
147 | *text++ = unicode; |
148 | else if (unicode < 0x0800u) |
149 | { |
150 | if (end - text >= 2) |
151 | { |
152 | *text++ = 0xC0u + (0x1Fu & (unicode >> 6)); |
153 | *text++ = 0x80u + (0x3Fu & (unicode )); |
154 | } |
155 | } |
156 | else if (unicode < 0x10000u) |
157 | { |
158 | if (end - text >= 3) |
159 | { |
160 | *text++ = 0xE0u + (0x0Fu & (unicode >> 12)); |
161 | *text++ = 0x80u + (0x3Fu & (unicode >> 6)); |
162 | *text++ = 0x80u + (0x3Fu & (unicode )); |
163 | } |
164 | } |
165 | else |
166 | { |
167 | if (end - text >= 4) |
168 | { |
169 | *text++ = 0xF0u + (0x07u & (unicode >> 18)); |
170 | *text++ = 0x80u + (0x3Fu & (unicode >> 12)); |
171 | *text++ = 0x80u + (0x3Fu & (unicode >> 6)); |
172 | *text++ = 0x80u + (0x3Fu & (unicode )); |
173 | } |
174 | } |
175 | return text; |
176 | } |
177 | }; |
178 | |
179 | |
180 | template <typename TCodepoint> |
181 | struct hb_utf16_xe_t |
182 | { |
183 | static_assert (sizeof (TCodepoint) == 2, "" ); |
184 | typedef TCodepoint codepoint_t; |
185 | |
186 | static const codepoint_t * |
187 | next (const codepoint_t *text, |
188 | const codepoint_t *end, |
189 | hb_codepoint_t *unicode, |
190 | hb_codepoint_t replacement) |
191 | { |
192 | hb_codepoint_t c = *text++; |
193 | |
194 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
195 | { |
196 | *unicode = c; |
197 | return text; |
198 | } |
199 | |
200 | if (likely (c <= 0xDBFFu && text < end)) |
201 | { |
202 | /* High-surrogate in c */ |
203 | hb_codepoint_t l = *text; |
204 | if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) |
205 | { |
206 | /* Low-surrogate in l */ |
207 | *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
208 | text++; |
209 | return text; |
210 | } |
211 | } |
212 | |
213 | /* Lonely / out-of-order surrogate. */ |
214 | *unicode = replacement; |
215 | return text; |
216 | } |
217 | |
218 | static const codepoint_t * |
219 | prev (const codepoint_t *text, |
220 | const codepoint_t *start, |
221 | hb_codepoint_t *unicode, |
222 | hb_codepoint_t replacement) |
223 | { |
224 | hb_codepoint_t c = *--text; |
225 | |
226 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
227 | { |
228 | *unicode = c; |
229 | return text; |
230 | } |
231 | |
232 | if (likely (c >= 0xDC00u && start < text)) |
233 | { |
234 | /* Low-surrogate in c */ |
235 | hb_codepoint_t h = text[-1]; |
236 | if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu))) |
237 | { |
238 | /* High-surrogate in h */ |
239 | *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
240 | text--; |
241 | return text; |
242 | } |
243 | } |
244 | |
245 | /* Lonely / out-of-order surrogate. */ |
246 | *unicode = replacement; |
247 | return text; |
248 | } |
249 | |
250 | |
251 | static unsigned int |
252 | strlen (const codepoint_t *text) |
253 | { |
254 | unsigned int l = 0; |
255 | while (*text++) l++; |
256 | return l; |
257 | } |
258 | |
259 | static unsigned int |
260 | encode_len (hb_codepoint_t unicode) |
261 | { |
262 | return unicode < 0x10000 ? 1 : 2; |
263 | } |
264 | |
265 | static codepoint_t * |
266 | encode (codepoint_t *text, |
267 | const codepoint_t *end, |
268 | hb_codepoint_t unicode) |
269 | { |
270 | if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
271 | unicode = 0xFFFDu; |
272 | if (unicode < 0x10000u) |
273 | *text++ = unicode; |
274 | else if (end - text >= 2) |
275 | { |
276 | unicode -= 0x10000u; |
277 | *text++ = 0xD800u + (unicode >> 10); |
278 | *text++ = 0xDC00u + (unicode & 0x03FFu); |
279 | } |
280 | return text; |
281 | } |
282 | }; |
283 | |
284 | typedef hb_utf16_xe_t<uint16_t> hb_utf16_t; |
285 | typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t; |
286 | |
287 | |
288 | template <typename TCodepoint, bool validate=true> |
289 | struct hb_utf32_xe_t |
290 | { |
291 | static_assert (sizeof (TCodepoint) == 4, "" ); |
292 | typedef TCodepoint codepoint_t; |
293 | |
294 | static const TCodepoint * |
295 | next (const TCodepoint *text, |
296 | const TCodepoint *end HB_UNUSED, |
297 | hb_codepoint_t *unicode, |
298 | hb_codepoint_t replacement) |
299 | { |
300 | hb_codepoint_t c = *unicode = *text++; |
301 | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
302 | *unicode = replacement; |
303 | return text; |
304 | } |
305 | |
306 | static const TCodepoint * |
307 | prev (const TCodepoint *text, |
308 | const TCodepoint *start HB_UNUSED, |
309 | hb_codepoint_t *unicode, |
310 | hb_codepoint_t replacement) |
311 | { |
312 | hb_codepoint_t c = *unicode = *--text; |
313 | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
314 | *unicode = replacement; |
315 | return text; |
316 | } |
317 | |
318 | static unsigned int |
319 | strlen (const TCodepoint *text) |
320 | { |
321 | unsigned int l = 0; |
322 | while (*text++) l++; |
323 | return l; |
324 | } |
325 | |
326 | static unsigned int |
327 | encode_len (hb_codepoint_t unicode HB_UNUSED) |
328 | { |
329 | return 1; |
330 | } |
331 | |
332 | static codepoint_t * |
333 | encode (codepoint_t *text, |
334 | const codepoint_t *end HB_UNUSED, |
335 | hb_codepoint_t unicode) |
336 | { |
337 | if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
338 | unicode = 0xFFFDu; |
339 | *text++ = unicode; |
340 | return text; |
341 | } |
342 | }; |
343 | |
344 | typedef hb_utf32_xe_t<uint32_t> hb_utf32_t; |
345 | typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t; |
346 | |
347 | |
348 | struct hb_latin1_t |
349 | { |
350 | typedef uint8_t codepoint_t; |
351 | |
352 | static const codepoint_t * |
353 | next (const codepoint_t *text, |
354 | const codepoint_t *end HB_UNUSED, |
355 | hb_codepoint_t *unicode, |
356 | hb_codepoint_t replacement HB_UNUSED) |
357 | { |
358 | *unicode = *text++; |
359 | return text; |
360 | } |
361 | |
362 | static const codepoint_t * |
363 | prev (const codepoint_t *text, |
364 | const codepoint_t *start HB_UNUSED, |
365 | hb_codepoint_t *unicode, |
366 | hb_codepoint_t replacement HB_UNUSED) |
367 | { |
368 | *unicode = *--text; |
369 | return text; |
370 | } |
371 | |
372 | static unsigned int |
373 | strlen (const codepoint_t *text) |
374 | { |
375 | unsigned int l = 0; |
376 | while (*text++) l++; |
377 | return l; |
378 | } |
379 | |
380 | static unsigned int |
381 | encode_len (hb_codepoint_t unicode HB_UNUSED) |
382 | { |
383 | return 1; |
384 | } |
385 | |
386 | static codepoint_t * |
387 | encode (codepoint_t *text, |
388 | const codepoint_t *end HB_UNUSED, |
389 | hb_codepoint_t unicode) |
390 | { |
391 | if (unlikely (unicode >= 0x0100u)) |
392 | unicode = '?'; |
393 | *text++ = unicode; |
394 | return text; |
395 | } |
396 | }; |
397 | |
398 | |
399 | struct hb_ascii_t |
400 | { |
401 | typedef uint8_t codepoint_t; |
402 | |
403 | static const codepoint_t * |
404 | next (const codepoint_t *text, |
405 | const codepoint_t *end HB_UNUSED, |
406 | hb_codepoint_t *unicode, |
407 | hb_codepoint_t replacement HB_UNUSED) |
408 | { |
409 | *unicode = *text++; |
410 | if (*unicode >= 0x0080u) |
411 | *unicode = replacement; |
412 | return text; |
413 | } |
414 | |
415 | static const codepoint_t * |
416 | prev (const codepoint_t *text, |
417 | const codepoint_t *start HB_UNUSED, |
418 | hb_codepoint_t *unicode, |
419 | hb_codepoint_t replacement) |
420 | { |
421 | *unicode = *--text; |
422 | if (*unicode >= 0x0080u) |
423 | *unicode = replacement; |
424 | return text; |
425 | } |
426 | |
427 | static unsigned int |
428 | strlen (const codepoint_t *text) |
429 | { |
430 | unsigned int l = 0; |
431 | while (*text++) l++; |
432 | return l; |
433 | } |
434 | |
435 | static unsigned int |
436 | encode_len (hb_codepoint_t unicode HB_UNUSED) |
437 | { |
438 | return 1; |
439 | } |
440 | |
441 | static codepoint_t * |
442 | encode (codepoint_t *text, |
443 | const codepoint_t *end HB_UNUSED, |
444 | hb_codepoint_t unicode) |
445 | { |
446 | if (unlikely (unicode >= 0x0080u)) |
447 | unicode = '?'; |
448 | *text++ = unicode; |
449 | return text; |
450 | } |
451 | }; |
452 | |
453 | #endif /* HB_UTF_HH */ |
454 | |