1 | /* |
2 | * Copyright © 2011,2012,2014 Google, Inc. |
3 | * |
4 | * This is part of HarfBuzz, a text shaping library. |
5 | * |
6 | * Permission is hereby granted, without written agreement and without |
7 | * license or royalty fees, to use, copy, modify, and distribute this |
8 | * software and its documentation for any purpose, provided that the |
9 | * above copyright notice and the following two paragraphs appear in |
10 | * all copies of this software. |
11 | * |
12 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
13 | * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
14 | * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
15 | * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
16 | * DAMAGE. |
17 | * |
18 | * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
19 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
20 | * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
21 | * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
22 | * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
23 | * |
24 | * Google Author(s): Behdad Esfahbod |
25 | */ |
26 | |
27 | #ifndef HB_UTF_HH |
28 | #define HB_UTF_HH |
29 | |
30 | #include "hb.hh" |
31 | |
32 | #include "hb-open-type.hh" |
33 | |
34 | |
35 | struct hb_utf8_t |
36 | { |
37 | typedef uint8_t codepoint_t; |
38 | static constexpr unsigned max_len = 4; |
39 | |
40 | static const codepoint_t * |
41 | next (const codepoint_t *text, |
42 | const codepoint_t *end, |
43 | hb_codepoint_t *unicode, |
44 | hb_codepoint_t replacement) |
45 | { |
46 | /* Written to only accept well-formed sequences. |
47 | * Based on ideas from ICU's U8_NEXT. |
48 | * Generates one "replacement" for each ill-formed byte. */ |
49 | |
50 | hb_codepoint_t c = *text++; |
51 | |
52 | if (c > 0x7Fu) |
53 | { |
54 | if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */ |
55 | { |
56 | unsigned int t1; |
57 | if (likely (text < end && |
58 | (t1 = text[0] - 0x80u) <= 0x3Fu)) |
59 | { |
60 | c = ((c&0x1Fu)<<6) | t1; |
61 | text++; |
62 | } |
63 | else |
64 | goto error; |
65 | } |
66 | else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */ |
67 | { |
68 | unsigned int t1, t2; |
69 | if (likely (1 < end - text && |
70 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
71 | (t2 = text[1] - 0x80u) <= 0x3Fu)) |
72 | { |
73 | c = ((c&0xFu)<<12) | (t1<<6) | t2; |
74 | if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
75 | goto error; |
76 | text += 2; |
77 | } |
78 | else |
79 | goto error; |
80 | } |
81 | else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */ |
82 | { |
83 | unsigned int t1, t2, t3; |
84 | if (likely (2 < end - text && |
85 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
86 | (t2 = text[1] - 0x80u) <= 0x3Fu && |
87 | (t3 = text[2] - 0x80u) <= 0x3Fu)) |
88 | { |
89 | c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; |
90 | if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu))) |
91 | goto error; |
92 | text += 3; |
93 | } |
94 | else |
95 | goto error; |
96 | } |
97 | else |
98 | goto error; |
99 | } |
100 | |
101 | *unicode = c; |
102 | return text; |
103 | |
104 | error: |
105 | *unicode = replacement; |
106 | return text; |
107 | } |
108 | |
109 | static const codepoint_t * |
110 | prev (const codepoint_t *text, |
111 | const codepoint_t *start, |
112 | hb_codepoint_t *unicode, |
113 | hb_codepoint_t replacement) |
114 | { |
115 | const codepoint_t *end = text--; |
116 | while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) |
117 | text--; |
118 | |
119 | if (likely (next (text, end, unicode, replacement) == end)) |
120 | return text; |
121 | |
122 | *unicode = replacement; |
123 | return end - 1; |
124 | } |
125 | |
126 | static unsigned int |
127 | strlen (const codepoint_t *text) |
128 | { return ::strlen ((const char *) text); } |
129 | |
130 | static unsigned int |
131 | encode_len (hb_codepoint_t unicode) |
132 | { |
133 | if (unicode < 0x0080u) return 1; |
134 | if (unicode < 0x0800u) return 2; |
135 | if (unicode < 0x10000u) return 3; |
136 | if (unicode < 0x110000u) return 4; |
137 | return 3; |
138 | } |
139 | |
140 | static codepoint_t * |
141 | encode (codepoint_t *text, |
142 | const codepoint_t *end, |
143 | hb_codepoint_t unicode) |
144 | { |
145 | if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
146 | unicode = 0xFFFDu; |
147 | if (unicode < 0x0080u) |
148 | *text++ = unicode; |
149 | else if (unicode < 0x0800u) |
150 | { |
151 | if (end - text >= 2) |
152 | { |
153 | *text++ = 0xC0u + (0x1Fu & (unicode >> 6)); |
154 | *text++ = 0x80u + (0x3Fu & (unicode )); |
155 | } |
156 | } |
157 | else if (unicode < 0x10000u) |
158 | { |
159 | if (end - text >= 3) |
160 | { |
161 | *text++ = 0xE0u + (0x0Fu & (unicode >> 12)); |
162 | *text++ = 0x80u + (0x3Fu & (unicode >> 6)); |
163 | *text++ = 0x80u + (0x3Fu & (unicode )); |
164 | } |
165 | } |
166 | else |
167 | { |
168 | if (end - text >= 4) |
169 | { |
170 | *text++ = 0xF0u + (0x07u & (unicode >> 18)); |
171 | *text++ = 0x80u + (0x3Fu & (unicode >> 12)); |
172 | *text++ = 0x80u + (0x3Fu & (unicode >> 6)); |
173 | *text++ = 0x80u + (0x3Fu & (unicode )); |
174 | } |
175 | } |
176 | return text; |
177 | } |
178 | }; |
179 | |
180 | |
181 | template <typename TCodepoint> |
182 | struct hb_utf16_xe_t |
183 | { |
184 | static_assert (sizeof (TCodepoint) == 2, "" ); |
185 | typedef TCodepoint codepoint_t; |
186 | static constexpr unsigned max_len = 2; |
187 | |
188 | static const codepoint_t * |
189 | next (const codepoint_t *text, |
190 | const codepoint_t *end, |
191 | hb_codepoint_t *unicode, |
192 | hb_codepoint_t replacement) |
193 | { |
194 | hb_codepoint_t c = *text++; |
195 | |
196 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
197 | { |
198 | *unicode = c; |
199 | return text; |
200 | } |
201 | |
202 | if (likely (c <= 0xDBFFu && text < end)) |
203 | { |
204 | /* High-surrogate in c */ |
205 | hb_codepoint_t l = *text; |
206 | if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) |
207 | { |
208 | /* Low-surrogate in l */ |
209 | *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
210 | text++; |
211 | return text; |
212 | } |
213 | } |
214 | |
215 | /* Lonely / out-of-order surrogate. */ |
216 | *unicode = replacement; |
217 | return text; |
218 | } |
219 | |
220 | static const codepoint_t * |
221 | prev (const codepoint_t *text, |
222 | const codepoint_t *start, |
223 | hb_codepoint_t *unicode, |
224 | hb_codepoint_t replacement) |
225 | { |
226 | hb_codepoint_t c = *--text; |
227 | |
228 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
229 | { |
230 | *unicode = c; |
231 | return text; |
232 | } |
233 | |
234 | if (likely (c >= 0xDC00u && start < text)) |
235 | { |
236 | /* Low-surrogate in c */ |
237 | hb_codepoint_t h = text[-1]; |
238 | if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu))) |
239 | { |
240 | /* High-surrogate in h */ |
241 | *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
242 | text--; |
243 | return text; |
244 | } |
245 | } |
246 | |
247 | /* Lonely / out-of-order surrogate. */ |
248 | *unicode = replacement; |
249 | return text; |
250 | } |
251 | |
252 | |
253 | static unsigned int |
254 | strlen (const codepoint_t *text) |
255 | { |
256 | unsigned int l = 0; |
257 | while (*text++) l++; |
258 | return l; |
259 | } |
260 | |
261 | static unsigned int |
262 | encode_len (hb_codepoint_t unicode) |
263 | { |
264 | return unicode < 0x10000 ? 1 : 2; |
265 | } |
266 | |
267 | static codepoint_t * |
268 | encode (codepoint_t *text, |
269 | const codepoint_t *end, |
270 | hb_codepoint_t unicode) |
271 | { |
272 | if (unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
273 | unicode = 0xFFFDu; |
274 | if (unicode < 0x10000u) |
275 | *text++ = unicode; |
276 | else if (end - text >= 2) |
277 | { |
278 | unicode -= 0x10000u; |
279 | *text++ = 0xD800u + (unicode >> 10); |
280 | *text++ = 0xDC00u + (unicode & 0x03FFu); |
281 | } |
282 | return text; |
283 | } |
284 | }; |
285 | |
286 | typedef hb_utf16_xe_t<uint16_t> hb_utf16_t; |
287 | typedef hb_utf16_xe_t<OT::HBUINT16> hb_utf16_be_t; |
288 | |
289 | |
290 | template <typename TCodepoint, bool validate=true> |
291 | struct hb_utf32_xe_t |
292 | { |
293 | static_assert (sizeof (TCodepoint) == 4, "" ); |
294 | typedef TCodepoint codepoint_t; |
295 | static constexpr unsigned max_len = 1; |
296 | |
297 | static const TCodepoint * |
298 | next (const TCodepoint *text, |
299 | const TCodepoint *end HB_UNUSED, |
300 | hb_codepoint_t *unicode, |
301 | hb_codepoint_t replacement) |
302 | { |
303 | hb_codepoint_t c = *unicode = *text++; |
304 | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
305 | *unicode = replacement; |
306 | return text; |
307 | } |
308 | |
309 | static const TCodepoint * |
310 | prev (const TCodepoint *text, |
311 | const TCodepoint *start HB_UNUSED, |
312 | hb_codepoint_t *unicode, |
313 | hb_codepoint_t replacement) |
314 | { |
315 | hb_codepoint_t c = *unicode = *--text; |
316 | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
317 | *unicode = replacement; |
318 | return text; |
319 | } |
320 | |
321 | static unsigned int |
322 | strlen (const TCodepoint *text) |
323 | { |
324 | unsigned int l = 0; |
325 | while (*text++) l++; |
326 | return l; |
327 | } |
328 | |
329 | static unsigned int |
330 | encode_len (hb_codepoint_t unicode HB_UNUSED) |
331 | { |
332 | return 1; |
333 | } |
334 | |
335 | static codepoint_t * |
336 | encode (codepoint_t *text, |
337 | const codepoint_t *end HB_UNUSED, |
338 | hb_codepoint_t unicode) |
339 | { |
340 | if (validate && unlikely (unicode >= 0xD800u && (unicode <= 0xDFFFu || unicode > 0x10FFFFu))) |
341 | unicode = 0xFFFDu; |
342 | *text++ = unicode; |
343 | return text; |
344 | } |
345 | }; |
346 | |
347 | typedef hb_utf32_xe_t<uint32_t> hb_utf32_t; |
348 | typedef hb_utf32_xe_t<uint32_t, false> hb_utf32_novalidate_t; |
349 | |
350 | |
351 | struct hb_latin1_t |
352 | { |
353 | typedef uint8_t codepoint_t; |
354 | static constexpr unsigned max_len = 1; |
355 | |
356 | static const codepoint_t * |
357 | next (const codepoint_t *text, |
358 | const codepoint_t *end HB_UNUSED, |
359 | hb_codepoint_t *unicode, |
360 | hb_codepoint_t replacement HB_UNUSED) |
361 | { |
362 | *unicode = *text++; |
363 | return text; |
364 | } |
365 | |
366 | static const codepoint_t * |
367 | prev (const codepoint_t *text, |
368 | const codepoint_t *start HB_UNUSED, |
369 | hb_codepoint_t *unicode, |
370 | hb_codepoint_t replacement HB_UNUSED) |
371 | { |
372 | *unicode = *--text; |
373 | return text; |
374 | } |
375 | |
376 | static unsigned int |
377 | strlen (const codepoint_t *text) |
378 | { |
379 | unsigned int l = 0; |
380 | while (*text++) l++; |
381 | return l; |
382 | } |
383 | |
384 | static unsigned int |
385 | encode_len (hb_codepoint_t unicode HB_UNUSED) |
386 | { |
387 | return 1; |
388 | } |
389 | |
390 | static codepoint_t * |
391 | encode (codepoint_t *text, |
392 | const codepoint_t *end HB_UNUSED, |
393 | hb_codepoint_t unicode) |
394 | { |
395 | if (unlikely (unicode >= 0x0100u)) |
396 | unicode = '?'; |
397 | *text++ = unicode; |
398 | return text; |
399 | } |
400 | }; |
401 | |
402 | |
403 | struct hb_ascii_t |
404 | { |
405 | typedef uint8_t codepoint_t; |
406 | static constexpr unsigned max_len = 1; |
407 | |
408 | static const codepoint_t * |
409 | next (const codepoint_t *text, |
410 | const codepoint_t *end HB_UNUSED, |
411 | hb_codepoint_t *unicode, |
412 | hb_codepoint_t replacement) |
413 | { |
414 | *unicode = *text++; |
415 | if (*unicode >= 0x0080u) |
416 | *unicode = replacement; |
417 | return text; |
418 | } |
419 | |
420 | static const codepoint_t * |
421 | prev (const codepoint_t *text, |
422 | const codepoint_t *start HB_UNUSED, |
423 | hb_codepoint_t *unicode, |
424 | hb_codepoint_t replacement) |
425 | { |
426 | *unicode = *--text; |
427 | if (*unicode >= 0x0080u) |
428 | *unicode = replacement; |
429 | return text; |
430 | } |
431 | |
432 | static unsigned int |
433 | strlen (const codepoint_t *text) |
434 | { |
435 | unsigned int l = 0; |
436 | while (*text++) l++; |
437 | return l; |
438 | } |
439 | |
440 | static unsigned int |
441 | encode_len (hb_codepoint_t unicode HB_UNUSED) |
442 | { |
443 | return 1; |
444 | } |
445 | |
446 | static codepoint_t * |
447 | encode (codepoint_t *text, |
448 | const codepoint_t *end HB_UNUSED, |
449 | hb_codepoint_t unicode) |
450 | { |
451 | if (unlikely (unicode >= 0x0080u)) |
452 | unicode = '?'; |
453 | *text++ = unicode; |
454 | return text; |
455 | } |
456 | }; |
457 | |
458 | template <typename utf_t> |
459 | static inline const typename utf_t::codepoint_t * |
460 | hb_utf_offset_to_pointer (const typename utf_t::codepoint_t *start, |
461 | signed offset) |
462 | { |
463 | hb_codepoint_t unicode; |
464 | |
465 | while (offset-- > 0) |
466 | start = utf_t::next (start, |
467 | start + utf_t::max_len, |
468 | &unicode, |
469 | HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT); |
470 | |
471 | while (offset++ < 0) |
472 | start = utf_t::prev (start, |
473 | start - utf_t::max_len, |
474 | &unicode, |
475 | HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT); |
476 | |
477 | return start; |
478 | } |
479 | |
480 | |
481 | #endif /* HB_UTF_HH */ |
482 | |