1 | /* |
2 | * Copyright © 2011,2012,2014 Google, Inc. |
3 | * |
4 | * This is part of HarfBuzz, a text shaping library. |
5 | * |
6 | * Permission is hereby granted, without written agreement and without |
7 | * license or royalty fees, to use, copy, modify, and distribute this |
8 | * software and its documentation for any purpose, provided that the |
9 | * above copyright notice and the following two paragraphs appear in |
10 | * all copies of this software. |
11 | * |
12 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
13 | * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
14 | * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
15 | * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
16 | * DAMAGE. |
17 | * |
18 | * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
19 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
20 | * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
21 | * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
22 | * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
23 | * |
24 | * Google Author(s): Behdad Esfahbod |
25 | */ |
26 | |
27 | #ifndef HB_UTF_HH |
28 | #define HB_UTF_HH |
29 | |
30 | #include "hb.hh" |
31 | |
32 | |
33 | struct hb_utf8_t |
34 | { |
35 | typedef uint8_t codepoint_t; |
36 | |
37 | static inline const uint8_t * |
38 | next (const uint8_t *text, |
39 | const uint8_t *end, |
40 | hb_codepoint_t *unicode, |
41 | hb_codepoint_t replacement) |
42 | { |
43 | /* Written to only accept well-formed sequences. |
44 | * Based on ideas from ICU's U8_NEXT. |
45 | * Generates one "replacement" for each ill-formed byte. */ |
46 | |
47 | hb_codepoint_t c = *text++; |
48 | |
49 | if (c > 0x7Fu) |
50 | { |
51 | if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */ |
52 | { |
53 | unsigned int t1; |
54 | if (likely (text < end && |
55 | (t1 = text[0] - 0x80u) <= 0x3Fu)) |
56 | { |
57 | c = ((c&0x1Fu)<<6) | t1; |
58 | text++; |
59 | } |
60 | else |
61 | goto error; |
62 | } |
63 | else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */ |
64 | { |
65 | unsigned int t1, t2; |
66 | if (likely (1 < end - text && |
67 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
68 | (t2 = text[1] - 0x80u) <= 0x3Fu)) |
69 | { |
70 | c = ((c&0xFu)<<12) | (t1<<6) | t2; |
71 | if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
72 | goto error; |
73 | text += 2; |
74 | } |
75 | else |
76 | goto error; |
77 | } |
78 | else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */ |
79 | { |
80 | unsigned int t1, t2, t3; |
81 | if (likely (2 < end - text && |
82 | (t1 = text[0] - 0x80u) <= 0x3Fu && |
83 | (t2 = text[1] - 0x80u) <= 0x3Fu && |
84 | (t3 = text[2] - 0x80u) <= 0x3Fu)) |
85 | { |
86 | c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; |
87 | if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu))) |
88 | goto error; |
89 | text += 3; |
90 | } |
91 | else |
92 | goto error; |
93 | } |
94 | else |
95 | goto error; |
96 | } |
97 | |
98 | *unicode = c; |
99 | return text; |
100 | |
101 | error: |
102 | *unicode = replacement; |
103 | return text; |
104 | } |
105 | |
106 | static inline const uint8_t * |
107 | prev (const uint8_t *text, |
108 | const uint8_t *start, |
109 | hb_codepoint_t *unicode, |
110 | hb_codepoint_t replacement) |
111 | { |
112 | const uint8_t *end = text--; |
113 | while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) |
114 | text--; |
115 | |
116 | if (likely (next (text, end, unicode, replacement) == end)) |
117 | return text; |
118 | |
119 | *unicode = replacement; |
120 | return end - 1; |
121 | } |
122 | |
123 | static inline unsigned int |
124 | strlen (const uint8_t *text) |
125 | { |
126 | return ::strlen ((const char *) text); |
127 | } |
128 | }; |
129 | |
130 | |
131 | struct hb_utf16_t |
132 | { |
133 | typedef uint16_t codepoint_t; |
134 | |
135 | static inline const uint16_t * |
136 | next (const uint16_t *text, |
137 | const uint16_t *end, |
138 | hb_codepoint_t *unicode, |
139 | hb_codepoint_t replacement) |
140 | { |
141 | hb_codepoint_t c = *text++; |
142 | |
143 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
144 | { |
145 | *unicode = c; |
146 | return text; |
147 | } |
148 | |
149 | if (likely (c <= 0xDBFFu && text < end)) |
150 | { |
151 | /* High-surrogate in c */ |
152 | hb_codepoint_t l = *text; |
153 | if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu))) |
154 | { |
155 | /* Low-surrogate in l */ |
156 | *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
157 | text++; |
158 | return text; |
159 | } |
160 | } |
161 | |
162 | /* Lonely / out-of-order surrogate. */ |
163 | *unicode = replacement; |
164 | return text; |
165 | } |
166 | |
167 | static inline const uint16_t * |
168 | prev (const uint16_t *text, |
169 | const uint16_t *start, |
170 | hb_codepoint_t *unicode, |
171 | hb_codepoint_t replacement) |
172 | { |
173 | hb_codepoint_t c = *--text; |
174 | |
175 | if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu))) |
176 | { |
177 | *unicode = c; |
178 | return text; |
179 | } |
180 | |
181 | if (likely (c >= 0xDC00u && start < text)) |
182 | { |
183 | /* Low-surrogate in c */ |
184 | hb_codepoint_t h = text[-1]; |
185 | if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu))) |
186 | { |
187 | /* High-surrogate in h */ |
188 | *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
189 | text--; |
190 | return text; |
191 | } |
192 | } |
193 | |
194 | /* Lonely / out-of-order surrogate. */ |
195 | *unicode = replacement; |
196 | return text; |
197 | } |
198 | |
199 | |
200 | static inline unsigned int |
201 | strlen (const uint16_t *text) |
202 | { |
203 | unsigned int l = 0; |
204 | while (*text++) l++; |
205 | return l; |
206 | } |
207 | }; |
208 | |
209 | |
210 | template <bool validate=true> |
211 | struct hb_utf32_t |
212 | { |
213 | typedef uint32_t codepoint_t; |
214 | |
215 | static inline const uint32_t * |
216 | next (const uint32_t *text, |
217 | const uint32_t *end HB_UNUSED, |
218 | hb_codepoint_t *unicode, |
219 | hb_codepoint_t replacement) |
220 | { |
221 | hb_codepoint_t c = *unicode = *text++; |
222 | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
223 | *unicode = replacement; |
224 | return text; |
225 | } |
226 | |
227 | static inline const uint32_t * |
228 | prev (const uint32_t *text, |
229 | const uint32_t *start HB_UNUSED, |
230 | hb_codepoint_t *unicode, |
231 | hb_codepoint_t replacement) |
232 | { |
233 | hb_codepoint_t c = *unicode = *--text; |
234 | if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) |
235 | *unicode = replacement; |
236 | return text; |
237 | } |
238 | |
239 | static inline unsigned int |
240 | strlen (const uint32_t *text) |
241 | { |
242 | unsigned int l = 0; |
243 | while (*text++) l++; |
244 | return l; |
245 | } |
246 | }; |
247 | |
248 | |
249 | struct hb_latin1_t |
250 | { |
251 | typedef uint8_t codepoint_t; |
252 | |
253 | static inline const uint8_t * |
254 | next (const uint8_t *text, |
255 | const uint8_t *end HB_UNUSED, |
256 | hb_codepoint_t *unicode, |
257 | hb_codepoint_t replacement HB_UNUSED) |
258 | { |
259 | *unicode = *text++; |
260 | return text; |
261 | } |
262 | |
263 | static inline const uint8_t * |
264 | prev (const uint8_t *text, |
265 | const uint8_t *start HB_UNUSED, |
266 | hb_codepoint_t *unicode, |
267 | hb_codepoint_t replacement) |
268 | { |
269 | *unicode = *--text; |
270 | return text; |
271 | } |
272 | |
273 | static inline unsigned int |
274 | strlen (const uint8_t *text) |
275 | { |
276 | unsigned int l = 0; |
277 | while (*text++) l++; |
278 | return l; |
279 | } |
280 | }; |
281 | |
282 | #endif /* HB_UTF_HH */ |
283 | |