1 | // Copyright 2018 Google LLC. |
2 | // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file. |
3 | |
4 | #include "src/utils/SkUTF.h" |
5 | |
6 | #include <climits> |
7 | |
8 | static constexpr inline int32_t left_shift(int32_t value, int32_t shift) { |
9 | return (int32_t) ((uint32_t) value << shift); |
10 | } |
11 | |
12 | template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); } |
13 | |
14 | template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); } |
15 | |
16 | static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; } |
17 | |
18 | static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; } |
19 | |
20 | /** @returns -1 iff invalid UTF8 byte, |
21 | 0 iff UTF8 continuation byte, |
22 | 1 iff ASCII byte, |
23 | 2 iff leading byte of 2-byte sequence, |
24 | 3 iff leading byte of 3-byte sequence, and |
25 | 4 iff leading byte of 4-byte sequence. |
26 | I.e.: if return value > 0, then gives length of sequence. |
27 | */ |
28 | static int utf8_byte_type(uint8_t c) { |
29 | if (c < 0x80) { |
30 | return 1; |
31 | } else if (c < 0xC0) { |
32 | return 0; |
33 | } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear" |
34 | return -1; |
35 | } else { |
36 | int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1; |
37 | // assert(value >= 2 && value <=4); |
38 | return value; |
39 | } |
40 | } |
41 | static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; } |
42 | |
43 | static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; } |
44 | |
45 | //////////////////////////////////////////////////////////////////////////////// |
46 | |
47 | int SkUTF::CountUTF8(const char* utf8, size_t byteLength) { |
48 | if (!utf8) { |
49 | return -1; |
50 | } |
51 | int count = 0; |
52 | const char* stop = utf8 + byteLength; |
53 | while (utf8 < stop) { |
54 | int type = utf8_byte_type(*(const uint8_t*)utf8); |
55 | if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) { |
56 | return -1; // Sequence extends beyond end. |
57 | } |
58 | while(type-- > 1) { |
59 | ++utf8; |
60 | if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) { |
61 | return -1; |
62 | } |
63 | } |
64 | ++utf8; |
65 | ++count; |
66 | } |
67 | return count; |
68 | } |
69 | |
70 | int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) { |
71 | if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) { |
72 | return -1; |
73 | } |
74 | const uint16_t* src = (const uint16_t*)utf16; |
75 | const uint16_t* stop = src + (byteLength >> 1); |
76 | int count = 0; |
77 | while (src < stop) { |
78 | unsigned c = *src++; |
79 | if (utf16_is_low_surrogate(c)) { |
80 | return -1; |
81 | } |
82 | if (utf16_is_high_surrogate(c)) { |
83 | if (src >= stop) { |
84 | return -1; |
85 | } |
86 | c = *src++; |
87 | if (!utf16_is_low_surrogate(c)) { |
88 | return -1; |
89 | } |
90 | } |
91 | count += 1; |
92 | } |
93 | return count; |
94 | } |
95 | |
96 | int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) { |
97 | if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || byteLength >> 2 > INT_MAX) { |
98 | return -1; |
99 | } |
100 | const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits |
101 | const uint32_t* ptr = (const uint32_t*)utf32; |
102 | const uint32_t* stop = ptr + (byteLength >> 2); |
103 | while (ptr < stop) { |
104 | if (*ptr & kInvalidUnicharMask) { |
105 | return -1; |
106 | } |
107 | ptr += 1; |
108 | } |
109 | return (int)(byteLength >> 2); |
110 | } |
111 | |
112 | template <typename T> |
113 | static SkUnichar next_fail(const T** ptr, const T* end) { |
114 | *ptr = end; |
115 | return -1; |
116 | } |
117 | |
118 | SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) { |
119 | if (!ptr || !end ) { |
120 | return -1; |
121 | } |
122 | const uint8_t* p = (const uint8_t*)*ptr; |
123 | if (!p || p >= (const uint8_t*)end) { |
124 | return next_fail(ptr, end); |
125 | } |
126 | int c = *p; |
127 | int hic = c << 24; |
128 | |
129 | if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) { |
130 | return next_fail(ptr, end); |
131 | } |
132 | if (hic < 0) { |
133 | uint32_t mask = (uint32_t)~0x3F; |
134 | hic = left_shift(hic, 1); |
135 | do { |
136 | ++p; |
137 | if (p >= (const uint8_t*)end) { |
138 | return next_fail(ptr, end); |
139 | } |
140 | // check before reading off end of array. |
141 | uint8_t nextByte = *p; |
142 | if (!utf8_byte_is_continuation(nextByte)) { |
143 | return next_fail(ptr, end); |
144 | } |
145 | c = (c << 6) | (nextByte & 0x3F); |
146 | mask <<= 5; |
147 | } while ((hic = left_shift(hic, 1)) < 0); |
148 | c &= ~mask; |
149 | } |
150 | *ptr = (char*)p + 1; |
151 | return c; |
152 | } |
153 | |
154 | SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) { |
155 | if (!ptr || !end ) { |
156 | return -1; |
157 | } |
158 | const uint16_t* src = *ptr; |
159 | if (!src || src + 1 > end || !is_align2(intptr_t(src))) { |
160 | return next_fail(ptr, end); |
161 | } |
162 | uint16_t c = *src++; |
163 | SkUnichar result = c; |
164 | if (utf16_is_low_surrogate(c)) { |
165 | return next_fail(ptr, end); // srcPtr should never point at low surrogate. |
166 | } |
167 | if (utf16_is_high_surrogate(c)) { |
168 | if (src + 1 > end) { |
169 | return next_fail(ptr, end); // Truncated string. |
170 | } |
171 | uint16_t low = *src++; |
172 | if (!utf16_is_low_surrogate(low)) { |
173 | return next_fail(ptr, end); |
174 | } |
175 | /* |
176 | [paraphrased from wikipedia] |
177 | Take the high surrogate and subtract 0xD800, then multiply by 0x400. |
178 | Take the low surrogate and subtract 0xDC00. Add these two results |
179 | together, and finally add 0x10000 to get the final decoded codepoint. |
180 | |
181 | unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000 |
182 | unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000 |
183 | unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000 |
184 | unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000) |
185 | */ |
186 | result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000); |
187 | } |
188 | *ptr = src; |
189 | return result; |
190 | } |
191 | |
192 | SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) { |
193 | if (!ptr || !end ) { |
194 | return -1; |
195 | } |
196 | const int32_t* s = *ptr; |
197 | if (!s || s + 1 > end || !is_align4(intptr_t(s))) { |
198 | return next_fail(ptr, end); |
199 | } |
200 | int32_t value = *s; |
201 | const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits |
202 | if (value & kInvalidUnicharMask) { |
203 | return next_fail(ptr, end); |
204 | } |
205 | *ptr = s + 1; |
206 | return value; |
207 | } |
208 | |
209 | size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) { |
210 | if ((uint32_t)uni > 0x10FFFF) { |
211 | return 0; |
212 | } |
213 | if (uni <= 127) { |
214 | if (utf8) { |
215 | *utf8 = (char)uni; |
216 | } |
217 | return 1; |
218 | } |
219 | char tmp[4]; |
220 | char* p = tmp; |
221 | size_t count = 1; |
222 | while (uni > 0x7F >> count) { |
223 | *p++ = (char)(0x80 | (uni & 0x3F)); |
224 | uni >>= 6; |
225 | count += 1; |
226 | } |
227 | if (utf8) { |
228 | p = tmp; |
229 | utf8 += count; |
230 | while (p < tmp + count - 1) { |
231 | *--utf8 = *p++; |
232 | } |
233 | *--utf8 = (char)(~(0xFF >> count) | uni); |
234 | } |
235 | return count; |
236 | } |
237 | |
238 | size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) { |
239 | if ((uint32_t)uni > 0x10FFFF) { |
240 | return 0; |
241 | } |
242 | int = (uni > 0xFFFF); |
243 | if (utf16) { |
244 | if (extra) { |
245 | utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10)); |
246 | utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF)); |
247 | } else { |
248 | utf16[0] = (uint16_t)uni; |
249 | } |
250 | } |
251 | return 1 + extra; |
252 | } |
253 | |
254 | int SkUTF::UTF8ToUTF16(uint16_t dst[], int dstCapacity, const char src[], size_t srcByteLength) { |
255 | if (!dst) { |
256 | dstCapacity = 0; |
257 | } |
258 | |
259 | int dstLength = 0; |
260 | uint16_t* endDst = dst + dstCapacity; |
261 | const char* endSrc = src + srcByteLength; |
262 | while (src < endSrc) { |
263 | SkUnichar uni = NextUTF8(&src, endSrc); |
264 | if (uni < 0) { |
265 | return -1; |
266 | } |
267 | |
268 | uint16_t utf16[2]; |
269 | size_t count = ToUTF16(uni, utf16); |
270 | if (count == 0) { |
271 | return -1; |
272 | } |
273 | dstLength += count; |
274 | |
275 | if (dst) { |
276 | uint16_t* elems = utf16; |
277 | while (dst < endDst && count > 0) { |
278 | *dst++ = *elems++; |
279 | count -= 1; |
280 | } |
281 | } |
282 | } |
283 | return dstLength; |
284 | } |
285 | |
286 | |
287 | |