1// Copyright 2018 Google LLC.
2// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3
4#include "src/utils/SkUTF.h"
5
6#include <climits>
7
8static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
9 return (int32_t) ((uint32_t) value << shift);
10}
11
12template <typename T> static constexpr bool is_align2(T x) { return 0 == (x & 1); }
13
14template <typename T> static constexpr bool is_align4(T x) { return 0 == (x & 3); }
15
16static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & 0xFC00) == 0xD800; }
17
18static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & 0xFC00) == 0xDC00; }
19
20/** @returns -1 iff invalid UTF8 byte,
21 0 iff UTF8 continuation byte,
22 1 iff ASCII byte,
23 2 iff leading byte of 2-byte sequence,
24 3 iff leading byte of 3-byte sequence, and
25 4 iff leading byte of 4-byte sequence.
26 I.e.: if return value > 0, then gives length of sequence.
27*/
28static int utf8_byte_type(uint8_t c) {
29 if (c < 0x80) {
30 return 1;
31 } else if (c < 0xC0) {
32 return 0;
33 } else if (c >= 0xF5 || (c & 0xFE) == 0xC0) { // "octet values c0, c1, f5 to ff never appear"
34 return -1;
35 } else {
36 int value = (((0xe5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1;
37 // assert(value >= 2 && value <=4);
38 return value;
39 }
40}
41static bool utf8_type_is_valid_leading_byte(int type) { return type > 0; }
42
43static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == 0; }
44
45////////////////////////////////////////////////////////////////////////////////
46
47int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
48 if (!utf8) {
49 return -1;
50 }
51 int count = 0;
52 const char* stop = utf8 + byteLength;
53 while (utf8 < stop) {
54 int type = utf8_byte_type(*(const uint8_t*)utf8);
55 if (!utf8_type_is_valid_leading_byte(type) || utf8 + type > stop) {
56 return -1; // Sequence extends beyond end.
57 }
58 while(type-- > 1) {
59 ++utf8;
60 if (!utf8_byte_is_continuation(*(const uint8_t*)utf8)) {
61 return -1;
62 }
63 }
64 ++utf8;
65 ++count;
66 }
67 return count;
68}
69
70int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
71 if (!utf16 || !is_align2(intptr_t(utf16)) || !is_align2(byteLength)) {
72 return -1;
73 }
74 const uint16_t* src = (const uint16_t*)utf16;
75 const uint16_t* stop = src + (byteLength >> 1);
76 int count = 0;
77 while (src < stop) {
78 unsigned c = *src++;
79 if (utf16_is_low_surrogate(c)) {
80 return -1;
81 }
82 if (utf16_is_high_surrogate(c)) {
83 if (src >= stop) {
84 return -1;
85 }
86 c = *src++;
87 if (!utf16_is_low_surrogate(c)) {
88 return -1;
89 }
90 }
91 count += 1;
92 }
93 return count;
94}
95
96int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
97 if (!is_align4(intptr_t(utf32)) || !is_align4(byteLength) || byteLength >> 2 > INT_MAX) {
98 return -1;
99 }
100 const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
101 const uint32_t* ptr = (const uint32_t*)utf32;
102 const uint32_t* stop = ptr + (byteLength >> 2);
103 while (ptr < stop) {
104 if (*ptr & kInvalidUnicharMask) {
105 return -1;
106 }
107 ptr += 1;
108 }
109 return (int)(byteLength >> 2);
110}
111
112template <typename T>
113static SkUnichar next_fail(const T** ptr, const T* end) {
114 *ptr = end;
115 return -1;
116}
117
118SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
119 if (!ptr || !end ) {
120 return -1;
121 }
122 const uint8_t* p = (const uint8_t*)*ptr;
123 if (!p || p >= (const uint8_t*)end) {
124 return next_fail(ptr, end);
125 }
126 int c = *p;
127 int hic = c << 24;
128
129 if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
130 return next_fail(ptr, end);
131 }
132 if (hic < 0) {
133 uint32_t mask = (uint32_t)~0x3F;
134 hic = left_shift(hic, 1);
135 do {
136 ++p;
137 if (p >= (const uint8_t*)end) {
138 return next_fail(ptr, end);
139 }
140 // check before reading off end of array.
141 uint8_t nextByte = *p;
142 if (!utf8_byte_is_continuation(nextByte)) {
143 return next_fail(ptr, end);
144 }
145 c = (c << 6) | (nextByte & 0x3F);
146 mask <<= 5;
147 } while ((hic = left_shift(hic, 1)) < 0);
148 c &= ~mask;
149 }
150 *ptr = (char*)p + 1;
151 return c;
152}
153
154SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
155 if (!ptr || !end ) {
156 return -1;
157 }
158 const uint16_t* src = *ptr;
159 if (!src || src + 1 > end || !is_align2(intptr_t(src))) {
160 return next_fail(ptr, end);
161 }
162 uint16_t c = *src++;
163 SkUnichar result = c;
164 if (utf16_is_low_surrogate(c)) {
165 return next_fail(ptr, end); // srcPtr should never point at low surrogate.
166 }
167 if (utf16_is_high_surrogate(c)) {
168 if (src + 1 > end) {
169 return next_fail(ptr, end); // Truncated string.
170 }
171 uint16_t low = *src++;
172 if (!utf16_is_low_surrogate(low)) {
173 return next_fail(ptr, end);
174 }
175 /*
176 [paraphrased from wikipedia]
177 Take the high surrogate and subtract 0xD800, then multiply by 0x400.
178 Take the low surrogate and subtract 0xDC00. Add these two results
179 together, and finally add 0x10000 to get the final decoded codepoint.
180
181 unicode = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
182 unicode = (high * 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000
183 unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
184 unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
185 */
186 result = (result << 10) + (SkUnichar)low - ((0xD800 << 10) + 0xDC00 - 0x10000);
187 }
188 *ptr = src;
189 return result;
190}
191
192SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
193 if (!ptr || !end ) {
194 return -1;
195 }
196 const int32_t* s = *ptr;
197 if (!s || s + 1 > end || !is_align4(intptr_t(s))) {
198 return next_fail(ptr, end);
199 }
200 int32_t value = *s;
201 const uint32_t kInvalidUnicharMask = 0xFF000000; // unichar fits in 24 bits
202 if (value & kInvalidUnicharMask) {
203 return next_fail(ptr, end);
204 }
205 *ptr = s + 1;
206 return value;
207}
208
209size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
210 if ((uint32_t)uni > 0x10FFFF) {
211 return 0;
212 }
213 if (uni <= 127) {
214 if (utf8) {
215 *utf8 = (char)uni;
216 }
217 return 1;
218 }
219 char tmp[4];
220 char* p = tmp;
221 size_t count = 1;
222 while (uni > 0x7F >> count) {
223 *p++ = (char)(0x80 | (uni & 0x3F));
224 uni >>= 6;
225 count += 1;
226 }
227 if (utf8) {
228 p = tmp;
229 utf8 += count;
230 while (p < tmp + count - 1) {
231 *--utf8 = *p++;
232 }
233 *--utf8 = (char)(~(0xFF >> count) | uni);
234 }
235 return count;
236}
237
238size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[2]) {
239 if ((uint32_t)uni > 0x10FFFF) {
240 return 0;
241 }
242 int extra = (uni > 0xFFFF);
243 if (utf16) {
244 if (extra) {
245 utf16[0] = (uint16_t)((0xD800 - 64) + (uni >> 10));
246 utf16[1] = (uint16_t)(0xDC00 | (uni & 0x3FF));
247 } else {
248 utf16[0] = (uint16_t)uni;
249 }
250 }
251 return 1 + extra;
252}
253
254