SkUTF.cpp source code [Skia/src/utils/SkUTF.cpp]

1	// Copyright 2018 Google LLC.
2	// Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3
4	#include "src/utils/SkUTF.h"
5
6	#include <climits>
7
8	static constexpr inline int32_t left_shift(int32_t value, int32_t shift) {
9	return (int32_t) ((uint32_t) value << shift);
10	}
11
12	template <typename T> static constexpr bool is_align2(T x) { return `0` == (x & `1`); }
13
14	template <typename T> static constexpr bool is_align4(T x) { return `0` == (x & `3`); }
15
16	static constexpr inline bool utf16_is_high_surrogate(uint16_t c) { return (c & `0xFC00`) == `0xD800`; }
17
18	static constexpr inline bool utf16_is_low_surrogate(uint16_t c) { return (c & `0xFC00`) == `0xDC00`; }
19
20	/* @returns -1 iff invalid UTF8 byte,*
21	0 iff UTF8 continuation byte,
22	1 iff ASCII byte,
23	2 iff leading byte of 2-byte sequence,
24	3 iff leading byte of 3-byte sequence, and
25	4 iff leading byte of 4-byte sequence.
26	I.e.: if return value > 0, then gives length of sequence.
27	*/
28	static int utf8_byte_type(uint8_t c) {
29	if (c < `0x80`) {
30	return `1`;
31	} else if (c < `0xC0`) {
32	return `0`;
33	} else if (c >= `0xF5` \|\| (c & `0xFE`) == `0xC0`) { // "octet values c0, c1, f5 to ff never appear"
34	return -`1`;
35	} else {
36	int value = (((`0xe5` << `24`) >> ((unsigned)c >> `4` << `1`)) & `3`) + `1`;
37	// assert(value >= 2 && value <=4);
38	return value;
39	}
40	}
41	static bool utf8_type_is_valid_leading_byte(int type) { return type > `0`; }
42
43	static bool utf8_byte_is_continuation(uint8_t c) { return utf8_byte_type(c) == `0`; }
44
45	////////////////////////////////////////////////////////////////////////////////
46
47	int SkUTF::CountUTF8(const char* utf8, size_t byteLength) {
48	if (!utf8) {
49	return -`1`;
50	}
51	int count = `0`;
52	const char* stop = utf8 + byteLength;
53	while (utf8 < stop) {
54	int type = utf8_byte_type((const* uint8_t*)utf8);
55	if (!utf8_type_is_valid_leading_byte(type) \|\| utf8 + type > stop) {
56	return -`1`; // Sequence extends beyond end.
57	}
58	while(type-- > `1`) {
59	++utf8;
60	if (!utf8_byte_is_continuation((const* uint8_t*)utf8)) {
61	return -`1`;
62	}
63	}
64	++utf8;
65	++count;
66	}
67	return count;
68	}
69
70	int SkUTF::CountUTF16(const uint16_t* utf16, size_t byteLength) {
71	if (!utf16 \|\| !is_align2(intptr_t(utf16)) \|\| !is_align2(byteLength)) {
72	return -`1`;
73	}
74	const uint16_t* src = (const uint16_t*)utf16;
75	const uint16_t* stop = src + (byteLength >> `1`);
76	int count = `0`;
77	while (src < stop) {
78	unsigned c = *src++;
79	if (utf16_is_low_surrogate(c)) {
80	return -`1`;
81	}
82	if (utf16_is_high_surrogate(c)) {
83	if (src >= stop) {
84	return -`1`;
85	}
86	c = *src++;
87	if (!utf16_is_low_surrogate(c)) {
88	return -`1`;
89	}
90	}
91	count += `1`;
92	}
93	return count;
94	}
95
96	int SkUTF::CountUTF32(const int32_t* utf32, size_t byteLength) {
97	if (!is_align4(intptr_t(utf32)) \|\| !is_align4(byteLength) \|\| byteLength >> `2` > INT_MAX) {
98	return -`1`;
99	}
100	const uint32_t kInvalidUnicharMask = `0xFF000000`; // unichar fits in 24 bits
101	const uint32_t* ptr = (const uint32_t*)utf32;
102	const uint32_t* stop = ptr + (byteLength >> `2`);
103	while (ptr < stop) {
104	if (*ptr & kInvalidUnicharMask) {
105	return -`1`;
106	}
107	ptr += `1`;
108	}
109	return (int)(byteLength >> `2`);
110	}
111
112	template <typename T>
113	static SkUnichar next_fail(const T** ptr, const T* end) {
114	*ptr = end;
115	return -`1`;
116	}
117
118	SkUnichar SkUTF::NextUTF8(const char** ptr, const char* end) {
119	if (!ptr \|\| !end ) {
120	return -`1`;
121	}
122	const uint8_t* p = (const uint8_t)ptr;
123	if (!p \|\| p >= (const uint8_t*)end) {
124	return next_fail(ptr, end);
125	}
126	int c = *p;
127	int hic = c << `24`;
128
129	if (!utf8_type_is_valid_leading_byte(utf8_byte_type(c))) {
130	return next_fail(ptr, end);
131	}
132	if (hic < `0`) {
133	uint32_t mask = (uint32_t)~`0x3F`;
134	hic = left_shift(hic, `1`);
135	do {
136	++p;
137	if (p >= (const uint8_t*)end) {
138	return next_fail(ptr, end);
139	}
140	// check before reading off end of array.
141	uint8_t nextByte = *p;
142	if (!utf8_byte_is_continuation(nextByte)) {
143	return next_fail(ptr, end);
144	}
145	c = (c << `6`) \| (nextByte & `0x3F`);
146	mask <<= `5`;
147	} while ((hic = left_shift(hic, `1`)) < `0`);
148	c &= ~mask;
149	}
150	ptr = (char**)p + `1`;
151	return c;
152	}
153
154	SkUnichar SkUTF::NextUTF16(const uint16_t** ptr, const uint16_t* end) {
155	if (!ptr \|\| !end ) {
156	return -`1`;
157	}
158	const uint16_t* src = *ptr;
159	if (!src \|\| src + `1` > end \|\| !is_align2(intptr_t(src))) {
160	return next_fail(ptr, end);
161	}
162	uint16_t c = *src++;
163	SkUnichar result = c;
164	if (utf16_is_low_surrogate(c)) {
165	return next_fail(ptr, end); // srcPtr should never point at low surrogate.
166	}
167	if (utf16_is_high_surrogate(c)) {
168	if (src + `1` > end) {
169	return next_fail(ptr, end); // Truncated string.
170	}
171	uint16_t low = *src++;
172	if (!utf16_is_low_surrogate(low)) {
173	return next_fail(ptr, end);
174	}
175	/*
176	[paraphrased from wikipedia]
177	Take the high surrogate and subtract 0xD800, then multiply by 0x400.
178	Take the low surrogate and subtract 0xDC00. Add these two results
179	together, and finally add 0x10000 to get the final decoded codepoint.
180
181	unicode = (high - 0xD800) 0x400 + low - 0xDC00 + 0x10000*
182	unicode = (high 0x400) - (0xD800 * 0x400) + low - 0xDC00 + 0x10000*
183	unicode = (high << 10) - (0xD800 << 10) + low - 0xDC00 + 0x10000
184	unicode = (high << 10) + low - ((0xD800 << 10) + 0xDC00 - 0x10000)
185	*/
186	result = (result << `10`) + (SkUnichar)low - ((`0xD800` << `10`) + `0xDC00` - `0x10000`);
187	}
188	*ptr = src;
189	return result;
190	}
191
192	SkUnichar SkUTF::NextUTF32(const int32_t** ptr, const int32_t* end) {
193	if (!ptr \|\| !end ) {
194	return -`1`;
195	}
196	const int32_t* s = *ptr;
197	if (!s \|\| s + `1` > end \|\| !is_align4(intptr_t(s))) {
198	return next_fail(ptr, end);
199	}
200	int32_t value = *s;
201	const uint32_t kInvalidUnicharMask = `0xFF000000`; // unichar fits in 24 bits
202	if (value & kInvalidUnicharMask) {
203	return next_fail(ptr, end);
204	}
205	*ptr = s + `1`;
206	return value;
207	}
208
209	size_t SkUTF::ToUTF8(SkUnichar uni, char utf8[SkUTF::kMaxBytesInUTF8Sequence]) {
210	if ((uint32_t)uni > `0x10FFFF`) {
211	return `0`;
212	}
213	if (uni <= `127`) {
214	if (utf8) {
215	utf8 = (char*)uni;
216	}
217	return `1`;
218	}
219	char tmp[`4`];
220	char* p = tmp;
221	size_t count = `1`;
222	while (uni > `0x7F` >> count) {
223	p++ = (char*)(`0x80` \| (uni & `0x3F`));
224	uni >>= `6`;
225	count += `1`;
226	}
227	if (utf8) {
228	p = tmp;
229	utf8 += count;
230	while (p < tmp + count - `1`) {
231	--utf8 = p++;
232	}
233	--utf8 = (char*)(~(`0xFF` >> count) \| uni);
234	}
235	return count;
236	}
237
238	size_t SkUTF::ToUTF16(SkUnichar uni, uint16_t utf16[`2`]) {
239	if ((uint32_t)uni > `0x10FFFF`) {
240	return `0`;
241	}
242	int extra = (uni > `0xFFFF`);
243	if (utf16) {
244	if (extra) {
245	utf16[`0`] = (uint16_t)((`0xD800` - `64`) + (uni >> `10`));
246	utf16[`1`] = (uint16_t)(`0xDC00` \| (uni & `0x3FF`));
247	} else {
248	utf16[`0`] = (uint16_t)uni;
249	}
250	}
251	return `1` + extra;
252	}
253
254

Browse the source code of Skia/src/utils/SkUTF.cpp