unicode.cc source code [engine/third_party/dart/runtime/platform/unicode.cc]

1	// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2	// for details. All rights reserved. Use of this source code is governed by a
3	// BSD-style license that can be found in the LICENSE file.
4
5	#include "platform/unicode.h"
6
7	#include "platform/allocation.h"
8	#include "platform/globals.h"
9	#include "platform/syslog.h"
10
11	namespace dart {
12
13	// clang-format off
14	const int8_t Utf8::kTrailBytes[`256`] = {
15	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
16	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
17	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
18	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
19	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
20	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
21	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
22	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
23	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
24	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
25	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
26	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
27	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
28	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`,
29	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`,
30	`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `5`, `5`, `5`, `5`, `6`, `6`, `0`, `0`
31	};
32	// clang-format on
33
34	const uint32_t Utf8::kMagicBits[`7`] = {`0`, // Padding.
35	`0x00000000`, `0x00003080`, `0x000E2080`,
36	`0x03C82080`, `0xFA082080`, `0x82082080`};
37
38	// Minimum values of code points used to check shortest form.
39	const uint32_t Utf8::kOverlongMinimum[`7`] = {`0`, // Padding.
40	`0x0`, `0x80`, `0x800`,
41	`0x10000`, `0xFFFFFFFF`, `0xFFFFFFFF`};
42
43	// Returns the most restricted coding form in which the sequence of utf8
44	// characters in 'utf8_array' can be represented in, and the number of
45	// code units needed in that form.
46	intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array,
47	intptr_t array_len,
48	Type* type) {
49	intptr_t len = `0`;
50	Type char_type = kLatin1;
51	for (intptr_t i = `0`; i < array_len; i++) {
52	uint8_t code_unit = utf8_array[i];
53	if (!IsTrailByte(code_unit)) {
54	++len;
55	if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF
56	if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000
57	char_type = kSupplementary;
58	++len;
59	} else if (char_type == kLatin1) {
60	char_type = kBMP;
61	}
62	}
63	}
64	}
65	*type = char_type;
66	return len;
67	}
68
69	// Returns true if str is a valid NUL-terminated UTF-8 string.
70	bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {
71	intptr_t i = `0`;
72	while (i < array_len) {
73	uint32_t ch = utf8_array[i] & `0xFF`;
74	intptr_t j = `1`;
75	if (ch >= `0x80`) {
76	int8_t num_trail_bytes = kTrailBytes[ch];
77	bool is_malformed = false;
78	for (; j < num_trail_bytes; ++j) {
79	if ((i + j) < array_len) {
80	uint8_t code_unit = utf8_array[i + j];
81	is_malformed \|= !IsTrailByte(code_unit);
82	ch = (ch << `6`) + code_unit;
83	} else {
84	return false;
85	}
86	}
87	ch -= kMagicBits[num_trail_bytes];
88	if (!((is_malformed == false) && (j == num_trail_bytes) &&
89	!Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, j))) {
90	return false;
91	}
92	}
93	i += j;
94	}
95	return true;
96	}
97
98	intptr_t Utf8::Length(int32_t ch) {
99	if (ch <= kMaxOneByteChar) {
100	return `1`;
101	} else if (ch <= kMaxTwoByteChar) {
102	return `2`;
103	} else if (ch <= kMaxThreeByteChar) {
104	return `3`;
105	}
106	ASSERT(ch <= kMaxFourByteChar);
107	return `4`;
108	}
109
110	intptr_t Utf8::Encode(int32_t ch, char* dst) {
111	static const int kMask = ~(`1` << `6`);
112	if (ch <= kMaxOneByteChar) {
113	dst[`0`] = ch;
114	return `1`;
115	}
116	if (ch <= kMaxTwoByteChar) {
117	dst[`0`] = `0xC0` \| (ch >> `6`);
118	dst[`1`] = `0x80` \| (ch & kMask);
119	return `2`;
120	}
121	if (ch <= kMaxThreeByteChar) {
122	dst[`0`] = `0xE0` \| (ch >> `12`);
123	dst[`1`] = `0x80` \| ((ch >> `6`) & kMask);
124	dst[`2`] = `0x80` \| (ch & kMask);
125	return `3`;
126	}
127	ASSERT(ch <= kMaxFourByteChar);
128	dst[`0`] = `0xF0` \| (ch >> `18`);
129	dst[`1`] = `0x80` \| ((ch >> `12`) & kMask);
130	dst[`2`] = `0x80` \| ((ch >> `6`) & kMask);
131	dst[`3`] = `0x80` \| (ch & kMask);
132	return `4`;
133	}
134
135	intptr_t Utf8::Decode(const uint8_t* utf8_array,
136	intptr_t array_len,
137	int32_t* dst) {
138	uint32_t ch = utf8_array[`0`] & `0xFF`;
139	intptr_t i = `1`;
140	if (ch >= `0x80`) {
141	intptr_t num_trail_bytes = kTrailBytes[ch];
142	bool is_malformed = false;
143	for (; i < num_trail_bytes; ++i) {
144	if (i < array_len) {
145	uint8_t code_unit = utf8_array[i];
146	is_malformed \|= !IsTrailByte(code_unit);
147	ch = (ch << `6`) + code_unit;
148	} else {
149	*dst = -`1`;
150	return `0`;
151	}
152	}
153	ch -= kMagicBits[num_trail_bytes];
154	if (!((is_malformed == false) && (i == num_trail_bytes) &&
155	!Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, i))) {
156	*dst = -`1`;
157	return `0`;
158	}
159	}
160	*dst = ch;
161	return i;
162	}
163	intptr_t Utf8::ReportInvalidByte(const uint8_t* utf8_array,
164	intptr_t array_len,
165	intptr_t len) {
166	intptr_t i = `0`;
167	intptr_t j = `0`;
168	intptr_t num_bytes;
169	for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
170	int32_t ch;
171	bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);
172	num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
173	if (ch == -`1`) {
174	break; // Invalid input.
175	}
176	if (is_supplementary) {
177	j = j + `1`;
178	}
179	}
180	#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
181	// Remain silent while libFuzzer is active, since
182	// the output only slows down the in-process fuzzing.
183	#else
184	Syslog::PrintErr("Invalid UTF8 sequence encountered, ");
185	for (intptr_t idx = `0`; idx < `10` && (i + idx) < array_len; idx++) {
186	Syslog::PrintErr("(Error Code: %X + idx: %" Pd " )", utf8_array[idx + i],
187	(idx + i));
188	}
189	Syslog::PrintErr("\n");
190	#endif
191	return i;
192	}
193
194	bool Utf8::DecodeToLatin1(const uint8_t* utf8_array,
195	intptr_t array_len,
196	uint8_t* dst,
197	intptr_t len) {
198	intptr_t i = `0`;
199	intptr_t j = `0`;
200	intptr_t num_bytes;
201	for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
202	int32_t ch;
203	ASSERT(IsLatin1SequenceStart(utf8_array[i]));
204	num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
205	if (ch == -`1`) {
206	return false; // Invalid input.
207	}
208	ASSERT(Utf::IsLatin1(ch));
209	dst[j] = ch;
210	}
211	if ((i < array_len) && (j == len)) {
212	return false; // Output overflow.
213	}
214	return true; // Success.
215	}
216
217	bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
218	intptr_t array_len,
219	uint16_t* dst,
220	intptr_t len) {
221	intptr_t i = `0`;
222	intptr_t j = `0`;
223	intptr_t num_bytes;
224	for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
225	int32_t ch;
226	bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);
227	num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
228	if (ch == -`1`) {
229	return false; // Invalid input.
230	}
231	if (is_supplementary) {
232	if (j == (len - `1`)) return false; // Output overflow.
233	Utf16::Encode(ch, &dst[j]);
234	j = j + `1`;
235	} else {
236	dst[j] = ch;
237	}
238	}
239	if ((i < array_len) && (j == len)) {
240	return false; // Output overflow.
241	}
242	return true; // Success.
243	}
244
245	bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
246	intptr_t array_len,
247	int32_t* dst,
248	intptr_t len) {
249	intptr_t i = `0`;
250	intptr_t j = `0`;
251	intptr_t num_bytes;
252	for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
253	int32_t ch;
254	num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
255	if (ch == -`1`) {
256	return false; // Invalid input.
257	}
258	dst[j] = ch;
259	}
260	if ((i < array_len) && (j == len)) {
261	return false; // Output overflow.
262	}
263	return true; // Success.
264	}
265
266	bool Utf8::DecodeCStringToUTF32(const char* str, int32_t* dst, intptr_t len) {
267	ASSERT(str != NULL);
268	intptr_t array_len = strlen(str);
269	const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str);
270	return Utf8::DecodeToUTF32(utf8_array, array_len, dst, len);
271	}
272
273	void Utf16::Encode(int32_t codepoint, uint16_t* dst) {
274	ASSERT(codepoint > Utf16::kMaxCodeUnit);
275	ASSERT(dst != NULL);
276	dst[`0`] = (Utf16::kLeadSurrogateOffset + (codepoint >> `10`));
277	dst[`1`] = (`0xDC00` + (codepoint & `0x3FF`));
278	}
279
280	} // namespace dart
281

Browse the source code of engine/third_party/dart/runtime/platform/unicode.cc