UniConversion.cxx source code [DeepinIDE/3rdparty/unioncode-scintilla515/scintilla/src/UniConversion.cxx]

1	// Scintilla source code edit control
2	/* @file UniConversion.cxx*
3	** Functions to handle UTF-8 and UTF-16 strings.
4	**/
5	// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6	// The License.txt file describes the conditions under which this software may be distributed.
7
8	#include <cstdlib>
9
10	#include <stdexcept>
11	#include <string>
12	#include <string_view>
13
14	#include "UniConversion.h"
15
16	namespace Scintilla::Internal {
17
18	size_t UTF8Length(std::wstring_view wsv) noexcept {
19	size_t len = `0`;
20	for (size_t i = `0`; i < wsv.length() && wsv [i];) {
21	const unsigned int uch = wsv [i];
22	if (uch < `0x80`) {
23	len++;
24	} else if (uch < `0x800`) {
25	len += `2`;
26	} else if ((uch >= SURROGATE_LEAD_FIRST) &&
27	(uch <= SURROGATE_TRAIL_LAST)) {
28	len += `4`;
29	i++;
30	} else {
31	len += `3`;
32	}
33	i++;
34	}
35	return len;
36	}
37
38	size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept {
39	size_t positionUTF8 = `0`;
40	for (size_t lengthUTF16 = `0`; (positionUTF8 < u8Text.length()) && (lengthUTF16 < positionUTF16);) {
41	const unsigned char uch = u8Text [positionUTF8];
42	const unsigned int byteCount = UTF8BytesOfLead[uch];
43	lengthUTF16 += UTF16LengthFromUTF8ByteCount(byteCount);
44	positionUTF8 += byteCount;
45	}
46
47	return positionUTF8;
48	}
49
50	void UTF8FromUTF16(std::wstring_view wsv, char putf, size_t len) noexcept* {
51	size_t k = `0`;
52	for (size_t i = `0`; i < wsv.length() && wsv [i];) {
53	const unsigned int uch = wsv [i];
54	if (uch < `0x80`) {
55	putf[k++] = static_cast<char>(uch);
56	} else if (uch < `0x800`) {
57	putf[k++] = static_cast<char>(`0xC0` \| (uch >> `6`));
58	putf[k++] = static_cast<char>(`0x80` \| (uch & `0x3f`));
59	} else if ((uch >= SURROGATE_LEAD_FIRST) &&
60	(uch <= SURROGATE_TRAIL_LAST)) {
61	// Half a surrogate pair
62	i++;
63	const unsigned int xch = `0x10000` + ((uch & `0x3ff`) << `10`) + (wsv [i] & `0x3ff`);
64	putf[k++] = static_cast<char>(`0xF0` \| (xch >> `18`));
65	putf[k++] = static_cast<char>(`0x80` \| ((xch >> `12`) & `0x3f`));
66	putf[k++] = static_cast<char>(`0x80` \| ((xch >> `6`) & `0x3f`));
67	putf[k++] = static_cast<char>(`0x80` \| (xch & `0x3f`));
68	} else {
69	putf[k++] = static_cast<char>(`0xE0` \| (uch >> `12`));
70	putf[k++] = static_cast<char>(`0x80` \| ((uch >> `6`) & `0x3f`));
71	putf[k++] = static_cast<char>(`0x80` \| (uch & `0x3f`));
72	}
73	i++;
74	}
75	if (k < len)
76	putf[k] = `'\0'`;
77	}
78
79	void UTF8FromUTF32Character(int uch, char putf) noexcept* {
80	size_t k = `0`;
81	if (uch < `0x80`) {
82	putf[k++] = static_cast<char>(uch);
83	} else if (uch < `0x800`) {
84	putf[k++] = static_cast<char>(`0xC0` \| (uch >> `6`));
85	putf[k++] = static_cast<char>(`0x80` \| (uch & `0x3f`));
86	} else if (uch < `0x10000`) {
87	putf[k++] = static_cast<char>(`0xE0` \| (uch >> `12`));
88	putf[k++] = static_cast<char>(`0x80` \| ((uch >> `6`) & `0x3f`));
89	putf[k++] = static_cast<char>(`0x80` \| (uch & `0x3f`));
90	} else {
91	putf[k++] = static_cast<char>(`0xF0` \| (uch >> `18`));
92	putf[k++] = static_cast<char>(`0x80` \| ((uch >> `12`) & `0x3f`));
93	putf[k++] = static_cast<char>(`0x80` \| ((uch >> `6`) & `0x3f`));
94	putf[k++] = static_cast<char>(`0x80` \| (uch & `0x3f`));
95	}
96	putf[k] = `'\0'`;
97	}
98
99	size_t UTF16Length(std::string_view svu8) noexcept {
100	size_t ulen = `0`;
101	for (size_t i = `0`; i< svu8.length();) {
102	const unsigned char ch = svu8 [i];
103	const unsigned int byteCount = UTF8BytesOfLead[ch];
104	const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
105	i += byteCount;
106	ulen += (i > svu8.length()) ? `1` : utf16Len;
107	}
108	return ulen;
109	}
110
111	constexpr unsigned char TrailByteValue(unsigned char c) {
112	// The top 2 bits are 0b10 to indicate a trail byte.
113	// The lower 6 bits contain the value.
114	return c & `0b0011'1111`;
115	}
116
117	size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen) {
118	size_t ui = `0`;
119	for (size_t i = `0`; i < svu8.length();) {
120	unsigned char ch = svu8 [i];
121	const unsigned int byteCount = UTF8BytesOfLead[ch];
122	unsigned int value;
123
124	if (i + byteCount > svu8.length()) {
125	// Trying to read past end but still have space to write
126	if (ui < tlen) {
127	tbuf[ui] = ch;
128	ui++;
129	}
130	break;
131	}
132
133	const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
134	if (ui + outLen > tlen) {
135	throw std::runtime_error ("UTF16FromUTF8: attempted write beyond end");
136	}
137
138	i++;
139	switch (byteCount) {
140	case `1`:
141	tbuf[ui] = ch;
142	break;
143	case `2`:
144	value = (ch & `0x1F`) << `6`;
145	ch = svu8 [i++];
146	value += TrailByteValue(ch);
147	tbuf[ui] = static_cast<wchar_t>(value);
148	break;
149	case `3`:
150	value = (ch & `0xF`) << `12`;
151	ch = svu8 [i++];
152	value += (TrailByteValue(ch) << `6`);
153	ch = svu8 [i++];
154	value += TrailByteValue(ch);
155	tbuf[ui] = static_cast<wchar_t>(value);
156	break;
157	default:
158	// Outside the BMP so need two surrogates
159	value = (ch & `0x7`) << `18`;
160	ch = svu8 [i++];
161	value += TrailByteValue(ch) << `12`;
162	ch = svu8 [i++];
163	value += TrailByteValue(ch) << `6`;
164	ch = svu8 [i++];
165	value += TrailByteValue(ch);
166	tbuf[ui] = static_cast<wchar_t>(((value - `0x10000`) >> `10`) + SURROGATE_LEAD_FIRST);
167	ui++;
168	tbuf[ui] = static_cast<wchar_t>((value & `0x3ff`) + SURROGATE_TRAIL_FIRST);
169	break;
170	}
171	ui++;
172	}
173	return ui;
174	}
175
176	size_t UTF32Length(std::string_view svu8) noexcept {
177	size_t ulen = `0`;
178	for (size_t i = `0`; i < svu8.length();) {
179	const unsigned char ch = svu8 [i];
180	const unsigned int byteCount = UTF8BytesOfLead[ch];
181	i += byteCount;
182	ulen++;
183	}
184	return ulen;
185	}
186
187	size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen) {
188	size_t ui = `0`;
189	for (size_t i = `0`; i < svu8.length();) {
190	unsigned char ch = svu8 [i];
191	const unsigned int byteCount = UTF8BytesOfLead[ch];
192	unsigned int value;
193
194	if (i + byteCount > svu8.length()) {
195	// Trying to read past end but still have space to write
196	if (ui < tlen) {
197	tbuf[ui] = ch;
198	ui++;
199	}
200	break;
201	}
202
203	if (ui == tlen) {
204	throw std::runtime_error ("UTF32FromUTF8: attempted write beyond end");
205	}
206
207	i++;
208	switch (byteCount) {
209	case `1`:
210	value = ch;
211	break;
212	case `2`:
213	value = (ch & `0x1F`) << `6`;
214	ch = svu8 [i++];
215	value += TrailByteValue(ch);
216	break;
217	case `3`:
218	value = (ch & `0xF`) << `12`;
219	ch = svu8 [i++];
220	value += TrailByteValue(ch) << `6`;
221	ch = svu8 [i++];
222	value += TrailByteValue(ch);
223	break;
224	default:
225	value = (ch & `0x7`) << `18`;
226	ch = svu8 [i++];
227	value += TrailByteValue(ch) << `12`;
228	ch = svu8 [i++];
229	value += TrailByteValue(ch) << `6`;
230	ch = svu8 [i++];
231	value += TrailByteValue(ch);
232	break;
233	}
234	tbuf[ui] = value;
235	ui++;
236	}
237	return ui;
238	}
239
240	std::wstring WStringFromUTF8(std::string_view svu8) {
241	if constexpr (sizeof(wchar_t) == `2`) {
242	const size_t len16 = UTF16Length(svu8);
243	std::wstring ws(len16, `0`);
244	UTF16FromUTF8(svu8, &ws [`0`], len16);
245	return ws;
246	} else {
247	const size_t len32 = UTF32Length(svu8);
248	std::wstring ws(len32, `0`);
249	UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws [`0`]), len32);
250	return ws;
251	}
252	}
253
254	unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t tbuf) noexcept* {
255	if (val < SUPPLEMENTAL_PLANE_FIRST) {
256	tbuf[`0`] = static_cast<wchar_t>(val);
257	return `1`;
258	} else {
259	tbuf[`0`] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> `10`) + SURROGATE_LEAD_FIRST);
260	tbuf[`1`] = static_cast<wchar_t>((val & `0x3ff`) + SURROGATE_TRAIL_FIRST);
261	return `2`;
262	}
263	}
264
265	const unsigned char UTF8BytesOfLead[`256`] = {
266	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 00 - 0F
267	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 10 - 1F
268	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 20 - 2F
269	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 30 - 3F
270	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 40 - 4F
271	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 50 - 5F
272	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 60 - 6F
273	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 70 - 7F
274	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 80 - 8F
275	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 90 - 9F
276	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // A0 - AF
277	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // B0 - BF
278	`1`, `1`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // C0 - CF
279	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // D0 - DF
280	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, // E0 - EF
281	`4`, `4`, `4`, `4`, `4`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // F0 - FF
282	};
283
284	// Return both the width of the first character in the string and a status
285	// saying whether it is valid or invalid.
286	// Most invalid sequences return a width of 1 so are treated as isolated bytes but
287	// the non-characters FFFE, FFFF and FDD0 .. FDEF return 3 or 4 as they can be
288	// reasonably treated as code points in some circumstances. They will, however,
289	// not have associated glyphs.
290	int UTF8Classify(const unsigned char us, size_t len) noexcept* {
291	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
292	if (us[`0`] < `0x80`) {
293	// ASCII
294	return `1`;
295	}
296
297	const size_t byteCount = UTF8BytesOfLead[us[`0`]];
298	if (byteCount == `1` \|\| byteCount > len) {
299	// Invalid lead byte
300	return UTF8MaskInvalid \| `1`;
301	}
302
303	if (!UTF8IsTrailByte(us[`1`])) {
304	// Invalid trail byte
305	return UTF8MaskInvalid \| `1`;
306	}
307
308	switch (byteCount) {
309	case `2`:
310	return `2`;
311
312	case `3`:
313	if (UTF8IsTrailByte(us[`2`])) {
314	if ((*us == `0xe0`) && ((us[`1`] & `0xe0`) == `0x80`)) {
315	// Overlong
316	return UTF8MaskInvalid \| `1`;
317	}
318	if ((*us == `0xed`) && ((us[`1`] & `0xe0`) == `0xa0`)) {
319	// Surrogate
320	return UTF8MaskInvalid \| `1`;
321	}
322	if ((*us == `0xef`) && (us[`1`] == `0xbf`) && (us[`2`] == `0xbe`)) {
323	// U+FFFE non-character - 3 bytes long
324	return UTF8MaskInvalid \| `3`;
325	}
326	if ((*us == `0xef`) && (us[`1`] == `0xbf`) && (us[`2`] == `0xbf`)) {
327	// U+FFFF non-character - 3 bytes long
328	return UTF8MaskInvalid \| `3`;
329	}
330	if ((*us == `0xef`) && (us[`1`] == `0xb7`) && (((us[`2`] & `0xf0`) == `0x90`) \|\| ((us[`2`] & `0xf0`) == `0xa0`))) {
331	// U+FDD0 .. U+FDEF
332	return UTF8MaskInvalid \| `3`;
333	}
334	return `3`;
335	}
336	break;
337
338	default:
339	if (UTF8IsTrailByte(us[`2`]) && UTF8IsTrailByte(us[`3`])) {
340	if (((us[`1`] & `0xf`) == `0xf`) && (us[`2`] == `0xbf`) && ((us[`3`] == `0xbe`) \|\| (us[`3`] == `0xbf`))) {
341	// FFFE or FFFF non-character
342	return UTF8MaskInvalid \| `4`;
343	}
344	if (*us == `0xf4`) {
345	// Check if encoding a value beyond the last Unicode character 10FFFF
346	if (us[`1`] > `0x8f`) {
347	return UTF8MaskInvalid \| `1`;
348	}
349	} else if ((*us == `0xf0`) && ((us[`1`] & `0xf0`) == `0x80`)) {
350	// Overlong
351	return UTF8MaskInvalid \| `1`;
352	}
353	return `4`;
354	}
355	break;
356	}
357
358	return UTF8MaskInvalid \| `1`;
359	}
360
361	int UTF8DrawBytes(const unsigned char us, int* len) noexcept {
362	const int utf8StatusNext = UTF8Classify(us, len);
363	return (utf8StatusNext & UTF8MaskInvalid) ? `1` : (utf8StatusNext & UTF8MaskWidth);
364	}
365
366	bool UTF8IsValid(std::string_view svu8) noexcept {
367	const unsigned char us = reinterpret_cast<const* unsigned char *>(svu8.data());
368	size_t remaining = svu8.length();
369	while (remaining > `0`) {
370	const int utf8Status = UTF8Classify(us, remaining);
371	if (utf8Status & UTF8MaskInvalid) {
372	return false;
373	} else {
374	const int lenChar = utf8Status & UTF8MaskWidth;
375	us += lenChar;
376	remaining -= lenChar;
377	}
378	}
379	return remaining == `0`;
380	}
381
382	// Replace invalid bytes in UTF-8 with the replacement character
383	std::string FixInvalidUTF8(const std::string &text) {
384	std::string result;
385	const char *s = text.c_str();
386	size_t remaining = text.size();
387	while (remaining > `0`) {
388	const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
389	if (utf8Status & UTF8MaskInvalid) {
390	// Replacement character 0xFFFD = UTF8:"efbfbd".
391	result.append("\xef\xbf\xbd");
392	s++;
393	remaining--;
394	} else {
395	const size_t len = utf8Status & UTF8MaskWidth;
396	result.append(s, len);
397	s += len;
398	remaining -= len;
399	}
400	}
401	return result;
402	}
403
404	}
405

Browse the source code of DeepinIDE/3rdparty/unioncode-scintilla515/scintilla/src/UniConversion.cxx