UniConversion.h source code [DeepinIDE/3rdparty/unioncode-scintilla515/scintilla/src/UniConversion.h]

1	// Scintilla source code edit control
2	/* @file UniConversion.h*
3	** Functions to handle UTF-8 and UTF-16 strings.
4	**/
5	// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6	// The License.txt file describes the conditions under which this software may be distributed.
7
8	#ifndef UNICONVERSION_H
9	#define UNICONVERSION_H
10
11	namespace Scintilla::Internal {
12
13	constexpr int UTF8MaxBytes = `4`;
14
15	constexpr int unicodeReplacementChar = `0xFFFD`;
16
17	size_t UTF8Length(std::wstring_view wsv) noexcept;
18	size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept;
19	void UTF8FromUTF16(std::wstring_view wsv, char putf, size_t len) noexcept*;
20	void UTF8FromUTF32Character(int uch, char putf) noexcept*;
21	size_t UTF16Length(std::string_view svu8) noexcept;
22	size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen);
23	size_t UTF32Length(std::string_view svu8) noexcept;
24	size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen);
25	// WStringFromUTF8 does the right thing when wchar_t is 2 or 4 bytes so
26	// works on both Windows and Unix.
27	std::wstring WStringFromUTF8(std::string_view svu8);
28	unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t tbuf) noexcept*;
29	bool UTF8IsValid(std::string_view svu8) noexcept;
30	std::string FixInvalidUTF8(const std::string &text);
31
32	extern const unsigned char UTF8BytesOfLead[`256`];
33
34	inline int UnicodeFromUTF8(const unsigned char us) noexcept* {
35	switch (UTF8BytesOfLead[us[`0`]]) {
36	case `1`:
37	return us[`0`];
38	case `2`:
39	return ((us[`0`] & `0x1F`) << `6`) + (us[`1`] & `0x3F`);
40	case `3`:
41	return ((us[`0`] & `0xF`) << `12`) + ((us[`1`] & `0x3F`) << `6`) + (us[`2`] & `0x3F`);
42	default:
43	return ((us[`0`] & `0x7`) << `18`) + ((us[`1`] & `0x3F`) << `12`) + ((us[`2`] & `0x3F`) << `6`) + (us[`3`] & `0x3F`);
44	}
45	}
46
47	inline constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
48	return (ch >= `0x80`) && (ch < `0xc0`);
49	}
50
51	inline constexpr bool UTF8IsAscii(unsigned char ch) noexcept {
52	return ch < `0x80`;
53	}
54
55	inline constexpr bool UTF8IsAscii(char ch) noexcept {
56	const unsigned char uch = ch;
57	return uch < `0x80`;
58	}
59
60	enum { UTF8MaskWidth=`0x7`, UTF8MaskInvalid=`0x8` };
61	int UTF8Classify(const unsigned char us, size_t len) noexcept*;
62	inline int UTF8Classify(std::string_view sv) noexcept {
63	return UTF8Classify(reinterpret_cast<const unsigned char *>(sv.data()), sv.length());
64	}
65
66	// Similar to UTF8Classify but returns a length of 1 for invalid bytes
67	// instead of setting the invalid flag
68	int UTF8DrawBytes(const unsigned char us, int* len) noexcept;
69
70	// Line separator is U+2028 \xe2\x80\xa8
71	// Paragraph separator is U+2029 \xe2\x80\xa9
72	constexpr int UTF8SeparatorLength = `3`;
73	inline bool UTF8IsSeparator(const unsigned char us) noexcept* {
74	return (us[`0`] == `0xe2`) && (us[`1`] == `0x80`) && ((us[`2`] == `0xa8`) \|\| (us[`2`] == `0xa9`));
75	}
76
77	// NEL is U+0085 \xc2\x85
78	constexpr int UTF8NELLength = `2`;
79	inline bool UTF8IsNEL(const unsigned char us) noexcept* {
80	return (us[`0`] == `0xc2`) && (us[`1`] == `0x85`);
81	}
82
83	// Is the sequence of 3 char a UTF-8 line end? Only the last two char are tested for a NEL.
84	constexpr bool UTF8IsMultibyteLineEnd(unsigned char ch0, unsigned char ch1, unsigned char ch2) noexcept {
85	return
86	((ch0 == `0xe2`) && (ch1 == `0x80`) && ((ch2 == `0xa8`) \|\| (ch2 == `0xa9`))) \|\|
87	((ch1 == `0xc2`) && (ch2 == `0x85`));
88	}
89
90	enum { SURROGATE_LEAD_FIRST = `0xD800` };
91	enum { SURROGATE_LEAD_LAST = `0xDBFF` };
92	enum { SURROGATE_TRAIL_FIRST = `0xDC00` };
93	enum { SURROGATE_TRAIL_LAST = `0xDFFF` };
94	enum { SUPPLEMENTAL_PLANE_FIRST = `0x10000` };
95
96	inline constexpr unsigned int UTF16CharLength(wchar_t uch) noexcept {
97	return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? `2` : `1`;
98	}
99
100	inline constexpr unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) noexcept {
101	return (byteCount < `4`) ? `1` : `2`;
102	}
103
104	}
105
106	#endif
107

Browse the source code of DeepinIDE/3rdparty/unioncode-scintilla515/scintilla/src/UniConversion.h