| 1 | // Scintilla source code edit control |
| 2 | /** @file UniConversion.h |
| 3 | ** Functions to handle UTF-8 and UTF-16 strings. |
| 4 | **/ |
| 5 | // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org> |
| 6 | // The License.txt file describes the conditions under which this software may be distributed. |
| 7 | |
| 8 | #ifndef UNICONVERSION_H |
| 9 | #define UNICONVERSION_H |
| 10 | |
| 11 | namespace Scintilla::Internal { |
| 12 | |
| 13 | constexpr int UTF8MaxBytes = 4; |
| 14 | |
| 15 | constexpr int unicodeReplacementChar = 0xFFFD; |
| 16 | |
| 17 | size_t UTF8Length(std::wstring_view wsv) noexcept; |
| 18 | size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept; |
| 19 | void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) noexcept; |
| 20 | void UTF8FromUTF32Character(int uch, char *putf) noexcept; |
| 21 | size_t UTF16Length(std::string_view svu8) noexcept; |
| 22 | size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen); |
| 23 | size_t UTF32Length(std::string_view svu8) noexcept; |
| 24 | size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen); |
| 25 | // WStringFromUTF8 does the right thing when wchar_t is 2 or 4 bytes so |
| 26 | // works on both Windows and Unix. |
| 27 | std::wstring WStringFromUTF8(std::string_view svu8); |
| 28 | unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept; |
| 29 | bool UTF8IsValid(std::string_view svu8) noexcept; |
| 30 | std::string FixInvalidUTF8(const std::string &text); |
| 31 | |
| 32 | extern const unsigned char UTF8BytesOfLead[256]; |
| 33 | |
| 34 | inline int UnicodeFromUTF8(const unsigned char *us) noexcept { |
| 35 | switch (UTF8BytesOfLead[us[0]]) { |
| 36 | case 1: |
| 37 | return us[0]; |
| 38 | case 2: |
| 39 | return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); |
| 40 | case 3: |
| 41 | return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); |
| 42 | default: |
| 43 | return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); |
| 44 | } |
| 45 | } |
| 46 | |
| 47 | inline constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept { |
| 48 | return (ch >= 0x80) && (ch < 0xc0); |
| 49 | } |
| 50 | |
| 51 | inline constexpr bool UTF8IsAscii(unsigned char ch) noexcept { |
| 52 | return ch < 0x80; |
| 53 | } |
| 54 | |
| 55 | inline constexpr bool UTF8IsAscii(char ch) noexcept { |
| 56 | const unsigned char uch = ch; |
| 57 | return uch < 0x80; |
| 58 | } |
| 59 | |
| 60 | enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 }; |
| 61 | int UTF8Classify(const unsigned char *us, size_t len) noexcept; |
| 62 | inline int UTF8Classify(std::string_view sv) noexcept { |
| 63 | return UTF8Classify(reinterpret_cast<const unsigned char *>(sv.data()), sv.length()); |
| 64 | } |
| 65 | |
| 66 | // Similar to UTF8Classify but returns a length of 1 for invalid bytes |
| 67 | // instead of setting the invalid flag |
| 68 | int UTF8DrawBytes(const unsigned char *us, int len) noexcept; |
| 69 | |
| 70 | // Line separator is U+2028 \xe2\x80\xa8 |
| 71 | // Paragraph separator is U+2029 \xe2\x80\xa9 |
| 72 | constexpr int UTF8SeparatorLength = 3; |
| 73 | inline bool UTF8IsSeparator(const unsigned char *us) noexcept { |
| 74 | return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9)); |
| 75 | } |
| 76 | |
| 77 | // NEL is U+0085 \xc2\x85 |
| 78 | constexpr int UTF8NELLength = 2; |
| 79 | inline bool UTF8IsNEL(const unsigned char *us) noexcept { |
| 80 | return (us[0] == 0xc2) && (us[1] == 0x85); |
| 81 | } |
| 82 | |
| 83 | // Is the sequence of 3 char a UTF-8 line end? Only the last two char are tested for a NEL. |
| 84 | constexpr bool UTF8IsMultibyteLineEnd(unsigned char ch0, unsigned char ch1, unsigned char ch2) noexcept { |
| 85 | return |
| 86 | ((ch0 == 0xe2) && (ch1 == 0x80) && ((ch2 == 0xa8) || (ch2 == 0xa9))) || |
| 87 | ((ch1 == 0xc2) && (ch2 == 0x85)); |
| 88 | } |
| 89 | |
| 90 | enum { SURROGATE_LEAD_FIRST = 0xD800 }; |
| 91 | enum { SURROGATE_LEAD_LAST = 0xDBFF }; |
| 92 | enum { SURROGATE_TRAIL_FIRST = 0xDC00 }; |
| 93 | enum { SURROGATE_TRAIL_LAST = 0xDFFF }; |
| 94 | enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 }; |
| 95 | |
| 96 | inline constexpr unsigned int UTF16CharLength(wchar_t uch) noexcept { |
| 97 | return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1; |
| 98 | } |
| 99 | |
| 100 | inline constexpr unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) noexcept { |
| 101 | return (byteCount < 4) ? 1 : 2; |
| 102 | } |
| 103 | |
| 104 | } |
| 105 | |
| 106 | #endif |
| 107 | |