1// Scintilla source code edit control
2/** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6// The License.txt file describes the conditions under which this software may be distributed.
7
8#include <cstdlib>
9
10#include <stdexcept>
11#include <string>
12#include <string_view>
13
14#include "UniConversion.h"
15
16namespace Scintilla::Internal {
17
18size_t UTF8Length(std::wstring_view wsv) noexcept {
19 size_t len = 0;
20 for (size_t i = 0; i < wsv.length() && wsv[i];) {
21 const unsigned int uch = wsv[i];
22 if (uch < 0x80) {
23 len++;
24 } else if (uch < 0x800) {
25 len += 2;
26 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
27 (uch <= SURROGATE_TRAIL_LAST)) {
28 len += 4;
29 i++;
30 } else {
31 len += 3;
32 }
33 i++;
34 }
35 return len;
36}
37
38size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept {
39 size_t positionUTF8 = 0;
40 for (size_t lengthUTF16 = 0; (positionUTF8 < u8Text.length()) && (lengthUTF16 < positionUTF16);) {
41 const unsigned char uch = u8Text[positionUTF8];
42 const unsigned int byteCount = UTF8BytesOfLead[uch];
43 lengthUTF16 += UTF16LengthFromUTF8ByteCount(byteCount);
44 positionUTF8 += byteCount;
45 }
46
47 return positionUTF8;
48}
49
50void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) noexcept {
51 size_t k = 0;
52 for (size_t i = 0; i < wsv.length() && wsv[i];) {
53 const unsigned int uch = wsv[i];
54 if (uch < 0x80) {
55 putf[k++] = static_cast<char>(uch);
56 } else if (uch < 0x800) {
57 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
58 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
59 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
60 (uch <= SURROGATE_TRAIL_LAST)) {
61 // Half a surrogate pair
62 i++;
63 const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (wsv[i] & 0x3ff);
64 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
65 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
66 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
67 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
68 } else {
69 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
70 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
71 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
72 }
73 i++;
74 }
75 if (k < len)
76 putf[k] = '\0';
77}
78
79void UTF8FromUTF32Character(int uch, char *putf) noexcept {
80 size_t k = 0;
81 if (uch < 0x80) {
82 putf[k++] = static_cast<char>(uch);
83 } else if (uch < 0x800) {
84 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
85 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
86 } else if (uch < 0x10000) {
87 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
88 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
89 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
90 } else {
91 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
92 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
93 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
94 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
95 }
96 putf[k] = '\0';
97}
98
99size_t UTF16Length(std::string_view svu8) noexcept {
100 size_t ulen = 0;
101 for (size_t i = 0; i< svu8.length();) {
102 const unsigned char ch = svu8[i];
103 const unsigned int byteCount = UTF8BytesOfLead[ch];
104 const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
105 i += byteCount;
106 ulen += (i > svu8.length()) ? 1 : utf16Len;
107 }
108 return ulen;
109}
110
111constexpr unsigned char TrailByteValue(unsigned char c) {
112 // The top 2 bits are 0b10 to indicate a trail byte.
113 // The lower 6 bits contain the value.
114 return c & 0b0011'1111;
115}
116
117size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen) {
118 size_t ui = 0;
119 for (size_t i = 0; i < svu8.length();) {
120 unsigned char ch = svu8[i];
121 const unsigned int byteCount = UTF8BytesOfLead[ch];
122 unsigned int value;
123
124 if (i + byteCount > svu8.length()) {
125 // Trying to read past end but still have space to write
126 if (ui < tlen) {
127 tbuf[ui] = ch;
128 ui++;
129 }
130 break;
131 }
132
133 const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
134 if (ui + outLen > tlen) {
135 throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
136 }
137
138 i++;
139 switch (byteCount) {
140 case 1:
141 tbuf[ui] = ch;
142 break;
143 case 2:
144 value = (ch & 0x1F) << 6;
145 ch = svu8[i++];
146 value += TrailByteValue(ch);
147 tbuf[ui] = static_cast<wchar_t>(value);
148 break;
149 case 3:
150 value = (ch & 0xF) << 12;
151 ch = svu8[i++];
152 value += (TrailByteValue(ch) << 6);
153 ch = svu8[i++];
154 value += TrailByteValue(ch);
155 tbuf[ui] = static_cast<wchar_t>(value);
156 break;
157 default:
158 // Outside the BMP so need two surrogates
159 value = (ch & 0x7) << 18;
160 ch = svu8[i++];
161 value += TrailByteValue(ch) << 12;
162 ch = svu8[i++];
163 value += TrailByteValue(ch) << 6;
164 ch = svu8[i++];
165 value += TrailByteValue(ch);
166 tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
167 ui++;
168 tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
169 break;
170 }
171 ui++;
172 }
173 return ui;
174}
175
176size_t UTF32Length(std::string_view svu8) noexcept {
177 size_t ulen = 0;
178 for (size_t i = 0; i < svu8.length();) {
179 const unsigned char ch = svu8[i];
180 const unsigned int byteCount = UTF8BytesOfLead[ch];
181 i += byteCount;
182 ulen++;
183 }
184 return ulen;
185}
186
187size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen) {
188 size_t ui = 0;
189 for (size_t i = 0; i < svu8.length();) {
190 unsigned char ch = svu8[i];
191 const unsigned int byteCount = UTF8BytesOfLead[ch];
192 unsigned int value;
193
194 if (i + byteCount > svu8.length()) {
195 // Trying to read past end but still have space to write
196 if (ui < tlen) {
197 tbuf[ui] = ch;
198 ui++;
199 }
200 break;
201 }
202
203 if (ui == tlen) {
204 throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
205 }
206
207 i++;
208 switch (byteCount) {
209 case 1:
210 value = ch;
211 break;
212 case 2:
213 value = (ch & 0x1F) << 6;
214 ch = svu8[i++];
215 value += TrailByteValue(ch);
216 break;
217 case 3:
218 value = (ch & 0xF) << 12;
219 ch = svu8[i++];
220 value += TrailByteValue(ch) << 6;
221 ch = svu8[i++];
222 value += TrailByteValue(ch);
223 break;
224 default:
225 value = (ch & 0x7) << 18;
226 ch = svu8[i++];
227 value += TrailByteValue(ch) << 12;
228 ch = svu8[i++];
229 value += TrailByteValue(ch) << 6;
230 ch = svu8[i++];
231 value += TrailByteValue(ch);
232 break;
233 }
234 tbuf[ui] = value;
235 ui++;
236 }
237 return ui;
238}
239
240std::wstring WStringFromUTF8(std::string_view svu8) {
241 if constexpr (sizeof(wchar_t) == 2) {
242 const size_t len16 = UTF16Length(svu8);
243 std::wstring ws(len16, 0);
244 UTF16FromUTF8(svu8, &ws[0], len16);
245 return ws;
246 } else {
247 const size_t len32 = UTF32Length(svu8);
248 std::wstring ws(len32, 0);
249 UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32);
250 return ws;
251 }
252}
253
254unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
255 if (val < SUPPLEMENTAL_PLANE_FIRST) {
256 tbuf[0] = static_cast<wchar_t>(val);
257 return 1;
258 } else {
259 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
260 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
261 return 2;
262 }
263}
264
265const unsigned char UTF8BytesOfLead[256] = {
2661, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
2671, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
2681, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
2691, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
2701, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
2711, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
2721, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
2731, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
2741, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
2751, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
2761, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
2771, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
2781, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
2792, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
2803, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
2814, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
282};
283
284// Return both the width of the first character in the string and a status
285// saying whether it is valid or invalid.
286// Most invalid sequences return a width of 1 so are treated as isolated bytes but
287// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
288// reasonably treated as code points in some circumstances. They will, however,
289// not have associated glyphs.
290int UTF8Classify(const unsigned char *us, size_t len) noexcept {
291 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
292 if (us[0] < 0x80) {
293 // ASCII
294 return 1;
295 }
296
297 const size_t byteCount = UTF8BytesOfLead[us[0]];
298 if (byteCount == 1 || byteCount > len) {
299 // Invalid lead byte
300 return UTF8MaskInvalid | 1;
301 }
302
303 if (!UTF8IsTrailByte(us[1])) {
304 // Invalid trail byte
305 return UTF8MaskInvalid | 1;
306 }
307
308 switch (byteCount) {
309 case 2:
310 return 2;
311
312 case 3:
313 if (UTF8IsTrailByte(us[2])) {
314 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
315 // Overlong
316 return UTF8MaskInvalid | 1;
317 }
318 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
319 // Surrogate
320 return UTF8MaskInvalid | 1;
321 }
322 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
323 // U+FFFE non-character - 3 bytes long
324 return UTF8MaskInvalid | 3;
325 }
326 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
327 // U+FFFF non-character - 3 bytes long
328 return UTF8MaskInvalid | 3;
329 }
330 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
331 // U+FDD0 .. U+FDEF
332 return UTF8MaskInvalid | 3;
333 }
334 return 3;
335 }
336 break;
337
338 default:
339 if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
340 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
341 // *FFFE or *FFFF non-character
342 return UTF8MaskInvalid | 4;
343 }
344 if (*us == 0xf4) {
345 // Check if encoding a value beyond the last Unicode character 10FFFF
346 if (us[1] > 0x8f) {
347 return UTF8MaskInvalid | 1;
348 }
349 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
350 // Overlong
351 return UTF8MaskInvalid | 1;
352 }
353 return 4;
354 }
355 break;
356 }
357
358 return UTF8MaskInvalid | 1;
359}
360
361int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
362 const int utf8StatusNext = UTF8Classify(us, len);
363 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
364}
365
366bool UTF8IsValid(std::string_view svu8) noexcept {
367 const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data());
368 size_t remaining = svu8.length();
369 while (remaining > 0) {
370 const int utf8Status = UTF8Classify(us, remaining);
371 if (utf8Status & UTF8MaskInvalid) {
372 return false;
373 } else {
374 const int lenChar = utf8Status & UTF8MaskWidth;
375 us += lenChar;
376 remaining -= lenChar;
377 }
378 }
379 return remaining == 0;
380}
381
382// Replace invalid bytes in UTF-8 with the replacement character
383std::string FixInvalidUTF8(const std::string &text) {
384 std::string result;
385 const char *s = text.c_str();
386 size_t remaining = text.size();
387 while (remaining > 0) {
388 const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
389 if (utf8Status & UTF8MaskInvalid) {
390 // Replacement character 0xFFFD = UTF8:"efbfbd".
391 result.append("\xef\xbf\xbd");
392 s++;
393 remaining--;
394 } else {
395 const size_t len = utf8Status & UTF8MaskWidth;
396 result.append(s, len);
397 s += len;
398 remaining -= len;
399 }
400 }
401 return result;
402}
403
404}
405