1//************************************ bs::framework - Copyright 2018 Marko Pintera **************************************//
2//*********** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ***********//
3#include "BsUnicode.h"
4
5namespace bs
6{
7 /** Converts an UTF-8 encoded character (possibly multibyte) into an UTF-32 character. */
8 template<typename T>
9 T UTF8To32(T begin, T end, char32_t& output, char32_t invalidChar = 0)
10 {
11 // Nothing to parse
12 if (begin >= end)
13 return begin;
14
15 // Determine the number of bytes used by the character
16 UINT32 numBytes;
17
18 UINT8 firstByte = (UINT8)*begin;
19 if (firstByte < 192)
20 numBytes = 1;
21 else if (firstByte < 224)
22 numBytes = 2;
23 else if (firstByte < 240)
24 numBytes = 3;
25 else if (firstByte < 248)
26 numBytes = 4;
27 else if (firstByte < 252)
28 numBytes = 5;
29 else // < 256
30 numBytes = 6;
31
32 // Not enough bytes were provided, invalid character
33 if((begin + numBytes) > end)
34 {
35 output = invalidChar;
36 return end;
37 }
38
39 // Decode the character
40 output = 0;
41 switch(numBytes)
42 {
43 case 6: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH;
44 case 5: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH;
45 case 4: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH;
46 case 3: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH;
47 case 2: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH;
48 case 1: output += (UINT8)(*begin); ++begin; BS_FALLTHROUGH;
49 default: break;
50 }
51
52 constexpr UINT32 offsets[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
53 output -= offsets[numBytes - 1];
54
55 return begin;
56 }
57
58 /** Converts an UTF-32 encoded character into an (possibly multibyte) UTF-8 character. */
59 template<typename T>
60 T UTF32To8(char32_t input, T output, UINT32 maxElems, char invalidChar = 0)
61 {
62 // No place to write the character
63 if (maxElems == 0)
64 return output;
65
66 // Check if character is valid
67 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
68 {
69 *output = invalidChar;
70 ++output;
71
72 return output;
73 }
74
75 // Determine the number of bytes used by the character
76 UINT32 numBytes;
77 if (input < 0x80)
78 numBytes = 1;
79 else if (input < 0x800)
80 numBytes = 2;
81 else if (input < 0x10000)
82 numBytes = 3;
83 else // <= 0x0010FFFF
84 numBytes = 4;
85
86 // Check if we have enough space
87 if(numBytes > maxElems)
88 {
89 *output = invalidChar;
90 ++output;
91
92 return output;
93 }
94
95 // Encode the character
96 constexpr UINT8 headers[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
97
98 char bytes[4];
99 switch (numBytes)
100 {
101 case 4: bytes[3] = (char)((input | 0x80) & 0xBF); input >>= 6; BS_FALLTHROUGH;
102 case 3: bytes[2] = (char)((input | 0x80) & 0xBF); input >>= 6; BS_FALLTHROUGH;
103 case 2: bytes[1] = (char)((input | 0x80) & 0xBF); input >>= 6; BS_FALLTHROUGH;
104 case 1: bytes[0] = (char)(input | headers[numBytes]); BS_FALLTHROUGH;
105 default: break;
106 }
107
108 output = std::copy(bytes, bytes + numBytes, output);
109 return output;
110 }
111
112 /** Converts an UTF-16 encoded character into an UTF-32 character. */
113 template<typename T>
114 T UTF16To32(T begin, T end, char32_t& output, char32_t invalidChar = 0)
115 {
116 // Nothing to parse
117 if (begin >= end)
118 return begin;
119
120 char16_t firstElem = (char16_t)*begin;
121 ++begin;
122
123 // Check if it's a surrogate pair
124 if ((firstElem >= 0xD800) && (firstElem <= 0xDBFF))
125 {
126 // Invalid character
127 if (begin >= end)
128 {
129 output = invalidChar;
130 return end;
131 }
132
133 char32_t secondElem = (char32_t)*begin;
134 ++begin;
135
136 if ((secondElem >= 0xDC00) && (secondElem <= 0xDFFF))
137 output = (char32_t)(((firstElem - 0xD800) << 10) + (secondElem - 0xDC00) + 0x0010000);
138 else // Invalid character
139 output = invalidChar;
140 }
141 else
142 {
143 output = (char32_t)firstElem;
144 return begin;
145 }
146
147 return begin;
148 }
149
150 /** Converts an UTF-32 encoded character into an UTF-16 character. */
151 template<typename T>
152 T UTF32To16(char32_t input, T output, UINT32 maxElems, char16_t invalidChar = 0)
153 {
154 // No place to write the character
155 if (maxElems == 0)
156 return output;
157
158 // Invalid character
159 if (input > 0x0010FFFF)
160 {
161 *output = invalidChar;
162 ++output;
163
164 return output;
165 }
166
167 // Can be encoded as a single element
168 if (input <= 0xFFFF)
169 {
170 // Check if in valid range
171 if ((input >= 0xD800) && (input <= 0xDFFF))
172 {
173 *output = invalidChar;
174 ++output;
175
176 return output;
177 }
178
179 *output = (char16_t)input;
180 ++output;
181 }
182 else // Must be encoded as two elements
183 {
184 // Two elements won't fit
185 if (maxElems < 2)
186 {
187 *output = invalidChar;
188 ++output;
189
190 return output;
191 }
192
193 input -= 0x0010000;
194
195 *output = (char16_t)((input >> 10) + 0xD800);
196 ++output;
197
198 *output = (char16_t)((input & 0x3FFUL) + 0xDC00);
199 ++output;
200 }
201
202 return output;
203 }
204
205 template<typename T>
206 T wideToUTF32(T begin, T end, char32_t& output, char32_t invalidChar = 0)
207 {
208 if (sizeof(wchar_t) == 4) // Assuming UTF-32 (i.e. Unix)
209 {
210 output = (char32_t)*begin;
211 ++begin;
212
213 return begin;
214 }
215 else // Assuming UTF-16 (i.e. Windows)
216 return UTF16To32(begin, end, output, invalidChar);
217
218 }
219
220 char32_t ANSIToUTF32(char input, const std::locale& locale = std::locale(""))
221 {
222 const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>(locale);
223
224 // Note: Not exactly valid on Windows, since the input character could require a surrogate pair.
225 // Consider improving this if it ever becomes an issue.
226 wchar_t wideChar = facet.widen(input);
227
228 char32_t output;
229 wideToUTF32(&wideChar, &wideChar + 1, output);
230
231 return output;
232 }
233
234 template<typename T>
235 T UTF32ToWide(char32_t input, T output, UINT32 maxElems, wchar_t invalidChar = 0)
236 {
237 if(sizeof(wchar_t) == 4) // Assuming UTF-32 (i.e. Unix)
238 {
239 *output = (wchar_t)input;
240 ++output;
241
242 return output;
243 }
244 else // Assuming UTF-16 (i.e. Windows)
245 return UTF32To16(input, output, maxElems, invalidChar);
246 }
247
248 char UTF32ToANSI(char32_t input, char invalidChar = 0, const std::locale& locale = std::locale(""))
249 {
250 const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>(locale);
251
252 // Note: Same as above, not exactly correct as narrow() doesn't accept a surrogate pair
253 return facet.narrow((wchar_t)input, invalidChar);
254 }
255
256 String UTF8::fromANSI(const String& input, const std::locale& locale)
257 {
258 String output;
259 output.reserve(input.size());
260
261 auto backInserter = std::back_inserter(output);
262
263 auto iter = input.begin();
264 while(iter != input.end())
265 {
266 char32_t u32char = ANSIToUTF32(*iter, locale);
267 UTF32To8(u32char, backInserter, 4);
268
269 ++iter;
270 }
271
272 return output;
273 }
274
275 String UTF8::toANSI(const String& input, const std::locale& locale, char invalidChar)
276 {
277 String output;
278
279 auto iter = input.begin();
280 while(iter != input.end())
281 {
282 char32_t u32char;
283 iter = UTF8To32(iter, input.end(), u32char, invalidChar);
284
285 output.push_back(UTF32ToANSI(u32char, invalidChar, locale));
286 }
287
288 return output;
289 }
290
291 String UTF8::fromWide(const WString& input)
292 {
293 String output;
294 output.reserve(input.size());
295
296 auto backInserter = std::back_inserter(output);
297
298 auto iter = input.begin();
299 while(iter != input.end())
300 {
301 char32_t u32char;
302 iter = wideToUTF32(iter, input.end(), u32char);
303 UTF32To8(u32char, backInserter, 4);
304 }
305
306 return output;
307 }
308
309 WString UTF8::toWide(const String& input)
310 {
311 WString output;
312 auto backInserter = std::back_inserter(output);
313
314 auto iter = input.begin();
315 while(iter != input.end())
316 {
317 char32_t u32char;
318 iter = UTF8To32(iter, input.end(), u32char);
319
320 UTF32ToWide(u32char, backInserter, 2);
321 }
322
323 return output;
324 }
325
326 String UTF8::fromUTF16(const U16String& input)
327 {
328 String output;
329 output.reserve(input.size());
330
331 auto backInserter = std::back_inserter(output);
332
333 auto iter = input.begin();
334 while(iter != input.end())
335 {
336 char32_t u32char = 0;
337 iter = UTF16To32(iter, input.end(), u32char);
338 UTF32To8(u32char, backInserter, 4);
339 }
340
341 return output;
342 }
343
344 U16String UTF8::toUTF16(const String& input)
345 {
346 U16String output;
347 auto backInserter = std::back_inserter(output);
348
349 auto iter = input.begin();
350 while(iter != input.end())
351 {
352 char32_t u32char;
353 iter = UTF8To32(iter, input.end(), u32char);
354
355 UTF32To16(u32char, backInserter, 2);
356 }
357
358 return output;
359 }
360
361 String UTF8::fromUTF32(const U32String& input)
362 {
363 String output;
364 output.reserve(input.size());
365
366 auto backInserter = std::back_inserter(output);
367
368 auto iter = input.begin();
369 while(iter != input.end())
370 {
371 UTF32To8(*iter, backInserter, 4);
372
373 ++iter;
374 }
375
376 return output;
377 }
378
379 U32String UTF8::toUTF32(const String& input)
380 {
381 U32String output;
382
383 auto iter = input.begin();
384 while(iter != input.end())
385 {
386 char32_t u32char;
387 iter = UTF8To32(iter, input.end(), u32char);
388
389 output.push_back(u32char);
390 }
391
392 return output;
393 }
394
395 UINT32 UTF8::count(const String& input)
396 {
397 UINT32 length = 0;
398 for (char i : input)
399 {
400 // Include only characters that don't start with bits 10
401 length += (i & 0xc0) != 0x80;
402 }
403
404 return length;
405 }
406
407 UINT32 UTF8::charToByteIndex(const String& input, UINT32 charIdx)
408 {
409 UINT32 curChar = 0;
410 UINT32 curByte = 0;
411 for (char i : input)
412 {
413 // Include only characters that don't start with bits 10
414 if((i & 0xc0) != 0x80)
415 {
416 if(curChar == charIdx)
417 return curByte;
418
419 curChar++;
420 }
421
422 curByte++;
423 }
424
425 return (UINT32)input.size();
426 }
427
428 UINT32 UTF8::charByteCount(const String& input, UINT32 charIdx)
429 {
430 const UINT32 byteIdx = charToByteIndex(input, charIdx);
431
432 UINT32 count = 1;
433 for(auto i = (size_t)byteIdx + 1; i < input.size(); i++)
434 {
435 if((i & 0xc0) != 0x80)
436 break;
437
438 count++;
439 }
440
441 return count;
442 }
443
444 String UTF8::toLower(const String& input)
445 {
446 return PlatformUtility::convertCaseUTF8(input, false);
447 }
448
449 String UTF8::toUpper(const String& input)
450 {
451 return PlatformUtility::convertCaseUTF8(input, true);
452 }
453}
454