1 | //************************************ bs::framework - Copyright 2018 Marko Pintera **************************************// |
2 | //*********** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ***********// |
3 | #include "BsUnicode.h" |
4 | |
5 | namespace bs |
6 | { |
7 | /** Converts an UTF-8 encoded character (possibly multibyte) into an UTF-32 character. */ |
8 | template<typename T> |
9 | T UTF8To32(T begin, T end, char32_t& output, char32_t invalidChar = 0) |
10 | { |
11 | // Nothing to parse |
12 | if (begin >= end) |
13 | return begin; |
14 | |
15 | // Determine the number of bytes used by the character |
16 | UINT32 numBytes; |
17 | |
18 | UINT8 firstByte = (UINT8)*begin; |
19 | if (firstByte < 192) |
20 | numBytes = 1; |
21 | else if (firstByte < 224) |
22 | numBytes = 2; |
23 | else if (firstByte < 240) |
24 | numBytes = 3; |
25 | else if (firstByte < 248) |
26 | numBytes = 4; |
27 | else if (firstByte < 252) |
28 | numBytes = 5; |
29 | else // < 256 |
30 | numBytes = 6; |
31 | |
32 | // Not enough bytes were provided, invalid character |
33 | if((begin + numBytes) > end) |
34 | { |
35 | output = invalidChar; |
36 | return end; |
37 | } |
38 | |
39 | // Decode the character |
40 | output = 0; |
41 | switch(numBytes) |
42 | { |
43 | case 6: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH; |
44 | case 5: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH; |
45 | case 4: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH; |
46 | case 3: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH; |
47 | case 2: output += (UINT8)(*begin); ++begin; output <<= 6; BS_FALLTHROUGH; |
48 | case 1: output += (UINT8)(*begin); ++begin; BS_FALLTHROUGH; |
49 | default: break; |
50 | } |
51 | |
52 | constexpr UINT32 offsets[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; |
53 | output -= offsets[numBytes - 1]; |
54 | |
55 | return begin; |
56 | } |
57 | |
58 | /** Converts an UTF-32 encoded character into an (possibly multibyte) UTF-8 character. */ |
59 | template<typename T> |
60 | T UTF32To8(char32_t input, T output, UINT32 maxElems, char invalidChar = 0) |
61 | { |
62 | // No place to write the character |
63 | if (maxElems == 0) |
64 | return output; |
65 | |
66 | // Check if character is valid |
67 | if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF))) |
68 | { |
69 | *output = invalidChar; |
70 | ++output; |
71 | |
72 | return output; |
73 | } |
74 | |
75 | // Determine the number of bytes used by the character |
76 | UINT32 numBytes; |
77 | if (input < 0x80) |
78 | numBytes = 1; |
79 | else if (input < 0x800) |
80 | numBytes = 2; |
81 | else if (input < 0x10000) |
82 | numBytes = 3; |
83 | else // <= 0x0010FFFF |
84 | numBytes = 4; |
85 | |
86 | // Check if we have enough space |
87 | if(numBytes > maxElems) |
88 | { |
89 | *output = invalidChar; |
90 | ++output; |
91 | |
92 | return output; |
93 | } |
94 | |
95 | // Encode the character |
96 | constexpr UINT8 [7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
97 | |
98 | char bytes[4]; |
99 | switch (numBytes) |
100 | { |
101 | case 4: bytes[3] = (char)((input | 0x80) & 0xBF); input >>= 6; BS_FALLTHROUGH; |
102 | case 3: bytes[2] = (char)((input | 0x80) & 0xBF); input >>= 6; BS_FALLTHROUGH; |
103 | case 2: bytes[1] = (char)((input | 0x80) & 0xBF); input >>= 6; BS_FALLTHROUGH; |
104 | case 1: bytes[0] = (char)(input | headers[numBytes]); BS_FALLTHROUGH; |
105 | default: break; |
106 | } |
107 | |
108 | output = std::copy(bytes, bytes + numBytes, output); |
109 | return output; |
110 | } |
111 | |
112 | /** Converts an UTF-16 encoded character into an UTF-32 character. */ |
113 | template<typename T> |
114 | T UTF16To32(T begin, T end, char32_t& output, char32_t invalidChar = 0) |
115 | { |
116 | // Nothing to parse |
117 | if (begin >= end) |
118 | return begin; |
119 | |
120 | char16_t firstElem = (char16_t)*begin; |
121 | ++begin; |
122 | |
123 | // Check if it's a surrogate pair |
124 | if ((firstElem >= 0xD800) && (firstElem <= 0xDBFF)) |
125 | { |
126 | // Invalid character |
127 | if (begin >= end) |
128 | { |
129 | output = invalidChar; |
130 | return end; |
131 | } |
132 | |
133 | char32_t secondElem = (char32_t)*begin; |
134 | ++begin; |
135 | |
136 | if ((secondElem >= 0xDC00) && (secondElem <= 0xDFFF)) |
137 | output = (char32_t)(((firstElem - 0xD800) << 10) + (secondElem - 0xDC00) + 0x0010000); |
138 | else // Invalid character |
139 | output = invalidChar; |
140 | } |
141 | else |
142 | { |
143 | output = (char32_t)firstElem; |
144 | return begin; |
145 | } |
146 | |
147 | return begin; |
148 | } |
149 | |
150 | /** Converts an UTF-32 encoded character into an UTF-16 character. */ |
151 | template<typename T> |
152 | T UTF32To16(char32_t input, T output, UINT32 maxElems, char16_t invalidChar = 0) |
153 | { |
154 | // No place to write the character |
155 | if (maxElems == 0) |
156 | return output; |
157 | |
158 | // Invalid character |
159 | if (input > 0x0010FFFF) |
160 | { |
161 | *output = invalidChar; |
162 | ++output; |
163 | |
164 | return output; |
165 | } |
166 | |
167 | // Can be encoded as a single element |
168 | if (input <= 0xFFFF) |
169 | { |
170 | // Check if in valid range |
171 | if ((input >= 0xD800) && (input <= 0xDFFF)) |
172 | { |
173 | *output = invalidChar; |
174 | ++output; |
175 | |
176 | return output; |
177 | } |
178 | |
179 | *output = (char16_t)input; |
180 | ++output; |
181 | } |
182 | else // Must be encoded as two elements |
183 | { |
184 | // Two elements won't fit |
185 | if (maxElems < 2) |
186 | { |
187 | *output = invalidChar; |
188 | ++output; |
189 | |
190 | return output; |
191 | } |
192 | |
193 | input -= 0x0010000; |
194 | |
195 | *output = (char16_t)((input >> 10) + 0xD800); |
196 | ++output; |
197 | |
198 | *output = (char16_t)((input & 0x3FFUL) + 0xDC00); |
199 | ++output; |
200 | } |
201 | |
202 | return output; |
203 | } |
204 | |
205 | template<typename T> |
206 | T wideToUTF32(T begin, T end, char32_t& output, char32_t invalidChar = 0) |
207 | { |
208 | if (sizeof(wchar_t) == 4) // Assuming UTF-32 (i.e. Unix) |
209 | { |
210 | output = (char32_t)*begin; |
211 | ++begin; |
212 | |
213 | return begin; |
214 | } |
215 | else // Assuming UTF-16 (i.e. Windows) |
216 | return UTF16To32(begin, end, output, invalidChar); |
217 | |
218 | } |
219 | |
220 | char32_t ANSIToUTF32(char input, const std::locale& locale = std::locale("" )) |
221 | { |
222 | const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>(locale); |
223 | |
224 | // Note: Not exactly valid on Windows, since the input character could require a surrogate pair. |
225 | // Consider improving this if it ever becomes an issue. |
226 | wchar_t wideChar = facet.widen(input); |
227 | |
228 | char32_t output; |
229 | wideToUTF32(&wideChar, &wideChar + 1, output); |
230 | |
231 | return output; |
232 | } |
233 | |
234 | template<typename T> |
235 | T UTF32ToWide(char32_t input, T output, UINT32 maxElems, wchar_t invalidChar = 0) |
236 | { |
237 | if(sizeof(wchar_t) == 4) // Assuming UTF-32 (i.e. Unix) |
238 | { |
239 | *output = (wchar_t)input; |
240 | ++output; |
241 | |
242 | return output; |
243 | } |
244 | else // Assuming UTF-16 (i.e. Windows) |
245 | return UTF32To16(input, output, maxElems, invalidChar); |
246 | } |
247 | |
248 | char UTF32ToANSI(char32_t input, char invalidChar = 0, const std::locale& locale = std::locale("" )) |
249 | { |
250 | const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>(locale); |
251 | |
252 | // Note: Same as above, not exactly correct as narrow() doesn't accept a surrogate pair |
253 | return facet.narrow((wchar_t)input, invalidChar); |
254 | } |
255 | |
256 | String UTF8::fromANSI(const String& input, const std::locale& locale) |
257 | { |
258 | String output; |
259 | output.reserve(input.size()); |
260 | |
261 | auto backInserter = std::back_inserter(output); |
262 | |
263 | auto iter = input.begin(); |
264 | while(iter != input.end()) |
265 | { |
266 | char32_t u32char = ANSIToUTF32(*iter, locale); |
267 | UTF32To8(u32char, backInserter, 4); |
268 | |
269 | ++iter; |
270 | } |
271 | |
272 | return output; |
273 | } |
274 | |
275 | String UTF8::toANSI(const String& input, const std::locale& locale, char invalidChar) |
276 | { |
277 | String output; |
278 | |
279 | auto iter = input.begin(); |
280 | while(iter != input.end()) |
281 | { |
282 | char32_t u32char; |
283 | iter = UTF8To32(iter, input.end(), u32char, invalidChar); |
284 | |
285 | output.push_back(UTF32ToANSI(u32char, invalidChar, locale)); |
286 | } |
287 | |
288 | return output; |
289 | } |
290 | |
291 | String UTF8::fromWide(const WString& input) |
292 | { |
293 | String output; |
294 | output.reserve(input.size()); |
295 | |
296 | auto backInserter = std::back_inserter(output); |
297 | |
298 | auto iter = input.begin(); |
299 | while(iter != input.end()) |
300 | { |
301 | char32_t u32char; |
302 | iter = wideToUTF32(iter, input.end(), u32char); |
303 | UTF32To8(u32char, backInserter, 4); |
304 | } |
305 | |
306 | return output; |
307 | } |
308 | |
309 | WString UTF8::toWide(const String& input) |
310 | { |
311 | WString output; |
312 | auto backInserter = std::back_inserter(output); |
313 | |
314 | auto iter = input.begin(); |
315 | while(iter != input.end()) |
316 | { |
317 | char32_t u32char; |
318 | iter = UTF8To32(iter, input.end(), u32char); |
319 | |
320 | UTF32ToWide(u32char, backInserter, 2); |
321 | } |
322 | |
323 | return output; |
324 | } |
325 | |
326 | String UTF8::fromUTF16(const U16String& input) |
327 | { |
328 | String output; |
329 | output.reserve(input.size()); |
330 | |
331 | auto backInserter = std::back_inserter(output); |
332 | |
333 | auto iter = input.begin(); |
334 | while(iter != input.end()) |
335 | { |
336 | char32_t u32char = 0; |
337 | iter = UTF16To32(iter, input.end(), u32char); |
338 | UTF32To8(u32char, backInserter, 4); |
339 | } |
340 | |
341 | return output; |
342 | } |
343 | |
344 | U16String UTF8::toUTF16(const String& input) |
345 | { |
346 | U16String output; |
347 | auto backInserter = std::back_inserter(output); |
348 | |
349 | auto iter = input.begin(); |
350 | while(iter != input.end()) |
351 | { |
352 | char32_t u32char; |
353 | iter = UTF8To32(iter, input.end(), u32char); |
354 | |
355 | UTF32To16(u32char, backInserter, 2); |
356 | } |
357 | |
358 | return output; |
359 | } |
360 | |
361 | String UTF8::fromUTF32(const U32String& input) |
362 | { |
363 | String output; |
364 | output.reserve(input.size()); |
365 | |
366 | auto backInserter = std::back_inserter(output); |
367 | |
368 | auto iter = input.begin(); |
369 | while(iter != input.end()) |
370 | { |
371 | UTF32To8(*iter, backInserter, 4); |
372 | |
373 | ++iter; |
374 | } |
375 | |
376 | return output; |
377 | } |
378 | |
379 | U32String UTF8::toUTF32(const String& input) |
380 | { |
381 | U32String output; |
382 | |
383 | auto iter = input.begin(); |
384 | while(iter != input.end()) |
385 | { |
386 | char32_t u32char; |
387 | iter = UTF8To32(iter, input.end(), u32char); |
388 | |
389 | output.push_back(u32char); |
390 | } |
391 | |
392 | return output; |
393 | } |
394 | |
395 | UINT32 UTF8::count(const String& input) |
396 | { |
397 | UINT32 length = 0; |
398 | for (char i : input) |
399 | { |
400 | // Include only characters that don't start with bits 10 |
401 | length += (i & 0xc0) != 0x80; |
402 | } |
403 | |
404 | return length; |
405 | } |
406 | |
407 | UINT32 UTF8::charToByteIndex(const String& input, UINT32 charIdx) |
408 | { |
409 | UINT32 curChar = 0; |
410 | UINT32 curByte = 0; |
411 | for (char i : input) |
412 | { |
413 | // Include only characters that don't start with bits 10 |
414 | if((i & 0xc0) != 0x80) |
415 | { |
416 | if(curChar == charIdx) |
417 | return curByte; |
418 | |
419 | curChar++; |
420 | } |
421 | |
422 | curByte++; |
423 | } |
424 | |
425 | return (UINT32)input.size(); |
426 | } |
427 | |
428 | UINT32 UTF8::charByteCount(const String& input, UINT32 charIdx) |
429 | { |
430 | const UINT32 byteIdx = charToByteIndex(input, charIdx); |
431 | |
432 | UINT32 count = 1; |
433 | for(auto i = (size_t)byteIdx + 1; i < input.size(); i++) |
434 | { |
435 | if((i & 0xc0) != 0x80) |
436 | break; |
437 | |
438 | count++; |
439 | } |
440 | |
441 | return count; |
442 | } |
443 | |
444 | String UTF8::toLower(const String& input) |
445 | { |
446 | return PlatformUtility::convertCaseUTF8(input, false); |
447 | } |
448 | |
449 | String UTF8::toUpper(const String& input) |
450 | { |
451 | return PlatformUtility::convertCaseUTF8(input, true); |
452 | } |
453 | } |
454 | |