BsUnicode.cpp source code [bsFramework/Source/Foundation/bsfUtility/String/BsUnicode.cpp]

1	//********************************* bs::framework - Copyright 2018 Marko Pintera ***********************************//
2	//******** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ********//
3	#include "BsUnicode.h"
4
5	namespace bs
6	{
7	/* Converts an UTF-8 encoded character (possibly multibyte) into an UTF-32 character. /
8	template<typename T>
9	T UTF8To32(T begin, T end, char32_t& output, char32_t invalidChar = `0`)
10	{
11	// Nothing to parse
12	if (begin >= end)
13	return begin;
14
15	// Determine the number of bytes used by the character
16	UINT32 numBytes;
17
18	UINT8 firstByte = (UINT8)*begin;
19	if (firstByte < `192`)
20	numBytes = `1`;
21	else if (firstByte < `224`)
22	numBytes = `2`;
23	else if (firstByte < `240`)
24	numBytes = `3`;
25	else if (firstByte < `248`)
26	numBytes = `4`;
27	else if (firstByte < `252`)
28	numBytes = `5`;
29	else // < 256
30	numBytes = `6`;
31
32	// Not enough bytes were provided, invalid character
33	if((begin + numBytes) > end)
34	{
35	output = invalidChar;
36	return end;
37	}
38
39	// Decode the character
40	output = `0`;
41	switch(numBytes)
42	{
43	case `6`: output += (UINT8)(*begin); ++begin; output <<= `6`; BS_FALLTHROUGH;
44	case `5`: output += (UINT8)(*begin); ++begin; output <<= `6`; BS_FALLTHROUGH;
45	case `4`: output += (UINT8)(*begin); ++begin; output <<= `6`; BS_FALLTHROUGH;
46	case `3`: output += (UINT8)(*begin); ++begin; output <<= `6`; BS_FALLTHROUGH;
47	case `2`: output += (UINT8)(*begin); ++begin; output <<= `6`; BS_FALLTHROUGH;
48	case `1`: output += (UINT8)(*begin); ++begin; BS_FALLTHROUGH;
49	default: break;
50	}
51
52	constexpr UINT32 offsets[`6`] = { `0x00000000`, `0x00003080`, `0x000E2080`, `0x03C82080`, `0xFA082080`, `0x82082080` };
53	output -= offsets[numBytes - `1`];
54
55	return begin;
56	}
57
58	/* Converts an UTF-32 encoded character into an (possibly multibyte) UTF-8 character. /
59	template<typename T>
60	T UTF32To8(char32_t input, T output, UINT32 maxElems, char invalidChar = `0`)
61	{
62	// No place to write the character
63	if (maxElems == `0`)
64	return output;
65
66	// Check if character is valid
67	if ((input > `0x0010FFFF`) \|\| ((input >= `0xD800`) && (input <= `0xDBFF`)))
68	{
69	*output = invalidChar;
70	++output;
71
72	return output;
73	}
74
75	// Determine the number of bytes used by the character
76	UINT32 numBytes;
77	if (input < `0x80`)
78	numBytes = `1`;
79	else if (input < `0x800`)
80	numBytes = `2`;
81	else if (input < `0x10000`)
82	numBytes = `3`;
83	else // <= 0x0010FFFF
84	numBytes = `4`;
85
86	// Check if we have enough space
87	if(numBytes > maxElems)
88	{
89	*output = invalidChar;
90	++output;
91
92	return output;
93	}
94
95	// Encode the character
96	constexpr UINT8 headers[`7`] = { `0x00`, `0x00`, `0xC0`, `0xE0`, `0xF0`, `0xF8`, `0xFC` };
97
98	char bytes[`4`];
99	switch (numBytes)
100	{
101	case `4`: bytes[`3`] = (char)((input \| `0x80`) & `0xBF`); input >>= `6`; BS_FALLTHROUGH;
102	case `3`: bytes[`2`] = (char)((input \| `0x80`) & `0xBF`); input >>= `6`; BS_FALLTHROUGH;
103	case `2`: bytes[`1`] = (char)((input \| `0x80`) & `0xBF`); input >>= `6`; BS_FALLTHROUGH;
104	case `1`: bytes[`0`] = (char)(input \| headers[numBytes]); BS_FALLTHROUGH;
105	default: break;
106	}
107
108	output = std::copy(bytes, bytes + numBytes, output);
109	return output;
110	}
111
112	/* Converts an UTF-16 encoded character into an UTF-32 character. /
113	template<typename T>
114	T UTF16To32(T begin, T end, char32_t& output, char32_t invalidChar = `0`)
115	{
116	// Nothing to parse
117	if (begin >= end)
118	return begin;
119
120	char16_t firstElem = (char16_t)*begin;
121	++begin;
122
123	// Check if it's a surrogate pair
124	if ((firstElem >= `0xD800`) && (firstElem <= `0xDBFF`))
125	{
126	// Invalid character
127	if (begin >= end)
128	{
129	output = invalidChar;
130	return end;
131	}
132
133	char32_t secondElem = (char32_t)*begin;
134	++begin;
135
136	if ((secondElem >= `0xDC00`) && (secondElem <= `0xDFFF`))
137	output = (char32_t)(((firstElem - `0xD800`) << `10`) + (secondElem - `0xDC00`) + `0x0010000`);
138	else // Invalid character
139	output = invalidChar;
140	}
141	else
142	{
143	output = (char32_t)firstElem;
144	return begin;
145	}
146
147	return begin;
148	}
149
150	/* Converts an UTF-32 encoded character into an UTF-16 character. /
151	template<typename T>
152	T UTF32To16(char32_t input, T output, UINT32 maxElems, char16_t invalidChar = `0`)
153	{
154	// No place to write the character
155	if (maxElems == `0`)
156	return output;
157
158	// Invalid character
159	if (input > `0x0010FFFF`)
160	{
161	*output = invalidChar;
162	++output;
163
164	return output;
165	}
166
167	// Can be encoded as a single element
168	if (input <= `0xFFFF`)
169	{
170	// Check if in valid range
171	if ((input >= `0xD800`) && (input <= `0xDFFF`))
172	{
173	*output = invalidChar;
174	++output;
175
176	return output;
177	}
178
179	output = (char16_t*)input;
180	++output;
181	}
182	else // Must be encoded as two elements
183	{
184	// Two elements won't fit
185	if (maxElems < `2`)
186	{
187	*output = invalidChar;
188	++output;
189
190	return output;
191	}
192
193	input -= `0x0010000`;
194
195	output = (char16_t*)((input >> `10`) + `0xD800`);
196	++output;
197
198	output = (char16_t*)((input & `0x3FFUL`) + `0xDC00`);
199	++output;
200	}
201
202	return output;
203	}
204
205	template<typename T>
206	T wideToUTF32(T begin, T end, char32_t& output, char32_t invalidChar = `0`)
207	{
208	if (sizeof(wchar_t) == `4`) // Assuming UTF-32 (i.e. Unix)
209	{
210	output = (char32_t)*begin;
211	++begin;
212
213	return begin;
214	}
215	else // Assuming UTF-16 (i.e. Windows)
216	return UTF16To32(begin, end, output, invalidChar);
217
218	}
219
220	char32_t ANSIToUTF32(char input, const std::locale& locale = std::locale (""))
221	{
222	const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>(locale);
223
224	// Note: Not exactly valid on Windows, since the input character could require a surrogate pair.
225	// Consider improving this if it ever becomes an issue.
226	wchar_t wideChar = facet.widen(input);
227
228	char32_t output;
229	wideToUTF32(&wideChar, &wideChar + `1`, output);
230
231	return output;
232	}
233
234	template<typename T>
235	T UTF32ToWide(char32_t input, T output, UINT32 maxElems, wchar_t invalidChar = `0`)
236	{
237	if(sizeof(wchar_t) == `4`) // Assuming UTF-32 (i.e. Unix)
238	{
239	output = (wchar_t*)input;
240	++output;
241
242	return output;
243	}
244	else // Assuming UTF-16 (i.e. Windows)
245	return UTF32To16(input, output, maxElems, invalidChar);
246	}
247
248	char UTF32ToANSI(char32_t input, char invalidChar = `0`, const std::locale& locale = std::locale (""))
249	{
250	const std::ctype<wchar_t>& facet = std::use_facet<std::ctype<wchar_t>>(locale);
251
252	// Note: Same as above, not exactly correct as narrow() doesn't accept a surrogate pair
253	return facet.narrow((wchar_t)input, invalidChar);
254	}
255
256	String UTF8::fromANSI(const String& input, const std::locale& locale)
257	{
258	String output;
259	output.reserve(input.size());
260
261	auto backInserter = std::back_inserter(output);
262
263	auto iter = input.begin();
264	while(iter != input.end())
265	{
266	char32_t u32char = ANSIToUTF32(*iter, locale);
267	UTF32To8(u32char, backInserter, `4`);
268
269	++iter;
270	}
271
272	return output;
273	}
274
275	String UTF8::toANSI(const String& input, const std::locale& locale, char invalidChar)
276	{
277	String output;
278
279	auto iter = input.begin();
280	while(iter != input.end())
281	{
282	char32_t u32char;
283	iter = UTF8To32(iter, input.end(), u32char, invalidChar);
284
285	output.push_back(UTF32ToANSI(u32char, invalidChar, locale));
286	}
287
288	return output;
289	}
290
291	String UTF8::fromWide(const WString& input)
292	{
293	String output;
294	output.reserve(input.size());
295
296	auto backInserter = std::back_inserter(output);
297
298	auto iter = input.begin();
299	while(iter != input.end())
300	{
301	char32_t u32char;
302	iter = wideToUTF32(iter, input.end(), u32char);
303	UTF32To8(u32char, backInserter, `4`);
304	}
305
306	return output;
307	}
308
309	WString UTF8::toWide(const String& input)
310	{
311	WString output;
312	auto backInserter = std::back_inserter(output);
313
314	auto iter = input.begin();
315	while(iter != input.end())
316	{
317	char32_t u32char;
318	iter = UTF8To32(iter, input.end(), u32char);
319
320	UTF32ToWide(u32char, backInserter, `2`);
321	}
322
323	return output;
324	}
325
326	String UTF8::fromUTF16(const U16String& input)
327	{
328	String output;
329	output.reserve(input.size());
330
331	auto backInserter = std::back_inserter(output);
332
333	auto iter = input.begin();
334	while(iter != input.end())
335	{
336	char32_t u32char = `0`;
337	iter = UTF16To32(iter, input.end(), u32char);
338	UTF32To8(u32char, backInserter, `4`);
339	}
340
341	return output;
342	}
343
344	U16String UTF8::toUTF16(const String& input)
345	{
346	U16String output;
347	auto backInserter = std::back_inserter(output);
348
349	auto iter = input.begin();
350	while(iter != input.end())
351	{
352	char32_t u32char;
353	iter = UTF8To32(iter, input.end(), u32char);
354
355	UTF32To16(u32char, backInserter, `2`);
356	}
357
358	return output;
359	}
360
361	String UTF8::fromUTF32(const U32String& input)
362	{
363	String output;
364	output.reserve(input.size());
365
366	auto backInserter = std::back_inserter(output);
367
368	auto iter = input.begin();
369	while(iter != input.end())
370	{
371	UTF32To8(*iter, backInserter, `4`);
372
373	++iter;
374	}
375
376	return output;
377	}
378
379	U32String UTF8::toUTF32(const String& input)
380	{
381	U32String output;
382
383	auto iter = input.begin();
384	while(iter != input.end())
385	{
386	char32_t u32char;
387	iter = UTF8To32(iter, input.end(), u32char);
388
389	output.push_back(u32char);
390	}
391
392	return output;
393	}
394
395	UINT32 UTF8::count(const String& input)
396	{
397	UINT32 length = `0`;
398	for (char i : input)
399	{
400	// Include only characters that don't start with bits 10
401	length += (i & `0xc0`) != `0x80`;
402	}
403
404	return length;
405	}
406
407	UINT32 UTF8::charToByteIndex(const String& input, UINT32 charIdx)
408	{
409	UINT32 curChar = `0`;
410	UINT32 curByte = `0`;
411	for (char i : input)
412	{
413	// Include only characters that don't start with bits 10
414	if((i & `0xc0`) != `0x80`)
415	{
416	if(curChar == charIdx)
417	return curByte;
418
419	curChar++;
420	}
421
422	curByte++;
423	}
424
425	return (UINT32)input.size();
426	}
427
428	UINT32 UTF8::charByteCount(const String& input, UINT32 charIdx)
429	{
430	const UINT32 byteIdx = charToByteIndex(input, charIdx);
431
432	UINT32 count = `1`;
433	for(auto i = (size_t)byteIdx + `1`; i < input.size(); i++)
434	{
435	if((i & `0xc0`) != `0x80`)
436	break;
437
438	count++;
439	}
440
441	return count;
442	}
443
444	String UTF8::toLower(const String& input)
445	{
446	return PlatformUtility::convertCaseUTF8(input, false);
447	}
448
449	String UTF8::toUpper(const String& input)
450	{
451	return PlatformUtility::convertCaseUTF8(input, true);
452	}
453	}
454

Browse the source code of bsFramework/Source/Foundation/bsfUtility/String/BsUnicode.cpp