blunicode.cpp source code [Blend2d/src/blend2d/blunicode.cpp]

1	// [Blend2D]
2	// 2D Vector Graphics Powered by a JIT Compiler.
3	//
4	// [License]
5	// Zlib - See LICENSE.md file in the package.
6
7	#include "./blapi-build_p.h"
8	#include "./blsupport_p.h"
9	#include "./blunicode_p.h"
10
11	// ============================================================================
12	// [Unicode Data]
13	// ============================================================================
14
15	// NOTE: Theoretically UTF-8 sequence can be extended to support sequences up
16	// to 6 bytes, however, since UCS-4 code-point's maximum value is 0x10FFFF it
17	// also limits the maximum length of a UTF-8 sequence to 4 bytes.
18	const uint8_t blUtf8SizeData[`256`] = {
19	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 0 - 15
20	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 16 - 31
21	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 32 - 47
22	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 48 - 63
23	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 64 - 79
24	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 80 - 95
25	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 96 - 111
26	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 112 - 127
27	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 128 - 143
28	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 144 - 159
29	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 160 - 175
30	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 176 - 191
31	`0`, `0`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // 192 - 207
32	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // 208 - 223
33	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, // 224 - 239
34	`4`, `4`, `4`, `4`, `4`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` // 240 - 255
35	};
36
37	// ============================================================================
38	// [BLUnicode - Validation]
39	// ============================================================================
40
41	// Not really anything to validate, we just want to calculate a corresponding UTf-8 size.
42	static BL_INLINE BLResult validateLatin1String(const char* data, size_t size, BLUnicodeValidationState& state) noexcept {
43	size_t extra = `0`;
44	state.utf16Index = size;
45	state.utf32Index = size;
46
47	for (size_t i = `0`; i < size; i++)
48	extra += size_t(uint8_t(data[i])) >> `7`;
49
50	BLOverflowFlag of = `0`;
51	size_t utf8Size = blAddOverflow(size, extra, &of);
52
53	if (BL_UNLIKELY(of))
54	return blTraceError(BL_ERROR_DATA_TOO_LARGE);
55
56	state.utf8Index = utf8Size;
57	return BL_SUCCESS;
58	}
59
60	template<typename Iterator, uint32_t Flags>
61	static BL_INLINE BLResult validateUnicodeString(const void* data, size_t size, BLUnicodeValidationState& state) noexcept {
62	Iterator it(data, size);
63	BLResult result = it.template validate<Flags \| BL_UNICODE_IO_CALC_INDEX>();
64	state.utf8Index = it.utf8Index(data);
65	state.utf16Index = it.utf16Index(data);
66	state.utf32Index = it.utf32Index(data);
67	return result;
68	}
69
70	BLResult blValidateUnicode(const void* data, size_t sizeInBytes, uint32_t encoding, BLUnicodeValidationState& state) noexcept {
71	BLResult result;
72	state.reset();
73
74	switch (encoding) {
75	case BL_TEXT_ENCODING_LATIN1:
76	return validateLatin1String(static_cast<const char*>(data), sizeInBytes, state);
77
78	case BL_TEXT_ENCODING_UTF8:
79	return validateUnicodeString<BLUtf8Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state);
80
81	case BL_TEXT_ENCODING_UTF16:
82	// This will make sure we won't compile specialized code for architectures that don't penalize unaligned reads.
83	if (BL_UNALIGNED_IO_16 \|\| !blIsAligned(data, `2`))
84	result = validateUnicodeString<BLUtf16Reader, BL_UNICODE_IO_STRICT \| BL_UNICODE_IO_UNALIGNED>(data, sizeInBytes, state);
85	else
86	result = validateUnicodeString<BLUtf16Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state);
87
88	if (result == BL_SUCCESS && BL_UNLIKELY(sizeInBytes & `0x1`))
89	result = blTraceError(BL_ERROR_DATA_TRUNCATED);
90	return result;
91
92	case BL_TEXT_ENCODING_UTF32:
93	// This will make sure we won't compile specialized code for architectures that don't penalize unaligned reads.
94	if (BL_UNALIGNED_IO_32 \|\| !blIsAligned(data, `4`))
95	result = validateUnicodeString<BLUtf32Reader, BL_UNICODE_IO_STRICT \| BL_UNICODE_IO_UNALIGNED>(data, sizeInBytes, state);
96	else
97	result = validateUnicodeString<BLUtf32Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state);
98
99	if (result == BL_SUCCESS && BL_UNLIKELY(sizeInBytes & `0x3`))
100	result = blTraceError(BL_ERROR_DATA_TRUNCATED);
101	return result;
102
103	default:
104	return blTraceError(BL_ERROR_INVALID_VALUE);
105	}
106	}
107
108	// ============================================================================
109	// [BLUnicode - Conversion]
110	// ============================================================================
111
112	static BL_INLINE size_t offsetOfPtr(const void* base, const void* advanced) noexcept {
113	return (size_t)(static_cast<const char>(advanced) - static_cast<const* char*>(base));
114	}
115
116	// A simple implementation. It iterates `src` char-by-char and writes it to the
117	// destination. The advantage of this implementation is that switching `Writer`
118	// and `Iterator` can customize strictness, endianness, etc, so we don't have
119	// to repeat the code for different variations of UTF16 and UTF32.
120	template<typename Writer, typename Iterator, uint32_t IterFlags>
121	static BL_INLINE BLResult blConvertUnicodeImpl(void* dst, size_t dstSizeInBytes, const void* src, size_t srcSizeInBytes, BLUnicodeConversionState& state) noexcept {
122	typedef typename Writer::CharType DstChar;
123
124	Writer writer(static_cast<DstChar>(dst), dstSizeInBytes / sizeof*(DstChar));
125	Iterator iter(src, blAlignDown(srcSizeInBytes, Iterator::kCharSize));
126
127	BLResult result = BL_SUCCESS;
128	while (iter.hasNext()) {
129	uint32_t uc;
130	size_t ucSizeInBytes;
131
132	result = iter.template next<IterFlags>(uc, ucSizeInBytes);
133	if (BL_UNLIKELY(result != BL_SUCCESS))
134	break;
135
136	result = writer.write(uc);
137	if (BL_UNLIKELY(result != BL_SUCCESS)) {
138	state.dstIndex = offsetOfPtr(dst, writer._ptr);
139	state.srcIndex = offsetOfPtr(src, iter._ptr) - ucSizeInBytes;
140	return result;
141	}
142	}
143
144	state.dstIndex = offsetOfPtr(dst, writer._ptr);
145	state.srcIndex = offsetOfPtr(src, iter._ptr);
146
147	if (Iterator::kCharSize > `1` && result == BL_SUCCESS && !blIsAligned(state.srcIndex, Iterator::kCharSize))
148	return blTraceError(BL_ERROR_DATA_TRUNCATED);
149	else
150	return result;
151	}
152
153	BLResult blConvertUnicode(
154	void* dst, size_t dstSizeInBytes, uint32_t dstEncoding,
155	const void* src, size_t srcSizeInBytes, uint32_t srcEncoding,
156	BLUnicodeConversionState& state) noexcept {
157
158	constexpr bool BL_UNALIGNED_IO_Any = BL_UNALIGNED_IO_16 && BL_UNALIGNED_IO_32;
159
160	BLResult result = BL_SUCCESS;
161	state.reset();
162
163	uint32_t encodingCombined = (dstEncoding << `2`) \| srcEncoding;
164	switch (encodingCombined) {
165	// ------------------------------------------------------------------------
166	// [MemCpy]
167	// ------------------------------------------------------------------------
168
169	case (BL_TEXT_ENCODING_LATIN1 << `2`) \| BL_TEXT_ENCODING_LATIN1: {
170	size_t copySize = blMin(dstSizeInBytes, srcSizeInBytes);
171	memcpy(dst, src, copySize);
172
173	state.dstIndex = copySize;
174	state.srcIndex = copySize;
175
176	if (dstSizeInBytes < srcSizeInBytes)
177	result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
178	break;
179	}
180
181	// ------------------------------------------------------------------------
182	// [Utf8 <- Latin1]
183	// ------------------------------------------------------------------------
184
185	case (BL_TEXT_ENCODING_UTF8 << `2`) \| BL_TEXT_ENCODING_LATIN1: {
186	BLUtf8Writer writer(static_cast<char*>(dst), dstSizeInBytes);
187	const uint8_t* src8 = static_cast<const uint8_t*>(src);
188
189	if (dstSizeInBytes / `2` >= srcSizeInBytes) {
190	// Fast case, there is enough space in `dst` even for the worst-case scenario.
191	for (size_t i = `0`; i < srcSizeInBytes; i++) {
192	uint32_t uc = src8[i];
193	if (uc <= `0x7F`)
194	writer.writeByteUnsafe(uc);
195	else
196	writer.write2BytesUnsafe(uc);
197	}
198
199	state.srcIndex = srcSizeInBytes;
200	state.dstIndex = writer.index(static_cast<char*>(dst));
201	}
202	else {
203	for (size_t i = `0`; i < srcSizeInBytes; i++) {
204	uint32_t uc = src8[i];
205	if (uc <= `0x7F`)
206	result = writer.writeByte(uc);
207	else
208	result = writer.write2Bytes(uc);
209
210	if (BL_UNLIKELY(result != BL_SUCCESS)) {
211	state.dstIndex = writer.index(static_cast<char*>(dst));
212	state.srcIndex = i;
213	break;
214	}
215	}
216	}
217
218	break;
219	}
220
221	// ------------------------------------------------------------------------
222	// [Utf8 <- Utf8]
223	// ------------------------------------------------------------------------
224
225	case (BL_TEXT_ENCODING_UTF8 << `2`) \| BL_TEXT_ENCODING_UTF8: {
226	size_t copySize = blMin(dstSizeInBytes, srcSizeInBytes);
227	BLUnicodeValidationState validationState;
228
229	result = blValidateUnicode(src, copySize, BL_TEXT_ENCODING_UTF8, validationState);
230	size_t validatedSize = validationState.utf8Index;
231
232	if (validatedSize)
233	memcpy(dst, src, validatedSize);
234
235	// Prevent `BL_ERROR_DATA_TRUNCATED` in case there is not enough space in destination.
236	if (copySize < srcSizeInBytes && (result == BL_SUCCESS \|\| result == BL_ERROR_DATA_TRUNCATED))
237	result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
238
239	state.dstIndex = validatedSize;
240	state.srcIndex = validatedSize;
241	break;
242	}
243
244	// ------------------------------------------------------------------------
245	// [Utf8 <- Utf16]
246	// ------------------------------------------------------------------------
247
248	case (BL_TEXT_ENCODING_UTF8 << `2`) \| BL_TEXT_ENCODING_UTF16:
249	if (BL_UNALIGNED_IO_16 \|\| !blIsAligned(src, `2`))
250	result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf16Reader, BL_UNICODE_IO_STRICT \| BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
251	else
252	result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf16Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
253	break;
254
255	// ------------------------------------------------------------------------
256	// [Utf8 <- Utf32]
257	// ------------------------------------------------------------------------
258
259	case (BL_TEXT_ENCODING_UTF8 << `2`) \| BL_TEXT_ENCODING_UTF32: {
260	if (BL_UNALIGNED_IO_32 \|\| !blIsAligned(src, `4`))
261	result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf32Reader, BL_UNICODE_IO_STRICT \| BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
262	else
263	result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
264	break;
265	}
266
267	// ------------------------------------------------------------------------
268	// [Utf16 <- Latin1]
269	// ------------------------------------------------------------------------
270
271	case (BL_TEXT_ENCODING_UTF16 << `2`) \| BL_TEXT_ENCODING_LATIN1: {
272	size_t count = blMin(dstSizeInBytes / `2`, srcSizeInBytes);
273
274	if (BL_UNALIGNED_IO_16 \|\| blIsAligned(dst, `2`)) {
275	for (size_t i = `0`; i < count; i++)
276	blMemWriteU16aLE(static_cast<uint8_t>(dst) + i `2u`, static_cast<const uint8_t*>(src)[i]);
277	}
278	else {
279	for (size_t i = `0`; i < count; i++)
280	blMemWriteU16uLE(static_cast<uint8_t>(dst) + i `2u`, static_cast<const uint8_t*>(src)[i]);
281	}
282
283	if (count < srcSizeInBytes)
284	result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
285
286	state.dstIndex = count * `2u`;
287	state.srcIndex = count;
288	break;
289	}
290
291	// ------------------------------------------------------------------------
292	// [Utf16 <- Utf8]
293	// ------------------------------------------------------------------------
294
295	case (BL_TEXT_ENCODING_UTF16 << `2`) \| BL_TEXT_ENCODING_UTF8: {
296	if (BL_UNALIGNED_IO_16 \|\| !blIsAligned(dst, `2`))
297	result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, `1`>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
298	else
299	result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, `2`>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
300	break;
301	}
302
303	// ------------------------------------------------------------------------
304	// [Utf16 <- Utf16]
305	// ------------------------------------------------------------------------
306
307	case (BL_TEXT_ENCODING_UTF16 << `2`) \| BL_TEXT_ENCODING_UTF16: {
308	size_t copySize = blAlignDown(blMin(dstSizeInBytes, srcSizeInBytes), `2`);
309	BLUnicodeValidationState validationState;
310
311	result = blValidateUnicode(src, copySize, BL_TEXT_ENCODING_UTF16, validationState);
312	size_t validatedSize = validationState.utf16Index * `2`;
313
314	memmove(dst, src, validatedSize);
315
316	// Prevent `BL_ERROR_DATA_TRUNCATED` in case there is not enough space in destination.
317	if (copySize < srcSizeInBytes && (result == BL_SUCCESS \|\| result == BL_ERROR_DATA_TRUNCATED))
318	result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
319
320	// Report `BL_ERROR_DATA_TRUNCATED` is everything went right, but the
321	// source size was not aligned to 2 bytes.
322	if (result == BL_SUCCESS && !blIsAligned(srcSizeInBytes, `2`))
323	result = blTraceError(BL_ERROR_DATA_TRUNCATED);
324
325	state.dstIndex = validatedSize;
326	state.srcIndex = validatedSize;
327	break;
328	}
329
330	// ------------------------------------------------------------------------
331	// [Utf16 <- Utf32]
332	// ------------------------------------------------------------------------
333
334	case (BL_TEXT_ENCODING_UTF16 << `2`) \| BL_TEXT_ENCODING_UTF32: {
335	if (BL_UNALIGNED_IO_Any \|\| !blIsAligned(dst, `2`) \|\| !blIsAligned(src, `4`))
336	result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, `1`>, BLUtf32Reader, BL_UNICODE_IO_STRICT \| BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
337	else
338	result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, `2`>, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
339	break;
340	}
341
342	// ------------------------------------------------------------------------
343	// [Utf32 <- Latin1]
344	// ------------------------------------------------------------------------
345
346	case (BL_TEXT_ENCODING_UTF32 << `2`) \| BL_TEXT_ENCODING_LATIN1: {
347	size_t count = blMin(dstSizeInBytes / `4`, srcSizeInBytes);
348
349	if (BL_UNALIGNED_IO_32 \|\| blIsAligned(dst, `4`)) {
350	for (size_t i = `0`; i < count; i++)
351	blMemWriteU32a(static_cast<uint8_t>(dst) + i `4u`, static_cast<const uint8_t*>(src)[i]);
352	}
353	else {
354	for (size_t i = `0`; i < count; i++)
355	blMemWriteU32u(static_cast<uint8_t>(dst) + i `4u`, static_cast<const uint8_t*>(src)[i]);
356	}
357
358	if (count < srcSizeInBytes)
359	result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
360
361	state.dstIndex = count * `4u`;
362	state.srcIndex = count;
363	break;
364	}
365
366	// ------------------------------------------------------------------------
367	// [Utf32 <- Utf8]
368	// ------------------------------------------------------------------------
369
370	case (BL_TEXT_ENCODING_UTF32 << `2`) \| BL_TEXT_ENCODING_UTF8: {
371	if (BL_UNALIGNED_IO_32 \|\| !blIsAligned(dst, `4`))
372	result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, `1`>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
373	else
374	result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, `4`>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
375	break;
376	}
377
378	// ------------------------------------------------------------------------
379	// [Utf32 <- Utf16]
380	// ------------------------------------------------------------------------
381
382	case (BL_TEXT_ENCODING_UTF32 << `2`) \| BL_TEXT_ENCODING_UTF16: {
383	if (BL_UNALIGNED_IO_Any \|\| !blIsAligned(dst, `4`) \|\| !blIsAligned(src, `2`))
384	result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, `1`>, BLUtf16Reader, BL_UNICODE_IO_STRICT \| BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
385	else
386	result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, `4`>, BLUtf16Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
387	break;
388	}
389
390	// ------------------------------------------------------------------------
391	// [Utf32 <- Utf32]
392	// ------------------------------------------------------------------------
393
394	case (BL_TEXT_ENCODING_UTF32 << `2`) \| BL_TEXT_ENCODING_UTF32: {
395	if (BL_UNALIGNED_IO_32 \|\| !blIsAligned(dst, `4`) \|\| !blIsAligned(src, `4`))
396	result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, `1`>, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
397	else
398	result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, `4`>, BLUtf32Reader, BL_UNICODE_IO_STRICT \| BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
399	break;
400	}
401
402	// ------------------------------------------------------------------------
403	// [Invalid]
404	// ------------------------------------------------------------------------
405
406	default: {
407	return blTraceError(BL_ERROR_INVALID_VALUE);
408	}
409	}
410
411	return result;
412	}
413
414	// ============================================================================
415	// [BLUnicode - Test]
416	// ============================================================================
417
418	#ifdef BL_TEST
419	UNIT(blend2d_unicode) {
420	struct TestEntry {
421	char dst[`28`];
422	char src[`28`];
423	uint8_t dstSize;
424	uint8_t srcSize;
425	uint8_t dstEncoding;
426	uint8_t srcEncoding;
427	BLResult result;
428	};
429
430	static const TestEntry testEntries[] = {
431	#define ENTRY(DST, DST_ENC, SRC, SRC_ENC, ERROR_CODE) { \
432	DST, \
433	SRC, \
434	uint8_t(sizeof(DST) - 1), \
435	uint8_t(sizeof(SRC) - 1), \
436	uint8_t(BL_TEXT_ENCODING_##DST_ENC), \
437	uint8_t(BL_TEXT_ENCODING_##SRC_ENC), \
438	ERROR_CODE \
439	}
440
441	ENTRY("Test" , LATIN1, "Test" , LATIN1, BL_SUCCESS),
442	ENTRY("Test" , UTF8 , "Test" , LATIN1, BL_SUCCESS),
443	ENTRY("Test" , UTF8 , "Test" , UTF8 , BL_SUCCESS),
444	#if BL_BYTE_ORDER == 1234
445	ENTRY("Test" , UTF8 , "T\0e\0s\0t\0" , UTF16 , BL_SUCCESS),
446	ENTRY("T\0e\0s\0t\0" , UTF16 , "Test" , UTF8 , BL_SUCCESS),
447	#else
448	ENTRY("Test" , UTF8 , "\0T\0e\0s\0t" , UTF16 , BL_SUCCESS),
449	ENTRY("\0T\0e\0s\0t" , UTF16 , "Test" , UTF8 , BL_SUCCESS),
450	#endif
451
452	// Tests a Czech word (Rain in english) with diacritic marks, at most 2 BYTEs per character.
453	#if BL_BYTE_ORDER == 1234
454	ENTRY("\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , "\x44\x00\xE9\x00\x61\x01\x65\x01", UTF16 , BL_SUCCESS),
455	ENTRY("\x44\x00\xE9\x00\x61\x01\x65\x01", UTF16 , "\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , BL_SUCCESS),
456	#else
457	ENTRY("\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , "\x00\x44\x00\xE9\x01\x61\x01\x65", UTF16 , BL_SUCCESS),
458	ENTRY("\x00\x44\x00\xE9\x01\x61\x01\x65", UTF16 , "\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , BL_SUCCESS),
459	#endif
460
461	// Tests full-width digit zero (3 BYTEs per UTF-8 character).
462	#if BL_BYTE_ORDER == 1234
463	ENTRY("\xEF\xBC\x90" , UTF8 , "\x10\xFF" , UTF16 , BL_SUCCESS),
464	ENTRY("\x10\xFF" , UTF16 , "\xEF\xBC\x90" , UTF8 , BL_SUCCESS),
465	#else
466	ENTRY("\xEF\xBC\x90" , UTF8 , "\xFF\x10" , UTF16 , BL_SUCCESS),
467	ENTRY("\xFF\x10" , UTF16 , "\xEF\xBC\x90" , UTF8 , BL_SUCCESS),
468	#endif
469
470	// Tests `BL_CHAR_MAX` character (4 BYTEs per UTF-8 character, the highest possible unicode code-point).
471	#if BL_BYTE_ORDER == 1234
472	ENTRY("\xF4\x8F\xBF\xBF" , UTF8 , "\xFF\xDB\xFF\xDF" , UTF16 , BL_SUCCESS),
473	ENTRY("\xFF\xDB\xFF\xDF" , UTF16 , "\xF4\x8F\xBF\xBF" , UTF8 , BL_SUCCESS),
474	#else
475	ENTRY("\xF4\x8F\xBF\xBF" , UTF8 , "\xDB\xFF\xDF\xFF" , UTF16 , BL_SUCCESS),
476	ENTRY("\xDB\xFF\xDF\xFF" , UTF16 , "\xF4\x8F\xBF\xBF" , UTF8 , BL_SUCCESS),
477	#endif
478
479	#if BL_BYTE_ORDER == 1234
480	ENTRY("Test" , UTF8 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS),
481	ENTRY("T\0e\0s\0t\0" , UTF16 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS),
482	ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS),
483	ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "T\0e\0s\0t\0" , UTF16 , BL_SUCCESS),
484	ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "Test" , LATIN1, BL_SUCCESS),
485	ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "Test" , UTF8 , BL_SUCCESS),
486	#else
487	ENTRY("Test" , UTF8 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS),
488	ENTRY("\0T\0e\0s\0t" , UTF16 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS),
489	ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS),
490	ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "\0T\0e\0s\0t" , UTF16 , BL_SUCCESS)
491	ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "Test" , LATIN1, BL_SUCCESS),
492	ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "Test" , UTF8 , BL_SUCCESS),
493	#endif
494
495	// Truncated characters.
496	ENTRY("" , UTF8 , "\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED),
497	ENTRY("" , UTF8 , "\xEF" , UTF8 , BL_ERROR_DATA_TRUNCATED),
498	ENTRY("" , UTF8 , "\xEF\xBC" , UTF8 , BL_ERROR_DATA_TRUNCATED),
499	ENTRY("" , UTF8 , "\xF4" , UTF8 , BL_ERROR_DATA_TRUNCATED),
500	ENTRY("" , UTF8 , "\xF4\x8F" , UTF8 , BL_ERROR_DATA_TRUNCATED),
501	ENTRY("" , UTF8 , "\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED),
502
503	// Truncated character at the end (the converter must output the content, which was correct).
504	ENTRY("a" , UTF8 , "a\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED),
505	ENTRY("ab" , UTF8 , "ab\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED),
506	ENTRY("TestString" , UTF8 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED),
507	#if BL_BYTE_ORDER == 1234
508	ENTRY("T\0e\0s\0t\0S\0t\0r\0i\0n\0g\0" , UTF16 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED),
509	#else
510	ENTRY("\0T\0e\0s\0t\0S\0t\0r\0i\0n\0g" , UTF16 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED),
511	#endif
512
513	// Invalid UTf-8 characters.
514	ENTRY("" , UTF8 , "\x80" , UTF8 , BL_ERROR_INVALID_STRING),
515	ENTRY("" , UTF8 , "\xC1" , UTF8 , BL_ERROR_INVALID_STRING),
516	ENTRY("" , UTF8 , "\xF5\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING),
517	ENTRY("" , UTF8 , "\x91\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING),
518	ENTRY("" , UTF8 , "\xF6\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING),
519	ENTRY("" , UTF8 , "\xF4\xFF\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING),
520
521	// Overlong UTF-8 characters.
522	ENTRY("" , UTF8 , "\xC0\xA0" , UTF8 , BL_ERROR_INVALID_STRING)
523
524	#undef ENTRY
525	};
526
527	for (size_t i = `0`; i < BL_ARRAY_SIZE(testEntries); i++) {
528	const TestEntry& entry = testEntries[i];
529	char output[`32`];
530
531	BLUnicodeConversionState state;
532	BLResult result = blConvertUnicode(output, `32`, entry.dstEncoding, entry.src, entry.srcSize, entry.srcEncoding, state);
533
534	bool failed = (result != entry.result) \|\|
535	(state.dstIndex != entry.dstSize) \|\|
536	(memcmp(output, entry.dst, state.dstIndex) != `0`);
537
538	if (failed) {
539	size_t inputSize = entry.srcSize;
540	size_t outputSize = state.dstIndex;
541	size_t expectedSize = entry.dstSize;
542
543	printf(" Failed Entry #%u\n", unsigned(i));
544
545	printf(" Input :");
546	for (size_t j = `0`; j < inputSize; j++)
547	printf(" %02X", uint8_t(entry.src[j]));
548	printf("%s\n", inputSize ? "" : " (Nothing)");
549
550	printf(" Output :");
551	for (size_t j = `0`; j < outputSize; j++)
552	printf(" %02X", uint8_t(output[j]));
553	printf("%s\n", outputSize ? "" : " (Nothing)");
554
555	printf(" Expected:");
556	for (size_t j = `0`; j < expectedSize; j++)
557	printf(" %02X", uint8_t(entry.dst[j]));
558	printf("%s\n", expectedSize ? "" : " (Nothing)");
559	printf(" ErrorCode: Actual(%u) %s Expected(%u)\n", result, (result == entry.result) ? "==" : "!=", entry.result);
560	}
561
562	EXPECT(!failed);
563	}
564	}
565
566	UNIT(blend2d_unicode_io) {
567	INFO("BLUtf8Reader");
568	{
569	const uint8_t data[] = {
570	`0xE2`, `0x82`, `0xAC`, // U+0020AC
571	`0xF0`, `0x90`, `0x8D`, `0x88` // U+010348
572	};
573
574	BLUtf8Reader it(data, BL_ARRAY_SIZE(data));
575	uint32_t uc;
576
577	EXPECT(it.hasNext());
578	EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
579	EXPECT(uc == `0x0020AC`);
580
581	EXPECT(it.hasNext());
582	EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
583	EXPECT(uc == `0x010348`);
584
585	EXPECT(!it.hasNext());
586
587	// Verify that sizes were calculated correctly.
588	EXPECT(it.byteIndex(data) == `7`);
589	EXPECT(it.utf8Index(data) == `7`);
590	EXPECT(it.utf16Index(data) == `3`); // 3 code-points (1 BMP and 1 SMP).
591	EXPECT(it.utf32Index(data) == `2`); // 2 code-points.
592
593	const uint8_t invalidData[] = { `0xE2`, `0x82` };
594	it.reset(invalidData, BL_ARRAY_SIZE(invalidData));
595
596	EXPECT(it.hasNext());
597	EXPECT(it.next(uc) == BL_ERROR_DATA_TRUNCATED);
598
599	// After error the iterator should not move.
600	EXPECT(it.hasNext());
601	EXPECT(it.byteIndex(invalidData) == `0`);
602	EXPECT(it.utf8Index(invalidData) == `0`);
603	EXPECT(it.utf16Index(invalidData) == `0`);
604	EXPECT(it.utf32Index(invalidData) == `0`);
605	}
606
607	INFO("BLUtf16Reader");
608	{
609	const uint16_t data[] = {
610	`0x20AC`, // U+0020AC
611	`0xD800`, `0xDF48` // U+010348
612	};
613
614	BLUtf16Reader it(data, BL_ARRAY_SIZE(data) * sizeof(uint16_t));
615	uint32_t uc;
616
617	EXPECT(it.hasNext());
618	EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
619	EXPECT(uc == `0x0020AC`);
620
621	EXPECT(it.hasNext());
622	EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
623	EXPECT(uc == `0x010348`);
624
625	EXPECT(!it.hasNext());
626
627	// Verify that sizes were calculated correctly.
628	EXPECT(it.byteIndex(data) == `6`);
629	EXPECT(it.utf8Index(data) == `7`);
630	EXPECT(it.utf16Index(data) == `3`); // 3 code-points (1 BMP and 1 SMP).
631	EXPECT(it.utf32Index(data) == `2`); // 2 code-points.
632
633	const uint16_t invalidData[] = { `0xD800` };
634	it.reset(invalidData, BL_ARRAY_SIZE(invalidData) * sizeof(uint16_t));
635
636	EXPECT(it.hasNext());
637	EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX \| BL_UNICODE_IO_STRICT>(uc) == BL_ERROR_DATA_TRUNCATED);
638
639	// After an error the iterator should not move.
640	EXPECT(it.hasNext());
641	EXPECT(it.byteIndex(invalidData) == `0`);
642	EXPECT(it.utf8Index(invalidData) == `0`);
643	EXPECT(it.utf16Index(invalidData) == `0`);
644	EXPECT(it.utf32Index(invalidData) == `0`);
645
646	// However, this should pass in non-strict mode.
647	EXPECT(it.next(uc) == BL_SUCCESS);
648	EXPECT(!it.hasNext());
649	}
650
651	INFO("BLUtf32Reader");
652	{
653	const uint32_t data[] = {
654	`0x0020AC`,
655	`0x010348`
656	};
657
658	BLUtf32Reader it(data, BL_ARRAY_SIZE(data) * sizeof(uint32_t));
659	uint32_t uc;
660
661	EXPECT(it.hasNext());
662	EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
663	EXPECT(uc == `0x0020AC`);
664
665	EXPECT(it.hasNext());
666	EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
667	EXPECT(uc == `0x010348`);
668
669	EXPECT(!it.hasNext());
670
671	// Verify that sizes were calculated correctly.
672	EXPECT(it.byteIndex(data) == `8`);
673	EXPECT(it.utf8Index(data) == `7`);
674	EXPECT(it.utf16Index(data) == `3`); // 3 code-points (1 BMP and 1 SMP).
675	EXPECT(it.utf32Index(data) == `2`); // 2 code-points.
676
677	const uint32_t invalidData[] = { `0xD800` };
678	it.reset(invalidData, BL_ARRAY_SIZE(invalidData) * sizeof(uint32_t));
679
680	EXPECT(it.hasNext());
681	EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX \| BL_UNICODE_IO_STRICT>(uc) == BL_ERROR_INVALID_STRING);
682
683	// After an error the iterator should not move.
684	EXPECT(it.hasNext());
685	EXPECT(it.byteIndex(invalidData) == `0`);
686	EXPECT(it.utf8Index(invalidData) == `0`);
687	EXPECT(it.utf16Index(invalidData) == `0`);
688	EXPECT(it.utf32Index(invalidData) == `0`);
689
690	// However, this should pass in non-strict mode.
691	EXPECT(it.next(uc) == BL_SUCCESS);
692	EXPECT(!it.hasNext());
693	}
694
695	INFO("BLUtf8Writer");
696	{
697	char dst[`7`];
698	BLUtf8Writer writer(dst, BL_ARRAY_SIZE(dst));
699
700	EXPECT(writer.write(`0x20ACu`) == BL_SUCCESS);
701	EXPECT(uint8_t(dst[`0`]) == `0xE2`);
702	EXPECT(uint8_t(dst[`1`]) == `0x82`);
703	EXPECT(uint8_t(dst[`2`]) == `0xAC`);
704
705	EXPECT(writer.write(`0x010348u`) == BL_SUCCESS);
706	EXPECT(uint8_t(dst[`3`]) == `0xF0`);
707	EXPECT(uint8_t(dst[`4`]) == `0x90`);
708	EXPECT(uint8_t(dst[`5`]) == `0x8D`);
709	EXPECT(uint8_t(dst[`6`]) == `0x88`);
710	EXPECT(writer.atEnd());
711
712	writer.reset(dst, `1`);
713	EXPECT(writer.write(`0x20ACu`) == BL_ERROR_NO_SPACE_LEFT);
714	EXPECT(writer.write(`0x0080u`) == BL_ERROR_NO_SPACE_LEFT);
715	EXPECT(writer.write(`0x00C1u`) == BL_ERROR_NO_SPACE_LEFT);
716
717	// We have only one byte left so this must pass...
718	EXPECT(writer.write(`'a'`) == BL_SUCCESS);
719	EXPECT(writer.atEnd());
720
721	writer.reset(dst, `2`);
722	EXPECT(writer.write(`0x20ACu`) == BL_ERROR_NO_SPACE_LEFT);
723	EXPECT(writer.write(`0x00C1u`) == BL_SUCCESS);
724	EXPECT(uint8_t(dst[`0`]) == `0xC3u`);
725	EXPECT(uint8_t(dst[`1`]) == `0x81u`);
726	EXPECT(writer.atEnd());
727	EXPECT(writer.write(`'a'`) == BL_ERROR_NO_SPACE_LEFT);
728	}
729
730	INFO("BLUtf16Writer");
731	{
732	uint16_t dst[`3`];
733	BLUtf16Writer<> writer(dst, BL_ARRAY_SIZE(dst));
734
735	EXPECT(writer.write(`0x010348u`) == BL_SUCCESS);
736	EXPECT(dst[`0`] == `0xD800u`);
737	EXPECT(dst[`1`] == `0xDF48u`);
738
739	EXPECT(writer.write(`0x010348u`) == BL_ERROR_NO_SPACE_LEFT);
740	EXPECT(writer.write(`0x20ACu`) == BL_SUCCESS);
741	EXPECT(dst[`2`] == `0x20ACu`);
742	EXPECT(writer.atEnd());
743	}
744	}
745	#endif
746

Browse the source code of Blend2d/src/blend2d/blunicode.cpp