1// [Blend2D]
2// 2D Vector Graphics Powered by a JIT Compiler.
3//
4// [License]
5// Zlib - See LICENSE.md file in the package.
6
7#include "./blapi-build_p.h"
8#include "./blsupport_p.h"
9#include "./blunicode_p.h"
10
11// ============================================================================
12// [Unicode Data]
13// ============================================================================
14
15// NOTE: Theoretically UTF-8 sequence can be extended to support sequences up
16// to 6 bytes, however, since UCS-4 code-point's maximum value is 0x10FFFF it
17// also limits the maximum length of a UTF-8 sequence to 4 bytes.
18const uint8_t blUtf8SizeData[256] = {
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 - 15
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 16 - 31
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 32 - 47
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 48 - 63
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 64 - 79
24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 95
25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 96 - 111
26 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 112 - 127
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 128 - 143
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 144 - 159
29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 160 - 175
30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 176 - 191
31 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 192 - 207
32 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 208 - 223
33 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 224 - 239
34 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 240 - 255
35};
36
37// ============================================================================
38// [BLUnicode - Validation]
39// ============================================================================
40
41// Not really anything to validate, we just want to calculate a corresponding UTf-8 size.
42static BL_INLINE BLResult validateLatin1String(const char* data, size_t size, BLUnicodeValidationState& state) noexcept {
43 size_t extra = 0;
44 state.utf16Index = size;
45 state.utf32Index = size;
46
47 for (size_t i = 0; i < size; i++)
48 extra += size_t(uint8_t(data[i])) >> 7;
49
50 BLOverflowFlag of = 0;
51 size_t utf8Size = blAddOverflow(size, extra, &of);
52
53 if (BL_UNLIKELY(of))
54 return blTraceError(BL_ERROR_DATA_TOO_LARGE);
55
56 state.utf8Index = utf8Size;
57 return BL_SUCCESS;
58}
59
60template<typename Iterator, uint32_t Flags>
61static BL_INLINE BLResult validateUnicodeString(const void* data, size_t size, BLUnicodeValidationState& state) noexcept {
62 Iterator it(data, size);
63 BLResult result = it.template validate<Flags | BL_UNICODE_IO_CALC_INDEX>();
64 state.utf8Index = it.utf8Index(data);
65 state.utf16Index = it.utf16Index(data);
66 state.utf32Index = it.utf32Index(data);
67 return result;
68}
69
70BLResult blValidateUnicode(const void* data, size_t sizeInBytes, uint32_t encoding, BLUnicodeValidationState& state) noexcept {
71 BLResult result;
72 state.reset();
73
74 switch (encoding) {
75 case BL_TEXT_ENCODING_LATIN1:
76 return validateLatin1String(static_cast<const char*>(data), sizeInBytes, state);
77
78 case BL_TEXT_ENCODING_UTF8:
79 return validateUnicodeString<BLUtf8Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state);
80
81 case BL_TEXT_ENCODING_UTF16:
82 // This will make sure we won't compile specialized code for architectures that don't penalize unaligned reads.
83 if (BL_UNALIGNED_IO_16 || !blIsAligned(data, 2))
84 result = validateUnicodeString<BLUtf16Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(data, sizeInBytes, state);
85 else
86 result = validateUnicodeString<BLUtf16Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state);
87
88 if (result == BL_SUCCESS && BL_UNLIKELY(sizeInBytes & 0x1))
89 result = blTraceError(BL_ERROR_DATA_TRUNCATED);
90 return result;
91
92 case BL_TEXT_ENCODING_UTF32:
93 // This will make sure we won't compile specialized code for architectures that don't penalize unaligned reads.
94 if (BL_UNALIGNED_IO_32 || !blIsAligned(data, 4))
95 result = validateUnicodeString<BLUtf32Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(data, sizeInBytes, state);
96 else
97 result = validateUnicodeString<BLUtf32Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state);
98
99 if (result == BL_SUCCESS && BL_UNLIKELY(sizeInBytes & 0x3))
100 result = blTraceError(BL_ERROR_DATA_TRUNCATED);
101 return result;
102
103 default:
104 return blTraceError(BL_ERROR_INVALID_VALUE);
105 }
106}
107
108// ============================================================================
109// [BLUnicode - Conversion]
110// ============================================================================
111
112static BL_INLINE size_t offsetOfPtr(const void* base, const void* advanced) noexcept {
113 return (size_t)(static_cast<const char*>(advanced) - static_cast<const char*>(base));
114}
115
116// A simple implementation. It iterates `src` char-by-char and writes it to the
117// destination. The advantage of this implementation is that switching `Writer`
118// and `Iterator` can customize strictness, endianness, etc, so we don't have
119// to repeat the code for different variations of UTF16 and UTF32.
120template<typename Writer, typename Iterator, uint32_t IterFlags>
121static BL_INLINE BLResult blConvertUnicodeImpl(void* dst, size_t dstSizeInBytes, const void* src, size_t srcSizeInBytes, BLUnicodeConversionState& state) noexcept {
122 typedef typename Writer::CharType DstChar;
123
124 Writer writer(static_cast<DstChar*>(dst), dstSizeInBytes / sizeof(DstChar));
125 Iterator iter(src, blAlignDown(srcSizeInBytes, Iterator::kCharSize));
126
127 BLResult result = BL_SUCCESS;
128 while (iter.hasNext()) {
129 uint32_t uc;
130 size_t ucSizeInBytes;
131
132 result = iter.template next<IterFlags>(uc, ucSizeInBytes);
133 if (BL_UNLIKELY(result != BL_SUCCESS))
134 break;
135
136 result = writer.write(uc);
137 if (BL_UNLIKELY(result != BL_SUCCESS)) {
138 state.dstIndex = offsetOfPtr(dst, writer._ptr);
139 state.srcIndex = offsetOfPtr(src, iter._ptr) - ucSizeInBytes;
140 return result;
141 }
142 }
143
144 state.dstIndex = offsetOfPtr(dst, writer._ptr);
145 state.srcIndex = offsetOfPtr(src, iter._ptr);
146
147 if (Iterator::kCharSize > 1 && result == BL_SUCCESS && !blIsAligned(state.srcIndex, Iterator::kCharSize))
148 return blTraceError(BL_ERROR_DATA_TRUNCATED);
149 else
150 return result;
151}
152
153BLResult blConvertUnicode(
154 void* dst, size_t dstSizeInBytes, uint32_t dstEncoding,
155 const void* src, size_t srcSizeInBytes, uint32_t srcEncoding,
156 BLUnicodeConversionState& state) noexcept {
157
158 constexpr bool BL_UNALIGNED_IO_Any = BL_UNALIGNED_IO_16 && BL_UNALIGNED_IO_32;
159
160 BLResult result = BL_SUCCESS;
161 state.reset();
162
163 uint32_t encodingCombined = (dstEncoding << 2) | srcEncoding;
164 switch (encodingCombined) {
165 // ------------------------------------------------------------------------
166 // [MemCpy]
167 // ------------------------------------------------------------------------
168
169 case (BL_TEXT_ENCODING_LATIN1 << 2) | BL_TEXT_ENCODING_LATIN1: {
170 size_t copySize = blMin(dstSizeInBytes, srcSizeInBytes);
171 memcpy(dst, src, copySize);
172
173 state.dstIndex = copySize;
174 state.srcIndex = copySize;
175
176 if (dstSizeInBytes < srcSizeInBytes)
177 result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
178 break;
179 }
180
181 // ------------------------------------------------------------------------
182 // [Utf8 <- Latin1]
183 // ------------------------------------------------------------------------
184
185 case (BL_TEXT_ENCODING_UTF8 << 2) | BL_TEXT_ENCODING_LATIN1: {
186 BLUtf8Writer writer(static_cast<char*>(dst), dstSizeInBytes);
187 const uint8_t* src8 = static_cast<const uint8_t*>(src);
188
189 if (dstSizeInBytes / 2 >= srcSizeInBytes) {
190 // Fast case, there is enough space in `dst` even for the worst-case scenario.
191 for (size_t i = 0; i < srcSizeInBytes; i++) {
192 uint32_t uc = src8[i];
193 if (uc <= 0x7F)
194 writer.writeByteUnsafe(uc);
195 else
196 writer.write2BytesUnsafe(uc);
197 }
198
199 state.srcIndex = srcSizeInBytes;
200 state.dstIndex = writer.index(static_cast<char*>(dst));
201 }
202 else {
203 for (size_t i = 0; i < srcSizeInBytes; i++) {
204 uint32_t uc = src8[i];
205 if (uc <= 0x7F)
206 result = writer.writeByte(uc);
207 else
208 result = writer.write2Bytes(uc);
209
210 if (BL_UNLIKELY(result != BL_SUCCESS)) {
211 state.dstIndex = writer.index(static_cast<char*>(dst));
212 state.srcIndex = i;
213 break;
214 }
215 }
216 }
217
218 break;
219 }
220
221 // ------------------------------------------------------------------------
222 // [Utf8 <- Utf8]
223 // ------------------------------------------------------------------------
224
225 case (BL_TEXT_ENCODING_UTF8 << 2) | BL_TEXT_ENCODING_UTF8: {
226 size_t copySize = blMin(dstSizeInBytes, srcSizeInBytes);
227 BLUnicodeValidationState validationState;
228
229 result = blValidateUnicode(src, copySize, BL_TEXT_ENCODING_UTF8, validationState);
230 size_t validatedSize = validationState.utf8Index;
231
232 if (validatedSize)
233 memcpy(dst, src, validatedSize);
234
235 // Prevent `BL_ERROR_DATA_TRUNCATED` in case there is not enough space in destination.
236 if (copySize < srcSizeInBytes && (result == BL_SUCCESS || result == BL_ERROR_DATA_TRUNCATED))
237 result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
238
239 state.dstIndex = validatedSize;
240 state.srcIndex = validatedSize;
241 break;
242 }
243
244 // ------------------------------------------------------------------------
245 // [Utf8 <- Utf16]
246 // ------------------------------------------------------------------------
247
248 case (BL_TEXT_ENCODING_UTF8 << 2) | BL_TEXT_ENCODING_UTF16:
249 if (BL_UNALIGNED_IO_16 || !blIsAligned(src, 2))
250 result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf16Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
251 else
252 result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf16Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
253 break;
254
255 // ------------------------------------------------------------------------
256 // [Utf8 <- Utf32]
257 // ------------------------------------------------------------------------
258
259 case (BL_TEXT_ENCODING_UTF8 << 2) | BL_TEXT_ENCODING_UTF32: {
260 if (BL_UNALIGNED_IO_32 || !blIsAligned(src, 4))
261 result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf32Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
262 else
263 result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
264 break;
265 }
266
267 // ------------------------------------------------------------------------
268 // [Utf16 <- Latin1]
269 // ------------------------------------------------------------------------
270
271 case (BL_TEXT_ENCODING_UTF16 << 2) | BL_TEXT_ENCODING_LATIN1: {
272 size_t count = blMin(dstSizeInBytes / 2, srcSizeInBytes);
273
274 if (BL_UNALIGNED_IO_16 || blIsAligned(dst, 2)) {
275 for (size_t i = 0; i < count; i++)
276 blMemWriteU16aLE(static_cast<uint8_t*>(dst) + i * 2u, static_cast<const uint8_t*>(src)[i]);
277 }
278 else {
279 for (size_t i = 0; i < count; i++)
280 blMemWriteU16uLE(static_cast<uint8_t*>(dst) + i * 2u, static_cast<const uint8_t*>(src)[i]);
281 }
282
283 if (count < srcSizeInBytes)
284 result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
285
286 state.dstIndex = count * 2u;
287 state.srcIndex = count;
288 break;
289 }
290
291 // ------------------------------------------------------------------------
292 // [Utf16 <- Utf8]
293 // ------------------------------------------------------------------------
294
295 case (BL_TEXT_ENCODING_UTF16 << 2) | BL_TEXT_ENCODING_UTF8: {
296 if (BL_UNALIGNED_IO_16 || !blIsAligned(dst, 2))
297 result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
298 else
299 result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, 2>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
300 break;
301 }
302
303 // ------------------------------------------------------------------------
304 // [Utf16 <- Utf16]
305 // ------------------------------------------------------------------------
306
307 case (BL_TEXT_ENCODING_UTF16 << 2) | BL_TEXT_ENCODING_UTF16: {
308 size_t copySize = blAlignDown(blMin(dstSizeInBytes, srcSizeInBytes), 2);
309 BLUnicodeValidationState validationState;
310
311 result = blValidateUnicode(src, copySize, BL_TEXT_ENCODING_UTF16, validationState);
312 size_t validatedSize = validationState.utf16Index * 2;
313
314 memmove(dst, src, validatedSize);
315
316 // Prevent `BL_ERROR_DATA_TRUNCATED` in case there is not enough space in destination.
317 if (copySize < srcSizeInBytes && (result == BL_SUCCESS || result == BL_ERROR_DATA_TRUNCATED))
318 result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
319
320 // Report `BL_ERROR_DATA_TRUNCATED` is everything went right, but the
321 // source size was not aligned to 2 bytes.
322 if (result == BL_SUCCESS && !blIsAligned(srcSizeInBytes, 2))
323 result = blTraceError(BL_ERROR_DATA_TRUNCATED);
324
325 state.dstIndex = validatedSize;
326 state.srcIndex = validatedSize;
327 break;
328 }
329
330 // ------------------------------------------------------------------------
331 // [Utf16 <- Utf32]
332 // ------------------------------------------------------------------------
333
334 case (BL_TEXT_ENCODING_UTF16 << 2) | BL_TEXT_ENCODING_UTF32: {
335 if (BL_UNALIGNED_IO_Any || !blIsAligned(dst, 2) || !blIsAligned(src, 4))
336 result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf32Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
337 else
338 result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, 2>, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
339 break;
340 }
341
342 // ------------------------------------------------------------------------
343 // [Utf32 <- Latin1]
344 // ------------------------------------------------------------------------
345
346 case (BL_TEXT_ENCODING_UTF32 << 2) | BL_TEXT_ENCODING_LATIN1: {
347 size_t count = blMin(dstSizeInBytes / 4, srcSizeInBytes);
348
349 if (BL_UNALIGNED_IO_32 || blIsAligned(dst, 4)) {
350 for (size_t i = 0; i < count; i++)
351 blMemWriteU32a(static_cast<uint8_t*>(dst) + i * 4u, static_cast<const uint8_t*>(src)[i]);
352 }
353 else {
354 for (size_t i = 0; i < count; i++)
355 blMemWriteU32u(static_cast<uint8_t*>(dst) + i * 4u, static_cast<const uint8_t*>(src)[i]);
356 }
357
358 if (count < srcSizeInBytes)
359 result = blTraceError(BL_ERROR_NO_SPACE_LEFT);
360
361 state.dstIndex = count * 4u;
362 state.srcIndex = count;
363 break;
364 }
365
366 // ------------------------------------------------------------------------
367 // [Utf32 <- Utf8]
368 // ------------------------------------------------------------------------
369
370 case (BL_TEXT_ENCODING_UTF32 << 2) | BL_TEXT_ENCODING_UTF8: {
371 if (BL_UNALIGNED_IO_32 || !blIsAligned(dst, 4))
372 result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
373 else
374 result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 4>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
375 break;
376 }
377
378 // ------------------------------------------------------------------------
379 // [Utf32 <- Utf16]
380 // ------------------------------------------------------------------------
381
382 case (BL_TEXT_ENCODING_UTF32 << 2) | BL_TEXT_ENCODING_UTF16: {
383 if (BL_UNALIGNED_IO_Any || !blIsAligned(dst, 4) || !blIsAligned(src, 2))
384 result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf16Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
385 else
386 result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 4>, BLUtf16Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
387 break;
388 }
389
390 // ------------------------------------------------------------------------
391 // [Utf32 <- Utf32]
392 // ------------------------------------------------------------------------
393
394 case (BL_TEXT_ENCODING_UTF32 << 2) | BL_TEXT_ENCODING_UTF32: {
395 if (BL_UNALIGNED_IO_32 || !blIsAligned(dst, 4) || !blIsAligned(src, 4))
396 result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
397 else
398 result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 4>, BLUtf32Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state);
399 break;
400 }
401
402 // ------------------------------------------------------------------------
403 // [Invalid]
404 // ------------------------------------------------------------------------
405
406 default: {
407 return blTraceError(BL_ERROR_INVALID_VALUE);
408 }
409 }
410
411 return result;
412}
413
414// ============================================================================
415// [BLUnicode - Test]
416// ============================================================================
417
418#ifdef BL_TEST
419UNIT(blend2d_unicode) {
420 struct TestEntry {
421 char dst[28];
422 char src[28];
423 uint8_t dstSize;
424 uint8_t srcSize;
425 uint8_t dstEncoding;
426 uint8_t srcEncoding;
427 BLResult result;
428 };
429
430 static const TestEntry testEntries[] = {
431 #define ENTRY(DST, DST_ENC, SRC, SRC_ENC, ERROR_CODE) { \
432 DST, \
433 SRC, \
434 uint8_t(sizeof(DST) - 1), \
435 uint8_t(sizeof(SRC) - 1), \
436 uint8_t(BL_TEXT_ENCODING_##DST_ENC), \
437 uint8_t(BL_TEXT_ENCODING_##SRC_ENC), \
438 ERROR_CODE \
439 }
440
441 ENTRY("Test" , LATIN1, "Test" , LATIN1, BL_SUCCESS),
442 ENTRY("Test" , UTF8 , "Test" , LATIN1, BL_SUCCESS),
443 ENTRY("Test" , UTF8 , "Test" , UTF8 , BL_SUCCESS),
444 #if BL_BYTE_ORDER == 1234
445 ENTRY("Test" , UTF8 , "T\0e\0s\0t\0" , UTF16 , BL_SUCCESS),
446 ENTRY("T\0e\0s\0t\0" , UTF16 , "Test" , UTF8 , BL_SUCCESS),
447 #else
448 ENTRY("Test" , UTF8 , "\0T\0e\0s\0t" , UTF16 , BL_SUCCESS),
449 ENTRY("\0T\0e\0s\0t" , UTF16 , "Test" , UTF8 , BL_SUCCESS),
450 #endif
451
452 // Tests a Czech word (Rain in english) with diacritic marks, at most 2 BYTEs per character.
453 #if BL_BYTE_ORDER == 1234
454 ENTRY("\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , "\x44\x00\xE9\x00\x61\x01\x65\x01", UTF16 , BL_SUCCESS),
455 ENTRY("\x44\x00\xE9\x00\x61\x01\x65\x01", UTF16 , "\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , BL_SUCCESS),
456 #else
457 ENTRY("\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , "\x00\x44\x00\xE9\x01\x61\x01\x65", UTF16 , BL_SUCCESS),
458 ENTRY("\x00\x44\x00\xE9\x01\x61\x01\x65", UTF16 , "\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , BL_SUCCESS),
459 #endif
460
461 // Tests full-width digit zero (3 BYTEs per UTF-8 character).
462 #if BL_BYTE_ORDER == 1234
463 ENTRY("\xEF\xBC\x90" , UTF8 , "\x10\xFF" , UTF16 , BL_SUCCESS),
464 ENTRY("\x10\xFF" , UTF16 , "\xEF\xBC\x90" , UTF8 , BL_SUCCESS),
465 #else
466 ENTRY("\xEF\xBC\x90" , UTF8 , "\xFF\x10" , UTF16 , BL_SUCCESS),
467 ENTRY("\xFF\x10" , UTF16 , "\xEF\xBC\x90" , UTF8 , BL_SUCCESS),
468 #endif
469
470 // Tests `BL_CHAR_MAX` character (4 BYTEs per UTF-8 character, the highest possible unicode code-point).
471 #if BL_BYTE_ORDER == 1234
472 ENTRY("\xF4\x8F\xBF\xBF" , UTF8 , "\xFF\xDB\xFF\xDF" , UTF16 , BL_SUCCESS),
473 ENTRY("\xFF\xDB\xFF\xDF" , UTF16 , "\xF4\x8F\xBF\xBF" , UTF8 , BL_SUCCESS),
474 #else
475 ENTRY("\xF4\x8F\xBF\xBF" , UTF8 , "\xDB\xFF\xDF\xFF" , UTF16 , BL_SUCCESS),
476 ENTRY("\xDB\xFF\xDF\xFF" , UTF16 , "\xF4\x8F\xBF\xBF" , UTF8 , BL_SUCCESS),
477 #endif
478
479 #if BL_BYTE_ORDER == 1234
480 ENTRY("Test" , UTF8 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS),
481 ENTRY("T\0e\0s\0t\0" , UTF16 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS),
482 ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS),
483 ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "T\0e\0s\0t\0" , UTF16 , BL_SUCCESS),
484 ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "Test" , LATIN1, BL_SUCCESS),
485 ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "Test" , UTF8 , BL_SUCCESS),
486 #else
487 ENTRY("Test" , UTF8 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS),
488 ENTRY("\0T\0e\0s\0t" , UTF16 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS),
489 ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS),
490 ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "\0T\0e\0s\0t" , UTF16 , BL_SUCCESS)
491 ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "Test" , LATIN1, BL_SUCCESS),
492 ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "Test" , UTF8 , BL_SUCCESS),
493 #endif
494
495 // Truncated characters.
496 ENTRY("" , UTF8 , "\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED),
497 ENTRY("" , UTF8 , "\xEF" , UTF8 , BL_ERROR_DATA_TRUNCATED),
498 ENTRY("" , UTF8 , "\xEF\xBC" , UTF8 , BL_ERROR_DATA_TRUNCATED),
499 ENTRY("" , UTF8 , "\xF4" , UTF8 , BL_ERROR_DATA_TRUNCATED),
500 ENTRY("" , UTF8 , "\xF4\x8F" , UTF8 , BL_ERROR_DATA_TRUNCATED),
501 ENTRY("" , UTF8 , "\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED),
502
503 // Truncated character at the end (the converter must output the content, which was correct).
504 ENTRY("a" , UTF8 , "a\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED),
505 ENTRY("ab" , UTF8 , "ab\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED),
506 ENTRY("TestString" , UTF8 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED),
507 #if BL_BYTE_ORDER == 1234
508 ENTRY("T\0e\0s\0t\0S\0t\0r\0i\0n\0g\0" , UTF16 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED),
509 #else
510 ENTRY("\0T\0e\0s\0t\0S\0t\0r\0i\0n\0g" , UTF16 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED),
511 #endif
512
513 // Invalid UTf-8 characters.
514 ENTRY("" , UTF8 , "\x80" , UTF8 , BL_ERROR_INVALID_STRING),
515 ENTRY("" , UTF8 , "\xC1" , UTF8 , BL_ERROR_INVALID_STRING),
516 ENTRY("" , UTF8 , "\xF5\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING),
517 ENTRY("" , UTF8 , "\x91\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING),
518 ENTRY("" , UTF8 , "\xF6\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING),
519 ENTRY("" , UTF8 , "\xF4\xFF\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING),
520
521 // Overlong UTF-8 characters.
522 ENTRY("" , UTF8 , "\xC0\xA0" , UTF8 , BL_ERROR_INVALID_STRING)
523
524 #undef ENTRY
525 };
526
527 for (size_t i = 0; i < BL_ARRAY_SIZE(testEntries); i++) {
528 const TestEntry& entry = testEntries[i];
529 char output[32];
530
531 BLUnicodeConversionState state;
532 BLResult result = blConvertUnicode(output, 32, entry.dstEncoding, entry.src, entry.srcSize, entry.srcEncoding, state);
533
534 bool failed = (result != entry.result) ||
535 (state.dstIndex != entry.dstSize) ||
536 (memcmp(output, entry.dst, state.dstIndex) != 0);
537
538 if (failed) {
539 size_t inputSize = entry.srcSize;
540 size_t outputSize = state.dstIndex;
541 size_t expectedSize = entry.dstSize;
542
543 printf(" Failed Entry #%u\n", unsigned(i));
544
545 printf(" Input :");
546 for (size_t j = 0; j < inputSize; j++)
547 printf(" %02X", uint8_t(entry.src[j]));
548 printf("%s\n", inputSize ? "" : " (Nothing)");
549
550 printf(" Output :");
551 for (size_t j = 0; j < outputSize; j++)
552 printf(" %02X", uint8_t(output[j]));
553 printf("%s\n", outputSize ? "" : " (Nothing)");
554
555 printf(" Expected:");
556 for (size_t j = 0; j < expectedSize; j++)
557 printf(" %02X", uint8_t(entry.dst[j]));
558 printf("%s\n", expectedSize ? "" : " (Nothing)");
559 printf(" ErrorCode: Actual(%u) %s Expected(%u)\n", result, (result == entry.result) ? "==" : "!=", entry.result);
560 }
561
562 EXPECT(!failed);
563 }
564}
565
566UNIT(blend2d_unicode_io) {
567 INFO("BLUtf8Reader");
568 {
569 const uint8_t data[] = {
570 0xE2, 0x82, 0xAC, // U+0020AC
571 0xF0, 0x90, 0x8D, 0x88 // U+010348
572 };
573
574 BLUtf8Reader it(data, BL_ARRAY_SIZE(data));
575 uint32_t uc;
576
577 EXPECT(it.hasNext());
578 EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
579 EXPECT(uc == 0x0020AC);
580
581 EXPECT(it.hasNext());
582 EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
583 EXPECT(uc == 0x010348);
584
585 EXPECT(!it.hasNext());
586
587 // Verify that sizes were calculated correctly.
588 EXPECT(it.byteIndex(data) == 7);
589 EXPECT(it.utf8Index(data) == 7);
590 EXPECT(it.utf16Index(data) == 3); // 3 code-points (1 BMP and 1 SMP).
591 EXPECT(it.utf32Index(data) == 2); // 2 code-points.
592
593 const uint8_t invalidData[] = { 0xE2, 0x82 };
594 it.reset(invalidData, BL_ARRAY_SIZE(invalidData));
595
596 EXPECT(it.hasNext());
597 EXPECT(it.next(uc) == BL_ERROR_DATA_TRUNCATED);
598
599 // After error the iterator should not move.
600 EXPECT(it.hasNext());
601 EXPECT(it.byteIndex(invalidData) == 0);
602 EXPECT(it.utf8Index(invalidData) == 0);
603 EXPECT(it.utf16Index(invalidData) == 0);
604 EXPECT(it.utf32Index(invalidData) == 0);
605 }
606
607 INFO("BLUtf16Reader");
608 {
609 const uint16_t data[] = {
610 0x20AC, // U+0020AC
611 0xD800, 0xDF48 // U+010348
612 };
613
614 BLUtf16Reader it(data, BL_ARRAY_SIZE(data) * sizeof(uint16_t));
615 uint32_t uc;
616
617 EXPECT(it.hasNext());
618 EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
619 EXPECT(uc == 0x0020AC);
620
621 EXPECT(it.hasNext());
622 EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
623 EXPECT(uc == 0x010348);
624
625 EXPECT(!it.hasNext());
626
627 // Verify that sizes were calculated correctly.
628 EXPECT(it.byteIndex(data) == 6);
629 EXPECT(it.utf8Index(data) == 7);
630 EXPECT(it.utf16Index(data) == 3); // 3 code-points (1 BMP and 1 SMP).
631 EXPECT(it.utf32Index(data) == 2); // 2 code-points.
632
633 const uint16_t invalidData[] = { 0xD800 };
634 it.reset(invalidData, BL_ARRAY_SIZE(invalidData) * sizeof(uint16_t));
635
636 EXPECT(it.hasNext());
637 EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX | BL_UNICODE_IO_STRICT>(uc) == BL_ERROR_DATA_TRUNCATED);
638
639 // After an error the iterator should not move.
640 EXPECT(it.hasNext());
641 EXPECT(it.byteIndex(invalidData) == 0);
642 EXPECT(it.utf8Index(invalidData) == 0);
643 EXPECT(it.utf16Index(invalidData) == 0);
644 EXPECT(it.utf32Index(invalidData) == 0);
645
646 // However, this should pass in non-strict mode.
647 EXPECT(it.next(uc) == BL_SUCCESS);
648 EXPECT(!it.hasNext());
649 }
650
651 INFO("BLUtf32Reader");
652 {
653 const uint32_t data[] = {
654 0x0020AC,
655 0x010348
656 };
657
658 BLUtf32Reader it(data, BL_ARRAY_SIZE(data) * sizeof(uint32_t));
659 uint32_t uc;
660
661 EXPECT(it.hasNext());
662 EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
663 EXPECT(uc == 0x0020AC);
664
665 EXPECT(it.hasNext());
666 EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS);
667 EXPECT(uc == 0x010348);
668
669 EXPECT(!it.hasNext());
670
671 // Verify that sizes were calculated correctly.
672 EXPECT(it.byteIndex(data) == 8);
673 EXPECT(it.utf8Index(data) == 7);
674 EXPECT(it.utf16Index(data) == 3); // 3 code-points (1 BMP and 1 SMP).
675 EXPECT(it.utf32Index(data) == 2); // 2 code-points.
676
677 const uint32_t invalidData[] = { 0xD800 };
678 it.reset(invalidData, BL_ARRAY_SIZE(invalidData) * sizeof(uint32_t));
679
680 EXPECT(it.hasNext());
681 EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX | BL_UNICODE_IO_STRICT>(uc) == BL_ERROR_INVALID_STRING);
682
683 // After an error the iterator should not move.
684 EXPECT(it.hasNext());
685 EXPECT(it.byteIndex(invalidData) == 0);
686 EXPECT(it.utf8Index(invalidData) == 0);
687 EXPECT(it.utf16Index(invalidData) == 0);
688 EXPECT(it.utf32Index(invalidData) == 0);
689
690 // However, this should pass in non-strict mode.
691 EXPECT(it.next(uc) == BL_SUCCESS);
692 EXPECT(!it.hasNext());
693 }
694
695 INFO("BLUtf8Writer");
696 {
697 char dst[7];
698 BLUtf8Writer writer(dst, BL_ARRAY_SIZE(dst));
699
700 EXPECT(writer.write(0x20ACu) == BL_SUCCESS);
701 EXPECT(uint8_t(dst[0]) == 0xE2);
702 EXPECT(uint8_t(dst[1]) == 0x82);
703 EXPECT(uint8_t(dst[2]) == 0xAC);
704
705 EXPECT(writer.write(0x010348u) == BL_SUCCESS);
706 EXPECT(uint8_t(dst[3]) == 0xF0);
707 EXPECT(uint8_t(dst[4]) == 0x90);
708 EXPECT(uint8_t(dst[5]) == 0x8D);
709 EXPECT(uint8_t(dst[6]) == 0x88);
710 EXPECT(writer.atEnd());
711
712 writer.reset(dst, 1);
713 EXPECT(writer.write(0x20ACu) == BL_ERROR_NO_SPACE_LEFT);
714 EXPECT(writer.write(0x0080u) == BL_ERROR_NO_SPACE_LEFT);
715 EXPECT(writer.write(0x00C1u) == BL_ERROR_NO_SPACE_LEFT);
716
717 // We have only one byte left so this must pass...
718 EXPECT(writer.write('a') == BL_SUCCESS);
719 EXPECT(writer.atEnd());
720
721 writer.reset(dst, 2);
722 EXPECT(writer.write(0x20ACu) == BL_ERROR_NO_SPACE_LEFT);
723 EXPECT(writer.write(0x00C1u) == BL_SUCCESS);
724 EXPECT(uint8_t(dst[0]) == 0xC3u);
725 EXPECT(uint8_t(dst[1]) == 0x81u);
726 EXPECT(writer.atEnd());
727 EXPECT(writer.write('a') == BL_ERROR_NO_SPACE_LEFT);
728 }
729
730 INFO("BLUtf16Writer");
731 {
732 uint16_t dst[3];
733 BLUtf16Writer<> writer(dst, BL_ARRAY_SIZE(dst));
734
735 EXPECT(writer.write(0x010348u) == BL_SUCCESS);
736 EXPECT(dst[0] == 0xD800u);
737 EXPECT(dst[1] == 0xDF48u);
738
739 EXPECT(writer.write(0x010348u) == BL_ERROR_NO_SPACE_LEFT);
740 EXPECT(writer.write(0x20ACu) == BL_SUCCESS);
741 EXPECT(dst[2] == 0x20ACu);
742 EXPECT(writer.atEnd());
743 }
744}
745#endif
746