1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #include "./blapi-build_p.h" |
8 | #include "./blsupport_p.h" |
9 | #include "./blunicode_p.h" |
10 | |
11 | // ============================================================================ |
12 | // [Unicode Data] |
13 | // ============================================================================ |
14 | |
15 | // NOTE: Theoretically UTF-8 sequence can be extended to support sequences up |
16 | // to 6 bytes, however, since UCS-4 code-point's maximum value is 0x10FFFF it |
17 | // also limits the maximum length of a UTF-8 sequence to 4 bytes. |
18 | const uint8_t blUtf8SizeData[256] = { |
19 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 - 15 |
20 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 16 - 31 |
21 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 32 - 47 |
22 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 48 - 63 |
23 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 64 - 79 |
24 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 95 |
25 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 96 - 111 |
26 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 112 - 127 |
27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 128 - 143 |
28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 144 - 159 |
29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 160 - 175 |
30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 176 - 191 |
31 | 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 192 - 207 |
32 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 208 - 223 |
33 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 224 - 239 |
34 | 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 240 - 255 |
35 | }; |
36 | |
37 | // ============================================================================ |
38 | // [BLUnicode - Validation] |
39 | // ============================================================================ |
40 | |
41 | // Not really anything to validate, we just want to calculate a corresponding UTf-8 size. |
42 | static BL_INLINE BLResult validateLatin1String(const char* data, size_t size, BLUnicodeValidationState& state) noexcept { |
43 | size_t = 0; |
44 | state.utf16Index = size; |
45 | state.utf32Index = size; |
46 | |
47 | for (size_t i = 0; i < size; i++) |
48 | extra += size_t(uint8_t(data[i])) >> 7; |
49 | |
50 | BLOverflowFlag of = 0; |
51 | size_t utf8Size = blAddOverflow(size, extra, &of); |
52 | |
53 | if (BL_UNLIKELY(of)) |
54 | return blTraceError(BL_ERROR_DATA_TOO_LARGE); |
55 | |
56 | state.utf8Index = utf8Size; |
57 | return BL_SUCCESS; |
58 | } |
59 | |
60 | template<typename Iterator, uint32_t Flags> |
61 | static BL_INLINE BLResult validateUnicodeString(const void* data, size_t size, BLUnicodeValidationState& state) noexcept { |
62 | Iterator it(data, size); |
63 | BLResult result = it.template validate<Flags | BL_UNICODE_IO_CALC_INDEX>(); |
64 | state.utf8Index = it.utf8Index(data); |
65 | state.utf16Index = it.utf16Index(data); |
66 | state.utf32Index = it.utf32Index(data); |
67 | return result; |
68 | } |
69 | |
70 | BLResult blValidateUnicode(const void* data, size_t sizeInBytes, uint32_t encoding, BLUnicodeValidationState& state) noexcept { |
71 | BLResult result; |
72 | state.reset(); |
73 | |
74 | switch (encoding) { |
75 | case BL_TEXT_ENCODING_LATIN1: |
76 | return validateLatin1String(static_cast<const char*>(data), sizeInBytes, state); |
77 | |
78 | case BL_TEXT_ENCODING_UTF8: |
79 | return validateUnicodeString<BLUtf8Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state); |
80 | |
81 | case BL_TEXT_ENCODING_UTF16: |
82 | // This will make sure we won't compile specialized code for architectures that don't penalize unaligned reads. |
83 | if (BL_UNALIGNED_IO_16 || !blIsAligned(data, 2)) |
84 | result = validateUnicodeString<BLUtf16Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(data, sizeInBytes, state); |
85 | else |
86 | result = validateUnicodeString<BLUtf16Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state); |
87 | |
88 | if (result == BL_SUCCESS && BL_UNLIKELY(sizeInBytes & 0x1)) |
89 | result = blTraceError(BL_ERROR_DATA_TRUNCATED); |
90 | return result; |
91 | |
92 | case BL_TEXT_ENCODING_UTF32: |
93 | // This will make sure we won't compile specialized code for architectures that don't penalize unaligned reads. |
94 | if (BL_UNALIGNED_IO_32 || !blIsAligned(data, 4)) |
95 | result = validateUnicodeString<BLUtf32Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(data, sizeInBytes, state); |
96 | else |
97 | result = validateUnicodeString<BLUtf32Reader, BL_UNICODE_IO_STRICT>(data, sizeInBytes, state); |
98 | |
99 | if (result == BL_SUCCESS && BL_UNLIKELY(sizeInBytes & 0x3)) |
100 | result = blTraceError(BL_ERROR_DATA_TRUNCATED); |
101 | return result; |
102 | |
103 | default: |
104 | return blTraceError(BL_ERROR_INVALID_VALUE); |
105 | } |
106 | } |
107 | |
108 | // ============================================================================ |
109 | // [BLUnicode - Conversion] |
110 | // ============================================================================ |
111 | |
112 | static BL_INLINE size_t offsetOfPtr(const void* base, const void* advanced) noexcept { |
113 | return (size_t)(static_cast<const char*>(advanced) - static_cast<const char*>(base)); |
114 | } |
115 | |
116 | // A simple implementation. It iterates `src` char-by-char and writes it to the |
117 | // destination. The advantage of this implementation is that switching `Writer` |
118 | // and `Iterator` can customize strictness, endianness, etc, so we don't have |
119 | // to repeat the code for different variations of UTF16 and UTF32. |
120 | template<typename Writer, typename Iterator, uint32_t IterFlags> |
121 | static BL_INLINE BLResult blConvertUnicodeImpl(void* dst, size_t dstSizeInBytes, const void* src, size_t srcSizeInBytes, BLUnicodeConversionState& state) noexcept { |
122 | typedef typename Writer::CharType DstChar; |
123 | |
124 | Writer writer(static_cast<DstChar*>(dst), dstSizeInBytes / sizeof(DstChar)); |
125 | Iterator iter(src, blAlignDown(srcSizeInBytes, Iterator::kCharSize)); |
126 | |
127 | BLResult result = BL_SUCCESS; |
128 | while (iter.hasNext()) { |
129 | uint32_t uc; |
130 | size_t ucSizeInBytes; |
131 | |
132 | result = iter.template next<IterFlags>(uc, ucSizeInBytes); |
133 | if (BL_UNLIKELY(result != BL_SUCCESS)) |
134 | break; |
135 | |
136 | result = writer.write(uc); |
137 | if (BL_UNLIKELY(result != BL_SUCCESS)) { |
138 | state.dstIndex = offsetOfPtr(dst, writer._ptr); |
139 | state.srcIndex = offsetOfPtr(src, iter._ptr) - ucSizeInBytes; |
140 | return result; |
141 | } |
142 | } |
143 | |
144 | state.dstIndex = offsetOfPtr(dst, writer._ptr); |
145 | state.srcIndex = offsetOfPtr(src, iter._ptr); |
146 | |
147 | if (Iterator::kCharSize > 1 && result == BL_SUCCESS && !blIsAligned(state.srcIndex, Iterator::kCharSize)) |
148 | return blTraceError(BL_ERROR_DATA_TRUNCATED); |
149 | else |
150 | return result; |
151 | } |
152 | |
153 | BLResult blConvertUnicode( |
154 | void* dst, size_t dstSizeInBytes, uint32_t dstEncoding, |
155 | const void* src, size_t srcSizeInBytes, uint32_t srcEncoding, |
156 | BLUnicodeConversionState& state) noexcept { |
157 | |
158 | constexpr bool BL_UNALIGNED_IO_Any = BL_UNALIGNED_IO_16 && BL_UNALIGNED_IO_32; |
159 | |
160 | BLResult result = BL_SUCCESS; |
161 | state.reset(); |
162 | |
163 | uint32_t encodingCombined = (dstEncoding << 2) | srcEncoding; |
164 | switch (encodingCombined) { |
165 | // ------------------------------------------------------------------------ |
166 | // [MemCpy] |
167 | // ------------------------------------------------------------------------ |
168 | |
169 | case (BL_TEXT_ENCODING_LATIN1 << 2) | BL_TEXT_ENCODING_LATIN1: { |
170 | size_t copySize = blMin(dstSizeInBytes, srcSizeInBytes); |
171 | memcpy(dst, src, copySize); |
172 | |
173 | state.dstIndex = copySize; |
174 | state.srcIndex = copySize; |
175 | |
176 | if (dstSizeInBytes < srcSizeInBytes) |
177 | result = blTraceError(BL_ERROR_NO_SPACE_LEFT); |
178 | break; |
179 | } |
180 | |
181 | // ------------------------------------------------------------------------ |
182 | // [Utf8 <- Latin1] |
183 | // ------------------------------------------------------------------------ |
184 | |
185 | case (BL_TEXT_ENCODING_UTF8 << 2) | BL_TEXT_ENCODING_LATIN1: { |
186 | BLUtf8Writer writer(static_cast<char*>(dst), dstSizeInBytes); |
187 | const uint8_t* src8 = static_cast<const uint8_t*>(src); |
188 | |
189 | if (dstSizeInBytes / 2 >= srcSizeInBytes) { |
190 | // Fast case, there is enough space in `dst` even for the worst-case scenario. |
191 | for (size_t i = 0; i < srcSizeInBytes; i++) { |
192 | uint32_t uc = src8[i]; |
193 | if (uc <= 0x7F) |
194 | writer.writeByteUnsafe(uc); |
195 | else |
196 | writer.write2BytesUnsafe(uc); |
197 | } |
198 | |
199 | state.srcIndex = srcSizeInBytes; |
200 | state.dstIndex = writer.index(static_cast<char*>(dst)); |
201 | } |
202 | else { |
203 | for (size_t i = 0; i < srcSizeInBytes; i++) { |
204 | uint32_t uc = src8[i]; |
205 | if (uc <= 0x7F) |
206 | result = writer.writeByte(uc); |
207 | else |
208 | result = writer.write2Bytes(uc); |
209 | |
210 | if (BL_UNLIKELY(result != BL_SUCCESS)) { |
211 | state.dstIndex = writer.index(static_cast<char*>(dst)); |
212 | state.srcIndex = i; |
213 | break; |
214 | } |
215 | } |
216 | } |
217 | |
218 | break; |
219 | } |
220 | |
221 | // ------------------------------------------------------------------------ |
222 | // [Utf8 <- Utf8] |
223 | // ------------------------------------------------------------------------ |
224 | |
225 | case (BL_TEXT_ENCODING_UTF8 << 2) | BL_TEXT_ENCODING_UTF8: { |
226 | size_t copySize = blMin(dstSizeInBytes, srcSizeInBytes); |
227 | BLUnicodeValidationState validationState; |
228 | |
229 | result = blValidateUnicode(src, copySize, BL_TEXT_ENCODING_UTF8, validationState); |
230 | size_t validatedSize = validationState.utf8Index; |
231 | |
232 | if (validatedSize) |
233 | memcpy(dst, src, validatedSize); |
234 | |
235 | // Prevent `BL_ERROR_DATA_TRUNCATED` in case there is not enough space in destination. |
236 | if (copySize < srcSizeInBytes && (result == BL_SUCCESS || result == BL_ERROR_DATA_TRUNCATED)) |
237 | result = blTraceError(BL_ERROR_NO_SPACE_LEFT); |
238 | |
239 | state.dstIndex = validatedSize; |
240 | state.srcIndex = validatedSize; |
241 | break; |
242 | } |
243 | |
244 | // ------------------------------------------------------------------------ |
245 | // [Utf8 <- Utf16] |
246 | // ------------------------------------------------------------------------ |
247 | |
248 | case (BL_TEXT_ENCODING_UTF8 << 2) | BL_TEXT_ENCODING_UTF16: |
249 | if (BL_UNALIGNED_IO_16 || !blIsAligned(src, 2)) |
250 | result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf16Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
251 | else |
252 | result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf16Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
253 | break; |
254 | |
255 | // ------------------------------------------------------------------------ |
256 | // [Utf8 <- Utf32] |
257 | // ------------------------------------------------------------------------ |
258 | |
259 | case (BL_TEXT_ENCODING_UTF8 << 2) | BL_TEXT_ENCODING_UTF32: { |
260 | if (BL_UNALIGNED_IO_32 || !blIsAligned(src, 4)) |
261 | result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf32Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
262 | else |
263 | result = blConvertUnicodeImpl<BLUtf8Writer, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
264 | break; |
265 | } |
266 | |
267 | // ------------------------------------------------------------------------ |
268 | // [Utf16 <- Latin1] |
269 | // ------------------------------------------------------------------------ |
270 | |
271 | case (BL_TEXT_ENCODING_UTF16 << 2) | BL_TEXT_ENCODING_LATIN1: { |
272 | size_t count = blMin(dstSizeInBytes / 2, srcSizeInBytes); |
273 | |
274 | if (BL_UNALIGNED_IO_16 || blIsAligned(dst, 2)) { |
275 | for (size_t i = 0; i < count; i++) |
276 | blMemWriteU16aLE(static_cast<uint8_t*>(dst) + i * 2u, static_cast<const uint8_t*>(src)[i]); |
277 | } |
278 | else { |
279 | for (size_t i = 0; i < count; i++) |
280 | blMemWriteU16uLE(static_cast<uint8_t*>(dst) + i * 2u, static_cast<const uint8_t*>(src)[i]); |
281 | } |
282 | |
283 | if (count < srcSizeInBytes) |
284 | result = blTraceError(BL_ERROR_NO_SPACE_LEFT); |
285 | |
286 | state.dstIndex = count * 2u; |
287 | state.srcIndex = count; |
288 | break; |
289 | } |
290 | |
291 | // ------------------------------------------------------------------------ |
292 | // [Utf16 <- Utf8] |
293 | // ------------------------------------------------------------------------ |
294 | |
295 | case (BL_TEXT_ENCODING_UTF16 << 2) | BL_TEXT_ENCODING_UTF8: { |
296 | if (BL_UNALIGNED_IO_16 || !blIsAligned(dst, 2)) |
297 | result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
298 | else |
299 | result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, 2>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
300 | break; |
301 | } |
302 | |
303 | // ------------------------------------------------------------------------ |
304 | // [Utf16 <- Utf16] |
305 | // ------------------------------------------------------------------------ |
306 | |
307 | case (BL_TEXT_ENCODING_UTF16 << 2) | BL_TEXT_ENCODING_UTF16: { |
308 | size_t copySize = blAlignDown(blMin(dstSizeInBytes, srcSizeInBytes), 2); |
309 | BLUnicodeValidationState validationState; |
310 | |
311 | result = blValidateUnicode(src, copySize, BL_TEXT_ENCODING_UTF16, validationState); |
312 | size_t validatedSize = validationState.utf16Index * 2; |
313 | |
314 | memmove(dst, src, validatedSize); |
315 | |
316 | // Prevent `BL_ERROR_DATA_TRUNCATED` in case there is not enough space in destination. |
317 | if (copySize < srcSizeInBytes && (result == BL_SUCCESS || result == BL_ERROR_DATA_TRUNCATED)) |
318 | result = blTraceError(BL_ERROR_NO_SPACE_LEFT); |
319 | |
320 | // Report `BL_ERROR_DATA_TRUNCATED` is everything went right, but the |
321 | // source size was not aligned to 2 bytes. |
322 | if (result == BL_SUCCESS && !blIsAligned(srcSizeInBytes, 2)) |
323 | result = blTraceError(BL_ERROR_DATA_TRUNCATED); |
324 | |
325 | state.dstIndex = validatedSize; |
326 | state.srcIndex = validatedSize; |
327 | break; |
328 | } |
329 | |
330 | // ------------------------------------------------------------------------ |
331 | // [Utf16 <- Utf32] |
332 | // ------------------------------------------------------------------------ |
333 | |
334 | case (BL_TEXT_ENCODING_UTF16 << 2) | BL_TEXT_ENCODING_UTF32: { |
335 | if (BL_UNALIGNED_IO_Any || !blIsAligned(dst, 2) || !blIsAligned(src, 4)) |
336 | result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf32Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
337 | else |
338 | result = blConvertUnicodeImpl<BLUtf16Writer<BL_BYTE_ORDER_NATIVE, 2>, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
339 | break; |
340 | } |
341 | |
342 | // ------------------------------------------------------------------------ |
343 | // [Utf32 <- Latin1] |
344 | // ------------------------------------------------------------------------ |
345 | |
346 | case (BL_TEXT_ENCODING_UTF32 << 2) | BL_TEXT_ENCODING_LATIN1: { |
347 | size_t count = blMin(dstSizeInBytes / 4, srcSizeInBytes); |
348 | |
349 | if (BL_UNALIGNED_IO_32 || blIsAligned(dst, 4)) { |
350 | for (size_t i = 0; i < count; i++) |
351 | blMemWriteU32a(static_cast<uint8_t*>(dst) + i * 4u, static_cast<const uint8_t*>(src)[i]); |
352 | } |
353 | else { |
354 | for (size_t i = 0; i < count; i++) |
355 | blMemWriteU32u(static_cast<uint8_t*>(dst) + i * 4u, static_cast<const uint8_t*>(src)[i]); |
356 | } |
357 | |
358 | if (count < srcSizeInBytes) |
359 | result = blTraceError(BL_ERROR_NO_SPACE_LEFT); |
360 | |
361 | state.dstIndex = count * 4u; |
362 | state.srcIndex = count; |
363 | break; |
364 | } |
365 | |
366 | // ------------------------------------------------------------------------ |
367 | // [Utf32 <- Utf8] |
368 | // ------------------------------------------------------------------------ |
369 | |
370 | case (BL_TEXT_ENCODING_UTF32 << 2) | BL_TEXT_ENCODING_UTF8: { |
371 | if (BL_UNALIGNED_IO_32 || !blIsAligned(dst, 4)) |
372 | result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
373 | else |
374 | result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 4>, BLUtf8Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
375 | break; |
376 | } |
377 | |
378 | // ------------------------------------------------------------------------ |
379 | // [Utf32 <- Utf16] |
380 | // ------------------------------------------------------------------------ |
381 | |
382 | case (BL_TEXT_ENCODING_UTF32 << 2) | BL_TEXT_ENCODING_UTF16: { |
383 | if (BL_UNALIGNED_IO_Any || !blIsAligned(dst, 4) || !blIsAligned(src, 2)) |
384 | result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf16Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
385 | else |
386 | result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 4>, BLUtf16Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
387 | break; |
388 | } |
389 | |
390 | // ------------------------------------------------------------------------ |
391 | // [Utf32 <- Utf32] |
392 | // ------------------------------------------------------------------------ |
393 | |
394 | case (BL_TEXT_ENCODING_UTF32 << 2) | BL_TEXT_ENCODING_UTF32: { |
395 | if (BL_UNALIGNED_IO_32 || !blIsAligned(dst, 4) || !blIsAligned(src, 4)) |
396 | result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 1>, BLUtf32Reader, BL_UNICODE_IO_STRICT>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
397 | else |
398 | result = blConvertUnicodeImpl<BLUtf32Writer<BL_BYTE_ORDER_NATIVE, 4>, BLUtf32Reader, BL_UNICODE_IO_STRICT | BL_UNICODE_IO_UNALIGNED>(dst, dstSizeInBytes, src, srcSizeInBytes, state); |
399 | break; |
400 | } |
401 | |
402 | // ------------------------------------------------------------------------ |
403 | // [Invalid] |
404 | // ------------------------------------------------------------------------ |
405 | |
406 | default: { |
407 | return blTraceError(BL_ERROR_INVALID_VALUE); |
408 | } |
409 | } |
410 | |
411 | return result; |
412 | } |
413 | |
414 | // ============================================================================ |
415 | // [BLUnicode - Test] |
416 | // ============================================================================ |
417 | |
418 | #ifdef BL_TEST |
419 | UNIT(blend2d_unicode) { |
420 | struct TestEntry { |
421 | char dst[28]; |
422 | char src[28]; |
423 | uint8_t dstSize; |
424 | uint8_t srcSize; |
425 | uint8_t dstEncoding; |
426 | uint8_t srcEncoding; |
427 | BLResult result; |
428 | }; |
429 | |
430 | static const TestEntry testEntries[] = { |
431 | #define ENTRY(DST, DST_ENC, SRC, SRC_ENC, ERROR_CODE) { \ |
432 | DST, \ |
433 | SRC, \ |
434 | uint8_t(sizeof(DST) - 1), \ |
435 | uint8_t(sizeof(SRC) - 1), \ |
436 | uint8_t(BL_TEXT_ENCODING_##DST_ENC), \ |
437 | uint8_t(BL_TEXT_ENCODING_##SRC_ENC), \ |
438 | ERROR_CODE \ |
439 | } |
440 | |
441 | ENTRY("Test" , LATIN1, "Test" , LATIN1, BL_SUCCESS), |
442 | ENTRY("Test" , UTF8 , "Test" , LATIN1, BL_SUCCESS), |
443 | ENTRY("Test" , UTF8 , "Test" , UTF8 , BL_SUCCESS), |
444 | #if BL_BYTE_ORDER == 1234 |
445 | ENTRY("Test" , UTF8 , "T\0e\0s\0t\0" , UTF16 , BL_SUCCESS), |
446 | ENTRY("T\0e\0s\0t\0" , UTF16 , "Test" , UTF8 , BL_SUCCESS), |
447 | #else |
448 | ENTRY("Test" , UTF8 , "\0T\0e\0s\0t" , UTF16 , BL_SUCCESS), |
449 | ENTRY("\0T\0e\0s\0t" , UTF16 , "Test" , UTF8 , BL_SUCCESS), |
450 | #endif |
451 | |
452 | // Tests a Czech word (Rain in english) with diacritic marks, at most 2 BYTEs per character. |
453 | #if BL_BYTE_ORDER == 1234 |
454 | ENTRY("\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , "\x44\x00\xE9\x00\x61\x01\x65\x01" , UTF16 , BL_SUCCESS), |
455 | ENTRY("\x44\x00\xE9\x00\x61\x01\x65\x01" , UTF16 , "\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , BL_SUCCESS), |
456 | #else |
457 | ENTRY("\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , "\x00\x44\x00\xE9\x01\x61\x01\x65" , UTF16 , BL_SUCCESS), |
458 | ENTRY("\x00\x44\x00\xE9\x01\x61\x01\x65" , UTF16 , "\x44\xC3\xA9\xC5\xA1\xC5\xA5" , UTF8 , BL_SUCCESS), |
459 | #endif |
460 | |
461 | // Tests full-width digit zero (3 BYTEs per UTF-8 character). |
462 | #if BL_BYTE_ORDER == 1234 |
463 | ENTRY("\xEF\xBC\x90" , UTF8 , "\x10\xFF" , UTF16 , BL_SUCCESS), |
464 | ENTRY("\x10\xFF" , UTF16 , "\xEF\xBC\x90" , UTF8 , BL_SUCCESS), |
465 | #else |
466 | ENTRY("\xEF\xBC\x90" , UTF8 , "\xFF\x10" , UTF16 , BL_SUCCESS), |
467 | ENTRY("\xFF\x10" , UTF16 , "\xEF\xBC\x90" , UTF8 , BL_SUCCESS), |
468 | #endif |
469 | |
470 | // Tests `BL_CHAR_MAX` character (4 BYTEs per UTF-8 character, the highest possible unicode code-point). |
471 | #if BL_BYTE_ORDER == 1234 |
472 | ENTRY("\xF4\x8F\xBF\xBF" , UTF8 , "\xFF\xDB\xFF\xDF" , UTF16 , BL_SUCCESS), |
473 | ENTRY("\xFF\xDB\xFF\xDF" , UTF16 , "\xF4\x8F\xBF\xBF" , UTF8 , BL_SUCCESS), |
474 | #else |
475 | ENTRY("\xF4\x8F\xBF\xBF" , UTF8 , "\xDB\xFF\xDF\xFF" , UTF16 , BL_SUCCESS), |
476 | ENTRY("\xDB\xFF\xDF\xFF" , UTF16 , "\xF4\x8F\xBF\xBF" , UTF8 , BL_SUCCESS), |
477 | #endif |
478 | |
479 | #if BL_BYTE_ORDER == 1234 |
480 | ENTRY("Test" , UTF8 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS), |
481 | ENTRY("T\0e\0s\0t\0" , UTF16 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS), |
482 | ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , BL_SUCCESS), |
483 | ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "T\0e\0s\0t\0" , UTF16 , BL_SUCCESS), |
484 | ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "Test" , LATIN1, BL_SUCCESS), |
485 | ENTRY("T\0\0\0e\0\0\0s\0\0\0t\0\0\0" , UTF32 , "Test" , UTF8 , BL_SUCCESS), |
486 | #else |
487 | ENTRY("Test" , UTF8 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS), |
488 | ENTRY("\0T\0e\0s\0t" , UTF16 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS), |
489 | ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , BL_SUCCESS), |
490 | ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "\0T\0e\0s\0t" , UTF16 , BL_SUCCESS) |
491 | ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "Test" , LATIN1, BL_SUCCESS), |
492 | ENTRY("\0\0\0T\0\0\0e\0\0\0s\0\0\0t" , UTF32 , "Test" , UTF8 , BL_SUCCESS), |
493 | #endif |
494 | |
495 | // Truncated characters. |
496 | ENTRY("" , UTF8 , "\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
497 | ENTRY("" , UTF8 , "\xEF" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
498 | ENTRY("" , UTF8 , "\xEF\xBC" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
499 | ENTRY("" , UTF8 , "\xF4" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
500 | ENTRY("" , UTF8 , "\xF4\x8F" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
501 | ENTRY("" , UTF8 , "\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
502 | |
503 | // Truncated character at the end (the converter must output the content, which was correct). |
504 | ENTRY("a" , UTF8 , "a\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
505 | ENTRY("ab" , UTF8 , "ab\xF4\x8F\xBF" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
506 | ENTRY("TestString" , UTF8 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
507 | #if BL_BYTE_ORDER == 1234 |
508 | ENTRY("T\0e\0s\0t\0S\0t\0r\0i\0n\0g\0" , UTF16 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
509 | #else |
510 | ENTRY("\0T\0e\0s\0t\0S\0t\0r\0i\0n\0g" , UTF16 , "TestString\xC5" , UTF8 , BL_ERROR_DATA_TRUNCATED), |
511 | #endif |
512 | |
513 | // Invalid UTf-8 characters. |
514 | ENTRY("" , UTF8 , "\x80" , UTF8 , BL_ERROR_INVALID_STRING), |
515 | ENTRY("" , UTF8 , "\xC1" , UTF8 , BL_ERROR_INVALID_STRING), |
516 | ENTRY("" , UTF8 , "\xF5\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING), |
517 | ENTRY("" , UTF8 , "\x91\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING), |
518 | ENTRY("" , UTF8 , "\xF6\x8F\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING), |
519 | ENTRY("" , UTF8 , "\xF4\xFF\xBF\xBF" , UTF8 , BL_ERROR_INVALID_STRING), |
520 | |
521 | // Overlong UTF-8 characters. |
522 | ENTRY("" , UTF8 , "\xC0\xA0" , UTF8 , BL_ERROR_INVALID_STRING) |
523 | |
524 | #undef ENTRY |
525 | }; |
526 | |
527 | for (size_t i = 0; i < BL_ARRAY_SIZE(testEntries); i++) { |
528 | const TestEntry& entry = testEntries[i]; |
529 | char output[32]; |
530 | |
531 | BLUnicodeConversionState state; |
532 | BLResult result = blConvertUnicode(output, 32, entry.dstEncoding, entry.src, entry.srcSize, entry.srcEncoding, state); |
533 | |
534 | bool failed = (result != entry.result) || |
535 | (state.dstIndex != entry.dstSize) || |
536 | (memcmp(output, entry.dst, state.dstIndex) != 0); |
537 | |
538 | if (failed) { |
539 | size_t inputSize = entry.srcSize; |
540 | size_t outputSize = state.dstIndex; |
541 | size_t expectedSize = entry.dstSize; |
542 | |
543 | printf(" Failed Entry #%u\n" , unsigned(i)); |
544 | |
545 | printf(" Input :" ); |
546 | for (size_t j = 0; j < inputSize; j++) |
547 | printf(" %02X" , uint8_t(entry.src[j])); |
548 | printf("%s\n" , inputSize ? "" : " (Nothing)" ); |
549 | |
550 | printf(" Output :" ); |
551 | for (size_t j = 0; j < outputSize; j++) |
552 | printf(" %02X" , uint8_t(output[j])); |
553 | printf("%s\n" , outputSize ? "" : " (Nothing)" ); |
554 | |
555 | printf(" Expected:" ); |
556 | for (size_t j = 0; j < expectedSize; j++) |
557 | printf(" %02X" , uint8_t(entry.dst[j])); |
558 | printf("%s\n" , expectedSize ? "" : " (Nothing)" ); |
559 | printf(" ErrorCode: Actual(%u) %s Expected(%u)\n" , result, (result == entry.result) ? "==" : "!=" , entry.result); |
560 | } |
561 | |
562 | EXPECT(!failed); |
563 | } |
564 | } |
565 | |
566 | UNIT(blend2d_unicode_io) { |
567 | INFO("BLUtf8Reader" ); |
568 | { |
569 | const uint8_t data[] = { |
570 | 0xE2, 0x82, 0xAC, // U+0020AC |
571 | 0xF0, 0x90, 0x8D, 0x88 // U+010348 |
572 | }; |
573 | |
574 | BLUtf8Reader it(data, BL_ARRAY_SIZE(data)); |
575 | uint32_t uc; |
576 | |
577 | EXPECT(it.hasNext()); |
578 | EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS); |
579 | EXPECT(uc == 0x0020AC); |
580 | |
581 | EXPECT(it.hasNext()); |
582 | EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS); |
583 | EXPECT(uc == 0x010348); |
584 | |
585 | EXPECT(!it.hasNext()); |
586 | |
587 | // Verify that sizes were calculated correctly. |
588 | EXPECT(it.byteIndex(data) == 7); |
589 | EXPECT(it.utf8Index(data) == 7); |
590 | EXPECT(it.utf16Index(data) == 3); // 3 code-points (1 BMP and 1 SMP). |
591 | EXPECT(it.utf32Index(data) == 2); // 2 code-points. |
592 | |
593 | const uint8_t invalidData[] = { 0xE2, 0x82 }; |
594 | it.reset(invalidData, BL_ARRAY_SIZE(invalidData)); |
595 | |
596 | EXPECT(it.hasNext()); |
597 | EXPECT(it.next(uc) == BL_ERROR_DATA_TRUNCATED); |
598 | |
599 | // After error the iterator should not move. |
600 | EXPECT(it.hasNext()); |
601 | EXPECT(it.byteIndex(invalidData) == 0); |
602 | EXPECT(it.utf8Index(invalidData) == 0); |
603 | EXPECT(it.utf16Index(invalidData) == 0); |
604 | EXPECT(it.utf32Index(invalidData) == 0); |
605 | } |
606 | |
607 | INFO("BLUtf16Reader" ); |
608 | { |
609 | const uint16_t data[] = { |
610 | 0x20AC, // U+0020AC |
611 | 0xD800, 0xDF48 // U+010348 |
612 | }; |
613 | |
614 | BLUtf16Reader it(data, BL_ARRAY_SIZE(data) * sizeof(uint16_t)); |
615 | uint32_t uc; |
616 | |
617 | EXPECT(it.hasNext()); |
618 | EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS); |
619 | EXPECT(uc == 0x0020AC); |
620 | |
621 | EXPECT(it.hasNext()); |
622 | EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS); |
623 | EXPECT(uc == 0x010348); |
624 | |
625 | EXPECT(!it.hasNext()); |
626 | |
627 | // Verify that sizes were calculated correctly. |
628 | EXPECT(it.byteIndex(data) == 6); |
629 | EXPECT(it.utf8Index(data) == 7); |
630 | EXPECT(it.utf16Index(data) == 3); // 3 code-points (1 BMP and 1 SMP). |
631 | EXPECT(it.utf32Index(data) == 2); // 2 code-points. |
632 | |
633 | const uint16_t invalidData[] = { 0xD800 }; |
634 | it.reset(invalidData, BL_ARRAY_SIZE(invalidData) * sizeof(uint16_t)); |
635 | |
636 | EXPECT(it.hasNext()); |
637 | EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX | BL_UNICODE_IO_STRICT>(uc) == BL_ERROR_DATA_TRUNCATED); |
638 | |
639 | // After an error the iterator should not move. |
640 | EXPECT(it.hasNext()); |
641 | EXPECT(it.byteIndex(invalidData) == 0); |
642 | EXPECT(it.utf8Index(invalidData) == 0); |
643 | EXPECT(it.utf16Index(invalidData) == 0); |
644 | EXPECT(it.utf32Index(invalidData) == 0); |
645 | |
646 | // However, this should pass in non-strict mode. |
647 | EXPECT(it.next(uc) == BL_SUCCESS); |
648 | EXPECT(!it.hasNext()); |
649 | } |
650 | |
651 | INFO("BLUtf32Reader" ); |
652 | { |
653 | const uint32_t data[] = { |
654 | 0x0020AC, |
655 | 0x010348 |
656 | }; |
657 | |
658 | BLUtf32Reader it(data, BL_ARRAY_SIZE(data) * sizeof(uint32_t)); |
659 | uint32_t uc; |
660 | |
661 | EXPECT(it.hasNext()); |
662 | EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS); |
663 | EXPECT(uc == 0x0020AC); |
664 | |
665 | EXPECT(it.hasNext()); |
666 | EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX>(uc) == BL_SUCCESS); |
667 | EXPECT(uc == 0x010348); |
668 | |
669 | EXPECT(!it.hasNext()); |
670 | |
671 | // Verify that sizes were calculated correctly. |
672 | EXPECT(it.byteIndex(data) == 8); |
673 | EXPECT(it.utf8Index(data) == 7); |
674 | EXPECT(it.utf16Index(data) == 3); // 3 code-points (1 BMP and 1 SMP). |
675 | EXPECT(it.utf32Index(data) == 2); // 2 code-points. |
676 | |
677 | const uint32_t invalidData[] = { 0xD800 }; |
678 | it.reset(invalidData, BL_ARRAY_SIZE(invalidData) * sizeof(uint32_t)); |
679 | |
680 | EXPECT(it.hasNext()); |
681 | EXPECT(it.next<BL_UNICODE_IO_CALC_INDEX | BL_UNICODE_IO_STRICT>(uc) == BL_ERROR_INVALID_STRING); |
682 | |
683 | // After an error the iterator should not move. |
684 | EXPECT(it.hasNext()); |
685 | EXPECT(it.byteIndex(invalidData) == 0); |
686 | EXPECT(it.utf8Index(invalidData) == 0); |
687 | EXPECT(it.utf16Index(invalidData) == 0); |
688 | EXPECT(it.utf32Index(invalidData) == 0); |
689 | |
690 | // However, this should pass in non-strict mode. |
691 | EXPECT(it.next(uc) == BL_SUCCESS); |
692 | EXPECT(!it.hasNext()); |
693 | } |
694 | |
695 | INFO("BLUtf8Writer" ); |
696 | { |
697 | char dst[7]; |
698 | BLUtf8Writer writer(dst, BL_ARRAY_SIZE(dst)); |
699 | |
700 | EXPECT(writer.write(0x20ACu) == BL_SUCCESS); |
701 | EXPECT(uint8_t(dst[0]) == 0xE2); |
702 | EXPECT(uint8_t(dst[1]) == 0x82); |
703 | EXPECT(uint8_t(dst[2]) == 0xAC); |
704 | |
705 | EXPECT(writer.write(0x010348u) == BL_SUCCESS); |
706 | EXPECT(uint8_t(dst[3]) == 0xF0); |
707 | EXPECT(uint8_t(dst[4]) == 0x90); |
708 | EXPECT(uint8_t(dst[5]) == 0x8D); |
709 | EXPECT(uint8_t(dst[6]) == 0x88); |
710 | EXPECT(writer.atEnd()); |
711 | |
712 | writer.reset(dst, 1); |
713 | EXPECT(writer.write(0x20ACu) == BL_ERROR_NO_SPACE_LEFT); |
714 | EXPECT(writer.write(0x0080u) == BL_ERROR_NO_SPACE_LEFT); |
715 | EXPECT(writer.write(0x00C1u) == BL_ERROR_NO_SPACE_LEFT); |
716 | |
717 | // We have only one byte left so this must pass... |
718 | EXPECT(writer.write('a') == BL_SUCCESS); |
719 | EXPECT(writer.atEnd()); |
720 | |
721 | writer.reset(dst, 2); |
722 | EXPECT(writer.write(0x20ACu) == BL_ERROR_NO_SPACE_LEFT); |
723 | EXPECT(writer.write(0x00C1u) == BL_SUCCESS); |
724 | EXPECT(uint8_t(dst[0]) == 0xC3u); |
725 | EXPECT(uint8_t(dst[1]) == 0x81u); |
726 | EXPECT(writer.atEnd()); |
727 | EXPECT(writer.write('a') == BL_ERROR_NO_SPACE_LEFT); |
728 | } |
729 | |
730 | INFO("BLUtf16Writer" ); |
731 | { |
732 | uint16_t dst[3]; |
733 | BLUtf16Writer<> writer(dst, BL_ARRAY_SIZE(dst)); |
734 | |
735 | EXPECT(writer.write(0x010348u) == BL_SUCCESS); |
736 | EXPECT(dst[0] == 0xD800u); |
737 | EXPECT(dst[1] == 0xDF48u); |
738 | |
739 | EXPECT(writer.write(0x010348u) == BL_ERROR_NO_SPACE_LEFT); |
740 | EXPECT(writer.write(0x20ACu) == BL_SUCCESS); |
741 | EXPECT(dst[2] == 0x20ACu); |
742 | EXPECT(writer.atEnd()); |
743 | } |
744 | } |
745 | #endif |
746 | |