1 | // [Blend2D] |
2 | // 2D Vector Graphics Powered by a JIT Compiler. |
3 | // |
4 | // [License] |
5 | // Zlib - See LICENSE.md file in the package. |
6 | |
7 | #ifndef BLEND2D_BLUNICODE_P_H |
8 | #define BLEND2D_BLUNICODE_P_H |
9 | |
10 | #include "./blsupport_p.h" |
11 | |
12 | //! \cond INTERNAL |
13 | //! \addtogroup blend2d_internal |
14 | //! \{ |
15 | |
16 | // ============================================================================ |
17 | // [Unicode Data] |
18 | // ============================================================================ |
19 | |
20 | BL_HIDDEN extern const uint8_t blUtf8SizeData[256]; |
21 | |
22 | // ============================================================================ |
23 | // [Unicode Constants] |
24 | // ============================================================================ |
25 | |
26 | //! Special unicode characters. |
27 | enum BLCharCode : uint32_t { |
28 | BL_CHAR_BOM = 0x00FEFFu, //!< Native Byte-Order-Mark. |
29 | BL_CHAR_MAX = 0x10FFFFu, //!< Last code-point. |
30 | |
31 | BL_CHAR_REPLACEMENT = 0x00FFFDu, //!< Replacement character. |
32 | |
33 | BL_CHAR_FVS1 = 0x00180Bu, //!< First char in Mongolian 'free variation selectors' FVS1..FVS3. |
34 | BL_CHAR_FVS3 = 0x00180Du, //!< Last char in Mongolian 'free variation selectors' FVS1..FVS3. |
35 | |
36 | BL_CHAR_VS1 = 0x00FE00u, //!< First char in 'variation selectors' VS1..VS16. |
37 | BL_CHAR_VS16 = 0x00FE0Fu, //!< Last char in 'variation selectors' VS1..VS16. |
38 | |
39 | BL_CHAR_VS17 = 0x0E0100u, //!< First char in 'variation selectors supplement' VS17..VS256. |
40 | BL_CHAR_VS256 = 0x0E01EFu, //!< Last char in 'variation selectors supplement' VS17..VS256. |
41 | |
42 | BL_CHAR_SURROGATE_FIRST = 0x00D800u, //!< First surrogate code-point. |
43 | BL_CHAR_SURROGATE_LAST = 0x00DFFFu, //!< Last surrogate code-point. |
44 | |
45 | BL_CHAR_HI_SURROGATE_FIRST = 0x00D800u, //!< First high-surrogate code-point |
46 | BL_CHAR_HI_SURROGATE_LAST = 0x00DBFFu, //!< Last high-surrogate code-point |
47 | |
48 | BL_CHAR_LO_SURROGATE_FIRST = 0x00DC00u, //!< First low-surrogate code-point |
49 | BL_CHAR_LO_SURROGATE_LAST = 0x00DFFFu //!< Last low-surrogate code-point |
50 | }; |
51 | |
52 | //! Flags that can be used to parametrize unicode I/O iterators. |
53 | enum BLUnicodeIOFlags : uint32_t { |
54 | BL_UNICODE_IO_UNALIGNED = 0x00000001u, |
55 | BL_UNICODE_IO_BYTE_SWAP = 0x00000002u, |
56 | BL_UNICODE_IO_STRICT = 0x00000004u, |
57 | BL_UNICODE_IO_CALC_INDEX = 0x00000008u, |
58 | |
59 | BL_UNICODE_IO_BYTE_ORDER_LE = BL_BYTE_ORDER_NATIVE == BL_BYTE_ORDER_LE ? 0 : BL_UNICODE_IO_BYTE_SWAP, |
60 | BL_UNICODE_IO_BYTE_ORDER_BE = BL_BYTE_ORDER_NATIVE == BL_BYTE_ORDER_BE ? 0 : BL_UNICODE_IO_BYTE_SWAP |
61 | }; |
62 | |
63 | // ============================================================================ |
64 | // [Unicode Utilities] |
65 | // ============================================================================ |
66 | |
67 | namespace { |
68 | |
69 | template<typename T> |
70 | BL_INLINE uint32_t blUtf8CharSize(const T& c) noexcept { |
71 | typedef typename std::make_unsigned<T>::type U; |
72 | return blUtf8SizeData[U(c)]; |
73 | } |
74 | |
75 | template<typename T> |
76 | BL_INLINE bool blIsValidUtf8(const T& c) noexcept { |
77 | typedef typename std::make_unsigned<T>::type U; |
78 | return U(c) < 128 || (U(c) - U(194) < U(245 - 194)); |
79 | } |
80 | |
81 | template<typename T> |
82 | constexpr bool blIsAsciiAlpha(const T& x) noexcept { return T(x | 0x20) >= T('a') && T(x | 0x20) <= T('z'); } |
83 | |
84 | template<typename T> |
85 | constexpr bool blIsAsciiDigit(const T& x) noexcept { return x >= T('0') && x <= T('9'); } |
86 | |
87 | template<typename T> |
88 | constexpr bool blIsAsciiAlnum(const T& x) noexcept { return blIsAsciiAlpha(x) || (x >= T('0') && x <= T('9')); } |
89 | |
90 | template<typename T> |
91 | constexpr T blAsciiToLower(const T& x) noexcept { return x >= T('A') && x <= T('Z') ? T(x | T(0x20)) : x; } |
92 | |
93 | template<typename T> |
94 | constexpr T blAsciiToUpper(const T& x) noexcept { return x >= T('a') && x <= T('z') ? T(x & ~T(0x20)) : x; } |
95 | |
96 | //! Tests whether the unicode character `uc` is high or low surrogate. |
97 | template<typename T> |
98 | constexpr bool blIsSurrogate(const T& uc) noexcept { return uc >= BL_CHAR_SURROGATE_FIRST && uc <= BL_CHAR_SURROGATE_LAST; } |
99 | |
100 | //! Tests whether the unicode character `uc` is a high (leading) surrogate. |
101 | template<typename T> |
102 | constexpr bool blIsHiSurrogate(const T& uc) noexcept { return uc >= BL_CHAR_HI_SURROGATE_FIRST && uc <= BL_CHAR_HI_SURROGATE_LAST; } |
103 | |
104 | //! Tests whether the unicode character `uc` is a low (trailing) surrogate. |
105 | template<typename T> |
106 | constexpr bool blIsLoSurrogate(const T& uc) noexcept { return uc >= BL_CHAR_LO_SURROGATE_FIRST && uc <= BL_CHAR_LO_SURROGATE_LAST; } |
107 | |
108 | //! Composes `hi` and `lo` surrogates into a unicode code-point. |
109 | template<typename T> |
110 | constexpr uint32_t blCharFromSurrogate(const T& hi, const T& lo) noexcept { |
111 | return (uint32_t(hi) << 10) + uint32_t(lo) - uint32_t((BL_CHAR_SURROGATE_FIRST << 10) + BL_CHAR_LO_SURROGATE_FIRST - 0x10000u); |
112 | } |
113 | |
114 | //! Decomposes a unicode code-point into `hi` and `lo` surrogates. |
115 | template<typename T> |
116 | BL_INLINE void blCharToSurrogate(uint32_t uc, T& hi, T& lo) noexcept { |
117 | uc -= 0x10000u; |
118 | hi = T(BL_CHAR_HI_SURROGATE_FIRST | (uc >> 10)); |
119 | lo = T(BL_CHAR_LO_SURROGATE_FIRST | (uc & 0x3FFu)); |
120 | } |
121 | |
122 | } // {anonymous} |
123 | |
124 | // ============================================================================ |
125 | // [Unicode Validation] |
126 | // ============================================================================ |
127 | |
128 | struct BLUnicodeValidationState { |
129 | size_t utf8Index; |
130 | size_t utf16Index; |
131 | size_t utf32Index; |
132 | |
133 | BL_INLINE void reset() noexcept { |
134 | utf8Index = 0; |
135 | utf16Index = 0; |
136 | utf32Index = 0; |
137 | } |
138 | |
139 | BL_INLINE bool hasSMP() const noexcept { return utf16Index != utf32Index; } |
140 | }; |
141 | |
142 | BL_HIDDEN BLResult blValidateUnicode(const void* data, size_t sizeInBytes, uint32_t encoding, BLUnicodeValidationState& state) noexcept; |
143 | |
144 | static BL_INLINE BLResult blValidateUtf8(const char* data, size_t size, BLUnicodeValidationState& state) noexcept { |
145 | return blValidateUnicode(data, size, BL_TEXT_ENCODING_UTF8, state); |
146 | } |
147 | |
148 | static BL_INLINE BLResult blValidateUtf16(const uint16_t* data, size_t size, BLUnicodeValidationState& state) noexcept { |
149 | return blValidateUnicode(data, size * 2u, BL_TEXT_ENCODING_UTF16, state); |
150 | } |
151 | |
152 | static BL_INLINE BLResult blValidateUtf32(const uint32_t* data, size_t size, BLUnicodeValidationState& state) noexcept { |
153 | return blValidateUnicode(data, size * 4u, BL_TEXT_ENCODING_UTF32, state); |
154 | } |
155 | |
156 | // ============================================================================ |
157 | // [Conversion] |
158 | // ============================================================================ |
159 | |
160 | struct BLUnicodeConversionState { |
161 | size_t dstIndex; |
162 | size_t srcIndex; |
163 | |
164 | BL_INLINE void reset() noexcept { |
165 | this->dstIndex = 0; |
166 | this->srcIndex = 0; |
167 | } |
168 | }; |
169 | |
170 | //! Converts a string from one encoding to another. |
171 | //! |
172 | //! Convert function works at a byte level. All sizes here are including those |
173 | //! stored in a `BLUnicodeConversionState` are byte entities. So for example to convert |
174 | //! a single UTF-16 BMP character the source size must be 2, etc... |
175 | BL_HIDDEN BLResult blConvertUnicode( |
176 | void* dst, size_t dstSizeInBytes, uint32_t dstEncoding, |
177 | const void* src, size_t srcSizeInBytes, uint32_t srcEncoding, BLUnicodeConversionState& state) noexcept; |
178 | |
179 | // ============================================================================ |
180 | // [BLUtf8Reader] |
181 | // ============================================================================ |
182 | |
183 | //! UTF-8 reader. |
184 | class BLUtf8Reader { |
185 | public: |
186 | enum : uint32_t { kCharSize = 1 }; |
187 | |
188 | //! Current pointer. |
189 | const char* _ptr; |
190 | //! End of input. |
191 | const char* _end; |
192 | //! `index() - _utf32IndexSubtract` yields the current `utf32Index`. |
193 | size_t _utf32IndexSubtract; |
194 | //! Number of surrogates is required to calculate `utf16Index`. |
195 | size_t _utf16SurrogateCount; |
196 | |
197 | BL_INLINE BLUtf8Reader(const void* data, size_t byteSize) noexcept { |
198 | reset(data, byteSize); |
199 | } |
200 | |
201 | BL_INLINE void reset(const void* data, size_t byteSize) noexcept { |
202 | _ptr = static_cast<const char*>(data); |
203 | _end = static_cast<const char*>(data) + byteSize; |
204 | _utf32IndexSubtract = 0; |
205 | _utf16SurrogateCount = 0; |
206 | } |
207 | |
208 | BL_INLINE bool hasNext() const noexcept { return _ptr != _end; } |
209 | BL_INLINE size_t remainingByteSize() const noexcept { return (size_t)(_end - _ptr); } |
210 | |
211 | BL_INLINE size_t byteIndex(const void* start) const noexcept { return (size_t)(_ptr - static_cast<const char*>(start)); } |
212 | BL_INLINE size_t utf8Index(const void* start) const noexcept { return byteIndex(start); } |
213 | BL_INLINE size_t utf16Index(const void* start) const noexcept { return utf32Index(start) + _utf16SurrogateCount; } |
214 | BL_INLINE size_t utf32Index(const void* start) const noexcept { return byteIndex(start) - _utf32IndexSubtract; } |
215 | BL_INLINE size_t nativeIndex(const void* start) const noexcept { return utf8Index(start); } |
216 | |
217 | template<uint32_t Flags = 0> |
218 | BL_INLINE BLResult next(uint32_t& uc) noexcept { |
219 | size_t ucSizeInBytes; |
220 | return next<Flags>(uc, ucSizeInBytes); |
221 | } |
222 | |
223 | template<uint32_t Flags = 0> |
224 | BL_INLINE BLResult next(uint32_t& uc, size_t& ucSizeInBytes) noexcept { |
225 | BL_ASSERT(hasNext()); |
226 | |
227 | uc = blMemReadU8(_ptr); |
228 | ucSizeInBytes = 1; |
229 | |
230 | _ptr++; |
231 | if (uc < 0x80u) { |
232 | // 1-Byte UTF-8 Sequence -> [0x00..0x7F]. |
233 | // ...nothing to do... |
234 | } |
235 | else { |
236 | // Start of MultiByte. |
237 | const uint32_t kMultiByte = 0xC2u; |
238 | |
239 | uc -= kMultiByte; |
240 | if (uc < 0xE0u - kMultiByte) { |
241 | // 2-Byte UTF-8 Sequence -> [0x80-0x7FF]. |
242 | _ptr++; |
243 | ucSizeInBytes = 2; |
244 | |
245 | // Truncated input. |
246 | if (BL_UNLIKELY(_ptr > _end)) |
247 | goto TruncatedString; |
248 | |
249 | // All consecutive bytes must be '10xxxxxx'. |
250 | uint32_t b1 = blMemReadU8(_ptr - 1) ^ 0x80u; |
251 | uc = ((uc + kMultiByte - 0xC0u) << 6) + b1; |
252 | |
253 | if (BL_UNLIKELY(b1 > 0x3Fu)) |
254 | goto InvalidString; |
255 | |
256 | // 2-Byte UTF-8 maps to one UTF-16 or UTF-32 code-point, so subtract 1. |
257 | if (Flags & BL_UNICODE_IO_CALC_INDEX) _utf32IndexSubtract += 1; |
258 | } |
259 | else if (uc < 0xF0u - kMultiByte) { |
260 | // 3-Byte UTF-8 Sequence -> [0x800-0xFFFF]. |
261 | _ptr += 2; |
262 | ucSizeInBytes = 3; |
263 | |
264 | // Truncated input. |
265 | if (BL_UNLIKELY(_ptr > _end)) |
266 | goto TruncatedString; |
267 | |
268 | uint32_t b1 = blMemReadU8(_ptr - 2) ^ 0x80u; |
269 | uint32_t b2 = blMemReadU8(_ptr - 1) ^ 0x80u; |
270 | uc = ((uc + kMultiByte - 0xE0u) << 12) + (b1 << 6) + b2; |
271 | |
272 | // 1. All consecutive bytes must be '10xxxxxx'. |
273 | // 2. Refuse overlong UTF-8. |
274 | if (BL_UNLIKELY((b1 | b2) > 0x3Fu || uc < 0x800u)) |
275 | goto InvalidString; |
276 | |
277 | // 3-Byte UTF-8 maps to one UTF-16 or UTF-32 code-point, so subtract 2. |
278 | if (Flags & BL_UNICODE_IO_CALC_INDEX) _utf32IndexSubtract += 2; |
279 | } |
280 | else { |
281 | // 4-Byte UTF-8 Sequence -> [0x010000-0x10FFFF]. |
282 | _ptr += 3; |
283 | ucSizeInBytes = 4; |
284 | |
285 | // Truncated input. |
286 | if (BL_UNLIKELY(_ptr > _end)) { |
287 | // If this happens we want to report a correct error, bytes 0xF5 |
288 | // and above are always invalid and normally caught later. |
289 | if (uc >= 0xF5u - kMultiByte) |
290 | goto InvalidString; |
291 | else |
292 | goto TruncatedString; |
293 | } |
294 | |
295 | uint32_t b1 = blMemReadU8(_ptr - 3) ^ 0x80u; |
296 | uint32_t b2 = blMemReadU8(_ptr - 2) ^ 0x80u; |
297 | uint32_t b3 = blMemReadU8(_ptr - 1) ^ 0x80u; |
298 | uc = ((uc + kMultiByte - 0xF0u) << 18) + (b1 << 12) + (b2 << 6) + b3; |
299 | |
300 | // 1. All consecutive bytes must be '10xxxxxx'. |
301 | // 2. Refuse overlong UTF-8. |
302 | // 3. Make sure the final character is <= U+10FFFF. |
303 | if (BL_UNLIKELY((b1 | b2 | b3) > 0x3Fu || uc < 0x010000u || uc > BL_CHAR_MAX)) |
304 | goto InvalidString; |
305 | |
306 | // 4-Byte UTF-8 maps to one UTF-16 or UTF-32 code-point, so subtract 3. |
307 | if (Flags & BL_UNICODE_IO_CALC_INDEX) _utf32IndexSubtract += 3; |
308 | if (Flags & BL_UNICODE_IO_CALC_INDEX) _utf16SurrogateCount += 1; |
309 | } |
310 | } |
311 | return BL_SUCCESS; |
312 | |
313 | InvalidString: |
314 | _ptr -= ucSizeInBytes; |
315 | return blTraceError(BL_ERROR_INVALID_STRING); |
316 | |
317 | TruncatedString: |
318 | _ptr -= ucSizeInBytes; |
319 | return blTraceError(BL_ERROR_DATA_TRUNCATED); |
320 | } |
321 | |
322 | BL_INLINE void skipOneUnit() noexcept { |
323 | BL_ASSERT(hasNext()); |
324 | _ptr++; |
325 | } |
326 | |
327 | template<uint32_t Flags = 0> |
328 | BL_INLINE BLResult validate() noexcept { |
329 | BLResult result = BL_SUCCESS; |
330 | while (hasNext()) { |
331 | uint32_t uc; |
332 | result = next<Flags>(uc); |
333 | if (result) |
334 | break; |
335 | } |
336 | return result; |
337 | } |
338 | }; |
339 | |
340 | // ============================================================================ |
341 | // [BLUtf16Reader] |
342 | // ============================================================================ |
343 | |
344 | //! UTF-16 reader. |
345 | class BLUtf16Reader { |
346 | public: |
347 | enum : uint32_t { kCharSize = 2 }; |
348 | |
349 | const char* _ptr; |
350 | const char* _end; |
351 | |
352 | size_t _utf8IndexAdd; |
353 | size_t _utf16SurrogateCount; |
354 | |
355 | BL_INLINE BLUtf16Reader(const void* data, size_t byteSize) noexcept { |
356 | reset(data, byteSize); |
357 | } |
358 | |
359 | // -------------------------------------------------------------------------- |
360 | // [Reset] |
361 | // -------------------------------------------------------------------------- |
362 | |
363 | BL_INLINE void reset(const void* data, size_t byteSize) noexcept { |
364 | _ptr = static_cast<const char*>(data); |
365 | _end = static_cast<const char*>(data) + blAlignDown(byteSize, 2); |
366 | _utf8IndexAdd = 0; |
367 | _utf16SurrogateCount = 0; |
368 | } |
369 | |
370 | // -------------------------------------------------------------------------- |
371 | // [Accessors] |
372 | // -------------------------------------------------------------------------- |
373 | |
374 | BL_INLINE bool hasNext() const noexcept { return _ptr != _end; } |
375 | BL_INLINE size_t remainingByteSize() const noexcept { return (size_t)(_end - _ptr); } |
376 | |
377 | BL_INLINE size_t byteIndex(const void* start) const noexcept { return (size_t)(_ptr - static_cast<const char*>(start)); } |
378 | BL_INLINE size_t utf8Index(const void* start) const noexcept { return utf16Index(start) + _utf8IndexAdd; } |
379 | BL_INLINE size_t utf16Index(const void* start) const noexcept { return byteIndex(start) / 2u; } |
380 | BL_INLINE size_t utf32Index(const void* start) const noexcept { return utf16Index(start) - _utf16SurrogateCount; } |
381 | BL_INLINE size_t nativeIndex(const void* start) const noexcept { return utf16Index(start); } |
382 | |
383 | // -------------------------------------------------------------------------- |
384 | // [Iterator] |
385 | // -------------------------------------------------------------------------- |
386 | |
387 | template<uint32_t Flags = 0> |
388 | BL_INLINE BLResult next(uint32_t& uc) noexcept { |
389 | size_t ucSizeInBytes; |
390 | return next<Flags>(uc, ucSizeInBytes); |
391 | } |
392 | |
393 | template<uint32_t Flags = 0> |
394 | BL_INLINE BLResult next(uint32_t& uc, size_t& ucSizeInBytes) noexcept { |
395 | BL_ASSERT(hasNext()); |
396 | |
397 | uc = readU16<Flags>(_ptr); |
398 | _ptr += 2; |
399 | |
400 | if (blIsSurrogate(uc)) { |
401 | if (BL_LIKELY(blIsHiSurrogate(uc))) { |
402 | if (BL_LIKELY(_ptr != _end)) { |
403 | uint32_t lo = readU16<Flags>(_ptr); |
404 | if (BL_LIKELY(blIsLoSurrogate(lo))) { |
405 | uc = blCharFromSurrogate(uc, lo); |
406 | _ptr += 2; |
407 | |
408 | // Add two to `_utf8IndexAdd` as two surrogates count as 2, so we |
409 | // have to add 2 more to have UTF-8 length of a valid surrogate. |
410 | if (Flags & BL_UNICODE_IO_CALC_INDEX) _utf8IndexAdd += 2; |
411 | if (Flags & BL_UNICODE_IO_CALC_INDEX) _utf16SurrogateCount += 1; |
412 | |
413 | ucSizeInBytes = 4; |
414 | return BL_SUCCESS; |
415 | } |
416 | else { |
417 | if (Flags & BL_UNICODE_IO_STRICT) |
418 | goto InvalidString; |
419 | } |
420 | } |
421 | else { |
422 | if (Flags & BL_UNICODE_IO_STRICT) |
423 | goto TruncatedString; |
424 | } |
425 | } |
426 | else { |
427 | if (Flags & BL_UNICODE_IO_STRICT) |
428 | goto InvalidString; |
429 | } |
430 | } |
431 | |
432 | // Either not surrogate or fallback in non-strict mode. |
433 | if (Flags & BL_UNICODE_IO_CALC_INDEX) |
434 | _utf8IndexAdd += size_t(uc >= 0x0080u) + size_t(uc >= 0x0800u); |
435 | |
436 | ucSizeInBytes = 2; |
437 | return BL_SUCCESS; |
438 | |
439 | InvalidString: |
440 | _ptr -= 2; |
441 | return blTraceError(BL_ERROR_INVALID_STRING); |
442 | |
443 | TruncatedString: |
444 | _ptr -= 2; |
445 | return blTraceError(BL_ERROR_DATA_TRUNCATED); |
446 | } |
447 | |
448 | BL_INLINE void skipOneUnit() noexcept { |
449 | BL_ASSERT(hasNext()); |
450 | _ptr += 2; |
451 | } |
452 | |
453 | // -------------------------------------------------------------------------- |
454 | // [Validator] |
455 | // -------------------------------------------------------------------------- |
456 | |
457 | template<uint32_t Flags = 0> |
458 | BL_INLINE BLResult validate() noexcept { |
459 | BLResult result = BL_SUCCESS; |
460 | while (hasNext()) { |
461 | uint32_t uc; |
462 | result = next<Flags>(uc); |
463 | if (result) |
464 | break; |
465 | } |
466 | return result; |
467 | } |
468 | |
469 | // -------------------------------------------------------------------------- |
470 | // [Utilities] |
471 | // -------------------------------------------------------------------------- |
472 | |
473 | template<uint32_t Flags = 0> |
474 | static BL_INLINE uint32_t readU16(const char* ptr) noexcept { |
475 | constexpr uint32_t kByteOrder = Flags & BL_UNICODE_IO_BYTE_SWAP ? BL_BYTE_ORDER_SWAPPED : BL_BYTE_ORDER_NATIVE; |
476 | constexpr uint32_t kAlignment = Flags & BL_UNICODE_IO_UNALIGNED ? 1 : 2; |
477 | return blMemReadU16<kByteOrder, kAlignment>(ptr); |
478 | } |
479 | }; |
480 | |
481 | // ============================================================================ |
482 | // [BLUtf32Reader] |
483 | // ============================================================================ |
484 | |
485 | //! UTF-32 reader. |
486 | class BLUtf32Reader { |
487 | public: |
488 | enum : uint32_t { kCharSize = 4 }; |
489 | |
490 | const char* _ptr; |
491 | const char* _end; |
492 | |
493 | size_t _utf8IndexAdd; |
494 | size_t _utf16SurrogateCount; |
495 | |
496 | BL_INLINE BLUtf32Reader(const void* data, size_t byteSize) noexcept { |
497 | reset(data, byteSize); |
498 | } |
499 | |
500 | BL_INLINE void reset(const void* data, size_t byteSize) noexcept { |
501 | _ptr = static_cast<const char*>(data); |
502 | _end = static_cast<const char*>(data) + blAlignDown(byteSize, 4); |
503 | _utf8IndexAdd = 0; |
504 | _utf16SurrogateCount = 0; |
505 | } |
506 | |
507 | BL_INLINE bool hasNext() const noexcept { return _ptr != _end; } |
508 | BL_INLINE size_t remainingByteSize() const noexcept { return (size_t)(_end - _ptr); } |
509 | |
510 | BL_INLINE size_t byteIndex(const void* start) const noexcept { return (size_t)(_ptr - static_cast<const char*>(start)); } |
511 | BL_INLINE size_t utf8Index(const void* start) const noexcept { return utf32Index(start) + _utf16SurrogateCount + _utf8IndexAdd; } |
512 | BL_INLINE size_t utf16Index(const void* start) const noexcept { return utf32Index(start) + _utf16SurrogateCount; } |
513 | BL_INLINE size_t utf32Index(const void* start) const noexcept { return byteIndex(start) / 4u; } |
514 | BL_INLINE size_t nativeIndex(const void* start) const noexcept { return utf32Index(start); } |
515 | |
516 | template<uint32_t Flags = 0> |
517 | BL_INLINE BLResult next(uint32_t& uc) noexcept { |
518 | size_t ucSizeInBytes; |
519 | return next<Flags>(uc, ucSizeInBytes); |
520 | } |
521 | |
522 | template<uint32_t Flags = 0> |
523 | BL_INLINE BLResult next(uint32_t& uc, size_t& ucSizeInBytes) noexcept { |
524 | BL_ASSERT(hasNext()); |
525 | |
526 | uc = readU32<Flags>(_ptr); |
527 | if (BL_UNLIKELY(uc > BL_CHAR_MAX)) |
528 | return blTraceError(BL_ERROR_INVALID_STRING); |
529 | |
530 | if (Flags & BL_UNICODE_IO_STRICT) { |
531 | if (BL_UNLIKELY(blIsSurrogate(uc))) |
532 | return blTraceError(BL_ERROR_INVALID_STRING); |
533 | } |
534 | |
535 | if (Flags & BL_UNICODE_IO_CALC_INDEX) _utf8IndexAdd += size_t(uc >= 0x800u) + size_t(uc >= 0x80u); |
536 | if (Flags & BL_UNICODE_IO_CALC_INDEX) _utf16SurrogateCount += size_t(uc >= 0x10000u); |
537 | |
538 | _ptr += 4; |
539 | ucSizeInBytes = 4; |
540 | return BL_SUCCESS; |
541 | } |
542 | |
543 | BL_INLINE void skipOneUnit() noexcept { |
544 | BL_ASSERT(hasNext()); |
545 | _ptr += 4; |
546 | } |
547 | |
548 | template<uint32_t Flags = 0> |
549 | BL_INLINE BLResult validate() noexcept { |
550 | BLResult result = BL_SUCCESS; |
551 | while (hasNext()) { |
552 | uint32_t uc; |
553 | result = next<Flags>(uc); |
554 | if (result) |
555 | break; |
556 | } |
557 | return result; |
558 | } |
559 | |
560 | template<uint32_t Flags = 0> |
561 | static BL_INLINE uint32_t readU32(const char* ptr) noexcept { |
562 | constexpr uint32_t kByteOrder = Flags & BL_UNICODE_IO_BYTE_SWAP ? BL_BYTE_ORDER_SWAPPED : BL_BYTE_ORDER_NATIVE; |
563 | constexpr uint32_t kAlignment = Flags & BL_UNICODE_IO_UNALIGNED ? 1 : 4; |
564 | return blMemReadU32<kByteOrder, kAlignment>(ptr); |
565 | } |
566 | }; |
567 | |
568 | // ============================================================================ |
569 | // [BLUtf8Writer] |
570 | // ============================================================================ |
571 | |
572 | //! UTF8 writer. |
573 | class BLUtf8Writer { |
574 | public: |
575 | typedef char CharType; |
576 | |
577 | char* _ptr; |
578 | char* _end; |
579 | |
580 | BL_INLINE BLUtf8Writer(char* dst, size_t size) noexcept { |
581 | reset(dst, size); |
582 | } |
583 | |
584 | BL_INLINE void reset(char* dst, size_t size) noexcept { |
585 | _ptr = dst; |
586 | _end = dst + size; |
587 | } |
588 | |
589 | BL_INLINE size_t index(const char* start) const noexcept { |
590 | return (size_t)(_ptr - start); |
591 | } |
592 | |
593 | BL_INLINE bool atEnd() const noexcept { return _ptr == _end; } |
594 | BL_INLINE size_t remainingSize() const noexcept { return (size_t)(_end - _ptr); } |
595 | |
596 | BL_INLINE BLResult write(uint32_t uc) noexcept { |
597 | if (uc <= 0x7F) |
598 | return writeByte(uc); |
599 | else if (uc <= 0x7FFu) |
600 | return write2Bytes(uc); |
601 | else if (uc <= 0xFFFFu) |
602 | return write3Bytes(uc); |
603 | else |
604 | return write4Bytes(uc); |
605 | } |
606 | |
607 | BL_INLINE BLResult writeUnsafe(uint32_t uc) noexcept { |
608 | if (uc <= 0x7F) |
609 | return writeByteUnsafe(uc); |
610 | else if (uc <= 0x7FFu) |
611 | return write2BytesUnsafe(uc); |
612 | else if (uc <= 0xFFFFu) |
613 | return write3BytesUnsafe(uc); |
614 | else |
615 | return write4BytesUnsafe(uc); |
616 | } |
617 | |
618 | BL_INLINE BLResult writeByte(uint32_t uc) noexcept { |
619 | BL_ASSERT(uc <= 0x7Fu); |
620 | if (BL_UNLIKELY(atEnd())) |
621 | return blTraceError(BL_ERROR_NO_SPACE_LEFT); |
622 | |
623 | _ptr[0] = char(uint8_t(uc)); |
624 | _ptr++; |
625 | return BL_SUCCESS; |
626 | } |
627 | |
628 | BL_INLINE BLResult writeByteUnsafe(uint32_t uc) noexcept { |
629 | BL_ASSERT(remainingSize() >= 1); |
630 | _ptr[0] = char(uint8_t(uc)); |
631 | _ptr++; |
632 | return BL_SUCCESS; |
633 | } |
634 | |
635 | BL_INLINE BLResult write2Bytes(uint32_t uc) noexcept { |
636 | BL_ASSERT(uc >= 0x80u && uc <= 0x7FFu); |
637 | |
638 | _ptr += 2; |
639 | if (BL_UNLIKELY(_ptr > _end)) { |
640 | _ptr -= 2; |
641 | return blTraceError(BL_ERROR_NO_SPACE_LEFT); |
642 | } |
643 | |
644 | _ptr[-2] = char(uint8_t(0xC0u | (uc >> 6))); |
645 | _ptr[-1] = char(uint8_t(0x80u | (uc & 63))); |
646 | return BL_SUCCESS; |
647 | } |
648 | |
649 | BL_INLINE BLResult write2BytesUnsafe(uint32_t uc) noexcept { |
650 | BL_ASSERT(remainingSize() >= 2); |
651 | BL_ASSERT(uc >= 0x80u && uc <= 0x7FFu); |
652 | |
653 | _ptr[0] = char(uint8_t(0xC0u | (uc >> 6))); |
654 | _ptr[1] = char(uint8_t(0x80u | (uc & 63))); |
655 | |
656 | _ptr += 2; |
657 | return BL_SUCCESS; |
658 | } |
659 | |
660 | BL_INLINE BLResult write3Bytes(uint32_t uc) noexcept { |
661 | BL_ASSERT(uc >= 0x800u && uc <= 0xFFFFu); |
662 | |
663 | _ptr += 3; |
664 | if (BL_UNLIKELY(_ptr > _end)) { |
665 | _ptr -= 3; |
666 | return blTraceError(BL_ERROR_NO_SPACE_LEFT); |
667 | } |
668 | |
669 | _ptr[-3] = char(uint8_t(0xE0u | ((uc >> 12) ))); |
670 | _ptr[-2] = char(uint8_t(0x80u | ((uc >> 6) & 63))); |
671 | _ptr[-1] = char(uint8_t(0x80u | ((uc ) & 63))); |
672 | return BL_SUCCESS; |
673 | } |
674 | |
675 | BL_INLINE BLResult write3BytesUnsafe(uint32_t uc) noexcept { |
676 | BL_ASSERT(remainingSize() >= 3); |
677 | BL_ASSERT(uc >= 0x800u && uc <= 0xFFFFu); |
678 | |
679 | _ptr[0] = char(uint8_t(0xE0u | ((uc >> 12) ))); |
680 | _ptr[1] = char(uint8_t(0x80u | ((uc >> 6) & 63))); |
681 | _ptr[2] = char(uint8_t(0x80u | ((uc ) & 63))); |
682 | |
683 | _ptr += 3; |
684 | return BL_SUCCESS; |
685 | } |
686 | |
687 | BL_INLINE BLResult write4Bytes(uint32_t uc) noexcept { |
688 | BL_ASSERT(uc >= 0x10000u && uc <= 0x10FFFFu); |
689 | |
690 | _ptr += 4; |
691 | if (BL_UNLIKELY(_ptr > _end)) { |
692 | _ptr -= 4; |
693 | return blTraceError(BL_ERROR_NO_SPACE_LEFT); |
694 | } |
695 | |
696 | _ptr[-4] = char(uint8_t(0xF0u | ((uc >> 18) ))); |
697 | _ptr[-3] = char(uint8_t(0x80u | ((uc >> 12) & 63))); |
698 | _ptr[-2] = char(uint8_t(0x80u | ((uc >> 6) & 63))); |
699 | _ptr[-1] = char(uint8_t(0x80u | ((uc ) & 63))); |
700 | return BL_SUCCESS; |
701 | } |
702 | |
703 | BL_INLINE BLResult write4BytesUnsafe(uint32_t uc) noexcept { |
704 | BL_ASSERT(remainingSize() >= 4); |
705 | BL_ASSERT(uc >= 0x10000u && uc <= 0x10FFFFu); |
706 | |
707 | _ptr[0] = char(uint8_t(0xF0u | ((uc >> 18) ))); |
708 | _ptr[1] = char(uint8_t(0x80u | ((uc >> 12) & 63))); |
709 | _ptr[2] = char(uint8_t(0x80u | ((uc >> 6) & 63))); |
710 | _ptr[3] = char(uint8_t(0x80u | ((uc ) & 63))); |
711 | |
712 | _ptr += 4; |
713 | return BL_SUCCESS; |
714 | } |
715 | }; |
716 | |
717 | // ============================================================================ |
718 | // [BLUtf16Writer] |
719 | // ============================================================================ |
720 | |
721 | //! UTF16 writer that can be parametrized by `ByteOrder` and `Alignment`. |
722 | template<uint32_t ByteOrder = BL_BYTE_ORDER_NATIVE, uint32_t Alignment = 2> |
723 | class BLUtf16Writer { |
724 | public: |
725 | typedef uint16_t CharType; |
726 | |
727 | uint16_t* _ptr; |
728 | uint16_t* _end; |
729 | |
730 | BL_INLINE BLUtf16Writer(uint16_t* dst, size_t size) noexcept { |
731 | reset(dst, size); |
732 | } |
733 | |
734 | BL_INLINE void reset(uint16_t* dst, size_t size) noexcept { |
735 | _ptr = dst; |
736 | _end = dst + size; |
737 | } |
738 | |
739 | BL_INLINE size_t index(const uint16_t* start) const noexcept { |
740 | return (size_t)(_ptr - start); |
741 | } |
742 | |
743 | BL_INLINE bool atEnd() const noexcept { return _ptr == _end; } |
744 | BL_INLINE size_t remainingSize() const noexcept { return (size_t)(_end - _ptr); } |
745 | |
746 | BL_INLINE BLResult write(uint32_t uc) noexcept { |
747 | if (uc <= 0xFFFFu) |
748 | return writeBMP(uc); |
749 | else |
750 | return writeSMP(uc); |
751 | } |
752 | |
753 | BL_INLINE BLResult writeBMP(uint32_t uc) noexcept { |
754 | BL_ASSERT(uc <= 0xFFFFu); |
755 | |
756 | if (BL_UNLIKELY(atEnd())) |
757 | return blTraceError(BL_ERROR_NO_SPACE_LEFT); |
758 | |
759 | _blMemWriteU16(_ptr, uc); |
760 | _ptr++; |
761 | return BL_SUCCESS; |
762 | } |
763 | |
764 | BL_INLINE BLResult writeBMPUnsafe(uint32_t uc) noexcept { |
765 | BL_ASSERT(remainingSize() >= 1); |
766 | |
767 | _blMemWriteU16(_ptr, uc); |
768 | _ptr++; |
769 | return BL_SUCCESS; |
770 | } |
771 | |
772 | BL_INLINE BLResult writeSMP(uint32_t uc) noexcept { |
773 | BL_ASSERT(uc >= 0x10000u && uc <= 0x10FFFFu); |
774 | |
775 | _ptr += 2; |
776 | if (BL_UNLIKELY(_ptr > _end)) { |
777 | _ptr -= 2; |
778 | return blTraceError(BL_ERROR_NO_SPACE_LEFT); |
779 | } |
780 | |
781 | uint32_t hi, lo; |
782 | blCharToSurrogate(uc, hi, lo); |
783 | |
784 | _blMemWriteU16(_ptr - 2, hi); |
785 | _blMemWriteU16(_ptr - 1, lo); |
786 | return BL_SUCCESS; |
787 | } |
788 | |
789 | BL_INLINE BLResult writeSMPUnsafe(uint32_t uc) noexcept { |
790 | BL_ASSERT(remainingSize() >= 2); |
791 | BL_ASSERT(uc >= 0x10000u && uc <= 0x10FFFFu); |
792 | |
793 | uint32_t hi, lo; |
794 | blCharToSurrogate(uc, hi, lo); |
795 | |
796 | _blMemWriteU16(_ptr + 0, hi); |
797 | _blMemWriteU16(_ptr + 1, lo); |
798 | |
799 | _ptr += 2; |
800 | return BL_SUCCESS; |
801 | } |
802 | |
803 | // -------------------------------------------------------------------------- |
804 | // [Utilities] |
805 | // -------------------------------------------------------------------------- |
806 | |
807 | static BL_INLINE void _blMemWriteU16(void* dst, uint32_t value) noexcept { |
808 | blMemWriteU16<ByteOrder, Alignment>(dst, value); |
809 | } |
810 | }; |
811 | |
812 | // ============================================================================ |
813 | // [BLUtf32Writer] |
814 | // ============================================================================ |
815 | |
816 | //! UTF32 writer that can be parametrized by `ByteOrder` and `Alignment`. |
817 | template<uint32_t ByteOrder = BL_BYTE_ORDER_NATIVE, uint32_t Alignment = 4> |
818 | class BLUtf32Writer { |
819 | public: |
820 | typedef uint32_t CharType; |
821 | |
822 | uint32_t* _ptr; |
823 | uint32_t* _end; |
824 | |
825 | BL_INLINE BLUtf32Writer(uint32_t* dst, size_t size) noexcept { |
826 | reset(dst, size); |
827 | } |
828 | |
829 | BL_INLINE void reset(uint32_t* dst, size_t size) noexcept { |
830 | _ptr = dst; |
831 | _end = dst + size; |
832 | } |
833 | |
834 | BL_INLINE size_t index(const uint32_t* start) const noexcept { |
835 | return (size_t)(_ptr - start); |
836 | } |
837 | |
838 | BL_INLINE bool atEnd() const noexcept { return _ptr == _end; } |
839 | BL_INLINE size_t remainingSize() const noexcept { return (size_t)(_end - _ptr); } |
840 | |
841 | BL_INLINE BLResult write(uint32_t uc) noexcept { |
842 | if (BL_UNLIKELY(atEnd())) |
843 | return blTraceError(BL_ERROR_NO_SPACE_LEFT); |
844 | |
845 | _blMemWriteU32(_ptr, uc); |
846 | _ptr++; |
847 | return BL_SUCCESS; |
848 | } |
849 | |
850 | static BL_INLINE void _blMemWriteU32(void* dst, uint32_t value) noexcept { |
851 | blMemWriteU32<ByteOrder, Alignment>(dst, value); |
852 | } |
853 | }; |
854 | |
855 | //! \} |
856 | //! \endcond |
857 | |
858 | #endif // BLEND2D_BLUNICODE_P_H |
859 | |