1// LAF Base Library
2// Copyright (c) 2022 Igara Studio S.A.
3//
4// This file is released under the terms of the MIT license.
5// Read LICENSE.txt for more information.
6
7#ifndef BASE_UTF8_DECODE_H_INCLUDED
8#define BASE_UTF8_DECODE_H_INCLUDED
9#pragma once
10
11#include <string>
12
13namespace base {
14
15 class utf8_decode {
16 public:
17 using string = std::string;
18 using string_ref = const std::string&;
19 using iterator = std::string::const_iterator;
20
21 utf8_decode() { }
22 utf8_decode(const utf8_decode&) = default;
23 utf8_decode& operator=(const utf8_decode&) = default;
24
25 explicit utf8_decode(string_ref str)
26 : m_it(str.begin())
27 , m_end(str.end()) {
28 }
29
30 iterator pos() const {
31 return m_it;
32 }
33
34 bool is_end() const {
35 return m_it == m_end;
36 }
37
38 bool is_valid() const {
39 return m_valid;
40 }
41
42 int next() {
43 if (m_it == m_end)
44 return 0;
45
46 int c = *m_it;
47 ++m_it;
48
49 // UTF-8 escape bit 0x80 to encode larger code points
50 if (c & 0b1000'0000) {
51 // Get the number of bits following the first one 0b1xxx'xxxx,
52 // which indicates the number of extra bytes in the input
53 // string following this one, and that will be part of the
54 // final Unicode code point.
55 //
56 // This is like "number of leading ones", similar to a
57 // __builtin_clz(~x)-24 (for 8 bits), anyway doing some tests,
58 // the CLZ intrinsic is not faster than this code in x86_64.
59 int n = 0;
60 int f = 0b0100'0000;
61 while (c & f) {
62 ++n;
63 f >>= 1;
64 }
65
66 if (n == 0) {
67 // Invalid UTF-8: 0b10xx'xxxx alone, i.e. not inside a
68 // escaped sequence (e.g. after 0b110xx'xxx
69 m_valid = false;
70 return 0;
71 }
72
73 // Keep only the few initial data bits from the first byte (6
74 // first bits if we have only one extra char, then for each
75 // extra char we have less useful data in this first byte).
76 c &= (0b0001'1111 >> (n-1));
77
78 while (n--) {
79 if (m_it == m_end) {
80 // Invalid UTF-8: missing 0b10xx'xxxx bytes
81 m_valid = false;
82 return 0;
83 }
84 const int chr = *m_it;
85 ++m_it;
86 if ((chr & 0b1100'0000) != 0b1000'0000) {
87 // Invalid UTF-8: Extra byte doesn't contain 0b10xx'xxxx
88 m_valid = false;
89 return 0;
90 }
91 // Each extra byte in the encoded string adds 6 bits of
92 // information for the final Unicode code point.
93 c = (c << 6) | (chr & 0b0011'1111);
94 }
95 }
96
97 return c;
98 }
99
100 private:
101 iterator m_it;
102 iterator m_end;
103 bool m_valid = true;
104 };
105
106}
107
108#endif
109