utf8_decode.h source code [Aseprite/laf/base/utf8_decode.h]

1	// LAF Base Library
2	// Copyright (c) 2022 Igara Studio S.A.
3	//
4	// This file is released under the terms of the MIT license.
5	// Read LICENSE.txt for more information.
6
7	#ifndef BASE_UTF8_DECODE_H_INCLUDED
8	#define BASE_UTF8_DECODE_H_INCLUDED
9	#pragma once
10
11	#include <string>
12
13	namespace base {
14
15	class utf8_decode {
16	public:
17	using string = std::string;
18	using string_ref = const std::string&;
19	using iterator = std::string::const_iterator;
20
21	utf8_decode() { }
22	utf8_decode(const utf8_decode&) = default;
23	utf8_decode& operator=(const utf8_decode&) = default;
24
25	explicit utf8_decode(string_ref str)
26	: m_it(str.begin())
27	, m_end(str.end()) {
28	}
29
30	iterator pos() const {
31	return m_it;
32	}
33
34	bool is_end() const {
35	return m_it == m_end;
36	}
37
38	bool is_valid() const {
39	return m_valid;
40	}
41
42	int next() {
43	if (m_it == m_end)
44	return `0`;
45
46	int c = *m_it;
47	++m_it;
48
49	// UTF-8 escape bit 0x80 to encode larger code points
50	if (c & `0b1000'0000`) {
51	// Get the number of bits following the first one 0b1xxx'xxxx,
52	// which indicates the number of extra bytes in the input
53	// string following this one, and that will be part of the
54	// final Unicode code point.
55	//
56	// This is like "number of leading ones", similar to a
57	// __builtin_clz(~x)-24 (for 8 bits), anyway doing some tests,
58	// the CLZ intrinsic is not faster than this code in x86_64.
59	int n = `0`;
60	int f = `0b0100'0000`;
61	while (c & f) {
62	++n;
63	f >>= `1`;
64	}
65
66	if (n == `0`) {
67	// Invalid UTF-8: 0b10xx'xxxx alone, i.e. not inside a
68	// escaped sequence (e.g. after 0b110xx'xxx
69	m_valid = false;
70	return `0`;
71	}
72
73	// Keep only the few initial data bits from the first byte (6
74	// first bits if we have only one extra char, then for each
75	// extra char we have less useful data in this first byte).
76	c &= (`0b0001'1111` >> (n-`1`));
77
78	while (n--) {
79	if (m_it == m_end) {
80	// Invalid UTF-8: missing 0b10xx'xxxx bytes
81	m_valid = false;
82	return `0`;
83	}
84	const int chr = *m_it;
85	++m_it;
86	if ((chr & `0b1100'0000`) != `0b1000'0000`) {
87	// Invalid UTF-8: Extra byte doesn't contain 0b10xx'xxxx
88	m_valid = false;
89	return `0`;
90	}
91	// Each extra byte in the encoded string adds 6 bits of
92	// information for the final Unicode code point.
93	c = (c << `6`) \| (chr & `0b0011'1111`);
94	}
95	}
96
97	return c;
98	}
99
100	private:
101	iterator m_it;
102	iterator m_end;
103	bool m_valid = true;
104	};
105
106	}
107
108	#endif
109

Browse the source code of Aseprite/laf/base/utf8_decode.h