utf8_iterator.cpp source code [SuperTux/src/util/utf8_iterator.cpp]

1	// SuperTux
2	// Copyright (C) 2009 Ingo Ruhnke <grumbel@gmail.com>
3	//
4	// This program is free software: you can redistribute it and/or modify
5	// it under the terms of the GNU General Public License as published by
6	// the Free Software Foundation, either version 3 of the License, or
7	// (at your option) any later version.
8	//
9	// This program is distributed in the hope that it will be useful,
10	// but WITHOUT ANY WARRANTY; without even the implied warranty of
11	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	// GNU General Public License for more details.
13	//
14	// You should have received a copy of the GNU General Public License
15	// along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17	#include "util/utf8_iterator.hpp"
18
19	#include <stdexcept>
20
21	#include "util/log.hpp"
22
23	namespace {
24
25	bool has_multibyte_mark(unsigned char c);
26	uint32_t decode_utf8(const std::string& text, size_t& p);
27
28	/**
29	* returns true if this byte matches a bitmask of 10xx.xxxx, i.e. it is the 2nd, 3rd or 4th byte of a multibyte utf8 string
30	*/
31	bool has_multibyte_mark(unsigned char c) {
32	return ((c & `0300`) == `0200`);
33	}
34
35	/**
36	* gets unicode character at byte position @a p of UTF-8 encoded @a
37	* text, then advances @a p to the next character.
38	*
39	* @throws std::runtime_error if decoding fails.
40	* See unicode standard section 3.10 table 3-5 and 3-6 for details.
41	*/
42	uint32_t decode_utf8(const std::string& text, size_t& p)
43	{
44	uint32_t c1 = static_cast<unsigned char>(text [p+`0`]);
45
46	if (has_multibyte_mark(static_cast<unsigned char>(c1))) {
47	throw std::runtime_error ("Malformed utf-8 sequence");
48	}
49
50	if ((c1 & `0200`) == `0000`) {
51	// 0xxx.xxxx: 1 byte sequence
52	p+=`1`;
53	return c1;
54	}
55	else if ((c1 & `0340`) == `0300`) {
56	// 110x.xxxx: 2 byte sequence
57	if (p+`1` >= text.size()) throw std::range_error ("Malformed utf-8 sequence");
58	uint32_t c2 = static_cast<unsigned char>(text [p+`1`]);
59	if (!has_multibyte_mark(static_cast<unsigned char>(c2))) throw std::runtime_error ("Malformed utf-8 sequence");
60	p+=`2`;
61	return (c1 & `0037`) << `6` \| (c2 & `0077`);
62	}
63	else if ((c1 & `0360`) == `0340`) {
64	// 1110.xxxx: 3 byte sequence
65	if (p+`2` >= text.size()) throw std::range_error ("Malformed utf-8 sequence");
66	uint32_t c2 = static_cast<unsigned char>(text [p+`1`]);
67	uint32_t c3 = static_cast<unsigned char>(text [p+`2`]);
68	if (!has_multibyte_mark(static_cast<unsigned char>(c2))) throw std::runtime_error ("Malformed utf-8 sequence");
69	if (!has_multibyte_mark(static_cast<unsigned char>(c3))) throw std::runtime_error ("Malformed utf-8 sequence");
70	p+=`3`;
71	return (c1 & `0017`) << `12` \| (c2 & `0077`) << `6` \| (c3 & `0077`);
72	}
73	else if ((c1 & `0370`) == `0360`) {
74	// 1111.0xxx: 4 byte sequence
75	if (p+`3` >= text.size()) throw std::range_error ("Malformed utf-8 sequence");
76	uint32_t c2 = static_cast<unsigned char>(text [p+`1`]);
77	uint32_t c3 = static_cast<unsigned char>(text [p+`2`]);
78	uint32_t c4 = static_cast<unsigned char>(text [p+`4`]);
79	if (!has_multibyte_mark(static_cast<unsigned char>(c2))) throw std::runtime_error ("Malformed utf-8 sequence");
80	if (!has_multibyte_mark(static_cast<unsigned char>(c3))) throw std::runtime_error ("Malformed utf-8 sequence");
81	if (!has_multibyte_mark(static_cast<unsigned char>(c4))) throw std::runtime_error ("Malformed utf-8 sequence");
82	p+=`4`;
83	return (c1 & `0007`) << `18` \| (c2 & `0077`) << `12` \| (c3 & `0077`) << `6` \| (c4 & `0077`);
84	}
85	throw std::runtime_error ("Malformed utf-8 sequence");
86	}
87
88	} // namespace
89
90
91	UTF8Iterator::UTF8Iterator(const std::string& text_) :
92	text(text_),
93	pos(`0`),
94	chr()
95	{
96	try {
97	chr = decode_utf8(text, pos);
98	} catch (std::exception&) {
99	log_debug << "Malformed utf-8 sequence beginning with " << (reinterpret_cast<const* uint32_t*>(text.c_str() + pos)) << " found " << std::endl;
100	chr = `0`;
101	}
102	}
103
104	bool
105	UTF8Iterator::done() const
106	{
107	return pos > text.size();
108	}
109
110	UTF8Iterator&
111	UTF8Iterator::operator++() {
112	try {
113	chr = decode_utf8(text, pos);
114	} catch (std::exception&) {
115	log_debug << "Malformed utf-8 sequence beginning with " << (reinterpret_cast<const* uint32_t*>(text.c_str() + pos)) << " found " << std::endl;
116	chr = `0`;
117	++pos;
118	}
119
120	return *this;
121	}
122
123	uint32_t
124	UTF8Iterator::operator() const* {
125	return chr;
126	}
127
128	/ EOF /
129

Browse the source code of SuperTux/src/util/utf8_iterator.cpp