1// SuperTux
2// Copyright (C) 2009 Ingo Ruhnke <grumbel@gmail.com>
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17#include "util/utf8_iterator.hpp"
18
19#include <stdexcept>
20
21#include "util/log.hpp"
22
23namespace {
24
25bool has_multibyte_mark(unsigned char c);
26uint32_t decode_utf8(const std::string& text, size_t& p);
27
28/**
29 * returns true if this byte matches a bitmask of 10xx.xxxx, i.e. it is the 2nd, 3rd or 4th byte of a multibyte utf8 string
30 */
31bool has_multibyte_mark(unsigned char c) {
32 return ((c & 0300) == 0200);
33}
34
35/**
36 * gets unicode character at byte position @a p of UTF-8 encoded @a
37 * text, then advances @a p to the next character.
38 *
39 * @throws std::runtime_error if decoding fails.
40 * See unicode standard section 3.10 table 3-5 and 3-6 for details.
41 */
42uint32_t decode_utf8(const std::string& text, size_t& p)
43{
44 uint32_t c1 = static_cast<unsigned char>(text[p+0]);
45
46 if (has_multibyte_mark(static_cast<unsigned char>(c1))) {
47 throw std::runtime_error("Malformed utf-8 sequence");
48 }
49
50 if ((c1 & 0200) == 0000) {
51 // 0xxx.xxxx: 1 byte sequence
52 p+=1;
53 return c1;
54 }
55 else if ((c1 & 0340) == 0300) {
56 // 110x.xxxx: 2 byte sequence
57 if (p+1 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
58 uint32_t c2 = static_cast<unsigned char>(text[p+1]);
59 if (!has_multibyte_mark(static_cast<unsigned char>(c2))) throw std::runtime_error("Malformed utf-8 sequence");
60 p+=2;
61 return (c1 & 0037) << 6 | (c2 & 0077);
62 }
63 else if ((c1 & 0360) == 0340) {
64 // 1110.xxxx: 3 byte sequence
65 if (p+2 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
66 uint32_t c2 = static_cast<unsigned char>(text[p+1]);
67 uint32_t c3 = static_cast<unsigned char>(text[p+2]);
68 if (!has_multibyte_mark(static_cast<unsigned char>(c2))) throw std::runtime_error("Malformed utf-8 sequence");
69 if (!has_multibyte_mark(static_cast<unsigned char>(c3))) throw std::runtime_error("Malformed utf-8 sequence");
70 p+=3;
71 return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077);
72 }
73 else if ((c1 & 0370) == 0360) {
74 // 1111.0xxx: 4 byte sequence
75 if (p+3 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
76 uint32_t c2 = static_cast<unsigned char>(text[p+1]);
77 uint32_t c3 = static_cast<unsigned char>(text[p+2]);
78 uint32_t c4 = static_cast<unsigned char>(text[p+4]);
79 if (!has_multibyte_mark(static_cast<unsigned char>(c2))) throw std::runtime_error("Malformed utf-8 sequence");
80 if (!has_multibyte_mark(static_cast<unsigned char>(c3))) throw std::runtime_error("Malformed utf-8 sequence");
81 if (!has_multibyte_mark(static_cast<unsigned char>(c4))) throw std::runtime_error("Malformed utf-8 sequence");
82 p+=4;
83 return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 & 0077);
84 }
85 throw std::runtime_error("Malformed utf-8 sequence");
86}
87
88} // namespace
89
90
91UTF8Iterator::UTF8Iterator(const std::string& text_) :
92 text(text_),
93 pos(0),
94 chr()
95{
96 try {
97 chr = decode_utf8(text, pos);
98 } catch (std::exception&) {
99 log_debug << "Malformed utf-8 sequence beginning with " << *(reinterpret_cast<const uint32_t*>(text.c_str() + pos)) << " found " << std::endl;
100 chr = 0;
101 }
102}
103
104 bool
105UTF8Iterator::done() const
106 {
107 return pos > text.size();
108 }
109
110 UTF8Iterator&
111UTF8Iterator::operator++() {
112 try {
113 chr = decode_utf8(text, pos);
114 } catch (std::exception&) {
115 log_debug << "Malformed utf-8 sequence beginning with " << *(reinterpret_cast<const uint32_t*>(text.c_str() + pos)) << " found " << std::endl;
116 chr = 0;
117 ++pos;
118 }
119
120 return *this;
121 }
122
123 uint32_t
124 UTF8Iterator::operator*() const {
125 return chr;
126 }
127
128/* EOF */
129