| 1 | // Copyright 2006 Nemanja Trifunovic | 
|---|
| 2 |  | 
|---|
| 3 | /* | 
|---|
| 4 | Permission is hereby granted, free of charge, to any person or organization | 
|---|
| 5 | obtaining a copy of the software and accompanying documentation covered by | 
|---|
| 6 | this license (the "Software") to use, reproduce, display, distribute, | 
|---|
| 7 | execute, and transmit the Software, and to prepare derivative works of the | 
|---|
| 8 | Software, and to permit third-parties to whom the Software is furnished to | 
|---|
| 9 | do so, all subject to the following: | 
|---|
| 10 |  | 
|---|
| 11 | The copyright notices in the Software and this entire statement, including | 
|---|
| 12 | the above license grant, this restriction and the following disclaimer, | 
|---|
| 13 | must be included in all copies of the Software, in whole or in part, and | 
|---|
| 14 | all derivative works of the Software, unless such copies or derivative | 
|---|
| 15 | works are solely in the form of machine-executable object code generated by | 
|---|
| 16 | a source language processor. | 
|---|
| 17 |  | 
|---|
| 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
|---|
| 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
|---|
| 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | 
|---|
| 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | 
|---|
| 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | 
|---|
| 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | 
|---|
| 24 | DEALINGS IN THE SOFTWARE. | 
|---|
| 25 | */ | 
|---|
| 26 |  | 
|---|
| 27 |  | 
|---|
| 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | 
|---|
| 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | 
|---|
| 30 |  | 
|---|
| 31 | #include "core.h" | 
|---|
| 32 | #include <stdexcept> | 
|---|
| 33 |  | 
|---|
| 34 | namespace utf8 | 
|---|
| 35 | { | 
|---|
| 36 | // Base for the exceptions that may be thrown from the library | 
|---|
| 37 | class exception : public ::std::exception { | 
|---|
| 38 | }; | 
|---|
| 39 |  | 
|---|
| 40 | // Exceptions that may be thrown from the library functions. | 
|---|
| 41 | class invalid_code_point : public exception { | 
|---|
| 42 | uint32_t cp; | 
|---|
| 43 | public: | 
|---|
| 44 | invalid_code_point(uint32_t cp) : cp(cp) {} | 
|---|
| 45 | virtual const char* what() const throw() { return "Invalid code point"; } | 
|---|
| 46 | uint32_t code_point() const {return cp;} | 
|---|
| 47 | }; | 
|---|
| 48 |  | 
|---|
| 49 | class invalid_utf8 : public exception { | 
|---|
| 50 | uint8_t u8; | 
|---|
| 51 | public: | 
|---|
| 52 | invalid_utf8 (uint8_t u) : u8(u) {} | 
|---|
| 53 | virtual const char* what() const throw() { return "Invalid UTF-8"; } | 
|---|
| 54 | uint8_t utf8_octet() const {return u8;} | 
|---|
| 55 | }; | 
|---|
| 56 |  | 
|---|
| 57 | class invalid_utf16 : public exception { | 
|---|
| 58 | uint16_t u16; | 
|---|
| 59 | public: | 
|---|
| 60 | invalid_utf16 (uint16_t u) : u16(u) {} | 
|---|
| 61 | virtual const char* what() const throw() { return "Invalid UTF-16"; } | 
|---|
| 62 | uint16_t utf16_word() const {return u16;} | 
|---|
| 63 | }; | 
|---|
| 64 |  | 
|---|
| 65 | class not_enough_room : public exception { | 
|---|
| 66 | public: | 
|---|
| 67 | virtual const char* what() const throw() { return "Not enough space"; } | 
|---|
| 68 | }; | 
|---|
| 69 |  | 
|---|
| 70 | /// The library API - functions intended to be called by the users | 
|---|
| 71 |  | 
|---|
| 72 | template <typename octet_iterator> | 
|---|
| 73 | octet_iterator append(uint32_t cp, octet_iterator result) | 
|---|
| 74 | { | 
|---|
| 75 | if (!utf8::internal::is_code_point_valid(cp)) | 
|---|
| 76 | throw invalid_code_point(cp); | 
|---|
| 77 |  | 
|---|
| 78 | if (cp < 0x80)                        // one octet | 
|---|
| 79 | *(result++) = static_cast<uint8_t>(cp); | 
|---|
| 80 | else if (cp < 0x800) {                // two octets | 
|---|
| 81 | *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0); | 
|---|
| 82 | *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80); | 
|---|
| 83 | } | 
|---|
| 84 | else if (cp < 0x10000) {              // three octets | 
|---|
| 85 | *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0); | 
|---|
| 86 | *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80); | 
|---|
| 87 | *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80); | 
|---|
| 88 | } | 
|---|
| 89 | else {                                // four octets | 
|---|
| 90 | *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0); | 
|---|
| 91 | *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80); | 
|---|
| 92 | *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80); | 
|---|
| 93 | *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80); | 
|---|
| 94 | } | 
|---|
| 95 | return result; | 
|---|
| 96 | } | 
|---|
| 97 |  | 
|---|
| 98 | template <typename octet_iterator, typename output_iterator> | 
|---|
| 99 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) | 
|---|
| 100 | { | 
|---|
| 101 | while (start != end) { | 
|---|
| 102 | octet_iterator sequence_start = start; | 
|---|
| 103 | internal::utf_error err_code = utf8::internal::validate_next(start, end); | 
|---|
| 104 | switch (err_code) { | 
|---|
| 105 | case internal::UTF8_OK : | 
|---|
| 106 | for (octet_iterator it = sequence_start; it != start; ++it) | 
|---|
| 107 | *out++ = *it; | 
|---|
| 108 | break; | 
|---|
| 109 | case internal::NOT_ENOUGH_ROOM: | 
|---|
| 110 | throw not_enough_room(); | 
|---|
| 111 | case internal::INVALID_LEAD: | 
|---|
| 112 | out = utf8::append (replacement, out); | 
|---|
| 113 | ++start; | 
|---|
| 114 | break; | 
|---|
| 115 | case internal::INCOMPLETE_SEQUENCE: | 
|---|
| 116 | case internal::OVERLONG_SEQUENCE: | 
|---|
| 117 | case internal::INVALID_CODE_POINT: | 
|---|
| 118 | out = utf8::append (replacement, out); | 
|---|
| 119 | ++start; | 
|---|
| 120 | // just one replacement mark for the sequence | 
|---|
| 121 | while (start != end && utf8::internal::is_trail(*start)) | 
|---|
| 122 | ++start; | 
|---|
| 123 | break; | 
|---|
| 124 | } | 
|---|
| 125 | } | 
|---|
| 126 | return out; | 
|---|
| 127 | } | 
|---|
| 128 |  | 
|---|
| 129 | template <typename octet_iterator, typename output_iterator> | 
|---|
| 130 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) | 
|---|
| 131 | { | 
|---|
| 132 | static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); | 
|---|
| 133 | return utf8::replace_invalid(start, end, out, replacement_marker); | 
|---|
| 134 | } | 
|---|
| 135 |  | 
|---|
| 136 | template <typename octet_iterator> | 
|---|
| 137 | uint32_t next(octet_iterator& it, octet_iterator end) | 
|---|
| 138 | { | 
|---|
| 139 | uint32_t cp = 0; | 
|---|
| 140 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); | 
|---|
| 141 | switch (err_code) { | 
|---|
| 142 | case internal::UTF8_OK : | 
|---|
| 143 | break; | 
|---|
| 144 | case internal::NOT_ENOUGH_ROOM : | 
|---|
| 145 | throw not_enough_room(); | 
|---|
| 146 | case internal::INVALID_LEAD : | 
|---|
| 147 | case internal::INCOMPLETE_SEQUENCE : | 
|---|
| 148 | case internal::OVERLONG_SEQUENCE : | 
|---|
| 149 | throw invalid_utf8(*it); | 
|---|
| 150 | case internal::INVALID_CODE_POINT : | 
|---|
| 151 | throw invalid_code_point(cp); | 
|---|
| 152 | } | 
|---|
| 153 | return cp; | 
|---|
| 154 | } | 
|---|
| 155 |  | 
|---|
| 156 | template <typename octet_iterator> | 
|---|
| 157 | uint32_t peek_next(octet_iterator it, octet_iterator end) | 
|---|
| 158 | { | 
|---|
| 159 | return utf8::next(it, end); | 
|---|
| 160 | } | 
|---|
| 161 |  | 
|---|
| 162 | template <typename octet_iterator> | 
|---|
| 163 | uint32_t prior(octet_iterator& it, octet_iterator start) | 
|---|
| 164 | { | 
|---|
| 165 | // can't do much if it == start | 
|---|
| 166 | if (it == start) | 
|---|
| 167 | throw not_enough_room(); | 
|---|
| 168 |  | 
|---|
| 169 | octet_iterator end = it; | 
|---|
| 170 | // Go back until we hit either a lead octet or start | 
|---|
| 171 | while (utf8::internal::is_trail(*(--it))) | 
|---|
| 172 | if (it == start) | 
|---|
| 173 | throw invalid_utf8(*it); // error - no lead byte in the sequence | 
|---|
| 174 | return utf8::peek_next(it, end); | 
|---|
| 175 | } | 
|---|
| 176 |  | 
|---|
| 177 | /// Deprecated in versions that include "prior" | 
|---|
| 178 | template <typename octet_iterator> | 
|---|
| 179 | uint32_t previous(octet_iterator& it, octet_iterator pass_start) | 
|---|
| 180 | { | 
|---|
| 181 | octet_iterator end = it; | 
|---|
| 182 | while (utf8::internal::is_trail(*(--it))) | 
|---|
| 183 | if (it == pass_start) | 
|---|
| 184 | throw invalid_utf8(*it); // error - no lead byte in the sequence | 
|---|
| 185 | octet_iterator temp = it; | 
|---|
| 186 | return utf8::next(temp, end); | 
|---|
| 187 | } | 
|---|
| 188 |  | 
|---|
| 189 | template <typename octet_iterator, typename distance_type> | 
|---|
| 190 | void advance (octet_iterator& it, distance_type n, octet_iterator end) | 
|---|
| 191 | { | 
|---|
| 192 | for (distance_type i = 0; i < n; ++i) | 
|---|
| 193 | utf8::next(it, end); | 
|---|
| 194 | } | 
|---|
| 195 |  | 
|---|
| 196 | template <typename octet_iterator> | 
|---|
| 197 | typename std::iterator_traits<octet_iterator>::difference_type | 
|---|
| 198 | distance (octet_iterator first, octet_iterator last) | 
|---|
| 199 | { | 
|---|
| 200 | typename std::iterator_traits<octet_iterator>::difference_type dist; | 
|---|
| 201 | for (dist = 0; first < last; ++dist) | 
|---|
| 202 | utf8::next(first, last); | 
|---|
| 203 | return dist; | 
|---|
| 204 | } | 
|---|
| 205 |  | 
|---|
| 206 | template <typename u16bit_iterator, typename octet_iterator> | 
|---|
| 207 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) | 
|---|
| 208 | { | 
|---|
| 209 | while (start != end) { | 
|---|
| 210 | uint32_t cp = utf8::internal::mask16(*start++); | 
|---|
| 211 | // Take care of surrogate pairs first | 
|---|
| 212 | if (utf8::internal::is_lead_surrogate(cp)) { | 
|---|
| 213 | if (start != end) { | 
|---|
| 214 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); | 
|---|
| 215 | if (utf8::internal::is_trail_surrogate(trail_surrogate)) | 
|---|
| 216 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | 
|---|
| 217 | else | 
|---|
| 218 | throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); | 
|---|
| 219 | } | 
|---|
| 220 | else | 
|---|
| 221 | throw invalid_utf16(static_cast<uint16_t>(cp)); | 
|---|
| 222 |  | 
|---|
| 223 | } | 
|---|
| 224 | // Lone trail surrogate | 
|---|
| 225 | else if (utf8::internal::is_trail_surrogate(cp)) | 
|---|
| 226 | throw invalid_utf16(static_cast<uint16_t>(cp)); | 
|---|
| 227 |  | 
|---|
| 228 | result = utf8::append(cp, result); | 
|---|
| 229 | } | 
|---|
| 230 | return result; | 
|---|
| 231 | } | 
|---|
| 232 |  | 
|---|
| 233 | template <typename u16bit_iterator, typename octet_iterator> | 
|---|
| 234 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) | 
|---|
| 235 | { | 
|---|
| 236 | while (start != end) { | 
|---|
| 237 | uint32_t cp = utf8::next(start, end); | 
|---|
| 238 | if (cp > 0xffff) { //make a surrogate pair | 
|---|
| 239 | *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET); | 
|---|
| 240 | *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | 
|---|
| 241 | } | 
|---|
| 242 | else | 
|---|
| 243 | *result++ = static_cast<uint16_t>(cp); | 
|---|
| 244 | } | 
|---|
| 245 | return result; | 
|---|
| 246 | } | 
|---|
| 247 |  | 
|---|
| 248 | template <typename octet_iterator, typename u32bit_iterator> | 
|---|
| 249 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) | 
|---|
| 250 | { | 
|---|
| 251 | while (start != end) | 
|---|
| 252 | result = utf8::append(*(start++), result); | 
|---|
| 253 |  | 
|---|
| 254 | return result; | 
|---|
| 255 | } | 
|---|
| 256 |  | 
|---|
| 257 | template <typename octet_iterator, typename u32bit_iterator> | 
|---|
| 258 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) | 
|---|
| 259 | { | 
|---|
| 260 | while (start != end) | 
|---|
| 261 | (*result++) = utf8::next(start, end); | 
|---|
| 262 |  | 
|---|
| 263 | return result; | 
|---|
| 264 | } | 
|---|
| 265 |  | 
|---|
| 266 | // The iterator class | 
|---|
| 267 | template <typename octet_iterator> | 
|---|
| 268 | class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { | 
|---|
| 269 | octet_iterator it; | 
|---|
| 270 | octet_iterator range_start; | 
|---|
| 271 | octet_iterator range_end; | 
|---|
| 272 | public: | 
|---|
| 273 | iterator () {} | 
|---|
| 274 | explicit iterator (const octet_iterator& octet_it, | 
|---|
| 275 | const octet_iterator& range_start, | 
|---|
| 276 | const octet_iterator& range_end) : | 
|---|
| 277 | it(octet_it), range_start(range_start), range_end(range_end) | 
|---|
| 278 | { | 
|---|
| 279 | if (it < range_start || it > range_end) | 
|---|
| 280 | throw std::out_of_range( "Invalid utf-8 iterator position"); | 
|---|
| 281 | } | 
|---|
| 282 | // the default "big three" are OK | 
|---|
| 283 | octet_iterator base () const { return it; } | 
|---|
| 284 | uint32_t operator * () const | 
|---|
| 285 | { | 
|---|
| 286 | octet_iterator temp = it; | 
|---|
| 287 | return utf8::next(temp, range_end); | 
|---|
| 288 | } | 
|---|
| 289 | bool operator == (const iterator& rhs) const | 
|---|
| 290 | { | 
|---|
| 291 | if (range_start != rhs.range_start || range_end != rhs.range_end) | 
|---|
| 292 | throw std::logic_error( "Comparing utf-8 iterators defined with different ranges"); | 
|---|
| 293 | return (it == rhs.it); | 
|---|
| 294 | } | 
|---|
| 295 | bool operator != (const iterator& rhs) const | 
|---|
| 296 | { | 
|---|
| 297 | return !(operator == (rhs)); | 
|---|
| 298 | } | 
|---|
| 299 | iterator& operator ++ () | 
|---|
| 300 | { | 
|---|
| 301 | utf8::next(it, range_end); | 
|---|
| 302 | return *this; | 
|---|
| 303 | } | 
|---|
| 304 | iterator operator ++ (int) | 
|---|
| 305 | { | 
|---|
| 306 | iterator temp = *this; | 
|---|
| 307 | utf8::next(it, range_end); | 
|---|
| 308 | return temp; | 
|---|
| 309 | } | 
|---|
| 310 | iterator& operator -- () | 
|---|
| 311 | { | 
|---|
| 312 | utf8::prior(it, range_start); | 
|---|
| 313 | return *this; | 
|---|
| 314 | } | 
|---|
| 315 | iterator operator -- (int) | 
|---|
| 316 | { | 
|---|
| 317 | iterator temp = *this; | 
|---|
| 318 | utf8::prior(it, range_start); | 
|---|
| 319 | return temp; | 
|---|
| 320 | } | 
|---|
| 321 | }; // class iterator | 
|---|
| 322 |  | 
|---|
| 323 | } // namespace utf8 | 
|---|
| 324 |  | 
|---|
| 325 | #endif //header guard | 
|---|
| 326 |  | 
|---|
| 327 |  | 
|---|
| 328 |  | 
|---|