| 1 | // |
| 2 | // UTF8Encoding.cpp |
| 3 | // |
| 4 | // Library: Foundation |
| 5 | // Package: Text |
| 6 | // Module: UTF8Encoding |
| 7 | // |
| 8 | // Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH. |
| 9 | // and Contributors. |
| 10 | // |
| 11 | // SPDX-License-Identifier: BSL-1.0 |
| 12 | // |
| 13 | |
| 14 | |
| 15 | #include "Poco/UTF8Encoding.h" |
| 16 | #include "Poco/String.h" |
| 17 | |
| 18 | |
| 19 | namespace Poco { |
| 20 | |
| 21 | |
| 22 | const char* UTF8Encoding::_names[] = |
| 23 | { |
| 24 | "UTF-8" , |
| 25 | "UTF8" , |
| 26 | NULL |
| 27 | }; |
| 28 | |
| 29 | |
| 30 | const TextEncoding::CharacterMap UTF8Encoding::_charMap = |
| 31 | { |
| 32 | /* 00 */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
| 33 | /* 10 */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |
| 34 | /* 20 */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, |
| 35 | /* 30 */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, |
| 36 | /* 40 */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, |
| 37 | /* 50 */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, |
| 38 | /* 60 */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, |
| 39 | /* 70 */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, |
| 40 | /* 80 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| 41 | /* 90 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| 42 | /* a0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| 43 | /* b0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| 44 | /* c0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, |
| 45 | /* d0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, |
| 46 | /* e0 */ -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, |
| 47 | /* f0 */ -4, -4, -4, -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, |
| 48 | }; |
| 49 | |
| 50 | |
| 51 | UTF8Encoding::UTF8Encoding() |
| 52 | { |
| 53 | } |
| 54 | |
| 55 | |
| 56 | UTF8Encoding::~UTF8Encoding() |
| 57 | { |
| 58 | } |
| 59 | |
| 60 | |
| 61 | const char* UTF8Encoding::canonicalName() const |
| 62 | { |
| 63 | return _names[0]; |
| 64 | } |
| 65 | |
| 66 | |
| 67 | bool UTF8Encoding::isA(const std::string& encodingName) const |
| 68 | { |
| 69 | for (const char** name = _names; *name; ++name) |
| 70 | { |
| 71 | if (Poco::icompare(encodingName, *name) == 0) |
| 72 | return true; |
| 73 | } |
| 74 | return false; |
| 75 | } |
| 76 | |
| 77 | |
| 78 | const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const |
| 79 | { |
| 80 | return _charMap; |
| 81 | } |
| 82 | |
| 83 | |
| 84 | int UTF8Encoding::convert(const unsigned char* bytes) const |
| 85 | { |
| 86 | int n = _charMap[*bytes]; |
| 87 | int uc; |
| 88 | |
| 89 | switch (n) |
| 90 | { |
| 91 | case -1: |
| 92 | return -1; |
| 93 | case -4: |
| 94 | case -3: |
| 95 | case -2: |
| 96 | if (!isLegal(bytes, -n)) return -1; |
| 97 | uc = *bytes & ((0x07 << (n + 4)) | 0x03); |
| 98 | break; |
| 99 | default: |
| 100 | return n; |
| 101 | } |
| 102 | |
| 103 | while (n++ < -1) |
| 104 | { |
| 105 | uc <<= 6; |
| 106 | uc |= (*++bytes & 0x3F); |
| 107 | } |
| 108 | return uc; |
| 109 | } |
| 110 | |
| 111 | |
| 112 | int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const |
| 113 | { |
| 114 | if (ch <= 0x7F) |
| 115 | { |
| 116 | if (bytes && length >= 1) |
| 117 | *bytes = (unsigned char) ch; |
| 118 | return 1; |
| 119 | } |
| 120 | else if (ch <= 0x7FF) |
| 121 | { |
| 122 | if (bytes && length >= 2) |
| 123 | { |
| 124 | *bytes++ = (unsigned char) (((ch >> 6) & 0x1F) | 0xC0); |
| 125 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
| 126 | } |
| 127 | return 2; |
| 128 | } |
| 129 | else if (ch <= 0xFFFF) |
| 130 | { |
| 131 | if (bytes && length >= 3) |
| 132 | { |
| 133 | *bytes++ = (unsigned char) (((ch >> 12) & 0x0F) | 0xE0); |
| 134 | *bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80); |
| 135 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
| 136 | } |
| 137 | return 3; |
| 138 | } |
| 139 | else if (ch <= 0x10FFFF) |
| 140 | { |
| 141 | if (bytes && length >= 4) |
| 142 | { |
| 143 | *bytes++ = (unsigned char) (((ch >> 18) & 0x07) | 0xF0); |
| 144 | *bytes++ = (unsigned char) (((ch >> 12) & 0x3F) | 0x80); |
| 145 | *bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80); |
| 146 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
| 147 | } |
| 148 | return 4; |
| 149 | } |
| 150 | else return 0; |
| 151 | } |
| 152 | |
| 153 | |
| 154 | int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const |
| 155 | { |
| 156 | int n = _charMap[*bytes]; |
| 157 | int uc; |
| 158 | if (-n > length) |
| 159 | { |
| 160 | return n; |
| 161 | } |
| 162 | else |
| 163 | { |
| 164 | switch (n) |
| 165 | { |
| 166 | case -1: |
| 167 | return -1; |
| 168 | case -4: |
| 169 | case -3: |
| 170 | case -2: |
| 171 | if (!isLegal(bytes, -n)) return -1; |
| 172 | uc = *bytes & ((0x07 << (n + 4)) | 0x03); |
| 173 | break; |
| 174 | default: |
| 175 | return n; |
| 176 | } |
| 177 | while (n++ < -1) |
| 178 | { |
| 179 | uc <<= 6; |
| 180 | uc |= (*++bytes & 0x3F); |
| 181 | } |
| 182 | return uc; |
| 183 | } |
| 184 | } |
| 185 | |
| 186 | |
| 187 | int UTF8Encoding::sequenceLength(const unsigned char* bytes, int length) const |
| 188 | { |
| 189 | if (1 <= length) |
| 190 | { |
| 191 | int cc = _charMap[*bytes]; |
| 192 | if (cc >= 0) |
| 193 | return 1; |
| 194 | else |
| 195 | return -cc; |
| 196 | } |
| 197 | else return -1; |
| 198 | } |
| 199 | |
| 200 | |
| 201 | bool UTF8Encoding::isLegal(const unsigned char *bytes, int length) |
| 202 | { |
| 203 | if (0 == bytes || 0 == length) return false; |
| 204 | |
| 205 | unsigned char a; |
| 206 | const unsigned char* srcptr = bytes + length; |
| 207 | switch (length) |
| 208 | { |
| 209 | default: |
| 210 | return false; |
| 211 | // Everything else falls through when true. |
| 212 | case 4: |
| 213 | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
| 214 | case 3: |
| 215 | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
| 216 | case 2: |
| 217 | a = (*--srcptr); |
| 218 | switch (*bytes) |
| 219 | { |
| 220 | case 0xE0: |
| 221 | if (a < 0xA0 || a > 0xBF) return false; |
| 222 | break; |
| 223 | case 0xED: |
| 224 | if (a < 0x80 || a > 0x9F) return false; |
| 225 | break; |
| 226 | case 0xF0: |
| 227 | if (a < 0x90 || a > 0xBF) return false; |
| 228 | break; |
| 229 | case 0xF4: |
| 230 | if (a < 0x80 || a > 0x8F) return false; |
| 231 | break; |
| 232 | default: |
| 233 | if (a < 0x80 || a > 0xBF) return false; |
| 234 | } |
| 235 | case 1: |
| 236 | if (*bytes >= 0x80 && *bytes < 0xC2) return false; |
| 237 | } |
| 238 | return *bytes <= 0xF4; |
| 239 | } |
| 240 | |
| 241 | |
| 242 | } // namespace Poco |
| 243 | |