1 | // |
2 | // UTF8Encoding.cpp |
3 | // |
4 | // Library: Foundation |
5 | // Package: Text |
6 | // Module: UTF8Encoding |
7 | // |
8 | // Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH. |
9 | // and Contributors. |
10 | // |
11 | // SPDX-License-Identifier: BSL-1.0 |
12 | // |
13 | |
14 | |
15 | #include "Poco/UTF8Encoding.h" |
16 | #include "Poco/String.h" |
17 | |
18 | |
19 | namespace Poco { |
20 | |
21 | |
22 | const char* UTF8Encoding::_names[] = |
23 | { |
24 | "UTF-8" , |
25 | "UTF8" , |
26 | NULL |
27 | }; |
28 | |
29 | |
30 | const TextEncoding::CharacterMap UTF8Encoding::_charMap = |
31 | { |
32 | /* 00 */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
33 | /* 10 */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |
34 | /* 20 */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, |
35 | /* 30 */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, |
36 | /* 40 */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, |
37 | /* 50 */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, |
38 | /* 60 */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, |
39 | /* 70 */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, |
40 | /* 80 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
41 | /* 90 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
42 | /* a0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
43 | /* b0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
44 | /* c0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, |
45 | /* d0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, |
46 | /* e0 */ -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, |
47 | /* f0 */ -4, -4, -4, -4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, |
48 | }; |
49 | |
50 | |
51 | UTF8Encoding::UTF8Encoding() |
52 | { |
53 | } |
54 | |
55 | |
56 | UTF8Encoding::~UTF8Encoding() |
57 | { |
58 | } |
59 | |
60 | |
61 | const char* UTF8Encoding::canonicalName() const |
62 | { |
63 | return _names[0]; |
64 | } |
65 | |
66 | |
67 | bool UTF8Encoding::isA(const std::string& encodingName) const |
68 | { |
69 | for (const char** name = _names; *name; ++name) |
70 | { |
71 | if (Poco::icompare(encodingName, *name) == 0) |
72 | return true; |
73 | } |
74 | return false; |
75 | } |
76 | |
77 | |
78 | const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const |
79 | { |
80 | return _charMap; |
81 | } |
82 | |
83 | |
84 | int UTF8Encoding::convert(const unsigned char* bytes) const |
85 | { |
86 | int n = _charMap[*bytes]; |
87 | int uc; |
88 | |
89 | switch (n) |
90 | { |
91 | case -1: |
92 | return -1; |
93 | case -4: |
94 | case -3: |
95 | case -2: |
96 | if (!isLegal(bytes, -n)) return -1; |
97 | uc = *bytes & ((0x07 << (n + 4)) | 0x03); |
98 | break; |
99 | default: |
100 | return n; |
101 | } |
102 | |
103 | while (n++ < -1) |
104 | { |
105 | uc <<= 6; |
106 | uc |= (*++bytes & 0x3F); |
107 | } |
108 | return uc; |
109 | } |
110 | |
111 | |
112 | int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const |
113 | { |
114 | if (ch <= 0x7F) |
115 | { |
116 | if (bytes && length >= 1) |
117 | *bytes = (unsigned char) ch; |
118 | return 1; |
119 | } |
120 | else if (ch <= 0x7FF) |
121 | { |
122 | if (bytes && length >= 2) |
123 | { |
124 | *bytes++ = (unsigned char) (((ch >> 6) & 0x1F) | 0xC0); |
125 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
126 | } |
127 | return 2; |
128 | } |
129 | else if (ch <= 0xFFFF) |
130 | { |
131 | if (bytes && length >= 3) |
132 | { |
133 | *bytes++ = (unsigned char) (((ch >> 12) & 0x0F) | 0xE0); |
134 | *bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80); |
135 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
136 | } |
137 | return 3; |
138 | } |
139 | else if (ch <= 0x10FFFF) |
140 | { |
141 | if (bytes && length >= 4) |
142 | { |
143 | *bytes++ = (unsigned char) (((ch >> 18) & 0x07) | 0xF0); |
144 | *bytes++ = (unsigned char) (((ch >> 12) & 0x3F) | 0x80); |
145 | *bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80); |
146 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
147 | } |
148 | return 4; |
149 | } |
150 | else return 0; |
151 | } |
152 | |
153 | |
154 | int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const |
155 | { |
156 | int n = _charMap[*bytes]; |
157 | int uc; |
158 | if (-n > length) |
159 | { |
160 | return n; |
161 | } |
162 | else |
163 | { |
164 | switch (n) |
165 | { |
166 | case -1: |
167 | return -1; |
168 | case -4: |
169 | case -3: |
170 | case -2: |
171 | if (!isLegal(bytes, -n)) return -1; |
172 | uc = *bytes & ((0x07 << (n + 4)) | 0x03); |
173 | break; |
174 | default: |
175 | return n; |
176 | } |
177 | while (n++ < -1) |
178 | { |
179 | uc <<= 6; |
180 | uc |= (*++bytes & 0x3F); |
181 | } |
182 | return uc; |
183 | } |
184 | } |
185 | |
186 | |
187 | int UTF8Encoding::sequenceLength(const unsigned char* bytes, int length) const |
188 | { |
189 | if (1 <= length) |
190 | { |
191 | int cc = _charMap[*bytes]; |
192 | if (cc >= 0) |
193 | return 1; |
194 | else |
195 | return -cc; |
196 | } |
197 | else return -1; |
198 | } |
199 | |
200 | |
201 | bool UTF8Encoding::isLegal(const unsigned char *bytes, int length) |
202 | { |
203 | if (0 == bytes || 0 == length) return false; |
204 | |
205 | unsigned char a; |
206 | const unsigned char* srcptr = bytes + length; |
207 | switch (length) |
208 | { |
209 | default: |
210 | return false; |
211 | // Everything else falls through when true. |
212 | case 4: |
213 | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
214 | case 3: |
215 | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
216 | case 2: |
217 | a = (*--srcptr); |
218 | switch (*bytes) |
219 | { |
220 | case 0xE0: |
221 | if (a < 0xA0 || a > 0xBF) return false; |
222 | break; |
223 | case 0xED: |
224 | if (a < 0x80 || a > 0x9F) return false; |
225 | break; |
226 | case 0xF0: |
227 | if (a < 0x90 || a > 0xBF) return false; |
228 | break; |
229 | case 0xF4: |
230 | if (a < 0x80 || a > 0x8F) return false; |
231 | break; |
232 | default: |
233 | if (a < 0x80 || a > 0xBF) return false; |
234 | } |
235 | case 1: |
236 | if (*bytes >= 0x80 && *bytes < 0xC2) return false; |
237 | } |
238 | return *bytes <= 0xF4; |
239 | } |
240 | |
241 | |
242 | } // namespace Poco |
243 | |