1 | // |
2 | // UTF8Encoding.cpp |
3 | // |
4 | // Library: Foundation |
5 | // Package: Text |
6 | // Module: UTF8Encoding |
7 | // |
8 | // Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH. |
9 | // and Contributors. |
10 | // |
11 | // SPDX-License-Identifier: BSL-1.0 |
12 | // |
13 | |
14 | |
15 | #include "Poco/UTF8Encoding.h" |
16 | #include "Poco/String.h" |
17 | |
18 | |
19 | namespace Poco { |
20 | |
21 | |
22 | const char* UTF8Encoding::_names[] = |
23 | { |
24 | "UTF-8" , |
25 | "UTF8" , |
26 | NULL |
27 | }; |
28 | |
29 | |
30 | const TextEncoding::CharacterMap UTF8Encoding::_charMap = |
31 | { |
32 | /* 00 */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
33 | /* 10 */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |
34 | /* 20 */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, |
35 | /* 30 */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, |
36 | /* 40 */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, |
37 | /* 50 */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, |
38 | /* 60 */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, |
39 | /* 70 */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, |
40 | /* 80 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
41 | /* 90 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
42 | /* a0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
43 | /* b0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
44 | /* c0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, |
45 | /* d0 */ -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, |
46 | /* e0 */ -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, |
47 | /* f0 */ -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -5, -6, -6, -1, -1, |
48 | }; |
49 | |
50 | |
51 | UTF8Encoding::UTF8Encoding() |
52 | { |
53 | } |
54 | |
55 | |
56 | UTF8Encoding::~UTF8Encoding() |
57 | { |
58 | } |
59 | |
60 | |
61 | const char* UTF8Encoding::canonicalName() const |
62 | { |
63 | return _names[0]; |
64 | } |
65 | |
66 | |
67 | bool UTF8Encoding::isA(const std::string& encodingName) const |
68 | { |
69 | for (const char** name = _names; *name; ++name) |
70 | { |
71 | if (Poco::icompare(encodingName, *name) == 0) |
72 | return true; |
73 | } |
74 | return false; |
75 | } |
76 | |
77 | |
78 | const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const |
79 | { |
80 | return _charMap; |
81 | } |
82 | |
83 | |
84 | int UTF8Encoding::convert(const unsigned char* bytes) const |
85 | { |
86 | int n = _charMap[*bytes]; |
87 | int uc; |
88 | |
89 | switch (n) |
90 | { |
91 | case -6: |
92 | case -5: |
93 | case -1: |
94 | return -1; |
95 | case -4: |
96 | case -3: |
97 | case -2: |
98 | if (!isLegal(bytes, -n)) return -1; |
99 | uc = *bytes & ((0x07 << (n + 4)) | 0x03); |
100 | break; |
101 | default: |
102 | return n; |
103 | } |
104 | |
105 | while (n++ < -1) |
106 | { |
107 | uc <<= 6; |
108 | uc |= (*++bytes & 0x3F); |
109 | } |
110 | return uc; |
111 | } |
112 | |
113 | |
114 | int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const |
115 | { |
116 | if (ch <= 0x7F) |
117 | { |
118 | if (bytes && length >= 1) |
119 | *bytes = (unsigned char) ch; |
120 | return 1; |
121 | } |
122 | else if (ch <= 0x7FF) |
123 | { |
124 | if (bytes && length >= 2) |
125 | { |
126 | *bytes++ = (unsigned char) (((ch >> 6) & 0x1F) | 0xC0); |
127 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
128 | } |
129 | return 2; |
130 | } |
131 | else if (ch <= 0xFFFF) |
132 | { |
133 | if (bytes && length >= 3) |
134 | { |
135 | *bytes++ = (unsigned char) (((ch >> 12) & 0x0F) | 0xE0); |
136 | *bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80); |
137 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
138 | } |
139 | return 3; |
140 | } |
141 | else if (ch <= 0x10FFFF) |
142 | { |
143 | if (bytes && length >= 4) |
144 | { |
145 | *bytes++ = (unsigned char) (((ch >> 18) & 0x07) | 0xF0); |
146 | *bytes++ = (unsigned char) (((ch >> 12) & 0x3F) | 0x80); |
147 | *bytes++ = (unsigned char) (((ch >> 6) & 0x3F) | 0x80); |
148 | *bytes = (unsigned char) ((ch & 0x3F) | 0x80); |
149 | } |
150 | return 4; |
151 | } |
152 | else return 0; |
153 | } |
154 | |
155 | |
156 | int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const |
157 | { |
158 | int n = _charMap[*bytes]; |
159 | int uc; |
160 | if (-n > length) |
161 | { |
162 | return n; |
163 | } |
164 | else |
165 | { |
166 | switch (n) |
167 | { |
168 | case -6: |
169 | case -5: |
170 | case -1: |
171 | return -1; |
172 | case -4: |
173 | case -3: |
174 | case -2: |
175 | if (!isLegal(bytes, -n)) return -1; |
176 | uc = *bytes & ((0x07 << (n + 4)) | 0x03); |
177 | break; |
178 | default: |
179 | return n; |
180 | } |
181 | while (n++ < -1) |
182 | { |
183 | uc <<= 6; |
184 | uc |= (*++bytes & 0x3F); |
185 | } |
186 | return uc; |
187 | } |
188 | } |
189 | |
190 | |
191 | int UTF8Encoding::sequenceLength(const unsigned char* bytes, int length) const |
192 | { |
193 | if (1 <= length) |
194 | { |
195 | int cc = _charMap[*bytes]; |
196 | if (cc >= 0) |
197 | return 1; |
198 | else |
199 | return -cc; |
200 | } |
201 | else return -1; |
202 | } |
203 | |
204 | |
205 | bool UTF8Encoding::isLegal(const unsigned char *bytes, int length) |
206 | { |
207 | if (0 == bytes || 0 == length) return false; |
208 | |
209 | unsigned char a; |
210 | const unsigned char* srcptr = bytes + length; |
211 | switch (length) |
212 | { |
213 | default: |
214 | return false; |
215 | // Everything else falls through when true. |
216 | case 4: |
217 | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
218 | case 3: |
219 | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; |
220 | case 2: |
221 | a = (*--srcptr); |
222 | switch (*bytes) |
223 | { |
224 | case 0xE0: |
225 | if (a < 0xA0 || a > 0xBF) return false; |
226 | break; |
227 | case 0xED: |
228 | if (a < 0x80 || a > 0x9F) return false; |
229 | break; |
230 | case 0xF0: |
231 | if (a < 0x90 || a > 0xBF) return false; |
232 | break; |
233 | case 0xF4: |
234 | if (a < 0x80 || a > 0x8F) return false; |
235 | break; |
236 | default: |
237 | if (a < 0x80 || a > 0xBF) return false; |
238 | } |
239 | case 1: |
240 | if (*bytes >= 0x80 && *bytes < 0xC2) return false; |
241 | } |
242 | return *bytes <= 0xF4; |
243 | } |
244 | |
245 | |
246 | } // namespace Poco |
247 | |