1 | // |
2 | // UTF8String.cpp |
3 | // |
4 | // Library: Foundation |
5 | // Package: Text |
6 | // Module: UTF8String |
7 | // |
8 | // Copyright (c) 2007, Applied Informatics Software Engineering GmbH. |
9 | // and Contributors. |
10 | // |
11 | // SPDX-License-Identifier: BSL-1.0 |
12 | // |
13 | |
14 | |
15 | #include "Poco/UTF8String.h" |
16 | #include "Poco/Unicode.h" |
17 | #include "Poco/TextIterator.h" |
18 | #include "Poco/TextConverter.h" |
19 | #include "Poco/UTF8Encoding.h" |
20 | #include "Poco/NumberFormatter.h" |
21 | #include "Poco/Ascii.h" |
22 | #include <algorithm> |
23 | |
24 | |
25 | namespace Poco { |
26 | |
27 | |
28 | namespace |
29 | { |
30 | static UTF8Encoding utf8; |
31 | } |
32 | |
33 | |
34 | int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, std::string::const_iterator it2, std::string::const_iterator end2) |
35 | { |
36 | std::string::size_type sz = str.size(); |
37 | if (pos > sz) pos = sz; |
38 | if (pos + n > sz) n = sz - pos; |
39 | TextIterator uit1(str.begin() + pos, str.begin() + pos + n, utf8); |
40 | TextIterator uend1(str.begin() + pos + n); |
41 | TextIterator uit2(it2, end2, utf8); |
42 | TextIterator uend2(end2); |
43 | while (uit1 != uend1 && uit2 != uend2) |
44 | { |
45 | int c1 = Unicode::toLower(*uit1); |
46 | int c2 = Unicode::toLower(*uit2); |
47 | if (c1 < c2) |
48 | return -1; |
49 | else if (c1 > c2) |
50 | return 1; |
51 | ++uit1; ++uit2; |
52 | } |
53 | |
54 | if (uit1 == uend1) |
55 | return uit2 == uend2 ? 0 : -1; |
56 | else |
57 | return 1; |
58 | } |
59 | |
60 | |
61 | int UTF8::icompare(const std::string& str1, const std::string& str2) |
62 | { |
63 | return icompare(str1, 0, str1.size(), str2.begin(), str2.end()); |
64 | } |
65 | |
66 | |
67 | int UTF8::icompare(const std::string& str1, std::string::size_type n1, const std::string& str2, std::string::size_type n2) |
68 | { |
69 | if (n2 > str2.size()) n2 = str2.size(); |
70 | return icompare(str1, 0, n1, str2.begin(), str2.begin() + n2); |
71 | } |
72 | |
73 | |
74 | int UTF8::icompare(const std::string& str1, std::string::size_type n, const std::string& str2) |
75 | { |
76 | if (n > str2.size()) n = str2.size(); |
77 | return icompare(str1, 0, n, str2.begin(), str2.begin() + n); |
78 | } |
79 | |
80 | |
81 | int UTF8::icompare(const std::string& str1, std::string::size_type pos, std::string::size_type n, const std::string& str2) |
82 | { |
83 | return icompare(str1, pos, n, str2.begin(), str2.end()); |
84 | } |
85 | |
86 | |
87 | int UTF8::icompare(const std::string& str1, std::string::size_type pos1, std::string::size_type n1, const std::string& str2, std::string::size_type pos2, std::string::size_type n2) |
88 | { |
89 | std::string::size_type sz2 = str2.size(); |
90 | if (pos2 > sz2) pos2 = sz2; |
91 | if (pos2 + n2 > sz2) n2 = sz2 - pos2; |
92 | return icompare(str1, pos1, n1, str2.begin() + pos2, str2.begin() + pos2 + n2); |
93 | } |
94 | |
95 | |
96 | int UTF8::icompare(const std::string& str1, std::string::size_type pos1, std::string::size_type n, const std::string& str2, std::string::size_type pos2) |
97 | { |
98 | std::string::size_type sz2 = str2.size(); |
99 | if (pos2 > sz2) pos2 = sz2; |
100 | if (pos2 + n > sz2) n = sz2 - pos2; |
101 | return icompare(str1, pos1, n, str2.begin() + pos2, str2.begin() + pos2 + n); |
102 | } |
103 | |
104 | |
105 | int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, const std::string::value_type* ptr) |
106 | { |
107 | poco_check_ptr (ptr); |
108 | std::string str2(ptr); // TODO: optimize |
109 | return icompare(str, pos, n, str2.begin(), str2.end()); |
110 | } |
111 | |
112 | |
113 | int UTF8::icompare(const std::string& str, std::string::size_type pos, const std::string::value_type* ptr) |
114 | { |
115 | return icompare(str, pos, str.size() - pos, ptr); |
116 | } |
117 | |
118 | |
119 | int UTF8::icompare(const std::string& str, const std::string::value_type* ptr) |
120 | { |
121 | return icompare(str, 0, str.size(), ptr); |
122 | } |
123 | |
124 | |
125 | std::string UTF8::toUpper(const std::string& str) |
126 | { |
127 | std::string result; |
128 | TextConverter converter(utf8, utf8); |
129 | converter.convert(str, result, Unicode::toUpper); |
130 | return result; |
131 | } |
132 | |
133 | |
134 | std::string& UTF8::toUpperInPlace(std::string& str) |
135 | { |
136 | std::string result; |
137 | TextConverter converter(utf8, utf8); |
138 | converter.convert(str, result, Unicode::toUpper); |
139 | std::swap(str, result); |
140 | return str; |
141 | } |
142 | |
143 | |
144 | std::string UTF8::toLower(const std::string& str) |
145 | { |
146 | std::string result; |
147 | TextConverter converter(utf8, utf8); |
148 | converter.convert(str, result, Unicode::toLower); |
149 | return result; |
150 | } |
151 | |
152 | |
153 | std::string& UTF8::toLowerInPlace(std::string& str) |
154 | { |
155 | std::string result; |
156 | TextConverter converter(utf8, utf8); |
157 | converter.convert(str, result, Unicode::toLower); |
158 | std::swap(str, result); |
159 | return str; |
160 | } |
161 | |
162 | |
163 | void UTF8::removeBOM(std::string& str) |
164 | { |
165 | if (str.size() >= 3 |
166 | && static_cast<unsigned char>(str[0]) == 0xEF |
167 | && static_cast<unsigned char>(str[1]) == 0xBB |
168 | && static_cast<unsigned char>(str[2]) == 0xBF) |
169 | { |
170 | str.erase(0, 3); |
171 | } |
172 | } |
173 | |
174 | std::string UTF8::escape(const std::string &s) |
175 | { |
176 | return escape(s.begin(), s.end()); |
177 | } |
178 | |
179 | std::string UTF8::escape(const std::string::const_iterator& begin, const std::string::const_iterator& end) |
180 | { |
181 | static Poco::UInt32 offsetsFromUTF8[6] = { |
182 | 0x00000000UL, 0x00003080UL, 0x000E2080UL, |
183 | 0x03C82080UL, 0xFA082080UL, 0x82082080UL |
184 | }; |
185 | |
186 | std::string result; |
187 | |
188 | std::string::const_iterator it = begin; |
189 | |
190 | while(it != end) |
191 | { |
192 | Poco::UInt32 ch = 0; |
193 | unsigned int sz = 0; |
194 | |
195 | do |
196 | { |
197 | ch <<= 6; |
198 | ch += (unsigned char)*it++; |
199 | sz++; |
200 | } |
201 | while (it != end && (*it & 0xC0) == 0x80 && sz < 6); |
202 | ch -= offsetsFromUTF8[sz-1]; |
203 | |
204 | if (ch == '\n') result += "\\n" ; |
205 | else if (ch == '\t') result += "\\t" ; |
206 | else if (ch == '\r') result += "\\r" ; |
207 | else if (ch == '\b') result += "\\b" ; |
208 | else if (ch == '\f') result += "\\f" ; |
209 | else if (ch == '\v') result += "\\v" ; |
210 | else if (ch == '\a') result += "\\a" ; |
211 | else if (ch == '\\') result += "\\\\" ; |
212 | else if (ch == '\"') result += "\\\"" ; |
213 | else if (ch == '/') result += "\\/" ; |
214 | else if (ch == '\0') result += "\\u0000" ; |
215 | else if (ch < 32 || ch == 0x7f) |
216 | { |
217 | result += "\\u" ; |
218 | NumberFormatter::appendHex(result, (unsigned short) ch, 4); |
219 | } |
220 | else if (ch > 0xFFFF) |
221 | { |
222 | ch -= 0x10000; |
223 | result += "\\u" ; |
224 | NumberFormatter::appendHex(result, (unsigned short) (( ch >> 10 ) & 0x03ff ) + 0xd800, 4); |
225 | result += "\\u" ; |
226 | NumberFormatter::appendHex(result, (unsigned short) (ch & 0x03ff ) + 0xdc00, 4); |
227 | } |
228 | else if (ch >= 0x80 && ch <= 0xFFFF) |
229 | { |
230 | result += "\\u" ; |
231 | NumberFormatter::appendHex(result, (unsigned short) ch, 4); |
232 | } |
233 | else |
234 | { |
235 | result += (char) ch; |
236 | } |
237 | } |
238 | return result; |
239 | } |
240 | |
241 | std::string UTF8::unescape(const std::string &s) |
242 | { |
243 | return unescape(s.begin(), s.end()); |
244 | } |
245 | |
246 | std::string UTF8::unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end) |
247 | { |
248 | std::string result; |
249 | |
250 | std::string::const_iterator it = begin; |
251 | |
252 | while (it != end) |
253 | { |
254 | Poco::UInt32 ch = (Poco::UInt32) *it++; |
255 | |
256 | if (ch == '\\') |
257 | { |
258 | if ( it == end ) |
259 | { |
260 | //Invalid sequence! |
261 | } |
262 | |
263 | if (*it == 'n') |
264 | { |
265 | ch = '\n'; |
266 | it++; |
267 | } |
268 | else if (*it == 't') |
269 | { |
270 | ch = '\t'; |
271 | it++; |
272 | } |
273 | else if (*it == 'r') |
274 | { |
275 | ch = '\r'; |
276 | it++; |
277 | } |
278 | else if (*it == 'b') |
279 | { |
280 | ch = '\b'; |
281 | it++; |
282 | } |
283 | else if (*it == 'f') |
284 | { |
285 | ch = '\f'; |
286 | it++; |
287 | } |
288 | else if (*it == 'v') |
289 | { |
290 | ch = '\v'; |
291 | it++; |
292 | } |
293 | else if (*it == 'a') |
294 | { |
295 | ch = '\a'; |
296 | it++; |
297 | } |
298 | else if (*it == 'u') |
299 | { |
300 | char digs[5]; |
301 | memset(digs, 0, 5); |
302 | unsigned int dno = 0; |
303 | |
304 | it++; |
305 | |
306 | while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++; |
307 | if (dno > 0) |
308 | { |
309 | ch = strtol(digs, NULL, 16); |
310 | } |
311 | |
312 | if( ch >= 0xD800 && ch <= 0xDBFF ) |
313 | { |
314 | if ( it == end || *it != '\\' ) |
315 | { |
316 | //Invalid sequence! |
317 | } |
318 | else |
319 | { |
320 | it++; |
321 | if ( it == end || *it != 'u' ) |
322 | { |
323 | //Invalid sequence! |
324 | } |
325 | else |
326 | { |
327 | it++; |
328 | } |
329 | } |
330 | |
331 | // UTF-16 surrogate pair. Go fetch other half |
332 | memset(digs, 0, 5); |
333 | dno = 0; |
334 | while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++; |
335 | if (dno > 0) |
336 | { |
337 | Poco::UInt32 temp = strtol(digs, NULL, 16); |
338 | if( temp >= 0xDC00 && temp <= 0xDFFF ) |
339 | { |
340 | ch = ( ( ( ch - 0xD800 ) << 10 ) | ( temp - 0xDC00 ) ) + 0x10000; |
341 | } |
342 | } |
343 | } |
344 | } |
345 | else if (*it == 'U') |
346 | { |
347 | char digs[9]; |
348 | memset(digs, 0, 9); |
349 | unsigned int dno = 0; |
350 | |
351 | it++; |
352 | while (it != end && Ascii::isHexDigit(*it) && dno < 8) |
353 | { |
354 | digs[dno++] = *it++; |
355 | } |
356 | if (dno > 0) |
357 | { |
358 | ch = strtol(digs, NULL, 16); |
359 | } |
360 | } |
361 | } |
362 | |
363 | unsigned char utf8[4]; |
364 | UTF8Encoding encoding; |
365 | int sz = encoding.convert(ch, utf8, 4); |
366 | result.append((char*) utf8, sz); |
367 | } |
368 | |
369 | return result; |
370 | } |
371 | |
372 | } // namespace Poco |
373 | |