1//
2// UTF8String.cpp
3//
4// Library: Foundation
5// Package: Text
6// Module: UTF8String
7//
8// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
9// and Contributors.
10//
11// SPDX-License-Identifier: BSL-1.0
12//
13
14
15#include "Poco/UTF8String.h"
16#include "Poco/Unicode.h"
17#include "Poco/TextIterator.h"
18#include "Poco/TextConverter.h"
19#include "Poco/UTF8Encoding.h"
20#include "Poco/NumberFormatter.h"
21#include "Poco/Ascii.h"
22#include <algorithm>
23
24
25namespace Poco {
26
27
28namespace
29{
30 static UTF8Encoding utf8;
31}
32
33
34int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, std::string::const_iterator it2, std::string::const_iterator end2)
35{
36 std::string::size_type sz = str.size();
37 if (pos > sz) pos = sz;
38 if (pos + n > sz) n = sz - pos;
39 TextIterator uit1(str.begin() + pos, str.begin() + pos + n, utf8);
40 TextIterator uend1(str.begin() + pos + n);
41 TextIterator uit2(it2, end2, utf8);
42 TextIterator uend2(end2);
43 while (uit1 != uend1 && uit2 != uend2)
44 {
45 int c1 = Unicode::toLower(*uit1);
46 int c2 = Unicode::toLower(*uit2);
47 if (c1 < c2)
48 return -1;
49 else if (c1 > c2)
50 return 1;
51 ++uit1; ++uit2;
52 }
53
54 if (uit1 == uend1)
55 return uit2 == uend2 ? 0 : -1;
56 else
57 return 1;
58}
59
60
61int UTF8::icompare(const std::string& str1, const std::string& str2)
62{
63 return icompare(str1, 0, str1.size(), str2.begin(), str2.end());
64}
65
66
67int UTF8::icompare(const std::string& str1, std::string::size_type n1, const std::string& str2, std::string::size_type n2)
68{
69 if (n2 > str2.size()) n2 = str2.size();
70 return icompare(str1, 0, n1, str2.begin(), str2.begin() + n2);
71}
72
73
74int UTF8::icompare(const std::string& str1, std::string::size_type n, const std::string& str2)
75{
76 if (n > str2.size()) n = str2.size();
77 return icompare(str1, 0, n, str2.begin(), str2.begin() + n);
78}
79
80
81int UTF8::icompare(const std::string& str1, std::string::size_type pos, std::string::size_type n, const std::string& str2)
82{
83 return icompare(str1, pos, n, str2.begin(), str2.end());
84}
85
86
87int UTF8::icompare(const std::string& str1, std::string::size_type pos1, std::string::size_type n1, const std::string& str2, std::string::size_type pos2, std::string::size_type n2)
88{
89 std::string::size_type sz2 = str2.size();
90 if (pos2 > sz2) pos2 = sz2;
91 if (pos2 + n2 > sz2) n2 = sz2 - pos2;
92 return icompare(str1, pos1, n1, str2.begin() + pos2, str2.begin() + pos2 + n2);
93}
94
95
96int UTF8::icompare(const std::string& str1, std::string::size_type pos1, std::string::size_type n, const std::string& str2, std::string::size_type pos2)
97{
98 std::string::size_type sz2 = str2.size();
99 if (pos2 > sz2) pos2 = sz2;
100 if (pos2 + n > sz2) n = sz2 - pos2;
101 return icompare(str1, pos1, n, str2.begin() + pos2, str2.begin() + pos2 + n);
102}
103
104
105int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, const std::string::value_type* ptr)
106{
107 poco_check_ptr (ptr);
108 std::string str2(ptr); // TODO: optimize
109 return icompare(str, pos, n, str2.begin(), str2.end());
110}
111
112
113int UTF8::icompare(const std::string& str, std::string::size_type pos, const std::string::value_type* ptr)
114{
115 return icompare(str, pos, str.size() - pos, ptr);
116}
117
118
119int UTF8::icompare(const std::string& str, const std::string::value_type* ptr)
120{
121 return icompare(str, 0, str.size(), ptr);
122}
123
124
125std::string UTF8::toUpper(const std::string& str)
126{
127 std::string result;
128 TextConverter converter(utf8, utf8);
129 converter.convert(str, result, Unicode::toUpper);
130 return result;
131}
132
133
134std::string& UTF8::toUpperInPlace(std::string& str)
135{
136 std::string result;
137 TextConverter converter(utf8, utf8);
138 converter.convert(str, result, Unicode::toUpper);
139 std::swap(str, result);
140 return str;
141}
142
143
144std::string UTF8::toLower(const std::string& str)
145{
146 std::string result;
147 TextConverter converter(utf8, utf8);
148 converter.convert(str, result, Unicode::toLower);
149 return result;
150}
151
152
153std::string& UTF8::toLowerInPlace(std::string& str)
154{
155 std::string result;
156 TextConverter converter(utf8, utf8);
157 converter.convert(str, result, Unicode::toLower);
158 std::swap(str, result);
159 return str;
160}
161
162
163void UTF8::removeBOM(std::string& str)
164{
165 if (str.size() >= 3
166 && static_cast<unsigned char>(str[0]) == 0xEF
167 && static_cast<unsigned char>(str[1]) == 0xBB
168 && static_cast<unsigned char>(str[2]) == 0xBF)
169 {
170 str.erase(0, 3);
171 }
172}
173
174std::string UTF8::escape(const std::string &s)
175{
176 return escape(s.begin(), s.end());
177}
178
179std::string UTF8::escape(const std::string::const_iterator& begin, const std::string::const_iterator& end)
180{
181 static Poco::UInt32 offsetsFromUTF8[6] = {
182 0x00000000UL, 0x00003080UL, 0x000E2080UL,
183 0x03C82080UL, 0xFA082080UL, 0x82082080UL
184 };
185
186 std::string result;
187
188 std::string::const_iterator it = begin;
189
190 while(it != end)
191 {
192 Poco::UInt32 ch = 0;
193 unsigned int sz = 0;
194
195 do
196 {
197 ch <<= 6;
198 ch += (unsigned char)*it++;
199 sz++;
200 }
201 while (it != end && (*it & 0xC0) == 0x80 && sz < 6);
202 ch -= offsetsFromUTF8[sz-1];
203
204 if (ch == '\n') result += "\\n";
205 else if (ch == '\t') result += "\\t";
206 else if (ch == '\r') result += "\\r";
207 else if (ch == '\b') result += "\\b";
208 else if (ch == '\f') result += "\\f";
209 else if (ch == '\v') result += "\\v";
210 else if (ch == '\a') result += "\\a";
211 else if (ch == '\\') result += "\\\\";
212 else if (ch == '\"') result += "\\\"";
213 else if (ch == '/') result += "\\/";
214 else if (ch == '\0') result += "\\u0000";
215 else if (ch < 32 || ch == 0x7f)
216 {
217 result += "\\u";
218 NumberFormatter::appendHex(result, (unsigned short) ch, 4);
219 }
220 else if (ch > 0xFFFF)
221 {
222 ch -= 0x10000;
223 result += "\\u";
224 NumberFormatter::appendHex(result, (unsigned short) (( ch >> 10 ) & 0x03ff ) + 0xd800, 4);
225 result += "\\u";
226 NumberFormatter::appendHex(result, (unsigned short) (ch & 0x03ff ) + 0xdc00, 4);
227 }
228 else if (ch >= 0x80 && ch <= 0xFFFF)
229 {
230 result += "\\u";
231 NumberFormatter::appendHex(result, (unsigned short) ch, 4);
232 }
233 else
234 {
235 result += (char) ch;
236 }
237 }
238 return result;
239}
240
241std::string UTF8::unescape(const std::string &s)
242{
243 return unescape(s.begin(), s.end());
244}
245
246std::string UTF8::unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end)
247{
248 std::string result;
249
250 std::string::const_iterator it = begin;
251
252 while (it != end)
253 {
254 Poco::UInt32 ch = (Poco::UInt32) *it++;
255
256 if (ch == '\\')
257 {
258 if ( it == end )
259 {
260 //Invalid sequence!
261 }
262
263 if (*it == 'n')
264 {
265 ch = '\n';
266 it++;
267 }
268 else if (*it == 't')
269 {
270 ch = '\t';
271 it++;
272 }
273 else if (*it == 'r')
274 {
275 ch = '\r';
276 it++;
277 }
278 else if (*it == 'b')
279 {
280 ch = '\b';
281 it++;
282 }
283 else if (*it == 'f')
284 {
285 ch = '\f';
286 it++;
287 }
288 else if (*it == 'v')
289 {
290 ch = '\v';
291 it++;
292 }
293 else if (*it == 'a')
294 {
295 ch = '\a';
296 it++;
297 }
298 else if (*it == 'u')
299 {
300 char digs[5];
301 memset(digs, 0, 5);
302 unsigned int dno = 0;
303
304 it++;
305
306 while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++;
307 if (dno > 0)
308 {
309 ch = strtol(digs, NULL, 16);
310 }
311
312 if( ch >= 0xD800 && ch <= 0xDBFF )
313 {
314 if ( it == end || *it != '\\' )
315 {
316 //Invalid sequence!
317 }
318 else
319 {
320 it++;
321 if ( it == end || *it != 'u' )
322 {
323 //Invalid sequence!
324 }
325 else
326 {
327 it++;
328 }
329 }
330
331 // UTF-16 surrogate pair. Go fetch other half
332 memset(digs, 0, 5);
333 dno = 0;
334 while (it != end && Ascii::isHexDigit(*it) && dno < 4) digs[dno++] = *it++;
335 if (dno > 0)
336 {
337 Poco::UInt32 temp = strtol(digs, NULL, 16);
338 if( temp >= 0xDC00 && temp <= 0xDFFF )
339 {
340 ch = ( ( ( ch - 0xD800 ) << 10 ) | ( temp - 0xDC00 ) ) + 0x10000;
341 }
342 }
343 }
344 }
345 else if (*it == 'U')
346 {
347 char digs[9];
348 memset(digs, 0, 9);
349 unsigned int dno = 0;
350
351 it++;
352 while (it != end && Ascii::isHexDigit(*it) && dno < 8)
353 {
354 digs[dno++] = *it++;
355 }
356 if (dno > 0)
357 {
358 ch = strtol(digs, NULL, 16);
359 }
360 }
361 }
362
363 unsigned char utf8[4];
364 UTF8Encoding encoding;
365 int sz = encoding.convert(ch, utf8, 4);
366 result.append((char*) utf8, sz);
367 }
368
369 return result;
370}
371
372} // namespace Poco
373