UTF8String.cpp source code [ClickHouse/contrib/poco/Foundation/src/UTF8String.cpp]

1	//
2	// UTF8String.cpp
3	//
4	// Library: Foundation
5	// Package: Text
6	// Module: UTF8String
7	//
8	// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
9	// and Contributors.
10	//
11	// SPDX-License-Identifier: BSL-1.0
12	//
13
14
15	#include "Poco/UTF8String.h"
16	#include "Poco/Unicode.h"
17	#include "Poco/TextIterator.h"
18	#include "Poco/TextConverter.h"
19	#include "Poco/UTF8Encoding.h"
20	#include "Poco/NumberFormatter.h"
21	#include "Poco/Ascii.h"
22	#include <algorithm>
23
24
25	namespace Poco {
26
27
28	namespace
29	{
30	static UTF8Encoding utf8;
31	}
32
33
34	int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, std::string::const_iterator it2, std::string::const_iterator end2)
35	{
36	std::string::size_type sz = str.size();
37	if (pos > sz) pos = sz;
38	if (pos + n > sz) n = sz - pos;
39	TextIterator uit1(str.begin() + pos, str.begin() + pos + n, utf8);
40	TextIterator uend1(str.begin() + pos + n);
41	TextIterator uit2(it2, end2, utf8);
42	TextIterator uend2(end2);
43	while (uit1 != uend1 && uit2 != uend2)
44	{
45	int c1 = Unicode::toLower(*uit1);
46	int c2 = Unicode::toLower(*uit2);
47	if (c1 < c2)
48	return -`1`;
49	else if (c1 > c2)
50	return `1`;
51	++uit1; ++uit2;
52	}
53
54	if (uit1 == uend1)
55	return uit2 == uend2 ? `0` : -`1`;
56	else
57	return `1`;
58	}
59
60
61	int UTF8::icompare(const std::string& str1, const std::string& str2)
62	{
63	return icompare(str1, `0`, str1.size(), str2.begin(), str2.end());
64	}
65
66
67	int UTF8::icompare(const std::string& str1, std::string::size_type n1, const std::string& str2, std::string::size_type n2)
68	{
69	if (n2 > str2.size()) n2 = str2.size();
70	return icompare(str1, `0`, n1, str2.begin(), str2.begin() + n2);
71	}
72
73
74	int UTF8::icompare(const std::string& str1, std::string::size_type n, const std::string& str2)
75	{
76	if (n > str2.size()) n = str2.size();
77	return icompare(str1, `0`, n, str2.begin(), str2.begin() + n);
78	}
79
80
81	int UTF8::icompare(const std::string& str1, std::string::size_type pos, std::string::size_type n, const std::string& str2)
82	{
83	return icompare(str1, pos, n, str2.begin(), str2.end());
84	}
85
86
87	int UTF8::icompare(const std::string& str1, std::string::size_type pos1, std::string::size_type n1, const std::string& str2, std::string::size_type pos2, std::string::size_type n2)
88	{
89	std::string::size_type sz2 = str2.size();
90	if (pos2 > sz2) pos2 = sz2;
91	if (pos2 + n2 > sz2) n2 = sz2 - pos2;
92	return icompare(str1, pos1, n1, str2.begin() + pos2, str2.begin() + pos2 + n2);
93	}
94
95
96	int UTF8::icompare(const std::string& str1, std::string::size_type pos1, std::string::size_type n, const std::string& str2, std::string::size_type pos2)
97	{
98	std::string::size_type sz2 = str2.size();
99	if (pos2 > sz2) pos2 = sz2;
100	if (pos2 + n > sz2) n = sz2 - pos2;
101	return icompare(str1, pos1, n, str2.begin() + pos2, str2.begin() + pos2 + n);
102	}
103
104
105	int UTF8::icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, const std::string::value_type* ptr)
106	{
107	poco_check_ptr (ptr);
108	std::string str2(ptr); // TODO: optimize
109	return icompare(str, pos, n, str2.begin(), str2.end());
110	}
111
112
113	int UTF8::icompare(const std::string& str, std::string::size_type pos, const std::string::value_type* ptr)
114	{
115	return icompare(str, pos, str.size() - pos, ptr);
116	}
117
118
119	int UTF8::icompare(const std::string& str, const std::string::value_type* ptr)
120	{
121	return icompare(str, `0`, str.size(), ptr);
122	}
123
124
125	std::string UTF8::toUpper(const std::string& str)
126	{
127	std::string result;
128	TextConverter converter(utf8, utf8);
129	converter.convert(str, result, Unicode::toUpper);
130	return result;
131	}
132
133
134	std::string& UTF8::toUpperInPlace(std::string& str)
135	{
136	std::string result;
137	TextConverter converter(utf8, utf8);
138	converter.convert(str, result, Unicode::toUpper);
139	std::swap(str, result);
140	return str;
141	}
142
143
144	std::string UTF8::toLower(const std::string& str)
145	{
146	std::string result;
147	TextConverter converter(utf8, utf8);
148	converter.convert(str, result, Unicode::toLower);
149	return result;
150	}
151
152
153	std::string& UTF8::toLowerInPlace(std::string& str)
154	{
155	std::string result;
156	TextConverter converter(utf8, utf8);
157	converter.convert(str, result, Unicode::toLower);
158	std::swap(str, result);
159	return str;
160	}
161
162
163	void UTF8::removeBOM(std::string& str)
164	{
165	if (str.size() >= `3`
166	&& static_cast<unsigned char>(str [`0`]) == `0xEF`
167	&& static_cast<unsigned char>(str [`1`]) == `0xBB`
168	&& static_cast<unsigned char>(str [`2`]) == `0xBF`)
169	{
170	str.erase(`0`, `3`);
171	}
172	}
173
174	std::string UTF8::escape(const std::string &s)
175	{
176	return escape(s.begin(), s.end());
177	}
178
179	std::string UTF8::escape(const std::string::const_iterator& begin, const std::string::const_iterator& end)
180	{
181	static Poco::UInt32 offsetsFromUTF8[`6`] = {
182	`0x00000000UL`, `0x00003080UL`, `0x000E2080UL`,
183	`0x03C82080UL`, `0xFA082080UL`, `0x82082080UL`
184	};
185
186	std::string result;
187
188	std::string::const_iterator it = begin;
189
190	while(it != end)
191	{
192	Poco::UInt32 ch = `0`;
193	unsigned int sz = `0`;
194
195	do
196	{
197	ch <<= `6`;
198	ch += (unsigned char)*it ++;
199	sz++;
200	}
201	while (it != end && (*it & `0xC0`) == `0x80` && sz < `6`);
202	ch -= offsetsFromUTF8[sz-`1`];
203
204	if (ch == `'\n'`) result += "\\n";
205	else if (ch == `'\t'`) result += "\\t";
206	else if (ch == `'\r'`) result += "\\r";
207	else if (ch == `'\b'`) result += "\\b";
208	else if (ch == `'\f'`) result += "\\f";
209	else if (ch == `'\v'`) result += "\\v";
210	else if (ch == `'\a'`) result += "\\a";
211	else if (ch == `'\\'`) result += "\\\\";
212	else if (ch == `'\"'`) result += "\\\"";
213	else if (ch == `'/'`) result += "\\/";
214	else if (ch == `'\0'`) result += "\\u0000";
215	else if (ch < `32` \|\| ch == `0x7f`)
216	{
217	result += "\\u";
218	NumberFormatter::appendHex(result, (unsigned short) ch, `4`);
219	}
220	else if (ch > `0xFFFF`)
221	{
222	ch -= `0x10000`;
223	result += "\\u";
224	NumberFormatter::appendHex(result, (unsigned short) (( ch >> `10` ) & `0x03ff` ) + `0xd800`, `4`);
225	result += "\\u";
226	NumberFormatter::appendHex(result, (unsigned short) (ch & `0x03ff` ) + `0xdc00`, `4`);
227	}
228	else if (ch >= `0x80` && ch <= `0xFFFF`)
229	{
230	result += "\\u";
231	NumberFormatter::appendHex(result, (unsigned short) ch, `4`);
232	}
233	else
234	{
235	result += (char) ch;
236	}
237	}
238	return result;
239	}
240
241	std::string UTF8::unescape(const std::string &s)
242	{
243	return unescape(s.begin(), s.end());
244	}
245
246	std::string UTF8::unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end)
247	{
248	std::string result;
249
250	std::string::const_iterator it = begin;
251
252	while (it != end)
253	{
254	Poco::UInt32 ch = (Poco::UInt32) *it ++;
255
256	if (ch == `'\\'`)
257	{
258	if ( it == end )
259	{
260	//Invalid sequence!
261	}
262
263	if (*it == `'n'`)
264	{
265	ch = `'\n'`;
266	it ++;
267	}
268	else if (*it == `'t'`)
269	{
270	ch = `'\t'`;
271	it ++;
272	}
273	else if (*it == `'r'`)
274	{
275	ch = `'\r'`;
276	it ++;
277	}
278	else if (*it == `'b'`)
279	{
280	ch = `'\b'`;
281	it ++;
282	}
283	else if (*it == `'f'`)
284	{
285	ch = `'\f'`;
286	it ++;
287	}
288	else if (*it == `'v'`)
289	{
290	ch = `'\v'`;
291	it ++;
292	}
293	else if (*it == `'a'`)
294	{
295	ch = `'\a'`;
296	it ++;
297	}
298	else if (*it == `'u'`)
299	{
300	char digs[`5`];
301	memset(digs, `0`, `5`);
302	unsigned int dno = `0`;
303
304	it ++;
305
306	while (it != end && Ascii::isHexDigit(it) && dno < `4`) digs[dno++] = it ++;
307	if (dno > `0`)
308	{
309	ch = strtol(digs, NULL, `16`);
310	}
311
312	if( ch >= `0xD800` && ch <= `0xDBFF` )
313	{
314	if ( it == end \|\| *it != `'\\'` )
315	{
316	//Invalid sequence!
317	}
318	else
319	{
320	it ++;
321	if ( it == end \|\| *it != `'u'` )
322	{
323	//Invalid sequence!
324	}
325	else
326	{
327	it ++;
328	}
329	}
330
331	// UTF-16 surrogate pair. Go fetch other half
332	memset(digs, `0`, `5`);
333	dno = `0`;
334	while (it != end && Ascii::isHexDigit(it) && dno < `4`) digs[dno++] = it ++;
335	if (dno > `0`)
336	{
337	Poco::UInt32 temp = strtol(digs, NULL, `16`);
338	if( temp >= `0xDC00` && temp <= `0xDFFF` )
339	{
340	ch = ( ( ( ch - `0xD800` ) << `10` ) \| ( temp - `0xDC00` ) ) + `0x10000`;
341	}
342	}
343	}
344	}
345	else if (*it == `'U'`)
346	{
347	char digs[`9`];
348	memset(digs, `0`, `9`);
349	unsigned int dno = `0`;
350
351	it ++;
352	while (it != end && Ascii::isHexDigit(*it) && dno < `8`)
353	{
354	digs[dno++] = *it ++;
355	}
356	if (dno > `0`)
357	{
358	ch = strtol(digs, NULL, `16`);
359	}
360	}
361	}
362
363	unsigned char utf8[`4`];
364	UTF8Encoding encoding;
365	int sz = encoding.convert(ch, utf8, `4`);
366	result.append((char*) utf8, sz);
367	}
368
369	return result;
370	}
371
372	} // namespace Poco
373

Browse the source code of ClickHouse/contrib/poco/Foundation/src/UTF8String.cpp