UTF8Encoding.cpp source code [ClickHouse/contrib/poco/Foundation/src/UTF8Encoding.cpp]

1	//
2	// UTF8Encoding.cpp
3	//
4	// Library: Foundation
5	// Package: Text
6	// Module: UTF8Encoding
7	//
8	// Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH.
9	// and Contributors.
10	//
11	// SPDX-License-Identifier: BSL-1.0
12	//
13
14
15	#include "Poco/UTF8Encoding.h"
16	#include "Poco/String.h"
17
18
19	namespace Poco {
20
21
22	const char* UTF8Encoding::_names[] =
23	{
24	"UTF-8",
25	"UTF8",
26	NULL
27	};
28
29
30	const TextEncoding::CharacterMap UTF8Encoding::_charMap =
31	{
32	/ 00 / `0x00`, `0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x0e`, `0x0f`,
33	/ 10 / `0x10`, `0x11`, `0x12`, `0x13`, `0x14`, `0x15`, `0x16`, `0x17`, `0x18`, `0x19`, `0x1a`, `0x1b`, `0x1c`, `0x1d`, `0x1e`, `0x1f`,
34	/ 20 / `0x20`, `0x21`, `0x22`, `0x23`, `0x24`, `0x25`, `0x26`, `0x27`, `0x28`, `0x29`, `0x2a`, `0x2b`, `0x2c`, `0x2d`, `0x2e`, `0x2f`,
35	/ 30 / `0x30`, `0x31`, `0x32`, `0x33`, `0x34`, `0x35`, `0x36`, `0x37`, `0x38`, `0x39`, `0x3a`, `0x3b`, `0x3c`, `0x3d`, `0x3e`, `0x3f`,
36	/ 40 / `0x40`, `0x41`, `0x42`, `0x43`, `0x44`, `0x45`, `0x46`, `0x47`, `0x48`, `0x49`, `0x4a`, `0x4b`, `0x4c`, `0x4d`, `0x4e`, `0x4f`,
37	/ 50 / `0x50`, `0x51`, `0x52`, `0x53`, `0x54`, `0x55`, `0x56`, `0x57`, `0x58`, `0x59`, `0x5a`, `0x5b`, `0x5c`, `0x5d`, `0x5e`, `0x5f`,
38	/ 60 / `0x60`, `0x61`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x6a`, `0x6b`, `0x6c`, `0x6d`, `0x6e`, `0x6f`,
39	/ 70 / `0x70`, `0x71`, `0x72`, `0x73`, `0x74`, `0x75`, `0x76`, `0x77`, `0x78`, `0x79`, `0x7a`, `0x7b`, `0x7c`, `0x7d`, `0x7e`, `0x7f`,
40	/ 80 / -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
41	/ 90 / -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
42	/ a0 / -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
43	/ b0 / -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
44	/ c0 / -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`,
45	/ d0 / -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`,
46	/ e0 / -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`,
47	/ f0 / -`4`, -`4`, -`4`, -`4`, -`4`, -`4`, -`4`, -`4`, -`5`, -`5`, -`5`, -`5`, -`6`, -`6`, -`1`, -`1`,
48	};
49
50
51	UTF8Encoding::UTF8Encoding()
52	{
53	}
54
55
56	UTF8Encoding::~UTF8Encoding()
57	{
58	}
59
60
61	const char* UTF8Encoding::canonicalName() const
62	{
63	return _names[`0`];
64	}
65
66
67	bool UTF8Encoding::isA(const std::string& encodingName) const
68	{
69	for (const char** name = _names; *name; ++name)
70	{
71	if (Poco::icompare(encodingName, *name) == `0`)
72	return true;
73	}
74	return false;
75	}
76
77
78	const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
79	{
80	return _charMap;
81	}
82
83
84	int UTF8Encoding::convert(const unsigned char* bytes) const
85	{
86	int n = _charMap[*bytes];
87	int uc;
88
89	switch (n)
90	{
91	case -`6`:
92	case -`5`:
93	case -`1`:
94	return -`1`;
95	case -`4`:
96	case -`3`:
97	case -`2`:
98	if (!isLegal(bytes, -n)) return -`1`;
99	uc = *bytes & ((`0x07` << (n + `4`)) \| `0x03`);
100	break;
101	default:
102	return n;
103	}
104
105	while (n++ < -`1`)
106	{
107	uc <<= `6`;
108	uc \|= (*++bytes & `0x3F`);
109	}
110	return uc;
111	}
112
113
114	int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
115	{
116	if (ch <= `0x7F`)
117	{
118	if (bytes && length >= `1`)
119	bytes = (unsigned* char) ch;
120	return `1`;
121	}
122	else if (ch <= `0x7FF`)
123	{
124	if (bytes && length >= `2`)
125	{
126	bytes++ = (unsigned* char) (((ch >> `6`) & `0x1F`) \| `0xC0`);
127	bytes = (unsigned* char) ((ch & `0x3F`) \| `0x80`);
128	}
129	return `2`;
130	}
131	else if (ch <= `0xFFFF`)
132	{
133	if (bytes && length >= `3`)
134	{
135	bytes++ = (unsigned* char) (((ch >> `12`) & `0x0F`) \| `0xE0`);
136	bytes++ = (unsigned* char) (((ch >> `6`) & `0x3F`) \| `0x80`);
137	bytes = (unsigned* char) ((ch & `0x3F`) \| `0x80`);
138	}
139	return `3`;
140	}
141	else if (ch <= `0x10FFFF`)
142	{
143	if (bytes && length >= `4`)
144	{
145	bytes++ = (unsigned* char) (((ch >> `18`) & `0x07`) \| `0xF0`);
146	bytes++ = (unsigned* char) (((ch >> `12`) & `0x3F`) \| `0x80`);
147	bytes++ = (unsigned* char) (((ch >> `6`) & `0x3F`) \| `0x80`);
148	bytes = (unsigned* char) ((ch & `0x3F`) \| `0x80`);
149	}
150	return `4`;
151	}
152	else return `0`;
153	}
154
155
156	int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const
157	{
158	int n = _charMap[*bytes];
159	int uc;
160	if (-n > length)
161	{
162	return n;
163	}
164	else
165	{
166	switch (n)
167	{
168	case -`6`:
169	case -`5`:
170	case -`1`:
171	return -`1`;
172	case -`4`:
173	case -`3`:
174	case -`2`:
175	if (!isLegal(bytes, -n)) return -`1`;
176	uc = *bytes & ((`0x07` << (n + `4`)) \| `0x03`);
177	break;
178	default:
179	return n;
180	}
181	while (n++ < -`1`)
182	{
183	uc <<= `6`;
184	uc \|= (*++bytes & `0x3F`);
185	}
186	return uc;
187	}
188	}
189
190
191	int UTF8Encoding::sequenceLength(const unsigned char* bytes, int length) const
192	{
193	if (`1` <= length)
194	{
195	int cc = _charMap[*bytes];
196	if (cc >= `0`)
197	return `1`;
198	else
199	return -cc;
200	}
201	else return -`1`;
202	}
203
204
205	bool UTF8Encoding::isLegal(const unsigned char bytes, int* length)
206	{
207	if (`0` == bytes \|\| `0` == length) return false;
208
209	unsigned char a;
210	const unsigned char* srcptr = bytes + length;
211	switch (length)
212	{
213	default:
214	return false;
215	// Everything else falls through when true.
216	case `4`:
217	if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
218	case `3`:
219	if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
220	case `2`:
221	a = (*--srcptr);
222	switch (*bytes)
223	{
224	case `0xE0`:
225	if (a < `0xA0` \|\| a > `0xBF`) return false;
226	break;
227	case `0xED`:
228	if (a < `0x80` \|\| a > `0x9F`) return false;
229	break;
230	case `0xF0`:
231	if (a < `0x90` \|\| a > `0xBF`) return false;
232	break;
233	case `0xF4`:
234	if (a < `0x80` \|\| a > `0x8F`) return false;
235	break;
236	default:
237	if (a < `0x80` \|\| a > `0xBF`) return false;
238	}
239	case `1`:
240	if (bytes >= `0x80` && bytes < `0xC2`) return false;
241	}
242	return *bytes <= `0xF4`;
243	}
244
245
246	} // namespace Poco
247

Browse the source code of ClickHouse/contrib/poco/Foundation/src/UTF8Encoding.cpp