UTF8Encoding.cpp source code [POCO/Foundation/src/UTF8Encoding.cpp]

1	//
2	// UTF8Encoding.cpp
3	//
4	// Library: Foundation
5	// Package: Text
6	// Module: UTF8Encoding
7	//
8	// Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH.
9	// and Contributors.
10	//
11	// SPDX-License-Identifier: BSL-1.0
12	//
13
14
15	#include "Poco/UTF8Encoding.h"
16	#include "Poco/String.h"
17
18
19	namespace Poco {
20
21
22	const char* UTF8Encoding::_names[] =
23	{
24	"UTF-8",
25	"UTF8",
26	NULL
27	};
28
29
30	const TextEncoding::CharacterMap UTF8Encoding::_charMap =
31	{
32	/ 00 / `0x00`, `0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, `0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x0e`, `0x0f`,
33	/ 10 / `0x10`, `0x11`, `0x12`, `0x13`, `0x14`, `0x15`, `0x16`, `0x17`, `0x18`, `0x19`, `0x1a`, `0x1b`, `0x1c`, `0x1d`, `0x1e`, `0x1f`,
34	/ 20 / `0x20`, `0x21`, `0x22`, `0x23`, `0x24`, `0x25`, `0x26`, `0x27`, `0x28`, `0x29`, `0x2a`, `0x2b`, `0x2c`, `0x2d`, `0x2e`, `0x2f`,
35	/ 30 / `0x30`, `0x31`, `0x32`, `0x33`, `0x34`, `0x35`, `0x36`, `0x37`, `0x38`, `0x39`, `0x3a`, `0x3b`, `0x3c`, `0x3d`, `0x3e`, `0x3f`,
36	/ 40 / `0x40`, `0x41`, `0x42`, `0x43`, `0x44`, `0x45`, `0x46`, `0x47`, `0x48`, `0x49`, `0x4a`, `0x4b`, `0x4c`, `0x4d`, `0x4e`, `0x4f`,
37	/ 50 / `0x50`, `0x51`, `0x52`, `0x53`, `0x54`, `0x55`, `0x56`, `0x57`, `0x58`, `0x59`, `0x5a`, `0x5b`, `0x5c`, `0x5d`, `0x5e`, `0x5f`,
38	/ 60 / `0x60`, `0x61`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, `0x68`, `0x69`, `0x6a`, `0x6b`, `0x6c`, `0x6d`, `0x6e`, `0x6f`,
39	/ 70 / `0x70`, `0x71`, `0x72`, `0x73`, `0x74`, `0x75`, `0x76`, `0x77`, `0x78`, `0x79`, `0x7a`, `0x7b`, `0x7c`, `0x7d`, `0x7e`, `0x7f`,
40	/ 80 / -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
41	/ 90 / -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
42	/ a0 / -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
43	/ b0 / -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
44	/ c0 / -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`,
45	/ d0 / -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`,
46	/ e0 / -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`, -`3`,
47	/ f0 / -`4`, -`4`, -`4`, -`4`, -`4`, -`4`, -`4`, -`4`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
48	};
49
50
51	UTF8Encoding::UTF8Encoding()
52	{
53	}
54
55
56	UTF8Encoding::~UTF8Encoding()
57	{
58	}
59
60
61	const char* UTF8Encoding::canonicalName() const
62	{
63	return _names[`0`];
64	}
65
66
67	bool UTF8Encoding::isA(const std::string& encodingName) const
68	{
69	for (const char** name = _names; *name; ++name)
70	{
71	if (Poco::icompare(encodingName, *name) == `0`)
72	return true;
73	}
74	return false;
75	}
76
77
78	const TextEncoding::CharacterMap& UTF8Encoding::characterMap() const
79	{
80	return _charMap;
81	}
82
83
84	int UTF8Encoding::convert(const unsigned char* bytes) const
85	{
86	int n = _charMap[*bytes];
87	int uc;
88
89	switch (n)
90	{
91	case -`1`:
92	return -`1`;
93	case -`4`:
94	case -`3`:
95	case -`2`:
96	if (!isLegal(bytes, -n)) return -`1`;
97	uc = *bytes & ((`0x07` << (n + `4`)) \| `0x03`);
98	break;
99	default:
100	return n;
101	}
102
103	while (n++ < -`1`)
104	{
105	uc <<= `6`;
106	uc \|= (*++bytes & `0x3F`);
107	}
108	return uc;
109	}
110
111
112	int UTF8Encoding::convert(int ch, unsigned char* bytes, int length) const
113	{
114	if (ch <= `0x7F`)
115	{
116	if (bytes && length >= `1`)
117	bytes = (unsigned* char) ch;
118	return `1`;
119	}
120	else if (ch <= `0x7FF`)
121	{
122	if (bytes && length >= `2`)
123	{
124	bytes++ = (unsigned* char) (((ch >> `6`) & `0x1F`) \| `0xC0`);
125	bytes = (unsigned* char) ((ch & `0x3F`) \| `0x80`);
126	}
127	return `2`;
128	}
129	else if (ch <= `0xFFFF`)
130	{
131	if (bytes && length >= `3`)
132	{
133	bytes++ = (unsigned* char) (((ch >> `12`) & `0x0F`) \| `0xE0`);
134	bytes++ = (unsigned* char) (((ch >> `6`) & `0x3F`) \| `0x80`);
135	bytes = (unsigned* char) ((ch & `0x3F`) \| `0x80`);
136	}
137	return `3`;
138	}
139	else if (ch <= `0x10FFFF`)
140	{
141	if (bytes && length >= `4`)
142	{
143	bytes++ = (unsigned* char) (((ch >> `18`) & `0x07`) \| `0xF0`);
144	bytes++ = (unsigned* char) (((ch >> `12`) & `0x3F`) \| `0x80`);
145	bytes++ = (unsigned* char) (((ch >> `6`) & `0x3F`) \| `0x80`);
146	bytes = (unsigned* char) ((ch & `0x3F`) \| `0x80`);
147	}
148	return `4`;
149	}
150	else return `0`;
151	}
152
153
154	int UTF8Encoding::queryConvert(const unsigned char* bytes, int length) const
155	{
156	int n = _charMap[*bytes];
157	int uc;
158	if (-n > length)
159	{
160	return n;
161	}
162	else
163	{
164	switch (n)
165	{
166	case -`1`:
167	return -`1`;
168	case -`4`:
169	case -`3`:
170	case -`2`:
171	if (!isLegal(bytes, -n)) return -`1`;
172	uc = *bytes & ((`0x07` << (n + `4`)) \| `0x03`);
173	break;
174	default:
175	return n;
176	}
177	while (n++ < -`1`)
178	{
179	uc <<= `6`;
180	uc \|= (*++bytes & `0x3F`);
181	}
182	return uc;
183	}
184	}
185
186
187	int UTF8Encoding::sequenceLength(const unsigned char* bytes, int length) const
188	{
189	if (`1` <= length)
190	{
191	int cc = _charMap[*bytes];
192	if (cc >= `0`)
193	return `1`;
194	else
195	return -cc;
196	}
197	else return -`1`;
198	}
199
200
201	bool UTF8Encoding::isLegal(const unsigned char bytes, int* length)
202	{
203	if (`0` == bytes \|\| `0` == length) return false;
204
205	unsigned char a;
206	const unsigned char* srcptr = bytes + length;
207	switch (length)
208	{
209	default:
210	return false;
211	// Everything else falls through when true.
212	case `4`:
213	if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
214	case `3`:
215	if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false;
216	case `2`:
217	a = (*--srcptr);
218	switch (*bytes)
219	{
220	case `0xE0`:
221	if (a < `0xA0` \|\| a > `0xBF`) return false;
222	break;
223	case `0xED`:
224	if (a < `0x80` \|\| a > `0x9F`) return false;
225	break;
226	case `0xF0`:
227	if (a < `0x90` \|\| a > `0xBF`) return false;
228	break;
229	case `0xF4`:
230	if (a < `0x80` \|\| a > `0x8F`) return false;
231	break;
232	default:
233	if (a < `0x80` \|\| a > `0xBF`) return false;
234	}
235	case `1`:
236	if (bytes >= `0x80` && bytes < `0xC2`) return false;
237	}
238	return *bytes <= `0xF4`;
239	}
240
241
242	} // namespace Poco
243

Browse the source code of POCO/Foundation/src/UTF8Encoding.cpp