utf8.cpp source code [OpenJDK/src/hotspot/share/utilities/utf8.cpp]

1	/*
2	* Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4	*
5	* This code is free software; you can redistribute it and/or modify it
6	* under the terms of the GNU General Public License version 2 only, as
7	* published by the Free Software Foundation.
8	*
9	* This code is distributed in the hope that it will be useful, but WITHOUT
10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12	* version 2 for more details (a copy is included in the LICENSE file that
13	* accompanied this code).
14	*
15	* You should have received a copy of the GNU General Public License version
16	* 2 along with this work; if not, write to the Free Software Foundation,
17	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18	*
19	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20	* or visit www.oracle.com if you need additional information or have any
21	* questions.
22	*
23	*/
24
25	#include "precompiled.hpp"
26	#include "utilities/utf8.hpp"
27
28	// Assume the utf8 string is in legal form and has been
29	// checked in the class file parser/format checker.
30	template<typename T> char* UTF8::next(const char* str, T* value) {
31	unsigned const char ptr = (const* unsigned char *)str;
32	unsigned char ch, ch2, ch3;
33	int length = -`1`; / bad length /
34	jchar result;
35	switch ((ch = ptr[`0`]) >> `4`) {
36	default:
37	result = ch;
38	length = `1`;
39	break;
40
41	case `0x8`: case `0x9`: case `0xA`: case `0xB`: case `0xF`:
42	/ Shouldn't happen. /
43	break;
44
45	case `0xC`: case `0xD`:
46	/ 110xxxxx 10xxxxxx /
47	if (((ch2 = ptr[`1`]) & `0xC0`) == `0x80`) {
48	unsigned char high_five = ch & `0x1F`;
49	unsigned char low_six = ch2 & `0x3F`;
50	result = (high_five << `6`) + low_six;
51	length = `2`;
52	break;
53	}
54	break;
55
56	case `0xE`:
57	/ 1110xxxx 10xxxxxx 10xxxxxx /
58	if (((ch2 = ptr[`1`]) & `0xC0`) == `0x80`) {
59	if (((ch3 = ptr[`2`]) & `0xC0`) == `0x80`) {
60	unsigned char high_four = ch & `0x0f`;
61	unsigned char mid_six = ch2 & `0x3f`;
62	unsigned char low_six = ch3 & `0x3f`;
63	result = (((high_four << `6`) + mid_six) << `6`) + low_six;
64	length = `3`;
65	}
66	}
67	break;
68	} / end of switch /
69
70	if (length <= `0`) {
71	value = (T)ptr[`0`]; /* default bad result; /
72	return (char)(ptr + `1`); // make progress somehow*
73	}
74
75	*value = (T)result;
76
77	// The assert is correct but the .class file is wrong
78	// assert(UNICODE::utf8_size(result) == length, "checking reverse computation");
79	return (char *)(ptr + length);
80	}
81
82	char* UTF8::next_character(const char* str, jint* value) {
83	unsigned const char ptr = (const* unsigned char *)str;
84	/ See if it's legal supplementary character:*
85	11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx /*
86	if (is_supplementary_character(ptr)) {
87	*value = get_supplementary_character(ptr);
88	return (char *)(ptr + `6`);
89	}
90	jchar result;
91	char* next_ch = next(str, &result);
92	*value = result;
93	return next_ch;
94	}
95
96	// Count bytes of the form 10xxxxxx and deduct this count
97	// from the total byte count. The utf8 string must be in
98	// legal form which has been verified in the format checker.
99	int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) {
100	int num_chars = len;
101	has_multibyte = false;
102	is_latin1 = true;
103	unsigned char prev = `0`;
104	for (int i = `0`; i < len; i++) {
105	unsigned char c = str[i];
106	if ((c & `0xC0`) == `0x80`) {
107	// Multibyte, check if valid latin1 character.
108	has_multibyte = true;
109	if (prev > `0xC3`) {
110	is_latin1 = false;
111	}
112	--num_chars;
113	}
114	prev = c;
115	}
116	return num_chars;
117	}
118
119	// Count bytes of the utf8 string except those in form
120	// 10xxxxxx which only appear in multibyte characters.
121	// The utf8 string must be in legal form and has been
122	// verified in the format checker.
123	int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) {
124	int num_chars = `0`;
125	has_multibyte = false;
126	is_latin1 = true;
127	unsigned char prev = `0`;
128	for (const char* p = str; *p; p++) {
129	unsigned char c = (*p);
130	if ((c & `0xC0`) == `0x80`) {
131	// Multibyte, check if valid latin1 character.
132	has_multibyte = true;
133	if (prev > `0xC3`) {
134	is_latin1 = false;
135	}
136	} else {
137	num_chars++;
138	}
139	prev = c;
140	}
141	return num_chars;
142	}
143
144	// Writes a jchar as utf8 and returns the end
145	static u_char* utf8_write(u_char* base, jchar ch) {
146	if ((ch != `0`) && (ch <=`0x7f`)) {
147	base[`0`] = (u_char) ch;
148	return base + `1`;
149	}
150
151	if (ch <= `0x7FF`) {
152	/ 11 bits or less. /
153	unsigned char high_five = ch >> `6`;
154	unsigned char low_six = ch & `0x3F`;
155	base[`0`] = high_five \| `0xC0`; / 110xxxxx /
156	base[`1`] = low_six \| `0x80`; / 10xxxxxx /
157	return base + `2`;
158	}
159	/ possibly full 16 bits. /
160	char high_four = ch >> `12`;
161	char mid_six = (ch >> `6`) & `0x3F`;
162	char low_six = ch & `0x3f`;
163	base[`0`] = high_four \| `0xE0`; / 1110xxxx /
164	base[`1`] = mid_six \| `0x80`; / 10xxxxxx /
165	base[`2`] = low_six \| `0x80`; / 10xxxxxx /
166	return base + `3`;
167	}
168
169	template<typename T> void UTF8::convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length) {
170	unsigned char ch;
171	const char *ptr = utf8_str;
172	int index = `0`;
173
174	/ ASCII case loop optimization /
175	for (; index < unicode_length; index++) {
176	if((ch = ptr[`0`]) > `0x7F`) { break; }
177	unicode_str[index] = (T)ch;
178	ptr = (const char *)(ptr + `1`);
179	}
180
181	for (; index < unicode_length; index++) {
182	ptr = UTF8::next(ptr, &unicode_str[index]);
183	}
184	}
185
186	// Explicit instantiation for all supported string types.
187	template char* UTF8::next<jchar>(const char* str, jchar* value);
188	template char* UTF8::next<jbyte>(const char* str, jbyte* value);
189	template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unicode_str, int unicode_length);
190	template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length);
191
192	// returns the quoted ascii length of a 0-terminated utf8 string
193	int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) {
194	const char *ptr = utf8_str;
195	const char* end = ptr + utf8_length;
196	int result = `0`;
197	while (ptr < end) {
198	jchar c;
199	ptr = UTF8::next(ptr, &c);
200	if (c >= `32` && c < `127`) {
201	result++;
202	} else {
203	result += `6`;
204	}
205	}
206	return result;
207	}
208
209	// converts a utf8 string to quoted ascii
210	void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) {
211	const char *ptr = utf8_str;
212	const char *utf8_end = ptr + utf8_length;
213	char* p = buf;
214	char* end = buf + buflen;
215	while (ptr < utf8_end) {
216	jchar c;
217	ptr = UTF8::next(ptr, &c);
218	if (c >= `32` && c < `127`) {
219	if (p + `1` >= end) break; // string is truncated
220	p++ = (char*)c;
221	} else {
222	if (p + `6` >= end) break; // string is truncated
223	sprintf(p, "\\u%04x", c);
224	p += `6`;
225	}
226	}
227	assert(p < end, "sanity");
228	*p = `'\0'`;
229	}
230
231	#ifndef PRODUCT
232	// converts a quoted ascii string back to utf8
233	// no longer used, but could be useful to test output of UTF8::as_quoted_ascii
234	const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) {
235	const char *ptr = quoted_ascii_str;
236	char* result = NULL;
237	while (*ptr != `'\0'`) {
238	char c = *ptr;
239	if (c < `32` \|\| c >= `127`) break;
240	}
241	if (*ptr == `'\0'`) {
242	// nothing to do so return original string
243	return quoted_ascii_str;
244	}
245	// everything up to this point was ok.
246	int length = ptr - quoted_ascii_str;
247	char* buffer = NULL;
248	for (int round = `0`; round < `2`; round++) {
249	while (*ptr != `'\0'`) {
250	if (*ptr != `'\\'`) {
251	if (buffer != NULL) {
252	buffer[length] = *ptr;
253	}
254	length++;
255	} else {
256	switch (ptr[`1`]) {
257	case `'u'`: {
258	ptr += `2`;
259	jchar value=`0`;
260	for (int i=`0`; i<`4`; i++) {
261	char c = *ptr++;
262	switch (c) {
263	case `'0'`: case `'1'`: case `'2'`: case `'3'`: case `'4'`:
264	case `'5'`: case `'6'`: case `'7'`: case `'8'`: case `'9'`:
265	value = (value << `4`) + c - `'0'`;
266	break;
267	case `'a'`: case `'b'`: case `'c'`:
268	case `'d'`: case `'e'`: case `'f'`:
269	value = (value << `4`) + `10` + c - `'a'`;
270	break;
271	case `'A'`: case `'B'`: case `'C'`:
272	case `'D'`: case `'E'`: case `'F'`:
273	value = (value << `4`) + `10` + c - `'A'`;
274	break;
275	default:
276	ShouldNotReachHere();
277	}
278	}
279	if (buffer == NULL) {
280	char utf8_buffer[`4`];
281	char* next = (char)utf8_write((u_char)utf8_buffer, value);
282	length += next - utf8_buffer;
283	} else {
284	char* next = (char)utf8_write((u_char)&buffer[length], value);
285	length += next - &buffer[length];
286	}
287	break;
288	}
289	case `'t'`: if (buffer != NULL) buffer[length] = `'\t'`; ptr += `2`; length++; break;
290	case `'n'`: if (buffer != NULL) buffer[length] = `'\n'`; ptr += `2`; length++; break;
291	case `'r'`: if (buffer != NULL) buffer[length] = `'\r'`; ptr += `2`; length++; break;
292	case `'f'`: if (buffer != NULL) buffer[length] = `'\f'`; ptr += `2`; length++; break;
293	default:
294	ShouldNotReachHere();
295	}
296	}
297	}
298	if (round == `0`) {
299	buffer = NEW_RESOURCE_ARRAY(char, length + `1`);
300	ptr = quoted_ascii_str;
301	} else {
302	buffer[length] = `'\0'`;
303	}
304	}
305	return buffer;
306	}
307	#endif // !PRODUCT
308
309	// Returns NULL if 'c' it not found. This only works as long
310	// as 'c' is an ASCII character
311	const jbyte* UTF8::strrchr(const jbyte* base, int length, jbyte c) {
312	assert(length >= `0`, "sanity check");
313	assert(c >= `0`, "does not work for non-ASCII characters");
314	// Skip backwards in string until 'c' is found or end is reached
315	while(--length >= `0` && base[length] != c);
316	return (length < `0`) ? NULL : &base[length];
317	}
318
319	bool UTF8::equal(const jbyte* base1, int length1, const jbyte* base2, int length2) {
320	// Length must be the same
321	if (length1 != length2) return false;
322	for (int i = `0`; i < length1; i++) {
323	if (base1[i] != base2[i]) return false;
324	}
325	return true;
326	}
327
328	bool UTF8::is_supplementary_character(const unsigned char* str) {
329	return ((str[`0`] & `0xFF`) == `0xED`) && ((str[`1`] & `0xF0`) == `0xA0`) && ((str[`2`] & `0xC0`) == `0x80`)
330	&& ((str[`3`] & `0xFF`) == `0xED`) && ((str[`4`] & `0xF0`) == `0xB0`) && ((str[`5`] & `0xC0`) == `0x80`);
331	}
332
333	jint UTF8::get_supplementary_character(const unsigned char* str) {
334	return `0x10000` + ((str[`1`] & `0x0f`) << `16`) + ((str[`2`] & `0x3f`) << `10`)
335	+ ((str[`4`] & `0x0f`) << `6`) + (str[`5`] & `0x3f`);
336	}
337
338	bool UTF8::is_legal_utf8(const unsigned char* buffer, int length,
339	bool version_leq_47) {
340	int i = `0`;
341	int count = length >> `2`;
342	for (int k=`0`; k<count; k++) {
343	unsigned char b0 = buffer[i];
344	unsigned char b1 = buffer[i+`1`];
345	unsigned char b2 = buffer[i+`2`];
346	unsigned char b3 = buffer[i+`3`];
347	// For an unsigned char v,
348	// (v \| v - 1) is < 128 (highest bit 0) for 0 < v < 128;
349	// (v \| v - 1) is >= 128 (highest bit 1) for v == 0 or v >= 128.
350	unsigned char res = b0 \| b0 - `1` \|
351	b1 \| b1 - `1` \|
352	b2 \| b2 - `1` \|
353	b3 \| b3 - `1`;
354	if (res >= `128`) break;
355	i += `4`;
356	}
357	for(; i < length; i++) {
358	unsigned short c;
359	// no embedded zeros
360	if (buffer[i] == `0`) return false;
361	if(buffer[i] < `128`) {
362	continue;
363	}
364	if ((i + `5`) < length) { // see if it's legal supplementary character
365	if (UTF8::is_supplementary_character(&buffer[i])) {
366	c = UTF8::get_supplementary_character(&buffer[i]);
367	i += `5`;
368	continue;
369	}
370	}
371	switch (buffer[i] >> `4`) {
372	default: break;
373	case `0x8`: case `0x9`: case `0xA`: case `0xB`: case `0xF`:
374	return false;
375	case `0xC`: case `0xD`: // 110xxxxx 10xxxxxx
376	c = (buffer[i] & `0x1F`) << `6`;
377	i++;
378	if ((i < length) && ((buffer[i] & `0xC0`) == `0x80`)) {
379	c += buffer[i] & `0x3F`;
380	if (version_leq_47 \|\| c == `0` \|\| c >= `0x80`) {
381	break;
382	}
383	}
384	return false;
385	case `0xE`: // 1110xxxx 10xxxxxx 10xxxxxx
386	c = (buffer[i] & `0xF`) << `12`;
387	i += `2`;
388	if ((i < length) && ((buffer[i-`1`] & `0xC0`) == `0x80`) && ((buffer[i] & `0xC0`) == `0x80`)) {
389	c += ((buffer[i-`1`] & `0x3F`) << `6`) + (buffer[i] & `0x3F`);
390	if (version_leq_47 \|\| c >= `0x800`) {
391	break;
392	}
393	}
394	return false;
395	} // end of switch
396	} // end of for
397	return true;
398	}
399
400	//-------------------------------------------------------------------------------------
401
402	bool UNICODE::is_latin1(jchar c) {
403	return (c <= `0x00FF`);
404	}
405
406	bool UNICODE::is_latin1(const jchar* base, int length) {
407	for (int index = `0`; index < length; index++) {
408	if (base[index] > `0x00FF`) {
409	return false;
410	}
411	}
412	return true;
413	}
414
415	int UNICODE::utf8_size(jchar c) {
416	if ((`0x0001` <= c) && (c <= `0x007F`)) {
417	// ASCII character
418	return `1`;
419	} else if (c <= `0x07FF`) {
420	return `2`;
421	} else {
422	return `3`;
423	}
424	}
425
426	int UNICODE::utf8_size(jbyte c) {
427	if (c >= `0x01`) {
428	// ASCII character. Check is equivalent to
429	// (0x01 <= c) && (c <= 0x7F) because c is signed.
430	return `1`;
431	} else {
432	// Non-ASCII character or 0x00 which needs to be
433	// two-byte encoded as 0xC080 in modified UTF-8.
434	return `2`;
435	}
436	}
437
438	template<typename T>
439	int UNICODE::utf8_length(const T* base, int length) {
440	int result = `0`;
441	for (int index = `0`; index < length; index++) {
442	T c = base[index];
443	result += utf8_size(c);
444	}
445	return result;
446	}
447
448	template<typename T>
449	char* UNICODE::as_utf8(const T* base, int& length) {
450	int utf8_len = utf8_length(base, length);
451	u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + `1`);
452	char* result = as_utf8(base, length, (char*) buf, utf8_len + `1`);
453	assert((int) strlen(result) == utf8_len, "length prediction must be correct");
454	// Set string length to uft8 length
455	length = utf8_len;
456	return (char*) result;
457	}
458
459	char* UNICODE::as_utf8(const jchar* base, int length, char* buf, int buflen) {
460	u_char* p = (u_char*)buf;
461	for (int index = `0`; index < length; index++) {
462	jchar c = base[index];
463	buflen -= utf8_size(c);
464	if (buflen <= `0`) break; // string is truncated
465	p = utf8_write(p, c);
466	}
467	*p = `'\0'`;
468	return buf;
469	}
470
471	char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, int buflen) {
472	u_char* p = (u_char*)buf;
473	for (int index = `0`; index < length; index++) {
474	jbyte c = base[index];
475	int sz = utf8_size(c);
476	buflen -= sz;
477	if (buflen <= `0`) break; // string is truncated
478	if (sz == `1`) {
479	// Copy ASCII characters (UTF-8 is ASCII compatible)
480	*p++ = c;
481	} else {
482	// Non-ASCII character or 0x00 which should
483	// be encoded as 0xC080 in "modified" UTF8.
484	p = utf8_write(p, ((jchar) c) & `0xff`);
485	}
486	}
487	*p = `'\0'`;
488	return buf;
489	}
490
491	void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) {
492	for(int index = `0`; index < length; index++) {
493	utf8_buffer = (char)utf8_write((u_char)utf8_buffer, base[index]);
494	}
495	*utf8_buffer = `'\0'`;
496	}
497
498	// returns the quoted ascii length of a unicode string
499	template<typename T>
500	int UNICODE::quoted_ascii_length(const T* base, int length) {
501	int result = `0`;
502	for (int i = `0`; i < length; i++) {
503	T c = base[i];
504	if (c >= `32` && c < `127`) {
505	result++;
506	} else {
507	result += `6`;
508	}
509	}
510	return result;
511	}
512
513	// converts a unicode string to quoted ascii
514	template<typename T>
515	void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) {
516	char* p = buf;
517	char* end = buf + buflen;
518	for (int index = `0`; index < length; index++) {
519	T c = base[index];
520	if (c >= `32` && c < `127`) {
521	if (p + `1` >= end) break; // string is truncated
522	p++ = (char*)c;
523	} else {
524	if (p + `6` >= end) break; // string is truncated
525	sprintf(p, "\\u%04x", c);
526	p += `6`;
527	}
528	}
529	*p = `'\0'`;
530	}
531
532	// Explicit instantiation for all supported types.
533	template int UNICODE::utf8_length(const jbyte* base, int length);
534	template int UNICODE::utf8_length(const jchar* base, int length);
535	template char* UNICODE::as_utf8(const jbyte* base, int& length);
536	template char* UNICODE::as_utf8(const jchar* base, int& length);
537	template int UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length);
538	template int UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length);
539	template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen);
540	template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen);
541

Browse the source code of OpenJDK/src/hotspot/share/utilities/utf8.cpp