EncodingSupport.c source code [OpenJDK/src/java.instrument/share/native/libinstrument/EncodingSupport.c]

1	/*
2	* Copyright (c) 2004, Oracle and/or its affiliates. All rights reserved.
3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4	*
5	* This code is free software; you can redistribute it and/or modify it
6	* under the terms of the GNU General Public License version 2 only, as
7	* published by the Free Software Foundation. Oracle designates this
8	* particular file as subject to the "Classpath" exception as provided
9	* by Oracle in the LICENSE file that accompanied this code.
10	*
11	* This code is distributed in the hope that it will be useful, but WITHOUT
12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14	* version 2 for more details (a copy is included in the LICENSE file that
15	* accompanied this code).
16	*
17	* You should have received a copy of the GNU General Public License version
18	* 2 along with this work; if not, write to the Free Software Foundation,
19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20	*
21	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22	* or visit www.oracle.com if you need additional information or have any
23	* questions.
24	*/
25
26
27	/**
28	* Determine length of this Standard UTF-8 in Modified UTF-8.
29	* Validation is done of the basic UTF encoding rules, returns
30	* length (no change) when errors are detected in the UTF encoding.
31	*
32	* Note: Accepts Modified UTF-8 also, no verification on the
33	* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
34	*/
35	int
36	modifiedUtf8LengthOfUtf8(char* string, int length) {
37	int new_length;
38	int i;
39
40	new_length = `0`;
41	for ( i = `0` ; i < length ; i++ ) {
42	unsigned byte;
43
44	byte = (unsigned char)string[i];
45	if ( (byte & `0x80`) == `0` ) { / 1byte encoding /
46	new_length++;
47	if ( byte == `0` ) {
48	new_length++; / We gain one byte in length on NULL bytes /
49	}
50	} else if ( (byte & `0xE0`) == `0xC0` ) { / 2byte encoding /
51	/ Check encoding of following bytes /
52	if ( (i+`1`) >= length \|\| (string[i+`1`] & `0xC0`) != `0x80` ) {
53	break; / Error condition /
54	}
55	i++; / Skip next byte /
56	new_length += `2`;
57	} else if ( (byte & `0xF0`) == `0xE0` ) { / 3byte encoding /
58	/ Check encoding of following bytes /
59	if ( (i+`2`) >= length \|\| (string[i+`1`] & `0xC0`) != `0x80`
60	\|\| (string[i+`2`] & `0xC0`) != `0x80` ) {
61	break; / Error condition /
62	}
63	i += `2`; / Skip next two bytes /
64	new_length += `3`;
65	} else if ( (byte & `0xF8`) == `0xF0` ) { / 4byte encoding /
66	/ Check encoding of following bytes /
67	if ( (i+`3`) >= length \|\| (string[i+`1`] & `0xC0`) != `0x80`
68	\|\| (string[i+`2`] & `0xC0`) != `0x80`
69	\|\| (string[i+`3`] & `0xC0`) != `0x80` ) {
70	break; / Error condition /
71	}
72	i += `3`; / Skip next 3 bytes /
73	new_length += `6`; / 4byte encoding turns into 2 3byte ones /
74	} else {
75	break; / Error condition /
76	}
77	}
78	if ( i != length ) {
79	/ Error in finding new length, return old length so no conversion /
80	/ FIXUP: ERROR_MESSAGE? /
81	return length;
82	}
83	return new_length;
84	}
85
86	/*
87	* Convert Standard UTF-8 to Modified UTF-8.
88	* Assumes the UTF-8 encoding was validated by modifiedLength() above.
89	*
90	* Note: Accepts Modified UTF-8 also, no verification on the
91	* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
92	*/
93	void
94	convertUtf8ToModifiedUtf8(char string, int* length, char new_string, int* new_length)
95	{
96	int i;
97	int j;
98
99	j = `0`;
100	for ( i = `0` ; i < length ; i++ ) {
101	unsigned byte1;
102
103	byte1 = (unsigned char)string[i];
104
105	/ NULL bytes and bytes starting with 11110xxx are special /
106	if ( (byte1 & `0x80`) == `0` ) { / 1byte encoding /
107	if ( byte1 == `0` ) {
108	/ Bits out: 11000000 10000000 /
109	new_string[j++] = (char)`0xC0`;
110	new_string[j++] = (char)`0x80`;
111	} else {
112	/ Single byte /
113	new_string[j++] = byte1;
114	}
115	} else if ( (byte1 & `0xE0`) == `0xC0` ) { / 2byte encoding /
116	new_string[j++] = byte1;
117	new_string[j++] = string[++i];
118	} else if ( (byte1 & `0xF0`) == `0xE0` ) { / 3byte encoding /
119	new_string[j++] = byte1;
120	new_string[j++] = string[++i];
121	new_string[j++] = string[++i];
122	} else if ( (byte1 & `0xF8`) == `0xF0` ) { / 4byte encoding /
123	/ Beginning of 4byte encoding, turn into 2 3byte encodings /
124	unsigned byte2, byte3, byte4, u21;
125
126	/ Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx /
127	byte2 = (unsigned char)string[++i];
128	byte3 = (unsigned char)string[++i];
129	byte4 = (unsigned char)string[++i];
130	/ Reconstruct full 21bit value /
131	u21 = (byte1 & `0x07`) << `18`;
132	u21 += (byte2 & `0x3F`) << `12`;
133	u21 += (byte3 & `0x3F`) << `6`;
134	u21 += (byte4 & `0x3F`);
135	/ Bits out: 11101101 1010xxxx 10xxxxxx /
136	new_string[j++] = (char)`0xED`;
137	new_string[j++] = `0xA0` + (((u21 >> `16`) - `1`) & `0x0F`);
138	new_string[j++] = `0x80` + ((u21 >> `10`) & `0x3F`);
139	/ Bits out: 11101101 1011xxxx 10xxxxxx /
140	new_string[j++] = (char)`0xED`;
141	new_string[j++] = `0xB0` + ((u21 >> `6`) & `0x0F`);
142	new_string[j++] = byte4;
143	}
144	}
145	new_string[j] = `0`;
146	}
147

Browse the source code of OpenJDK/src/java.instrument/share/native/libinstrument/EncodingSupport.c