1 | /* |
2 | * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. |
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | * |
5 | * This code is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License version 2 only, as |
7 | * published by the Free Software Foundation. |
8 | * |
9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
12 | * version 2 for more details (a copy is included in the LICENSE file that |
13 | * accompanied this code). |
14 | * |
15 | * You should have received a copy of the GNU General Public License version |
16 | * 2 along with this work; if not, write to the Free Software Foundation, |
17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
18 | * |
19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 | * or visit www.oracle.com if you need additional information or have any |
21 | * questions. |
22 | * |
23 | */ |
24 | |
25 | #include "precompiled.hpp" |
26 | #include "utilities/utf8.hpp" |
27 | |
28 | // Assume the utf8 string is in legal form and has been |
29 | // checked in the class file parser/format checker. |
30 | template<typename T> char* UTF8::next(const char* str, T* value) { |
31 | unsigned const char *ptr = (const unsigned char *)str; |
32 | unsigned char ch, ch2, ch3; |
33 | int length = -1; /* bad length */ |
34 | jchar result; |
35 | switch ((ch = ptr[0]) >> 4) { |
36 | default: |
37 | result = ch; |
38 | length = 1; |
39 | break; |
40 | |
41 | case 0x8: case 0x9: case 0xA: case 0xB: case 0xF: |
42 | /* Shouldn't happen. */ |
43 | break; |
44 | |
45 | case 0xC: case 0xD: |
46 | /* 110xxxxx 10xxxxxx */ |
47 | if (((ch2 = ptr[1]) & 0xC0) == 0x80) { |
48 | unsigned char high_five = ch & 0x1F; |
49 | unsigned char low_six = ch2 & 0x3F; |
50 | result = (high_five << 6) + low_six; |
51 | length = 2; |
52 | break; |
53 | } |
54 | break; |
55 | |
56 | case 0xE: |
57 | /* 1110xxxx 10xxxxxx 10xxxxxx */ |
58 | if (((ch2 = ptr[1]) & 0xC0) == 0x80) { |
59 | if (((ch3 = ptr[2]) & 0xC0) == 0x80) { |
60 | unsigned char high_four = ch & 0x0f; |
61 | unsigned char mid_six = ch2 & 0x3f; |
62 | unsigned char low_six = ch3 & 0x3f; |
63 | result = (((high_four << 6) + mid_six) << 6) + low_six; |
64 | length = 3; |
65 | } |
66 | } |
67 | break; |
68 | } /* end of switch */ |
69 | |
70 | if (length <= 0) { |
71 | *value = (T)ptr[0]; /* default bad result; */ |
72 | return (char*)(ptr + 1); // make progress somehow |
73 | } |
74 | |
75 | *value = (T)result; |
76 | |
77 | // The assert is correct but the .class file is wrong |
78 | // assert(UNICODE::utf8_size(result) == length, "checking reverse computation"); |
79 | return (char *)(ptr + length); |
80 | } |
81 | |
82 | char* UTF8::next_character(const char* str, jint* value) { |
83 | unsigned const char *ptr = (const unsigned char *)str; |
84 | /* See if it's legal supplementary character: |
85 | 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx */ |
86 | if (is_supplementary_character(ptr)) { |
87 | *value = get_supplementary_character(ptr); |
88 | return (char *)(ptr + 6); |
89 | } |
90 | jchar result; |
91 | char* next_ch = next(str, &result); |
92 | *value = result; |
93 | return next_ch; |
94 | } |
95 | |
96 | // Count bytes of the form 10xxxxxx and deduct this count |
97 | // from the total byte count. The utf8 string must be in |
98 | // legal form which has been verified in the format checker. |
99 | int UTF8::unicode_length(const char* str, int len, bool& is_latin1, bool& has_multibyte) { |
100 | int num_chars = len; |
101 | has_multibyte = false; |
102 | is_latin1 = true; |
103 | unsigned char prev = 0; |
104 | for (int i = 0; i < len; i++) { |
105 | unsigned char c = str[i]; |
106 | if ((c & 0xC0) == 0x80) { |
107 | // Multibyte, check if valid latin1 character. |
108 | has_multibyte = true; |
109 | if (prev > 0xC3) { |
110 | is_latin1 = false; |
111 | } |
112 | --num_chars; |
113 | } |
114 | prev = c; |
115 | } |
116 | return num_chars; |
117 | } |
118 | |
119 | // Count bytes of the utf8 string except those in form |
120 | // 10xxxxxx which only appear in multibyte characters. |
121 | // The utf8 string must be in legal form and has been |
122 | // verified in the format checker. |
123 | int UTF8::unicode_length(const char* str, bool& is_latin1, bool& has_multibyte) { |
124 | int num_chars = 0; |
125 | has_multibyte = false; |
126 | is_latin1 = true; |
127 | unsigned char prev = 0; |
128 | for (const char* p = str; *p; p++) { |
129 | unsigned char c = (*p); |
130 | if ((c & 0xC0) == 0x80) { |
131 | // Multibyte, check if valid latin1 character. |
132 | has_multibyte = true; |
133 | if (prev > 0xC3) { |
134 | is_latin1 = false; |
135 | } |
136 | } else { |
137 | num_chars++; |
138 | } |
139 | prev = c; |
140 | } |
141 | return num_chars; |
142 | } |
143 | |
144 | // Writes a jchar as utf8 and returns the end |
145 | static u_char* utf8_write(u_char* base, jchar ch) { |
146 | if ((ch != 0) && (ch <=0x7f)) { |
147 | base[0] = (u_char) ch; |
148 | return base + 1; |
149 | } |
150 | |
151 | if (ch <= 0x7FF) { |
152 | /* 11 bits or less. */ |
153 | unsigned char high_five = ch >> 6; |
154 | unsigned char low_six = ch & 0x3F; |
155 | base[0] = high_five | 0xC0; /* 110xxxxx */ |
156 | base[1] = low_six | 0x80; /* 10xxxxxx */ |
157 | return base + 2; |
158 | } |
159 | /* possibly full 16 bits. */ |
160 | char high_four = ch >> 12; |
161 | char mid_six = (ch >> 6) & 0x3F; |
162 | char low_six = ch & 0x3f; |
163 | base[0] = high_four | 0xE0; /* 1110xxxx */ |
164 | base[1] = mid_six | 0x80; /* 10xxxxxx */ |
165 | base[2] = low_six | 0x80; /* 10xxxxxx */ |
166 | return base + 3; |
167 | } |
168 | |
169 | template<typename T> void UTF8::convert_to_unicode(const char* utf8_str, T* unicode_str, int unicode_length) { |
170 | unsigned char ch; |
171 | const char *ptr = utf8_str; |
172 | int index = 0; |
173 | |
174 | /* ASCII case loop optimization */ |
175 | for (; index < unicode_length; index++) { |
176 | if((ch = ptr[0]) > 0x7F) { break; } |
177 | unicode_str[index] = (T)ch; |
178 | ptr = (const char *)(ptr + 1); |
179 | } |
180 | |
181 | for (; index < unicode_length; index++) { |
182 | ptr = UTF8::next(ptr, &unicode_str[index]); |
183 | } |
184 | } |
185 | |
186 | // Explicit instantiation for all supported string types. |
187 | template char* UTF8::next<jchar>(const char* str, jchar* value); |
188 | template char* UTF8::next<jbyte>(const char* str, jbyte* value); |
189 | template void UTF8::convert_to_unicode<jchar>(const char* utf8_str, jchar* unicode_str, int unicode_length); |
190 | template void UTF8::convert_to_unicode<jbyte>(const char* utf8_str, jbyte* unicode_str, int unicode_length); |
191 | |
192 | // returns the quoted ascii length of a 0-terminated utf8 string |
193 | int UTF8::quoted_ascii_length(const char* utf8_str, int utf8_length) { |
194 | const char *ptr = utf8_str; |
195 | const char* end = ptr + utf8_length; |
196 | int result = 0; |
197 | while (ptr < end) { |
198 | jchar c; |
199 | ptr = UTF8::next(ptr, &c); |
200 | if (c >= 32 && c < 127) { |
201 | result++; |
202 | } else { |
203 | result += 6; |
204 | } |
205 | } |
206 | return result; |
207 | } |
208 | |
209 | // converts a utf8 string to quoted ascii |
210 | void UTF8::as_quoted_ascii(const char* utf8_str, int utf8_length, char* buf, int buflen) { |
211 | const char *ptr = utf8_str; |
212 | const char *utf8_end = ptr + utf8_length; |
213 | char* p = buf; |
214 | char* end = buf + buflen; |
215 | while (ptr < utf8_end) { |
216 | jchar c; |
217 | ptr = UTF8::next(ptr, &c); |
218 | if (c >= 32 && c < 127) { |
219 | if (p + 1 >= end) break; // string is truncated |
220 | *p++ = (char)c; |
221 | } else { |
222 | if (p + 6 >= end) break; // string is truncated |
223 | sprintf(p, "\\u%04x" , c); |
224 | p += 6; |
225 | } |
226 | } |
227 | assert(p < end, "sanity" ); |
228 | *p = '\0'; |
229 | } |
230 | |
231 | #ifndef PRODUCT |
232 | // converts a quoted ascii string back to utf8 |
233 | // no longer used, but could be useful to test output of UTF8::as_quoted_ascii |
234 | const char* UTF8::from_quoted_ascii(const char* quoted_ascii_str) { |
235 | const char *ptr = quoted_ascii_str; |
236 | char* result = NULL; |
237 | while (*ptr != '\0') { |
238 | char c = *ptr; |
239 | if (c < 32 || c >= 127) break; |
240 | } |
241 | if (*ptr == '\0') { |
242 | // nothing to do so return original string |
243 | return quoted_ascii_str; |
244 | } |
245 | // everything up to this point was ok. |
246 | int length = ptr - quoted_ascii_str; |
247 | char* buffer = NULL; |
248 | for (int round = 0; round < 2; round++) { |
249 | while (*ptr != '\0') { |
250 | if (*ptr != '\\') { |
251 | if (buffer != NULL) { |
252 | buffer[length] = *ptr; |
253 | } |
254 | length++; |
255 | } else { |
256 | switch (ptr[1]) { |
257 | case 'u': { |
258 | ptr += 2; |
259 | jchar value=0; |
260 | for (int i=0; i<4; i++) { |
261 | char c = *ptr++; |
262 | switch (c) { |
263 | case '0': case '1': case '2': case '3': case '4': |
264 | case '5': case '6': case '7': case '8': case '9': |
265 | value = (value << 4) + c - '0'; |
266 | break; |
267 | case 'a': case 'b': case 'c': |
268 | case 'd': case 'e': case 'f': |
269 | value = (value << 4) + 10 + c - 'a'; |
270 | break; |
271 | case 'A': case 'B': case 'C': |
272 | case 'D': case 'E': case 'F': |
273 | value = (value << 4) + 10 + c - 'A'; |
274 | break; |
275 | default: |
276 | ShouldNotReachHere(); |
277 | } |
278 | } |
279 | if (buffer == NULL) { |
280 | char utf8_buffer[4]; |
281 | char* next = (char*)utf8_write((u_char*)utf8_buffer, value); |
282 | length += next - utf8_buffer; |
283 | } else { |
284 | char* next = (char*)utf8_write((u_char*)&buffer[length], value); |
285 | length += next - &buffer[length]; |
286 | } |
287 | break; |
288 | } |
289 | case 't': if (buffer != NULL) buffer[length] = '\t'; ptr += 2; length++; break; |
290 | case 'n': if (buffer != NULL) buffer[length] = '\n'; ptr += 2; length++; break; |
291 | case 'r': if (buffer != NULL) buffer[length] = '\r'; ptr += 2; length++; break; |
292 | case 'f': if (buffer != NULL) buffer[length] = '\f'; ptr += 2; length++; break; |
293 | default: |
294 | ShouldNotReachHere(); |
295 | } |
296 | } |
297 | } |
298 | if (round == 0) { |
299 | buffer = NEW_RESOURCE_ARRAY(char, length + 1); |
300 | ptr = quoted_ascii_str; |
301 | } else { |
302 | buffer[length] = '\0'; |
303 | } |
304 | } |
305 | return buffer; |
306 | } |
307 | #endif // !PRODUCT |
308 | |
309 | // Returns NULL if 'c' it not found. This only works as long |
310 | // as 'c' is an ASCII character |
311 | const jbyte* UTF8::strrchr(const jbyte* base, int length, jbyte c) { |
312 | assert(length >= 0, "sanity check" ); |
313 | assert(c >= 0, "does not work for non-ASCII characters" ); |
314 | // Skip backwards in string until 'c' is found or end is reached |
315 | while(--length >= 0 && base[length] != c); |
316 | return (length < 0) ? NULL : &base[length]; |
317 | } |
318 | |
319 | bool UTF8::equal(const jbyte* base1, int length1, const jbyte* base2, int length2) { |
320 | // Length must be the same |
321 | if (length1 != length2) return false; |
322 | for (int i = 0; i < length1; i++) { |
323 | if (base1[i] != base2[i]) return false; |
324 | } |
325 | return true; |
326 | } |
327 | |
328 | bool UTF8::is_supplementary_character(const unsigned char* str) { |
329 | return ((str[0] & 0xFF) == 0xED) && ((str[1] & 0xF0) == 0xA0) && ((str[2] & 0xC0) == 0x80) |
330 | && ((str[3] & 0xFF) == 0xED) && ((str[4] & 0xF0) == 0xB0) && ((str[5] & 0xC0) == 0x80); |
331 | } |
332 | |
333 | jint UTF8::get_supplementary_character(const unsigned char* str) { |
334 | return 0x10000 + ((str[1] & 0x0f) << 16) + ((str[2] & 0x3f) << 10) |
335 | + ((str[4] & 0x0f) << 6) + (str[5] & 0x3f); |
336 | } |
337 | |
338 | bool UTF8::is_legal_utf8(const unsigned char* buffer, int length, |
339 | bool version_leq_47) { |
340 | int i = 0; |
341 | int count = length >> 2; |
342 | for (int k=0; k<count; k++) { |
343 | unsigned char b0 = buffer[i]; |
344 | unsigned char b1 = buffer[i+1]; |
345 | unsigned char b2 = buffer[i+2]; |
346 | unsigned char b3 = buffer[i+3]; |
347 | // For an unsigned char v, |
348 | // (v | v - 1) is < 128 (highest bit 0) for 0 < v < 128; |
349 | // (v | v - 1) is >= 128 (highest bit 1) for v == 0 or v >= 128. |
350 | unsigned char res = b0 | b0 - 1 | |
351 | b1 | b1 - 1 | |
352 | b2 | b2 - 1 | |
353 | b3 | b3 - 1; |
354 | if (res >= 128) break; |
355 | i += 4; |
356 | } |
357 | for(; i < length; i++) { |
358 | unsigned short c; |
359 | // no embedded zeros |
360 | if (buffer[i] == 0) return false; |
361 | if(buffer[i] < 128) { |
362 | continue; |
363 | } |
364 | if ((i + 5) < length) { // see if it's legal supplementary character |
365 | if (UTF8::is_supplementary_character(&buffer[i])) { |
366 | c = UTF8::get_supplementary_character(&buffer[i]); |
367 | i += 5; |
368 | continue; |
369 | } |
370 | } |
371 | switch (buffer[i] >> 4) { |
372 | default: break; |
373 | case 0x8: case 0x9: case 0xA: case 0xB: case 0xF: |
374 | return false; |
375 | case 0xC: case 0xD: // 110xxxxx 10xxxxxx |
376 | c = (buffer[i] & 0x1F) << 6; |
377 | i++; |
378 | if ((i < length) && ((buffer[i] & 0xC0) == 0x80)) { |
379 | c += buffer[i] & 0x3F; |
380 | if (version_leq_47 || c == 0 || c >= 0x80) { |
381 | break; |
382 | } |
383 | } |
384 | return false; |
385 | case 0xE: // 1110xxxx 10xxxxxx 10xxxxxx |
386 | c = (buffer[i] & 0xF) << 12; |
387 | i += 2; |
388 | if ((i < length) && ((buffer[i-1] & 0xC0) == 0x80) && ((buffer[i] & 0xC0) == 0x80)) { |
389 | c += ((buffer[i-1] & 0x3F) << 6) + (buffer[i] & 0x3F); |
390 | if (version_leq_47 || c >= 0x800) { |
391 | break; |
392 | } |
393 | } |
394 | return false; |
395 | } // end of switch |
396 | } // end of for |
397 | return true; |
398 | } |
399 | |
400 | //------------------------------------------------------------------------------------- |
401 | |
402 | bool UNICODE::is_latin1(jchar c) { |
403 | return (c <= 0x00FF); |
404 | } |
405 | |
406 | bool UNICODE::is_latin1(const jchar* base, int length) { |
407 | for (int index = 0; index < length; index++) { |
408 | if (base[index] > 0x00FF) { |
409 | return false; |
410 | } |
411 | } |
412 | return true; |
413 | } |
414 | |
415 | int UNICODE::utf8_size(jchar c) { |
416 | if ((0x0001 <= c) && (c <= 0x007F)) { |
417 | // ASCII character |
418 | return 1; |
419 | } else if (c <= 0x07FF) { |
420 | return 2; |
421 | } else { |
422 | return 3; |
423 | } |
424 | } |
425 | |
426 | int UNICODE::utf8_size(jbyte c) { |
427 | if (c >= 0x01) { |
428 | // ASCII character. Check is equivalent to |
429 | // (0x01 <= c) && (c <= 0x7F) because c is signed. |
430 | return 1; |
431 | } else { |
432 | // Non-ASCII character or 0x00 which needs to be |
433 | // two-byte encoded as 0xC080 in modified UTF-8. |
434 | return 2; |
435 | } |
436 | } |
437 | |
438 | template<typename T> |
439 | int UNICODE::utf8_length(const T* base, int length) { |
440 | int result = 0; |
441 | for (int index = 0; index < length; index++) { |
442 | T c = base[index]; |
443 | result += utf8_size(c); |
444 | } |
445 | return result; |
446 | } |
447 | |
448 | template<typename T> |
449 | char* UNICODE::as_utf8(const T* base, int& length) { |
450 | int utf8_len = utf8_length(base, length); |
451 | u_char* buf = NEW_RESOURCE_ARRAY(u_char, utf8_len + 1); |
452 | char* result = as_utf8(base, length, (char*) buf, utf8_len + 1); |
453 | assert((int) strlen(result) == utf8_len, "length prediction must be correct" ); |
454 | // Set string length to uft8 length |
455 | length = utf8_len; |
456 | return (char*) result; |
457 | } |
458 | |
459 | char* UNICODE::as_utf8(const jchar* base, int length, char* buf, int buflen) { |
460 | u_char* p = (u_char*)buf; |
461 | for (int index = 0; index < length; index++) { |
462 | jchar c = base[index]; |
463 | buflen -= utf8_size(c); |
464 | if (buflen <= 0) break; // string is truncated |
465 | p = utf8_write(p, c); |
466 | } |
467 | *p = '\0'; |
468 | return buf; |
469 | } |
470 | |
471 | char* UNICODE::as_utf8(const jbyte* base, int length, char* buf, int buflen) { |
472 | u_char* p = (u_char*)buf; |
473 | for (int index = 0; index < length; index++) { |
474 | jbyte c = base[index]; |
475 | int sz = utf8_size(c); |
476 | buflen -= sz; |
477 | if (buflen <= 0) break; // string is truncated |
478 | if (sz == 1) { |
479 | // Copy ASCII characters (UTF-8 is ASCII compatible) |
480 | *p++ = c; |
481 | } else { |
482 | // Non-ASCII character or 0x00 which should |
483 | // be encoded as 0xC080 in "modified" UTF8. |
484 | p = utf8_write(p, ((jchar) c) & 0xff); |
485 | } |
486 | } |
487 | *p = '\0'; |
488 | return buf; |
489 | } |
490 | |
491 | void UNICODE::convert_to_utf8(const jchar* base, int length, char* utf8_buffer) { |
492 | for(int index = 0; index < length; index++) { |
493 | utf8_buffer = (char*)utf8_write((u_char*)utf8_buffer, base[index]); |
494 | } |
495 | *utf8_buffer = '\0'; |
496 | } |
497 | |
498 | // returns the quoted ascii length of a unicode string |
499 | template<typename T> |
500 | int UNICODE::quoted_ascii_length(const T* base, int length) { |
501 | int result = 0; |
502 | for (int i = 0; i < length; i++) { |
503 | T c = base[i]; |
504 | if (c >= 32 && c < 127) { |
505 | result++; |
506 | } else { |
507 | result += 6; |
508 | } |
509 | } |
510 | return result; |
511 | } |
512 | |
513 | // converts a unicode string to quoted ascii |
514 | template<typename T> |
515 | void UNICODE::as_quoted_ascii(const T* base, int length, char* buf, int buflen) { |
516 | char* p = buf; |
517 | char* end = buf + buflen; |
518 | for (int index = 0; index < length; index++) { |
519 | T c = base[index]; |
520 | if (c >= 32 && c < 127) { |
521 | if (p + 1 >= end) break; // string is truncated |
522 | *p++ = (char)c; |
523 | } else { |
524 | if (p + 6 >= end) break; // string is truncated |
525 | sprintf(p, "\\u%04x" , c); |
526 | p += 6; |
527 | } |
528 | } |
529 | *p = '\0'; |
530 | } |
531 | |
532 | // Explicit instantiation for all supported types. |
533 | template int UNICODE::utf8_length(const jbyte* base, int length); |
534 | template int UNICODE::utf8_length(const jchar* base, int length); |
535 | template char* UNICODE::as_utf8(const jbyte* base, int& length); |
536 | template char* UNICODE::as_utf8(const jchar* base, int& length); |
537 | template int UNICODE::quoted_ascii_length<jbyte>(const jbyte* base, int length); |
538 | template int UNICODE::quoted_ascii_length<jchar>(const jchar* base, int length); |
539 | template void UNICODE::as_quoted_ascii<jbyte>(const jbyte* base, int length, char* buf, int buflen); |
540 | template void UNICODE::as_quoted_ascii<jchar>(const jchar* base, int length, char* buf, int buflen); |
541 | |