1 | /* Convert multibyte character to wide character. |
2 | Copyright (C) 1999-2002, 2005-2019 Free Software Foundation, Inc. |
3 | Written by Bruno Haible <bruno@clisp.org>, 2008. |
4 | |
5 | This program is free software: you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published by |
7 | the Free Software Foundation; either version 3 of the License, or |
8 | (at your option) any later version. |
9 | |
10 | This program is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | GNU General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU General Public License |
16 | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | |
18 | #include <config.h> |
19 | |
20 | /* Specification. */ |
21 | #include <wchar.h> |
22 | |
23 | #if C_LOCALE_MAYBE_EILSEQ |
24 | # include "hard-locale.h" |
25 | # include <locale.h> |
26 | #endif |
27 | |
28 | #if GNULIB_defined_mbstate_t |
29 | /* Implement mbrtowc() on top of mbtowc(). */ |
30 | |
31 | # include <errno.h> |
32 | # include <stdlib.h> |
33 | |
34 | # include "localcharset.h" |
35 | # include "streq.h" |
36 | # include "verify.h" |
37 | # include "glthread/lock.h" |
38 | |
39 | # ifndef FALLTHROUGH |
40 | # if __GNUC__ < 7 |
41 | # define FALLTHROUGH ((void) 0) |
42 | # else |
43 | # define FALLTHROUGH __attribute__ ((__fallthrough__)) |
44 | # endif |
45 | # endif |
46 | |
47 | /* Returns a classification of special values of the encoding of the current |
48 | locale. */ |
49 | typedef enum { |
50 | enc_other, /* other */ |
51 | enc_utf8, /* UTF-8 */ |
52 | enc_eucjp, /* EUC-JP */ |
53 | enc_94, /* EUC-KR, GB2312, BIG5 */ |
54 | enc_euctw, /* EUC-TW */ |
55 | enc_gb18030, /* GB18030 */ |
56 | enc_sjis /* SJIS */ |
57 | } enc_t; |
58 | static inline enc_t |
59 | locale_enc (void) |
60 | { |
61 | const char *encoding = locale_charset (); |
62 | if (STREQ_OPT (encoding, "UTF-8" , 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) |
63 | return enc_utf8; |
64 | if (STREQ_OPT (encoding, "EUC-JP" , 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) |
65 | return enc_eucjp; |
66 | if (STREQ_OPT (encoding, "EUC-KR" , 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) |
67 | || STREQ_OPT (encoding, "GB2312" , 'G', 'B', '2', '3', '1', '2', 0, 0, 0) |
68 | || STREQ_OPT (encoding, "BIG5" , 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) |
69 | return enc_94; |
70 | if (STREQ_OPT (encoding, "EUC-TW" , 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) |
71 | return enc_euctw; |
72 | if (STREQ_OPT (encoding, "GB18030" , 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) |
73 | return enc_gb18030; |
74 | if (STREQ_OPT (encoding, "SJIS" , 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) |
75 | return enc_sjis; |
76 | return enc_other; |
77 | } |
78 | |
79 | # if GNULIB_WCHAR_SINGLE |
80 | /* When we know that the locale does not change, provide a speedup by |
81 | caching the value of locale_enc. */ |
82 | static int cached_locale_enc = -1; |
83 | static inline enc_t |
84 | locale_enc_cached (void) |
85 | { |
86 | if (cached_locale_enc < 0) |
87 | cached_locale_enc = locale_enc (); |
88 | return cached_locale_enc; |
89 | } |
90 | # else |
91 | /* By default, don't make assumptions, hence no caching. */ |
92 | # define locale_enc_cached locale_enc |
93 | # endif |
94 | |
95 | /* This lock protects the internal state of mbtowc against multiple simultaneous |
96 | calls of mbrtowc. */ |
97 | gl_lock_define_initialized(static, mbtowc_lock) |
98 | |
99 | verify (sizeof (mbstate_t) >= 4); |
100 | |
101 | static char internal_state[4]; |
102 | |
103 | size_t |
104 | mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) |
105 | { |
106 | char *pstate = (char *)ps; |
107 | |
108 | if (s == NULL) |
109 | { |
110 | pwc = NULL; |
111 | s = "" ; |
112 | n = 1; |
113 | } |
114 | |
115 | if (n == 0) |
116 | return (size_t)(-2); |
117 | |
118 | /* Here n > 0. */ |
119 | |
120 | if (pstate == NULL) |
121 | pstate = internal_state; |
122 | |
123 | { |
124 | size_t nstate = pstate[0]; |
125 | char buf[4]; |
126 | const char *p; |
127 | size_t m; |
128 | enc_t enc; |
129 | int res; |
130 | |
131 | switch (nstate) |
132 | { |
133 | case 0: |
134 | p = s; |
135 | m = n; |
136 | break; |
137 | case 3: |
138 | buf[2] = pstate[3]; |
139 | FALLTHROUGH; |
140 | case 2: |
141 | buf[1] = pstate[2]; |
142 | FALLTHROUGH; |
143 | case 1: |
144 | buf[0] = pstate[1]; |
145 | p = buf; |
146 | m = nstate; |
147 | buf[m++] = s[0]; |
148 | if (n >= 2 && m < 4) |
149 | { |
150 | buf[m++] = s[1]; |
151 | if (n >= 3 && m < 4) |
152 | buf[m++] = s[2]; |
153 | } |
154 | break; |
155 | default: |
156 | errno = EINVAL; |
157 | return (size_t)(-1); |
158 | } |
159 | |
160 | /* Here m > 0. */ |
161 | |
162 | enc = locale_enc_cached (); |
163 | |
164 | if (enc == enc_utf8) /* UTF-8 */ |
165 | { |
166 | /* Achieve multi-thread safety by not calling mbtowc() at all. */ |
167 | /* Cf. unistr/u8-mbtouc.c. */ |
168 | unsigned char c = (unsigned char) p[0]; |
169 | |
170 | if (c < 0x80) |
171 | { |
172 | if (pwc != NULL) |
173 | *pwc = c; |
174 | res = (c == 0 ? 0 : 1); |
175 | goto success; |
176 | } |
177 | if (c >= 0xc2) |
178 | { |
179 | if (c < 0xe0) |
180 | { |
181 | if (m == 1) |
182 | goto incomplete; |
183 | else /* m >= 2 */ |
184 | { |
185 | unsigned char c2 = (unsigned char) p[1]; |
186 | |
187 | if ((c2 ^ 0x80) < 0x40) |
188 | { |
189 | if (pwc != NULL) |
190 | *pwc = ((unsigned int) (c & 0x1f) << 6) |
191 | | (unsigned int) (c2 ^ 0x80); |
192 | res = 2; |
193 | goto success; |
194 | } |
195 | } |
196 | } |
197 | else if (c < 0xf0) |
198 | { |
199 | if (m == 1) |
200 | goto incomplete; |
201 | else |
202 | { |
203 | unsigned char c2 = (unsigned char) p[1]; |
204 | |
205 | if ((c2 ^ 0x80) < 0x40 |
206 | && (c >= 0xe1 || c2 >= 0xa0) |
207 | && (c != 0xed || c2 < 0xa0)) |
208 | { |
209 | if (m == 2) |
210 | goto incomplete; |
211 | else /* m >= 3 */ |
212 | { |
213 | unsigned char c3 = (unsigned char) p[2]; |
214 | |
215 | if ((c3 ^ 0x80) < 0x40) |
216 | { |
217 | if (pwc != NULL) |
218 | *pwc = ((unsigned int) (c & 0x0f) << 12) |
219 | | ((unsigned int) (c2 ^ 0x80) << 6) |
220 | | (unsigned int) (c3 ^ 0x80); |
221 | res = 3; |
222 | goto success; |
223 | } |
224 | } |
225 | } |
226 | } |
227 | } |
228 | else if (c <= 0xf4) |
229 | { |
230 | if (m == 1) |
231 | goto incomplete; |
232 | else |
233 | { |
234 | unsigned char c2 = (unsigned char) p[1]; |
235 | |
236 | if ((c2 ^ 0x80) < 0x40 |
237 | && (c >= 0xf1 || c2 >= 0x90) |
238 | && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) |
239 | { |
240 | if (m == 2) |
241 | goto incomplete; |
242 | else |
243 | { |
244 | unsigned char c3 = (unsigned char) p[2]; |
245 | |
246 | if ((c3 ^ 0x80) < 0x40) |
247 | { |
248 | if (m == 3) |
249 | goto incomplete; |
250 | else /* m >= 4 */ |
251 | { |
252 | unsigned char c4 = (unsigned char) p[3]; |
253 | |
254 | if ((c4 ^ 0x80) < 0x40) |
255 | { |
256 | if (pwc != NULL) |
257 | *pwc = ((unsigned int) (c & 0x07) << 18) |
258 | | ((unsigned int) (c2 ^ 0x80) << 12) |
259 | | ((unsigned int) (c3 ^ 0x80) << 6) |
260 | | (unsigned int) (c4 ^ 0x80); |
261 | res = 4; |
262 | goto success; |
263 | } |
264 | } |
265 | } |
266 | } |
267 | } |
268 | } |
269 | } |
270 | } |
271 | goto invalid; |
272 | } |
273 | else |
274 | { |
275 | /* The hidden internal state of mbtowc would make this function not |
276 | multi-thread safe. Achieve multi-thread safety through a lock. */ |
277 | gl_lock_lock (mbtowc_lock); |
278 | |
279 | /* Put the hidden internal state of mbtowc into its initial state. |
280 | This is needed at least with glibc, uClibc, and MSVC CRT. |
281 | See <https://sourceware.org/bugzilla/show_bug.cgi?id=9674>. */ |
282 | mbtowc (NULL, NULL, 0); |
283 | |
284 | res = mbtowc (pwc, p, m); |
285 | |
286 | gl_lock_unlock (mbtowc_lock); |
287 | |
288 | if (res >= 0) |
289 | { |
290 | if (pwc != NULL && ((*pwc == 0) != (res == 0))) |
291 | abort (); |
292 | goto success; |
293 | } |
294 | |
295 | /* mbtowc does not distinguish between invalid and incomplete multibyte |
296 | sequences. But mbrtowc needs to make this distinction. |
297 | There are two possible approaches: |
298 | - Use iconv() and its return value. |
299 | - Use built-in knowledge about the possible encodings. |
300 | Given the low quality of implementation of iconv() on the systems |
301 | that lack mbrtowc(), we use the second approach. |
302 | The possible encodings are: |
303 | - 8-bit encodings, |
304 | - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, |
305 | - UTF-8 (already handled above). |
306 | Use specialized code for each. */ |
307 | if (m >= 4 || m >= MB_CUR_MAX) |
308 | goto invalid; |
309 | /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ |
310 | switch (enc) |
311 | { |
312 | /* As a reference for this code, you can use the GNU libiconv |
313 | implementation. Look for uses of the RET_TOOFEW macro. */ |
314 | |
315 | case enc_eucjp: /* EUC-JP */ |
316 | { |
317 | if (m == 1) |
318 | { |
319 | unsigned char c = (unsigned char) p[0]; |
320 | |
321 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) |
322 | goto incomplete; |
323 | } |
324 | if (m == 2) |
325 | { |
326 | unsigned char c = (unsigned char) p[0]; |
327 | |
328 | if (c == 0x8f) |
329 | { |
330 | unsigned char c2 = (unsigned char) p[1]; |
331 | |
332 | if (c2 >= 0xa1 && c2 < 0xff) |
333 | goto incomplete; |
334 | } |
335 | } |
336 | goto invalid; |
337 | } |
338 | |
339 | case enc_94: /* EUC-KR, GB2312, BIG5 */ |
340 | { |
341 | if (m == 1) |
342 | { |
343 | unsigned char c = (unsigned char) p[0]; |
344 | |
345 | if (c >= 0xa1 && c < 0xff) |
346 | goto incomplete; |
347 | } |
348 | goto invalid; |
349 | } |
350 | |
351 | case enc_euctw: /* EUC-TW */ |
352 | { |
353 | if (m == 1) |
354 | { |
355 | unsigned char c = (unsigned char) p[0]; |
356 | |
357 | if ((c >= 0xa1 && c < 0xff) || c == 0x8e) |
358 | goto incomplete; |
359 | } |
360 | else /* m == 2 || m == 3 */ |
361 | { |
362 | unsigned char c = (unsigned char) p[0]; |
363 | |
364 | if (c == 0x8e) |
365 | goto incomplete; |
366 | } |
367 | goto invalid; |
368 | } |
369 | |
370 | case enc_gb18030: /* GB18030 */ |
371 | { |
372 | if (m == 1) |
373 | { |
374 | unsigned char c = (unsigned char) p[0]; |
375 | |
376 | if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) |
377 | goto incomplete; |
378 | } |
379 | else /* m == 2 || m == 3 */ |
380 | { |
381 | unsigned char c = (unsigned char) p[0]; |
382 | |
383 | if (c >= 0x90 && c <= 0xe3) |
384 | { |
385 | unsigned char c2 = (unsigned char) p[1]; |
386 | |
387 | if (c2 >= 0x30 && c2 <= 0x39) |
388 | { |
389 | if (m == 2) |
390 | goto incomplete; |
391 | else /* m == 3 */ |
392 | { |
393 | unsigned char c3 = (unsigned char) p[2]; |
394 | |
395 | if (c3 >= 0x81 && c3 <= 0xfe) |
396 | goto incomplete; |
397 | } |
398 | } |
399 | } |
400 | } |
401 | goto invalid; |
402 | } |
403 | |
404 | case enc_sjis: /* SJIS */ |
405 | { |
406 | if (m == 1) |
407 | { |
408 | unsigned char c = (unsigned char) p[0]; |
409 | |
410 | if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) |
411 | || (c >= 0xf0 && c <= 0xf9)) |
412 | goto incomplete; |
413 | } |
414 | goto invalid; |
415 | } |
416 | |
417 | default: |
418 | /* An unknown multibyte encoding. */ |
419 | goto incomplete; |
420 | } |
421 | } |
422 | |
423 | success: |
424 | /* res >= 0 is the corrected return value of mbtowc (pwc, p, m). */ |
425 | if (nstate >= (res > 0 ? res : 1)) |
426 | abort (); |
427 | res -= nstate; |
428 | pstate[0] = 0; |
429 | return res; |
430 | |
431 | incomplete: |
432 | { |
433 | size_t k = nstate; |
434 | /* Here 0 <= k < m < 4. */ |
435 | pstate[++k] = s[0]; |
436 | if (k < m) |
437 | { |
438 | pstate[++k] = s[1]; |
439 | if (k < m) |
440 | pstate[++k] = s[2]; |
441 | } |
442 | if (k != m) |
443 | abort (); |
444 | } |
445 | pstate[0] = m; |
446 | return (size_t)(-2); |
447 | |
448 | invalid: |
449 | errno = EILSEQ; |
450 | /* The conversion state is undefined, says POSIX. */ |
451 | return (size_t)(-1); |
452 | } |
453 | } |
454 | |
455 | #else |
456 | /* Override the system's mbrtowc() function. */ |
457 | |
458 | # undef mbrtowc |
459 | |
460 | size_t |
461 | rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) |
462 | { |
463 | size_t ret; |
464 | wchar_t wc; |
465 | |
466 | # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG |
467 | if (s == NULL) |
468 | { |
469 | pwc = NULL; |
470 | s = "" ; |
471 | n = 1; |
472 | } |
473 | # endif |
474 | |
475 | # if MBRTOWC_EMPTY_INPUT_BUG |
476 | if (n == 0) |
477 | return (size_t) -2; |
478 | # endif |
479 | |
480 | if (! pwc) |
481 | pwc = &wc; |
482 | |
483 | # if MBRTOWC_RETVAL_BUG |
484 | { |
485 | static mbstate_t internal_state; |
486 | |
487 | /* Override mbrtowc's internal state. We cannot call mbsinit() on the |
488 | hidden internal state, but we can call it on our variable. */ |
489 | if (ps == NULL) |
490 | ps = &internal_state; |
491 | |
492 | if (!mbsinit (ps)) |
493 | { |
494 | /* Parse the rest of the multibyte character byte for byte. */ |
495 | size_t count = 0; |
496 | for (; n > 0; s++, n--) |
497 | { |
498 | ret = mbrtowc (&wc, s, 1, ps); |
499 | |
500 | if (ret == (size_t)(-1)) |
501 | return (size_t)(-1); |
502 | count++; |
503 | if (ret != (size_t)(-2)) |
504 | { |
505 | /* The multibyte character has been completed. */ |
506 | *pwc = wc; |
507 | return (wc == 0 ? 0 : count); |
508 | } |
509 | } |
510 | return (size_t)(-2); |
511 | } |
512 | } |
513 | # endif |
514 | |
515 | ret = mbrtowc (pwc, s, n, ps); |
516 | |
517 | # if MBRTOWC_NUL_RETVAL_BUG |
518 | if (ret < (size_t) -2 && !*pwc) |
519 | return 0; |
520 | # endif |
521 | |
522 | # if C_LOCALE_MAYBE_EILSEQ |
523 | if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE)) |
524 | { |
525 | unsigned char uc = *s; |
526 | *pwc = uc; |
527 | return 1; |
528 | } |
529 | # endif |
530 | |
531 | return ret; |
532 | } |
533 | |
534 | #endif |
535 | |