1/* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2019 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18#include <config.h>
19
20/* Specification. */
21#include <wchar.h>
22
23#if C_LOCALE_MAYBE_EILSEQ
24# include "hard-locale.h"
25# include <locale.h>
26#endif
27
28#if GNULIB_defined_mbstate_t
29/* Implement mbrtowc() on top of mbtowc(). */
30
31# include <errno.h>
32# include <stdlib.h>
33
34# include "localcharset.h"
35# include "streq.h"
36# include "verify.h"
37# include "glthread/lock.h"
38
39# ifndef FALLTHROUGH
40# if __GNUC__ < 7
41# define FALLTHROUGH ((void) 0)
42# else
43# define FALLTHROUGH __attribute__ ((__fallthrough__))
44# endif
45# endif
46
47/* Returns a classification of special values of the encoding of the current
48 locale. */
49typedef enum {
50 enc_other, /* other */
51 enc_utf8, /* UTF-8 */
52 enc_eucjp, /* EUC-JP */
53 enc_94, /* EUC-KR, GB2312, BIG5 */
54 enc_euctw, /* EUC-TW */
55 enc_gb18030, /* GB18030 */
56 enc_sjis /* SJIS */
57} enc_t;
58static inline enc_t
59locale_enc (void)
60{
61 const char *encoding = locale_charset ();
62 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
63 return enc_utf8;
64 if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
65 return enc_eucjp;
66 if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
67 || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
68 || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
69 return enc_94;
70 if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
71 return enc_euctw;
72 if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
73 return enc_gb18030;
74 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
75 return enc_sjis;
76 return enc_other;
77}
78
79# if GNULIB_WCHAR_SINGLE
80/* When we know that the locale does not change, provide a speedup by
81 caching the value of locale_enc. */
82static int cached_locale_enc = -1;
83static inline enc_t
84locale_enc_cached (void)
85{
86 if (cached_locale_enc < 0)
87 cached_locale_enc = locale_enc ();
88 return cached_locale_enc;
89}
90# else
91/* By default, don't make assumptions, hence no caching. */
92# define locale_enc_cached locale_enc
93# endif
94
95/* This lock protects the internal state of mbtowc against multiple simultaneous
96 calls of mbrtowc. */
97gl_lock_define_initialized(static, mbtowc_lock)
98
99verify (sizeof (mbstate_t) >= 4);
100
101static char internal_state[4];
102
103size_t
104mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
105{
106 char *pstate = (char *)ps;
107
108 if (s == NULL)
109 {
110 pwc = NULL;
111 s = "";
112 n = 1;
113 }
114
115 if (n == 0)
116 return (size_t)(-2);
117
118 /* Here n > 0. */
119
120 if (pstate == NULL)
121 pstate = internal_state;
122
123 {
124 size_t nstate = pstate[0];
125 char buf[4];
126 const char *p;
127 size_t m;
128 enc_t enc;
129 int res;
130
131 switch (nstate)
132 {
133 case 0:
134 p = s;
135 m = n;
136 break;
137 case 3:
138 buf[2] = pstate[3];
139 FALLTHROUGH;
140 case 2:
141 buf[1] = pstate[2];
142 FALLTHROUGH;
143 case 1:
144 buf[0] = pstate[1];
145 p = buf;
146 m = nstate;
147 buf[m++] = s[0];
148 if (n >= 2 && m < 4)
149 {
150 buf[m++] = s[1];
151 if (n >= 3 && m < 4)
152 buf[m++] = s[2];
153 }
154 break;
155 default:
156 errno = EINVAL;
157 return (size_t)(-1);
158 }
159
160 /* Here m > 0. */
161
162 enc = locale_enc_cached ();
163
164 if (enc == enc_utf8) /* UTF-8 */
165 {
166 /* Achieve multi-thread safety by not calling mbtowc() at all. */
167 /* Cf. unistr/u8-mbtouc.c. */
168 unsigned char c = (unsigned char) p[0];
169
170 if (c < 0x80)
171 {
172 if (pwc != NULL)
173 *pwc = c;
174 res = (c == 0 ? 0 : 1);
175 goto success;
176 }
177 if (c >= 0xc2)
178 {
179 if (c < 0xe0)
180 {
181 if (m == 1)
182 goto incomplete;
183 else /* m >= 2 */
184 {
185 unsigned char c2 = (unsigned char) p[1];
186
187 if ((c2 ^ 0x80) < 0x40)
188 {
189 if (pwc != NULL)
190 *pwc = ((unsigned int) (c & 0x1f) << 6)
191 | (unsigned int) (c2 ^ 0x80);
192 res = 2;
193 goto success;
194 }
195 }
196 }
197 else if (c < 0xf0)
198 {
199 if (m == 1)
200 goto incomplete;
201 else
202 {
203 unsigned char c2 = (unsigned char) p[1];
204
205 if ((c2 ^ 0x80) < 0x40
206 && (c >= 0xe1 || c2 >= 0xa0)
207 && (c != 0xed || c2 < 0xa0))
208 {
209 if (m == 2)
210 goto incomplete;
211 else /* m >= 3 */
212 {
213 unsigned char c3 = (unsigned char) p[2];
214
215 if ((c3 ^ 0x80) < 0x40)
216 {
217 if (pwc != NULL)
218 *pwc = ((unsigned int) (c & 0x0f) << 12)
219 | ((unsigned int) (c2 ^ 0x80) << 6)
220 | (unsigned int) (c3 ^ 0x80);
221 res = 3;
222 goto success;
223 }
224 }
225 }
226 }
227 }
228 else if (c <= 0xf4)
229 {
230 if (m == 1)
231 goto incomplete;
232 else
233 {
234 unsigned char c2 = (unsigned char) p[1];
235
236 if ((c2 ^ 0x80) < 0x40
237 && (c >= 0xf1 || c2 >= 0x90)
238 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
239 {
240 if (m == 2)
241 goto incomplete;
242 else
243 {
244 unsigned char c3 = (unsigned char) p[2];
245
246 if ((c3 ^ 0x80) < 0x40)
247 {
248 if (m == 3)
249 goto incomplete;
250 else /* m >= 4 */
251 {
252 unsigned char c4 = (unsigned char) p[3];
253
254 if ((c4 ^ 0x80) < 0x40)
255 {
256 if (pwc != NULL)
257 *pwc = ((unsigned int) (c & 0x07) << 18)
258 | ((unsigned int) (c2 ^ 0x80) << 12)
259 | ((unsigned int) (c3 ^ 0x80) << 6)
260 | (unsigned int) (c4 ^ 0x80);
261 res = 4;
262 goto success;
263 }
264 }
265 }
266 }
267 }
268 }
269 }
270 }
271 goto invalid;
272 }
273 else
274 {
275 /* The hidden internal state of mbtowc would make this function not
276 multi-thread safe. Achieve multi-thread safety through a lock. */
277 gl_lock_lock (mbtowc_lock);
278
279 /* Put the hidden internal state of mbtowc into its initial state.
280 This is needed at least with glibc, uClibc, and MSVC CRT.
281 See <https://sourceware.org/bugzilla/show_bug.cgi?id=9674>. */
282 mbtowc (NULL, NULL, 0);
283
284 res = mbtowc (pwc, p, m);
285
286 gl_lock_unlock (mbtowc_lock);
287
288 if (res >= 0)
289 {
290 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
291 abort ();
292 goto success;
293 }
294
295 /* mbtowc does not distinguish between invalid and incomplete multibyte
296 sequences. But mbrtowc needs to make this distinction.
297 There are two possible approaches:
298 - Use iconv() and its return value.
299 - Use built-in knowledge about the possible encodings.
300 Given the low quality of implementation of iconv() on the systems
301 that lack mbrtowc(), we use the second approach.
302 The possible encodings are:
303 - 8-bit encodings,
304 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
305 - UTF-8 (already handled above).
306 Use specialized code for each. */
307 if (m >= 4 || m >= MB_CUR_MAX)
308 goto invalid;
309 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
310 switch (enc)
311 {
312 /* As a reference for this code, you can use the GNU libiconv
313 implementation. Look for uses of the RET_TOOFEW macro. */
314
315 case enc_eucjp: /* EUC-JP */
316 {
317 if (m == 1)
318 {
319 unsigned char c = (unsigned char) p[0];
320
321 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
322 goto incomplete;
323 }
324 if (m == 2)
325 {
326 unsigned char c = (unsigned char) p[0];
327
328 if (c == 0x8f)
329 {
330 unsigned char c2 = (unsigned char) p[1];
331
332 if (c2 >= 0xa1 && c2 < 0xff)
333 goto incomplete;
334 }
335 }
336 goto invalid;
337 }
338
339 case enc_94: /* EUC-KR, GB2312, BIG5 */
340 {
341 if (m == 1)
342 {
343 unsigned char c = (unsigned char) p[0];
344
345 if (c >= 0xa1 && c < 0xff)
346 goto incomplete;
347 }
348 goto invalid;
349 }
350
351 case enc_euctw: /* EUC-TW */
352 {
353 if (m == 1)
354 {
355 unsigned char c = (unsigned char) p[0];
356
357 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
358 goto incomplete;
359 }
360 else /* m == 2 || m == 3 */
361 {
362 unsigned char c = (unsigned char) p[0];
363
364 if (c == 0x8e)
365 goto incomplete;
366 }
367 goto invalid;
368 }
369
370 case enc_gb18030: /* GB18030 */
371 {
372 if (m == 1)
373 {
374 unsigned char c = (unsigned char) p[0];
375
376 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
377 goto incomplete;
378 }
379 else /* m == 2 || m == 3 */
380 {
381 unsigned char c = (unsigned char) p[0];
382
383 if (c >= 0x90 && c <= 0xe3)
384 {
385 unsigned char c2 = (unsigned char) p[1];
386
387 if (c2 >= 0x30 && c2 <= 0x39)
388 {
389 if (m == 2)
390 goto incomplete;
391 else /* m == 3 */
392 {
393 unsigned char c3 = (unsigned char) p[2];
394
395 if (c3 >= 0x81 && c3 <= 0xfe)
396 goto incomplete;
397 }
398 }
399 }
400 }
401 goto invalid;
402 }
403
404 case enc_sjis: /* SJIS */
405 {
406 if (m == 1)
407 {
408 unsigned char c = (unsigned char) p[0];
409
410 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
411 || (c >= 0xf0 && c <= 0xf9))
412 goto incomplete;
413 }
414 goto invalid;
415 }
416
417 default:
418 /* An unknown multibyte encoding. */
419 goto incomplete;
420 }
421 }
422
423 success:
424 /* res >= 0 is the corrected return value of mbtowc (pwc, p, m). */
425 if (nstate >= (res > 0 ? res : 1))
426 abort ();
427 res -= nstate;
428 pstate[0] = 0;
429 return res;
430
431 incomplete:
432 {
433 size_t k = nstate;
434 /* Here 0 <= k < m < 4. */
435 pstate[++k] = s[0];
436 if (k < m)
437 {
438 pstate[++k] = s[1];
439 if (k < m)
440 pstate[++k] = s[2];
441 }
442 if (k != m)
443 abort ();
444 }
445 pstate[0] = m;
446 return (size_t)(-2);
447
448 invalid:
449 errno = EILSEQ;
450 /* The conversion state is undefined, says POSIX. */
451 return (size_t)(-1);
452 }
453}
454
455#else
456/* Override the system's mbrtowc() function. */
457
458# undef mbrtowc
459
460size_t
461rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
462{
463 size_t ret;
464 wchar_t wc;
465
466# if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
467 if (s == NULL)
468 {
469 pwc = NULL;
470 s = "";
471 n = 1;
472 }
473# endif
474
475# if MBRTOWC_EMPTY_INPUT_BUG
476 if (n == 0)
477 return (size_t) -2;
478# endif
479
480 if (! pwc)
481 pwc = &wc;
482
483# if MBRTOWC_RETVAL_BUG
484 {
485 static mbstate_t internal_state;
486
487 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
488 hidden internal state, but we can call it on our variable. */
489 if (ps == NULL)
490 ps = &internal_state;
491
492 if (!mbsinit (ps))
493 {
494 /* Parse the rest of the multibyte character byte for byte. */
495 size_t count = 0;
496 for (; n > 0; s++, n--)
497 {
498 ret = mbrtowc (&wc, s, 1, ps);
499
500 if (ret == (size_t)(-1))
501 return (size_t)(-1);
502 count++;
503 if (ret != (size_t)(-2))
504 {
505 /* The multibyte character has been completed. */
506 *pwc = wc;
507 return (wc == 0 ? 0 : count);
508 }
509 }
510 return (size_t)(-2);
511 }
512 }
513# endif
514
515 ret = mbrtowc (pwc, s, n, ps);
516
517# if MBRTOWC_NUL_RETVAL_BUG
518 if (ret < (size_t) -2 && !*pwc)
519 return 0;
520# endif
521
522# if C_LOCALE_MAYBE_EILSEQ
523 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
524 {
525 unsigned char uc = *s;
526 *pwc = uc;
527 return 1;
528 }
529# endif
530
531 return ret;
532}
533
534#endif
535