1// This is an open source non-commercial project. Dear PVS-Studio, please check
2// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4/// mbyte.c: Code specifically for handling multi-byte characters.
5/// Multibyte extensions partly by Sung-Hoon Baek
6///
7/// Strings internal to Nvim are always encoded as UTF-8 (thus the legacy
8/// 'encoding' option is always "utf-8").
9///
10/// The cell width on the display needs to be determined from the character
11/// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
12/// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
13/// character. To make things complicated, up to six composing characters
14/// are allowed. These are drawn on top of the first char. For most editing
15/// the sequence of bytes with composing characters included is considered to
16/// be one character.
17///
18/// UTF-8 is used everywhere in the core. This is in registers, text
19/// manipulation, buffers, etc. Nvim core communicates with external plugins
20/// and GUIs in this encoding.
21///
22/// The encoding of a file is specified with 'fileencoding'. Conversion
23/// is to be done when it's different from "utf-8".
24///
25/// Vim scripts may contain an ":scriptencoding" command. This has an effect
26/// for some commands, like ":menutrans".
27
28#include <inttypes.h>
29#include <stdbool.h>
30#include <string.h>
31#include <wchar.h>
32#include <wctype.h>
33
34#include "nvim/vim.h"
35#include "nvim/ascii.h"
36#ifdef HAVE_LOCALE_H
37# include <locale.h>
38#endif
39#include "nvim/eval.h"
40#include "nvim/path.h"
41#include "nvim/iconv.h"
42#include "nvim/mbyte.h"
43#include "nvim/charset.h"
44#include "nvim/cursor.h"
45#include "nvim/fileio.h"
46#include "nvim/func_attr.h"
47#include "nvim/memline.h"
48#include "nvim/message.h"
49#include "nvim/misc1.h"
50#include "nvim/memory.h"
51#include "nvim/option.h"
52#include "nvim/screen.h"
53#include "nvim/spell.h"
54#include "nvim/strings.h"
55#include "nvim/os/os.h"
56#include "nvim/arabic.h"
57#include "nvim/mark.h"
58
59typedef struct {
60 int rangeStart;
61 int rangeEnd;
62 int step;
63 int offset;
64} convertStruct;
65
66struct interval {
67 long first;
68 long last;
69};
70
71#ifdef INCLUDE_GENERATED_DECLARATIONS
72# include "mbyte.c.generated.h"
73# include "unicode_tables.generated.h"
74#endif
75
76char_u e_loadlib[] = "E370: Could not load library %s";
77char_u e_loadfunc[] = "E448: Could not load library function %s";
78
79// To speed up BYTELEN(); keep a lookup table to quickly get the length in
80// bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes
81// which are illegal when used as the first byte have a 1. The NUL byte has
82// length 1.
83const uint8_t utf8len_tab[] = {
84 // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0?
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1?
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2?
88 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3?
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4?
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5?
91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6?
92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7?
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8?
94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9?
95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A?
96 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B?
97 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C?
98 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D?
99 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E?
100 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, // F?
101};
102
103// Like utf8len_tab above, but using a zero for illegal lead bytes.
104const uint8_t utf8len_tab_zero[] = {
105 // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0?
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1?
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2?
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3?
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4?
111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5?
112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6?
113 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7?
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8?
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9?
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A?
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B?
118 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C?
119 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D?
120 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E?
121 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0, // F?
122};
123
124/*
125 * Canonical encoding names and their properties.
126 * "iso-8859-n" is handled by enc_canonize() directly.
127 */
128static struct
129{ const char *name; int prop; int codepage; }
130enc_canon_table[] =
131{
132#define IDX_LATIN_1 0
133 {"latin1", ENC_8BIT + ENC_LATIN1, 1252},
134#define IDX_ISO_2 1
135 {"iso-8859-2", ENC_8BIT, 0},
136#define IDX_ISO_3 2
137 {"iso-8859-3", ENC_8BIT, 0},
138#define IDX_ISO_4 3
139 {"iso-8859-4", ENC_8BIT, 0},
140#define IDX_ISO_5 4
141 {"iso-8859-5", ENC_8BIT, 0},
142#define IDX_ISO_6 5
143 {"iso-8859-6", ENC_8BIT, 0},
144#define IDX_ISO_7 6
145 {"iso-8859-7", ENC_8BIT, 0},
146#define IDX_ISO_8 7
147 {"iso-8859-8", ENC_8BIT, 0},
148#define IDX_ISO_9 8
149 {"iso-8859-9", ENC_8BIT, 0},
150#define IDX_ISO_10 9
151 {"iso-8859-10", ENC_8BIT, 0},
152#define IDX_ISO_11 10
153 {"iso-8859-11", ENC_8BIT, 0},
154#define IDX_ISO_13 11
155 {"iso-8859-13", ENC_8BIT, 0},
156#define IDX_ISO_14 12
157 {"iso-8859-14", ENC_8BIT, 0},
158#define IDX_ISO_15 13
159 {"iso-8859-15", ENC_8BIT + ENC_LATIN9, 0},
160#define IDX_KOI8_R 14
161 {"koi8-r", ENC_8BIT, 0},
162#define IDX_KOI8_U 15
163 {"koi8-u", ENC_8BIT, 0},
164#define IDX_UTF8 16
165 {"utf-8", ENC_UNICODE, 0},
166#define IDX_UCS2 17
167 {"ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0},
168#define IDX_UCS2LE 18
169 {"ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0},
170#define IDX_UTF16 19
171 {"utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0},
172#define IDX_UTF16LE 20
173 {"utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0},
174#define IDX_UCS4 21
175 {"ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0},
176#define IDX_UCS4LE 22
177 {"ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0},
178
179 /* For debugging DBCS encoding on Unix. */
180#define IDX_DEBUG 23
181 {"debug", ENC_DBCS, DBCS_DEBUG},
182#define IDX_EUC_JP 24
183 {"euc-jp", ENC_DBCS, DBCS_JPNU},
184#define IDX_SJIS 25
185 {"sjis", ENC_DBCS, DBCS_JPN},
186#define IDX_EUC_KR 26
187 {"euc-kr", ENC_DBCS, DBCS_KORU},
188#define IDX_EUC_CN 27
189 {"euc-cn", ENC_DBCS, DBCS_CHSU},
190#define IDX_EUC_TW 28
191 {"euc-tw", ENC_DBCS, DBCS_CHTU},
192#define IDX_BIG5 29
193 {"big5", ENC_DBCS, DBCS_CHT},
194
195 /* MS-DOS and MS-Windows codepages are included here, so that they can be
196 * used on Unix too. Most of them are similar to ISO-8859 encodings, but
197 * not exactly the same. */
198#define IDX_CP437 30
199 {"cp437", ENC_8BIT, 437}, /* like iso-8859-1 */
200#define IDX_CP737 31
201 {"cp737", ENC_8BIT, 737}, /* like iso-8859-7 */
202#define IDX_CP775 32
203 {"cp775", ENC_8BIT, 775}, /* Baltic */
204#define IDX_CP850 33
205 {"cp850", ENC_8BIT, 850}, /* like iso-8859-4 */
206#define IDX_CP852 34
207 {"cp852", ENC_8BIT, 852}, /* like iso-8859-1 */
208#define IDX_CP855 35
209 {"cp855", ENC_8BIT, 855}, /* like iso-8859-2 */
210#define IDX_CP857 36
211 {"cp857", ENC_8BIT, 857}, /* like iso-8859-5 */
212#define IDX_CP860 37
213 {"cp860", ENC_8BIT, 860}, /* like iso-8859-9 */
214#define IDX_CP861 38
215 {"cp861", ENC_8BIT, 861}, /* like iso-8859-1 */
216#define IDX_CP862 39
217 {"cp862", ENC_8BIT, 862}, /* like iso-8859-1 */
218#define IDX_CP863 40
219 {"cp863", ENC_8BIT, 863}, /* like iso-8859-8 */
220#define IDX_CP865 41
221 {"cp865", ENC_8BIT, 865}, /* like iso-8859-1 */
222#define IDX_CP866 42
223 {"cp866", ENC_8BIT, 866}, /* like iso-8859-5 */
224#define IDX_CP869 43
225 {"cp869", ENC_8BIT, 869}, /* like iso-8859-7 */
226#define IDX_CP874 44
227 {"cp874", ENC_8BIT, 874}, /* Thai */
228#define IDX_CP932 45
229 {"cp932", ENC_DBCS, DBCS_JPN},
230#define IDX_CP936 46
231 {"cp936", ENC_DBCS, DBCS_CHS},
232#define IDX_CP949 47
233 {"cp949", ENC_DBCS, DBCS_KOR},
234#define IDX_CP950 48
235 {"cp950", ENC_DBCS, DBCS_CHT},
236#define IDX_CP1250 49
237 {"cp1250", ENC_8BIT, 1250}, /* Czech, Polish, etc. */
238#define IDX_CP1251 50
239 {"cp1251", ENC_8BIT, 1251}, /* Cyrillic */
240 /* cp1252 is considered to be equal to latin1 */
241#define IDX_CP1253 51
242 {"cp1253", ENC_8BIT, 1253}, /* Greek */
243#define IDX_CP1254 52
244 {"cp1254", ENC_8BIT, 1254}, /* Turkish */
245#define IDX_CP1255 53
246 {"cp1255", ENC_8BIT, 1255}, /* Hebrew */
247#define IDX_CP1256 54
248 {"cp1256", ENC_8BIT, 1256}, /* Arabic */
249#define IDX_CP1257 55
250 {"cp1257", ENC_8BIT, 1257}, /* Baltic */
251#define IDX_CP1258 56
252 {"cp1258", ENC_8BIT, 1258}, /* Vietnamese */
253
254#define IDX_MACROMAN 57
255 {"macroman", ENC_8BIT + ENC_MACROMAN, 0}, /* Mac OS */
256#define IDX_HPROMAN8 58
257 {"hp-roman8", ENC_8BIT, 0}, /* HP Roman8 */
258#define IDX_COUNT 59
259};
260
261/*
262 * Aliases for encoding names.
263 */
264static struct
265{ const char *name; int canon; }
266enc_alias_table[] =
267{
268 {"ansi", IDX_LATIN_1},
269 {"iso-8859-1", IDX_LATIN_1},
270 {"latin2", IDX_ISO_2},
271 {"latin3", IDX_ISO_3},
272 {"latin4", IDX_ISO_4},
273 {"cyrillic", IDX_ISO_5},
274 {"arabic", IDX_ISO_6},
275 {"greek", IDX_ISO_7},
276 {"hebrew", IDX_ISO_8},
277 {"latin5", IDX_ISO_9},
278 {"turkish", IDX_ISO_9}, /* ? */
279 {"latin6", IDX_ISO_10},
280 {"nordic", IDX_ISO_10}, /* ? */
281 {"thai", IDX_ISO_11}, /* ? */
282 {"latin7", IDX_ISO_13},
283 {"latin8", IDX_ISO_14},
284 {"latin9", IDX_ISO_15},
285 {"utf8", IDX_UTF8},
286 {"unicode", IDX_UCS2},
287 {"ucs2", IDX_UCS2},
288 {"ucs2be", IDX_UCS2},
289 {"ucs-2be", IDX_UCS2},
290 {"ucs2le", IDX_UCS2LE},
291 {"utf16", IDX_UTF16},
292 {"utf16be", IDX_UTF16},
293 {"utf-16be", IDX_UTF16},
294 {"utf16le", IDX_UTF16LE},
295 {"ucs4", IDX_UCS4},
296 {"ucs4be", IDX_UCS4},
297 {"ucs-4be", IDX_UCS4},
298 {"ucs4le", IDX_UCS4LE},
299 {"utf32", IDX_UCS4},
300 {"utf-32", IDX_UCS4},
301 {"utf32be", IDX_UCS4},
302 {"utf-32be", IDX_UCS4},
303 {"utf32le", IDX_UCS4LE},
304 {"utf-32le", IDX_UCS4LE},
305 {"932", IDX_CP932},
306 {"949", IDX_CP949},
307 {"936", IDX_CP936},
308 {"gbk", IDX_CP936},
309 {"950", IDX_CP950},
310 {"eucjp", IDX_EUC_JP},
311 {"unix-jis", IDX_EUC_JP},
312 {"ujis", IDX_EUC_JP},
313 {"shift-jis", IDX_SJIS},
314 {"pck", IDX_SJIS}, /* Sun: PCK */
315 {"euckr", IDX_EUC_KR},
316 {"5601", IDX_EUC_KR}, /* Sun: KS C 5601 */
317 {"euccn", IDX_EUC_CN},
318 {"gb2312", IDX_EUC_CN},
319 {"euctw", IDX_EUC_TW},
320 {"japan", IDX_EUC_JP},
321 {"korea", IDX_EUC_KR},
322 {"prc", IDX_EUC_CN},
323 {"chinese", IDX_EUC_CN},
324 {"taiwan", IDX_EUC_TW},
325 {"cp950", IDX_BIG5},
326 {"950", IDX_BIG5},
327 {"mac", IDX_MACROMAN},
328 {"mac-roman", IDX_MACROMAN},
329 {NULL, 0}
330};
331
332/*
333 * Find encoding "name" in the list of canonical encoding names.
334 * Returns -1 if not found.
335 */
336static int enc_canon_search(const char_u *name)
337{
338 int i;
339
340 for (i = 0; i < IDX_COUNT; ++i)
341 if (STRCMP(name, enc_canon_table[i].name) == 0)
342 return i;
343 return -1;
344}
345
346
347
348/*
349 * Find canonical encoding "name" in the list and return its properties.
350 * Returns 0 if not found.
351 */
352int enc_canon_props(const char_u *name)
353{
354 int i;
355
356 i = enc_canon_search(name);
357 if (i >= 0)
358 return enc_canon_table[i].prop;
359 if (STRNCMP(name, "2byte-", 6) == 0)
360 return ENC_DBCS;
361 if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0)
362 return ENC_8BIT;
363 return 0;
364}
365
366/*
367 * Return the size of the BOM for the current buffer:
368 * 0 - no BOM
369 * 2 - UCS-2 or UTF-16 BOM
370 * 4 - UCS-4 BOM
371 * 3 - UTF-8 BOM
372 */
373int bomb_size(void)
374{
375 int n = 0;
376
377 if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
378 if (*curbuf->b_p_fenc == NUL
379 || STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {
380 n = 3;
381 } else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
382 || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {
383 n = 2;
384 } else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {
385 n = 4;
386 }
387 }
388 return n;
389}
390
391/*
392 * Remove all BOM from "s" by moving remaining text.
393 */
394void remove_bom(char_u *s)
395{
396 char *p = (char *)s;
397
398 while ((p = strchr(p, 0xef)) != NULL) {
399 if ((uint8_t)p[1] == 0xbb && (uint8_t)p[2] == 0xbf) {
400 STRMOVE(p, p + 3);
401 } else {
402 p++;
403 }
404 }
405}
406
407/*
408 * Get class of pointer:
409 * 0 for blank or NUL
410 * 1 for punctuation
411 * 2 for an (ASCII) word character
412 * >2 for other word characters
413 */
414int mb_get_class(const char_u *p)
415{
416 return mb_get_class_tab(p, curbuf->b_chartab);
417}
418
419int mb_get_class_tab(const char_u *p, const uint64_t *const chartab)
420{
421 if (MB_BYTE2LEN(p[0]) == 1) {
422 if (p[0] == NUL || ascii_iswhite(p[0])) {
423 return 0;
424 }
425 if (vim_iswordc_tab(p[0], chartab)) {
426 return 2;
427 }
428 return 1;
429 }
430 return utf_class_tab(utf_ptr2char(p), chartab);
431}
432
433/*
434 * Return true if "c" is in "table".
435 */
436static bool intable(const struct interval *table, size_t n_items, int c)
437{
438 int mid, bot, top;
439
440 /* first quick check for Latin1 etc. characters */
441 if (c < table[0].first)
442 return false;
443
444 /* binary search in table */
445 bot = 0;
446 top = (int)(n_items - 1);
447 while (top >= bot) {
448 mid = (bot + top) / 2;
449 if (table[mid].last < c)
450 bot = mid + 1;
451 else if (table[mid].first > c)
452 top = mid - 1;
453 else
454 return true;
455 }
456 return false;
457}
458
459/// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
460/// Returns 4 or 6 for an unprintable character.
461/// Is only correct for characters >= 0x80.
462/// When p_ambw is "double", return 2 for a character with East Asian Width
463/// class 'A'(mbiguous).
464///
465/// @note Tables `doublewidth` and `ambiguous` are generated by
466/// gen_unicode_tables.lua, which must be manually invoked as needed.
467int utf_char2cells(int c)
468{
469 if (c >= 0x100) {
470#ifdef USE_WCHAR_FUNCTIONS
471 //
472 // Assume the library function wcwidth() works better than our own
473 // stuff. It should return 1 for ambiguous width chars!
474 //
475 int n = wcwidth(c);
476
477 if (n < 0) {
478 return 6; // unprintable, displays <xxxx>
479 }
480 if (n > 1) {
481 return n;
482 }
483#else
484 if (!utf_printable(c)) {
485 return 6; // unprintable, displays <xxxx>
486 }
487 if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
488 return 2;
489 }
490#endif
491 if (p_emoji && intable(emoji_width, ARRAY_SIZE(emoji_width), c)) {
492 return 2;
493 }
494 } else if (c >= 0x80 && !vim_isprintc(c)) {
495 // Characters below 0x100 are influenced by 'isprint' option.
496 return 4; // unprintable, displays <xx>
497 }
498
499 if (c >= 0x80 && *p_ambw == 'd'
500 && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
501 return 2;
502 }
503
504 return 1;
505}
506
507/// Return the number of display cells character at "*p" occupies.
508/// This doesn't take care of unprintable characters, use ptr2cells() for that.
509int utf_ptr2cells(const char_u *p)
510{
511 int c;
512
513 /* Need to convert to a wide character. */
514 if (*p >= 0x80) {
515 c = utf_ptr2char(p);
516 /* An illegal byte is displayed as <xx>. */
517 if (utf_ptr2len(p) == 1 || c == NUL)
518 return 4;
519 /* If the char is ASCII it must be an overlong sequence. */
520 if (c < 0x80)
521 return char2cells(c);
522 return utf_char2cells(c);
523 }
524 return 1;
525}
526
527/// Like utf_ptr2cells(), but limit string length to "size".
528/// For an empty string or truncated character returns 1.
529int utf_ptr2cells_len(const char_u *p, int size)
530{
531 int c;
532
533 /* Need to convert to a wide character. */
534 if (size > 0 && *p >= 0x80) {
535 if (utf_ptr2len_len(p, size) < utf8len_tab[*p])
536 return 1; /* truncated */
537 c = utf_ptr2char(p);
538 /* An illegal byte is displayed as <xx>. */
539 if (utf_ptr2len(p) == 1 || c == NUL)
540 return 4;
541 /* If the char is ASCII it must be an overlong sequence. */
542 if (c < 0x80)
543 return char2cells(c);
544 return utf_char2cells(c);
545 }
546 return 1;
547}
548
549/// Calculate the number of cells occupied by string `str`.
550///
551/// @param str The source string, may not be NULL, must be a NUL-terminated
552/// string.
553/// @return The number of cells occupied by string `str`
554size_t mb_string2cells(const char_u *str)
555{
556 size_t clen = 0;
557
558 for (const char_u *p = str; *p != NUL; p += (*mb_ptr2len)(p)) {
559 clen += utf_ptr2cells(p);
560 }
561
562 return clen;
563}
564
565/// Get the number of cells occupied by string `str` with maximum length `size`
566///
567/// @param str The source string, may not be NULL, must be a NUL-terminated
568/// string.
569/// @param size maximum length of string. It will terminate on earlier NUL.
570/// @return The number of cells occupied by string `str`
571size_t mb_string2cells_len(const char_u *str, size_t size)
572{
573 size_t clen = 0;
574
575 for (const char_u *p = str; *p != NUL && p < str+size;
576 p += utf_ptr2len_len(p, size+(p-str))) {
577 clen += utf_ptr2cells(p);
578 }
579
580 return clen;
581}
582
583/// Convert a UTF-8 byte sequence to a wide character
584///
585/// If the sequence is illegal or truncated by a NUL then the first byte is
586/// returned.
587/// For an overlong sequence this may return zero.
588/// Does not include composing characters for obvious reasons.
589///
590/// @param[in] p String to convert.
591///
592/// @return Unicode codepoint or byte value.
593int utf_ptr2char(const char_u *const p)
594 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
595{
596 if (p[0] < 0x80) { // Be quick for ASCII.
597 return p[0];
598 }
599
600 const uint8_t len = utf8len_tab_zero[p[0]];
601 if (len > 1 && (p[1] & 0xc0) == 0x80) {
602 if (len == 2) {
603 return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
604 }
605 if ((p[2] & 0xc0) == 0x80) {
606 if (len == 3) {
607 return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
608 + (p[2] & 0x3f));
609 }
610 if ((p[3] & 0xc0) == 0x80) {
611 if (len == 4) {
612 return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
613 + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
614 }
615 if ((p[4] & 0xc0) == 0x80) {
616 if (len == 5) {
617 return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
618 + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
619 + (p[4] & 0x3f));
620 }
621 if ((p[5] & 0xc0) == 0x80 && len == 6) {
622 return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
623 + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
624 + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f));
625 }
626 }
627 }
628 }
629 }
630 // Illegal value: just return the first byte.
631 return p[0];
632}
633
634/*
635 * Convert a UTF-8 byte sequence to a wide character.
636 * String is assumed to be terminated by NUL or after "n" bytes, whichever
637 * comes first.
638 * The function is safe in the sense that it never accesses memory beyond the
639 * first "n" bytes of "s".
640 *
641 * On success, returns decoded codepoint, advances "s" to the beginning of
642 * next character and decreases "n" accordingly.
643 *
644 * If end of string was reached, returns 0 and, if "n" > 0, advances "s" past
645 * NUL byte.
646 *
647 * If byte sequence is illegal or incomplete, returns -1 and does not advance
648 * "s".
649 */
650static int utf_safe_read_char_adv(const char_u **s, size_t *n)
651{
652 int c;
653
654 if (*n == 0) /* end of buffer */
655 return 0;
656
657 uint8_t k = utf8len_tab_zero[**s];
658
659 if (k == 1) {
660 /* ASCII character or NUL */
661 (*n)--;
662 return *(*s)++;
663 }
664
665 if (k <= *n) {
666 /* We have a multibyte sequence and it isn't truncated by buffer
667 * limits so utf_ptr2char() is safe to use. Or the first byte is
668 * illegal (k=0), and it's also safe to use utf_ptr2char(). */
669 c = utf_ptr2char(*s);
670
671 /* On failure, utf_ptr2char() returns the first byte, so here we
672 * check equality with the first byte. The only non-ASCII character
673 * which equals the first byte of its own UTF-8 representation is
674 * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
675 * It's safe even if n=1, else we would have k=2 > n. */
676 if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) {
677 /* byte sequence was successfully decoded */
678 *s += k;
679 *n -= k;
680 return c;
681 }
682 }
683
684 /* byte sequence is incomplete or illegal */
685 return -1;
686}
687
688/*
689 * Get character at **pp and advance *pp to the next character.
690 * Note: composing characters are skipped!
691 */
692int mb_ptr2char_adv(const char_u **const pp)
693{
694 int c;
695
696 c = utf_ptr2char(*pp);
697 *pp += (*mb_ptr2len)(*pp);
698 return c;
699}
700
701/*
702 * Get character at **pp and advance *pp to the next character.
703 * Note: composing characters are returned as separate characters.
704 */
705int mb_cptr2char_adv(const char_u **pp)
706{
707 int c;
708
709 c = utf_ptr2char(*pp);
710 *pp += utf_ptr2len(*pp);
711 return c;
712}
713
714/*
715 * Check if the character pointed to by "p2" is a composing character when it
716 * comes after "p1". For Arabic sometimes "ab" is replaced with "c", which
717 * behaves like a composing character.
718 */
719bool utf_composinglike(const char_u *p1, const char_u *p2)
720{
721 int c2;
722
723 c2 = utf_ptr2char(p2);
724 if (utf_iscomposing(c2))
725 return true;
726 if (!arabic_maycombine(c2))
727 return false;
728 return arabic_combine(utf_ptr2char(p1), c2);
729}
730
731/// Convert a UTF-8 string to a wide character
732///
733/// Also gets up to #MAX_MCO composing characters.
734///
735/// @param[out] pcc Location where to store composing characters. Must have
736/// space at least for #MAX_MCO + 1 elements.
737///
738/// @return leading character.
739int utfc_ptr2char(const char_u *p, int *pcc)
740{
741 int len;
742 int c;
743 int cc;
744 int i = 0;
745
746 c = utf_ptr2char(p);
747 len = utf_ptr2len(p);
748
749 /* Only accept a composing char when the first char isn't illegal. */
750 if ((len > 1 || *p < 0x80)
751 && p[len] >= 0x80
752 && UTF_COMPOSINGLIKE(p, p + len)) {
753 cc = utf_ptr2char(p + len);
754 for (;; ) {
755 pcc[i++] = cc;
756 if (i == MAX_MCO)
757 break;
758 len += utf_ptr2len(p + len);
759 if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len)))
760 break;
761 }
762 }
763
764 if (i < MAX_MCO) /* last composing char must be 0 */
765 pcc[i] = 0;
766
767 return c;
768}
769
770/*
771 * Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO
772 * composing characters. Use no more than p[maxlen].
773 *
774 * @param [out] pcc: composing chars, last one is 0
775 */
776int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen)
777{
778#define IS_COMPOSING(s1, s2, s3) \
779 (i == 0 ? UTF_COMPOSINGLIKE((s1), (s2)) : utf_iscomposing((s3)))
780
781 assert(maxlen > 0);
782
783 int i = 0;
784
785 int len = utf_ptr2len_len(p, maxlen);
786 // Is it safe to use utf_ptr2char()?
787 bool safe = len > 1 && len <= maxlen;
788 int c = safe ? utf_ptr2char(p) : *p;
789
790 // Only accept a composing char when the first char isn't illegal.
791 if ((safe || c < 0x80) && len < maxlen && p[len] >= 0x80) {
792 for (; i < MAX_MCO; i++) {
793 int len_cc = utf_ptr2len_len(p + len, maxlen - len);
794 safe = len_cc > 1 && len_cc <= maxlen - len;
795 if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
796 || !IS_COMPOSING(p, p + len, pcc[i])) {
797 break;
798 }
799 len += len_cc;
800 }
801 }
802
803 if (i < MAX_MCO) {
804 // last composing char must be 0
805 pcc[i] = 0;
806 }
807
808 return c;
809#undef ISCOMPOSING
810}
811
812/// Get the length of a UTF-8 byte sequence representing a single codepoint
813///
814/// @param[in] p UTF-8 string.
815///
816/// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte
817/// sequence.
818int utf_ptr2len(const char_u *const p)
819 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
820{
821 if (*p == NUL) {
822 return 0;
823 }
824 const int len = utf8len_tab[*p];
825 for (int i = 1; i < len; i++) {
826 if ((p[i] & 0xc0) != 0x80) {
827 return 1;
828 }
829 }
830 return len;
831}
832
833/*
834 * Return length of UTF-8 character, obtained from the first byte.
835 * "b" must be between 0 and 255!
836 * Returns 1 for an invalid first byte value.
837 */
838int utf_byte2len(int b)
839{
840 return utf8len_tab[b];
841}
842
843/*
844 * Get the length of UTF-8 byte sequence "p[size]". Does not include any
845 * following composing characters.
846 * Returns 1 for "".
847 * Returns 1 for an illegal byte sequence (also in incomplete byte seq.).
848 * Returns number > "size" for an incomplete byte sequence.
849 * Never returns zero.
850 */
851int utf_ptr2len_len(const char_u *p, int size)
852{
853 int len;
854 int i;
855 int m;
856
857 len = utf8len_tab[*p];
858 if (len == 1)
859 return 1; /* NUL, ascii or illegal lead byte */
860 if (len > size)
861 m = size; /* incomplete byte sequence. */
862 else
863 m = len;
864 for (i = 1; i < m; ++i)
865 if ((p[i] & 0xc0) != 0x80)
866 return 1;
867 return len;
868}
869
870/// Return the number of bytes occupied by a UTF-8 character in a string
871///
872/// This includes following composing characters.
873int utfc_ptr2len(const char_u *const p)
874 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
875{
876 uint8_t b0 = (uint8_t)(*p);
877
878 if (b0 == NUL) {
879 return 0;
880 }
881 if (b0 < 0x80 && p[1] < 0x80) { // be quick for ASCII
882 return 1;
883 }
884
885 // Skip over first UTF-8 char, stopping at a NUL byte.
886 int len = utf_ptr2len(p);
887
888 // Check for illegal byte.
889 if (len == 1 && b0 >= 0x80) {
890 return 1;
891 }
892
893 // Check for composing characters. We can handle only the first six, but
894 // skip all of them (otherwise the cursor would get stuck).
895 int prevlen = 0;
896 for (;;) {
897 if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len)) {
898 return len;
899 }
900
901 // Skip over composing char.
902 prevlen = len;
903 len += utf_ptr2len(p + len);
904 }
905}
906
907/*
908 * Return the number of bytes the UTF-8 encoding of the character at "p[size]"
909 * takes. This includes following composing characters.
910 * Returns 0 for an empty string.
911 * Returns 1 for an illegal char or an incomplete byte sequence.
912 */
913int utfc_ptr2len_len(const char_u *p, int size)
914{
915 int len;
916 int prevlen;
917
918 if (size < 1 || *p == NUL)
919 return 0;
920 if (p[0] < 0x80 && (size == 1 || p[1] < 0x80)) /* be quick for ASCII */
921 return 1;
922
923 /* Skip over first UTF-8 char, stopping at a NUL byte. */
924 len = utf_ptr2len_len(p, size);
925
926 /* Check for illegal byte and incomplete byte sequence. */
927 if ((len == 1 && p[0] >= 0x80) || len > size)
928 return 1;
929
930 /*
931 * Check for composing characters. We can handle only the first six, but
932 * skip all of them (otherwise the cursor would get stuck).
933 */
934 prevlen = 0;
935 while (len < size) {
936 int len_next_char;
937
938 if (p[len] < 0x80)
939 break;
940
941 /*
942 * Next character length should not go beyond size to ensure that
943 * UTF_COMPOSINGLIKE(...) does not read beyond size.
944 */
945 len_next_char = utf_ptr2len_len(p + len, size - len);
946 if (len_next_char > size - len)
947 break;
948
949 if (!UTF_COMPOSINGLIKE(p + prevlen, p + len))
950 break;
951
952 /* Skip over composing char */
953 prevlen = len;
954 len += len_next_char;
955 }
956 return len;
957}
958
959/// Determine how many bytes certain unicode codepoint will occupy
960int utf_char2len(const int c)
961{
962 if (c < 0x80) {
963 return 1;
964 } else if (c < 0x800) {
965 return 2;
966 } else if (c < 0x10000) {
967 return 3;
968 } else if (c < 0x200000) {
969 return 4;
970 } else if (c < 0x4000000) {
971 return 5;
972 } else {
973 return 6;
974 }
975}
976
977/// Convert Unicode character to UTF-8 string
978///
979/// @param c character to convert to \p buf
980/// @param[out] buf UTF-8 string generated from \p c, does not add \0
981/// @return Number of bytes (1-6).
982int utf_char2bytes(const int c, char_u *const buf)
983{
984 if (c < 0x80) { // 7 bits
985 buf[0] = c;
986 return 1;
987 } else if (c < 0x800) { // 11 bits
988 buf[0] = 0xc0 + ((unsigned)c >> 6);
989 buf[1] = 0x80 + (c & 0x3f);
990 return 2;
991 } else if (c < 0x10000) { // 16 bits
992 buf[0] = 0xe0 + ((unsigned)c >> 12);
993 buf[1] = 0x80 + (((unsigned)c >> 6) & 0x3f);
994 buf[2] = 0x80 + (c & 0x3f);
995 return 3;
996 } else if (c < 0x200000) { // 21 bits
997 buf[0] = 0xf0 + ((unsigned)c >> 18);
998 buf[1] = 0x80 + (((unsigned)c >> 12) & 0x3f);
999 buf[2] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1000 buf[3] = 0x80 + (c & 0x3f);
1001 return 4;
1002 } else if (c < 0x4000000) { // 26 bits
1003 buf[0] = 0xf8 + ((unsigned)c >> 24);
1004 buf[1] = 0x80 + (((unsigned)c >> 18) & 0x3f);
1005 buf[2] = 0x80 + (((unsigned)c >> 12) & 0x3f);
1006 buf[3] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1007 buf[4] = 0x80 + (c & 0x3f);
1008 return 5;
1009 } else { // 31 bits
1010 buf[0] = 0xfc + ((unsigned)c >> 30);
1011 buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f);
1012 buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f);
1013 buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f);
1014 buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f);
1015 buf[5] = 0x80 + (c & 0x3f);
1016 return 6;
1017 }
1018}
1019
1020/*
1021 * Return true if "c" is a composing UTF-8 character. This means it will be
1022 * drawn on top of the preceding character.
1023 * Based on code from Markus Kuhn.
1024 */
1025bool utf_iscomposing(int c)
1026{
1027 return intable(combining, ARRAY_SIZE(combining), c);
1028}
1029
1030/*
1031 * Return true for characters that can be displayed in a normal way.
1032 * Only for characters of 0x100 and above!
1033 */
1034bool utf_printable(int c)
1035{
1036#ifdef USE_WCHAR_FUNCTIONS
1037 /*
1038 * Assume the iswprint() library function works better than our own stuff.
1039 */
1040 return iswprint(c);
1041#else
1042 /* Sorted list of non-overlapping intervals.
1043 * 0xd800-0xdfff is reserved for UTF-16, actually illegal. */
1044 static struct interval nonprint[] =
1045 {
1046 {0x070f, 0x070f}, {0x180b, 0x180e}, {0x200b, 0x200f}, {0x202a, 0x202e},
1047 {0x206a, 0x206f}, {0xd800, 0xdfff}, {0xfeff, 0xfeff}, {0xfff9, 0xfffb},
1048 {0xfffe, 0xffff}
1049 };
1050
1051 return !intable(nonprint, ARRAY_SIZE(nonprint), c);
1052#endif
1053}
1054
1055/*
1056 * Get class of a Unicode character.
1057 * 0: white space
1058 * 1: punctuation
1059 * 2 or bigger: some class of word character.
1060 */
1061int utf_class(const int c)
1062{
1063 return utf_class_tab(c, curbuf->b_chartab);
1064}
1065
1066int utf_class_tab(const int c, const uint64_t *const chartab)
1067{
1068 /* sorted list of non-overlapping intervals */
1069 static struct clinterval {
1070 unsigned int first;
1071 unsigned int last;
1072 unsigned int class;
1073 } classes[] = {
1074 { 0x037e, 0x037e, 1 }, // Greek question mark
1075 { 0x0387, 0x0387, 1 }, // Greek ano teleia
1076 { 0x055a, 0x055f, 1 }, // Armenian punctuation
1077 { 0x0589, 0x0589, 1 }, // Armenian full stop
1078 { 0x05be, 0x05be, 1 },
1079 { 0x05c0, 0x05c0, 1 },
1080 { 0x05c3, 0x05c3, 1 },
1081 { 0x05f3, 0x05f4, 1 },
1082 { 0x060c, 0x060c, 1 },
1083 { 0x061b, 0x061b, 1 },
1084 { 0x061f, 0x061f, 1 },
1085 { 0x066a, 0x066d, 1 },
1086 { 0x06d4, 0x06d4, 1 },
1087 { 0x0700, 0x070d, 1 }, // Syriac punctuation
1088 { 0x0964, 0x0965, 1 },
1089 { 0x0970, 0x0970, 1 },
1090 { 0x0df4, 0x0df4, 1 },
1091 { 0x0e4f, 0x0e4f, 1 },
1092 { 0x0e5a, 0x0e5b, 1 },
1093 { 0x0f04, 0x0f12, 1 },
1094 { 0x0f3a, 0x0f3d, 1 },
1095 { 0x0f85, 0x0f85, 1 },
1096 { 0x104a, 0x104f, 1 }, // Myanmar punctuation
1097 { 0x10fb, 0x10fb, 1 }, // Georgian punctuation
1098 { 0x1361, 0x1368, 1 }, // Ethiopic punctuation
1099 { 0x166d, 0x166e, 1 }, // Canadian Syl. punctuation
1100 { 0x1680, 0x1680, 0 },
1101 { 0x169b, 0x169c, 1 },
1102 { 0x16eb, 0x16ed, 1 },
1103 { 0x1735, 0x1736, 1 },
1104 { 0x17d4, 0x17dc, 1 }, // Khmer punctuation
1105 { 0x1800, 0x180a, 1 }, // Mongolian punctuation
1106 { 0x2000, 0x200b, 0 }, // spaces
1107 { 0x200c, 0x2027, 1 }, // punctuation and symbols
1108 { 0x2028, 0x2029, 0 },
1109 { 0x202a, 0x202e, 1 }, // punctuation and symbols
1110 { 0x202f, 0x202f, 0 },
1111 { 0x2030, 0x205e, 1 }, // punctuation and symbols
1112 { 0x205f, 0x205f, 0 },
1113 { 0x2060, 0x27ff, 1 }, // punctuation and symbols
1114 { 0x2070, 0x207f, 0x2070 }, // superscript
1115 { 0x2080, 0x2094, 0x2080 }, // subscript
1116 { 0x20a0, 0x27ff, 1 }, // all kinds of symbols
1117 { 0x2800, 0x28ff, 0x2800 }, // braille
1118 { 0x2900, 0x2998, 1 }, // arrows, brackets, etc.
1119 { 0x29d8, 0x29db, 1 },
1120 { 0x29fc, 0x29fd, 1 },
1121 { 0x2e00, 0x2e7f, 1 }, // supplemental punctuation
1122 { 0x3000, 0x3000, 0 }, // ideographic space
1123 { 0x3001, 0x3020, 1 }, // ideographic punctuation
1124 { 0x3030, 0x3030, 1 },
1125 { 0x303d, 0x303d, 1 },
1126 { 0x3040, 0x309f, 0x3040 }, // Hiragana
1127 { 0x30a0, 0x30ff, 0x30a0 }, // Katakana
1128 { 0x3300, 0x9fff, 0x4e00 }, // CJK Ideographs
1129 { 0xac00, 0xd7a3, 0xac00 }, // Hangul Syllables
1130 { 0xf900, 0xfaff, 0x4e00 }, // CJK Ideographs
1131 { 0xfd3e, 0xfd3f, 1 },
1132 { 0xfe30, 0xfe6b, 1 }, // punctuation forms
1133 { 0xff00, 0xff0f, 1 }, // half/fullwidth ASCII
1134 { 0xff1a, 0xff20, 1 }, // half/fullwidth ASCII
1135 { 0xff3b, 0xff40, 1 }, // half/fullwidth ASCII
1136 { 0xff5b, 0xff65, 1 }, // half/fullwidth ASCII
1137 { 0x1d000, 0x1d24f, 1 }, // Musical notation
1138 { 0x1d400, 0x1d7ff, 1 }, // Mathematical Alphanumeric Symbols
1139 { 0x1f000, 0x1f2ff, 1 }, // Game pieces; enclosed characters
1140 { 0x1f300, 0x1f9ff, 1 }, // Many symbol blocks
1141 { 0x20000, 0x2a6df, 0x4e00 }, // CJK Ideographs
1142 { 0x2a700, 0x2b73f, 0x4e00 }, // CJK Ideographs
1143 { 0x2b740, 0x2b81f, 0x4e00 }, // CJK Ideographs
1144 { 0x2f800, 0x2fa1f, 0x4e00 }, // CJK Ideographs
1145 };
1146 int bot = 0;
1147 int top = ARRAY_SIZE(classes) - 1;
1148 int mid;
1149
1150 /* First quick check for Latin1 characters, use 'iskeyword'. */
1151 if (c < 0x100) {
1152 if (c == ' ' || c == '\t' || c == NUL || c == 0xa0) {
1153 return 0; // blank
1154 }
1155 if (vim_iswordc_tab(c, chartab)) {
1156 return 2; // word character
1157 }
1158 return 1; // punctuation
1159 }
1160
1161 /* binary search in table */
1162 while (top >= bot) {
1163 mid = (bot + top) / 2;
1164 if (classes[mid].last < (unsigned int)c)
1165 bot = mid + 1;
1166 else if (classes[mid].first > (unsigned int)c)
1167 top = mid - 1;
1168 else
1169 return (int)classes[mid].class;
1170 }
1171
1172 // emoji
1173 if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
1174 return 3;
1175 }
1176
1177 /* most other characters are "word" characters */
1178 return 2;
1179}
1180
1181bool utf_ambiguous_width(int c)
1182{
1183 return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
1184 || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
1185}
1186
1187/*
1188 * Generic conversion function for case operations.
1189 * Return the converted equivalent of "a", which is a UCS-4 character. Use
1190 * the given conversion "table". Uses binary search on "table".
1191 */
1192static int utf_convert(int a, const convertStruct *const table, size_t n_items)
1193{
1194 size_t start, mid, end; /* indices into table */
1195
1196 start = 0;
1197 end = n_items;
1198 while (start < end) {
1199 /* need to search further */
1200 mid = (end + start) / 2;
1201 if (table[mid].rangeEnd < a)
1202 start = mid + 1;
1203 else
1204 end = mid;
1205 }
1206 if (start < n_items
1207 && table[start].rangeStart <= a
1208 && a <= table[start].rangeEnd
1209 && (a - table[start].rangeStart) % table[start].step == 0)
1210 return a + table[start].offset;
1211 else
1212 return a;
1213}
1214
1215/*
1216 * Return the folded-case equivalent of "a", which is a UCS-4 character. Uses
1217 * simple case folding.
1218 */
1219int utf_fold(int a)
1220{
1221 if (a < 0x80) {
1222 // be fast for ASCII
1223 return a >= 0x41 && a <= 0x5a ? a + 32 : a;
1224 }
1225 return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
1226}
1227
1228// Vim's own character class functions. These exist because many library
1229// islower()/toupper() etc. do not work properly: they crash when used with
1230// invalid values or can't handle latin1 when the locale is C.
1231// Speed is most important here.
1232
1233/// Return the upper-case equivalent of "a", which is a UCS-4 character. Use
1234/// simple case folding.
1235int mb_toupper(int a)
1236{
1237 /* If 'casemap' contains "keepascii" use ASCII style toupper(). */
1238 if (a < 128 && (cmp_flags & CMP_KEEPASCII))
1239 return TOUPPER_ASC(a);
1240
1241#if defined(__STDC_ISO_10646__)
1242 /* If towupper() is available and handles Unicode, use it. */
1243 if (!(cmp_flags & CMP_INTERNAL))
1244 return towupper(a);
1245#endif
1246
1247 /* For characters below 128 use locale sensitive toupper(). */
1248 if (a < 128)
1249 return TOUPPER_LOC(a);
1250
1251 /* For any other characters use the above mapping table. */
1252 return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
1253}
1254
1255bool mb_islower(int a)
1256{
1257 // German sharp s is lower case but has no upper case equivalent.
1258 return (mb_toupper(a) != a) || a == 0xdf;
1259}
1260
1261/// Return the lower-case equivalent of "a", which is a UCS-4 character. Use
1262/// simple case folding.
1263int mb_tolower(int a)
1264{
1265 /* If 'casemap' contains "keepascii" use ASCII style tolower(). */
1266 if (a < 128 && (cmp_flags & CMP_KEEPASCII))
1267 return TOLOWER_ASC(a);
1268
1269#if defined(__STDC_ISO_10646__)
1270 /* If towlower() is available and handles Unicode, use it. */
1271 if (!(cmp_flags & CMP_INTERNAL))
1272 return towlower(a);
1273#endif
1274
1275 /* For characters below 128 use locale sensitive tolower(). */
1276 if (a < 128)
1277 return TOLOWER_LOC(a);
1278
1279 /* For any other characters use the above mapping table. */
1280 return utf_convert(a, toLower, ARRAY_SIZE(toLower));
1281}
1282
1283bool mb_isupper(int a)
1284{
1285 return mb_tolower(a) != a;
1286}
1287
1288static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1,
1289 size_t n2)
1290{
1291 int c1, c2, cdiff;
1292 char_u buffer[6];
1293
1294 for (;; ) {
1295 c1 = utf_safe_read_char_adv(&s1, &n1);
1296 c2 = utf_safe_read_char_adv(&s2, &n2);
1297
1298 if (c1 <= 0 || c2 <= 0)
1299 break;
1300
1301 if (c1 == c2)
1302 continue;
1303
1304 cdiff = utf_fold(c1) - utf_fold(c2);
1305 if (cdiff != 0)
1306 return cdiff;
1307 }
1308
1309 /* some string ended or has an incomplete/illegal character sequence */
1310
1311 if (c1 == 0 || c2 == 0) {
1312 /* some string ended. shorter string is smaller */
1313 if (c1 == 0 && c2 == 0)
1314 return 0;
1315 return c1 == 0 ? -1 : 1;
1316 }
1317
1318 /* Continue with bytewise comparison to produce some result that
1319 * would make comparison operations involving this function transitive.
1320 *
1321 * If only one string had an error, comparison should be made with
1322 * folded version of the other string. In this case it is enough
1323 * to fold just one character to determine the result of comparison. */
1324
1325 if (c1 != -1 && c2 == -1) {
1326 n1 = utf_char2bytes(utf_fold(c1), buffer);
1327 s1 = buffer;
1328 } else if (c2 != -1 && c1 == -1) {
1329 n2 = utf_char2bytes(utf_fold(c2), buffer);
1330 s2 = buffer;
1331 }
1332
1333 while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) {
1334 cdiff = (int)(*s1) - (int)(*s2);
1335 if (cdiff != 0)
1336 return cdiff;
1337
1338 s1++;
1339 s2++;
1340 n1--;
1341 n2--;
1342 }
1343
1344 if (n1 > 0 && *s1 == NUL)
1345 n1 = 0;
1346 if (n2 > 0 && *s2 == NUL)
1347 n2 = 0;
1348
1349 if (n1 == 0 && n2 == 0)
1350 return 0;
1351 return n1 == 0 ? -1 : 1;
1352}
1353
1354#ifdef WIN32
1355#ifndef CP_UTF8
1356# define CP_UTF8 65001 /* magic number from winnls.h */
1357#endif
1358
1359/// Converts string from UTF-8 to UTF-16.
1360///
1361/// @param utf8 UTF-8 string.
1362/// @param utf8len Length of `utf8`. May be -1 if `utf8` is NUL-terminated.
1363/// @param utf16[out,allocated] NUL-terminated UTF-16 string, or NULL on error
1364/// @return 0 on success, or libuv error code
1365int utf8_to_utf16(const char *utf8, int utf8len, wchar_t **utf16)
1366 FUNC_ATTR_NONNULL_ALL
1367{
1368 // Compute the length needed for the converted UTF-16 string.
1369 int bufsize = MultiByteToWideChar(CP_UTF8,
1370 0, // dwFlags: must be 0 for UTF-8
1371 utf8, // -1: process up to NUL
1372 utf8len,
1373 NULL,
1374 0); // 0: get length, don't convert
1375 if (bufsize == 0) {
1376 *utf16 = NULL;
1377 return uv_translate_sys_error(GetLastError());
1378 }
1379
1380 // Allocate the destination buffer adding an extra byte for the terminating
1381 // NULL. If `utf8len` is not -1 MultiByteToWideChar will not add it, so
1382 // we do it ourselves always, just in case.
1383 *utf16 = xmalloc(sizeof(wchar_t) * (bufsize + 1));
1384
1385 // Convert to UTF-16.
1386 bufsize = MultiByteToWideChar(CP_UTF8, 0, utf8, utf8len, *utf16, bufsize);
1387 if (bufsize == 0) {
1388 XFREE_CLEAR(*utf16);
1389 return uv_translate_sys_error(GetLastError());
1390 }
1391
1392 (*utf16)[bufsize] = L'\0';
1393 return 0;
1394}
1395
1396/// Converts string from UTF-16 to UTF-8.
1397///
1398/// @param utf16 UTF-16 string.
1399/// @param utf16len Length of `utf16`. May be -1 if `utf16` is NUL-terminated.
1400/// @param utf8[out,allocated] NUL-terminated UTF-8 string, or NULL on error
1401/// @return 0 on success, or libuv error code
1402int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8)
1403 FUNC_ATTR_NONNULL_ALL
1404{
1405 // Compute the space needed for the converted UTF-8 string.
1406 DWORD bufsize = WideCharToMultiByte(CP_UTF8,
1407 0,
1408 utf16,
1409 utf16len,
1410 NULL,
1411 0,
1412 NULL,
1413 NULL);
1414 if (bufsize == 0) {
1415 *utf8 = NULL;
1416 return uv_translate_sys_error(GetLastError());
1417 }
1418
1419 // Allocate the destination buffer adding an extra byte for the terminating
1420 // NULL. If `utf16len` is not -1 WideCharToMultiByte will not add it, so
1421 // we do it ourselves always, just in case.
1422 *utf8 = xmalloc(bufsize + 1);
1423
1424 // Convert to UTF-8.
1425 bufsize = WideCharToMultiByte(CP_UTF8,
1426 0,
1427 utf16,
1428 utf16len,
1429 *utf8,
1430 bufsize,
1431 NULL,
1432 NULL);
1433 if (bufsize == 0) {
1434 XFREE_CLEAR(*utf8);
1435 return uv_translate_sys_error(GetLastError());
1436 }
1437
1438 (*utf8)[bufsize] = '\0';
1439 return 0;
1440}
1441
1442#endif
1443
1444/// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
1445///
1446/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
1447/// each.
1448///
1449/// The out parameters are incremented. This is used to measure the size of
1450/// a buffer region consisting of multiple line segments.
1451///
1452/// @param s the string
1453/// @param len maximum length (an earlier NUL terminates)
1454/// @param[out] codepoints incremented with UTF-32 code point size
1455/// @param[out] codeunits incremented with UTF-16 code unit size
1456void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
1457 size_t *codeunits)
1458 FUNC_ATTR_NONNULL_ALL
1459{
1460 size_t count = 0, extra = 0;
1461 size_t clen;
1462 for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
1463 clen = utf_ptr2len_len(s+i, len-i);
1464 // NB: gets the byte value of invalid sequence bytes.
1465 // we only care whether the char fits in the BMP or not
1466 int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
1467 count++;
1468 if (c > 0xFFFF) {
1469 extra++;
1470 }
1471 }
1472 *codepoints += count;
1473 *codeunits += count + extra;
1474}
1475
1476ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len,
1477 size_t index, bool use_utf16_units)
1478 FUNC_ATTR_NONNULL_ALL
1479{
1480 size_t count = 0;
1481 size_t clen, i;
1482 if (index == 0) {
1483 return 0;
1484 }
1485 for (i = 0; i < len && s[i] != NUL; i += clen) {
1486 clen = utf_ptr2len_len(s+i, len-i);
1487 // NB: gets the byte value of invalid sequence bytes.
1488 // we only care whether the char fits in the BMP or not
1489 int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
1490 count++;
1491 if (use_utf16_units && c > 0xFFFF) {
1492 count++;
1493 }
1494 if (count >= index) {
1495 return i+clen;
1496 }
1497 }
1498 return -1;
1499}
1500
1501
1502/*
1503 * Version of strnicmp() that handles multi-byte characters.
1504 * Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can
1505 * probably use strnicmp(), because there are no ASCII characters in the
1506 * second byte.
1507 * Returns zero if s1 and s2 are equal (ignoring case), the difference between
1508 * two characters otherwise.
1509 */
1510int mb_strnicmp(const char_u *s1, const char_u *s2, const size_t nn)
1511{
1512 return utf_strnicmp(s1, s2, nn, nn);
1513}
1514
1515/// Compare strings case-insensitively
1516///
1517/// @note We need to call mb_stricmp() even when we aren't dealing with
1518/// a multi-byte encoding because mb_stricmp() takes care of all ASCII and
1519/// non-ascii encodings, including characters with umlauts in latin1,
1520/// etc., while STRICMP() only handles the system locale version, which
1521/// often does not handle non-ascii properly.
1522///
1523/// @param[in] s1 First string to compare, not more then #MAXCOL characters.
1524/// @param[in] s2 Second string to compare, not more then #MAXCOL characters.
1525///
1526/// @return 0 if strings are equal, <0 if s1 < s2, >0 if s1 > s2.
1527int mb_stricmp(const char *s1, const char *s2)
1528{
1529 return mb_strnicmp((const char_u *)s1, (const char_u *)s2, MAXCOL);
1530}
1531
1532/*
1533 * "g8": show bytes of the UTF-8 char under the cursor. Doesn't matter what
1534 * 'encoding' has been set to.
1535 */
1536void show_utf8(void)
1537{
1538 int len;
1539 int rlen = 0;
1540 char_u *line;
1541 int clen;
1542 int i;
1543
1544 /* Get the byte length of the char under the cursor, including composing
1545 * characters. */
1546 line = get_cursor_pos_ptr();
1547 len = utfc_ptr2len(line);
1548 if (len == 0) {
1549 MSG("NUL");
1550 return;
1551 }
1552
1553 clen = 0;
1554 for (i = 0; i < len; ++i) {
1555 if (clen == 0) {
1556 /* start of (composing) character, get its length */
1557 if (i > 0) {
1558 STRCPY(IObuff + rlen, "+ ");
1559 rlen += 2;
1560 }
1561 clen = utf_ptr2len(line + i);
1562 }
1563 sprintf((char *)IObuff + rlen, "%02x ",
1564 (line[i] == NL) ? NUL : line[i]); /* NUL is stored as NL */
1565 --clen;
1566 rlen += (int)STRLEN(IObuff + rlen);
1567 if (rlen > IOSIZE - 20)
1568 break;
1569 }
1570
1571 msg(IObuff);
1572}
1573
1574/// Return offset from "p" to the first byte of the character it points into.
1575/// If "p" points to the NUL at the end of the string return 0.
1576/// Returns 0 when already at the first byte of a character.
1577int utf_head_off(const char_u *base, const char_u *p)
1578{
1579 int c;
1580 int len;
1581
1582 if (*p < 0x80) /* be quick for ASCII */
1583 return 0;
1584
1585 /* Skip backwards over trailing bytes: 10xx.xxxx
1586 * Skip backwards again if on a composing char. */
1587 const char_u *q;
1588 for (q = p;; --q) {
1589 /* Move s to the last byte of this char. */
1590 const char_u *s;
1591 for (s = q; (s[1] & 0xc0) == 0x80; ++s) {}
1592
1593 /* Move q to the first byte of this char. */
1594 while (q > base && (*q & 0xc0) == 0x80)
1595 --q;
1596 /* Check for illegal sequence. Do allow an illegal byte after where we
1597 * started. */
1598 len = utf8len_tab[*q];
1599 if (len != (int)(s - q + 1) && len != (int)(p - q + 1))
1600 return 0;
1601
1602 if (q <= base)
1603 break;
1604
1605 c = utf_ptr2char(q);
1606 if (utf_iscomposing(c))
1607 continue;
1608
1609 if (arabic_maycombine(c)) {
1610 /* Advance to get a sneak-peak at the next char */
1611 const char_u *j = q;
1612 --j;
1613 /* Move j to the first byte of this char. */
1614 while (j > base && (*j & 0xc0) == 0x80)
1615 --j;
1616 if (arabic_combine(utf_ptr2char(j), c))
1617 continue;
1618 }
1619 break;
1620 }
1621
1622 return (int)(p - q);
1623}
1624
1625/// Copy a character, advancing the pointers
1626///
1627/// @param[in,out] fp Source of the character to copy.
1628/// @param[in,out] tp Destination to copy to.
1629void mb_copy_char(const char_u **const fp, char_u **const tp)
1630{
1631 const size_t l = (size_t)utfc_ptr2len(*fp);
1632
1633 memmove(*tp, *fp, l);
1634 *tp += l;
1635 *fp += l;
1636}
1637
1638/*
1639 * Return the offset from "p" to the first byte of a character. When "p" is
1640 * at the start of a character 0 is returned, otherwise the offset to the next
1641 * character. Can start anywhere in a stream of bytes.
1642 */
1643int mb_off_next(char_u *base, char_u *p)
1644{
1645 int i;
1646 int j;
1647
1648 if (*p < 0x80) { // be quick for ASCII
1649 return 0;
1650 }
1651
1652 // Find the next character that isn't 10xx.xxxx
1653 for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
1654 if (i > 0) {
1655 // Check for illegal sequence.
1656 for (j = 0; p - j > base; j++) {
1657 if ((p[-j] & 0xc0) != 0x80) {
1658 break;
1659 }
1660 }
1661 if (utf8len_tab[p[-j]] != i + j) {
1662 return 0;
1663 }
1664 }
1665 return i;
1666}
1667
1668/*
1669 * Return the offset from "p" to the last byte of the character it points
1670 * into. Can start anywhere in a stream of bytes.
1671 */
1672int mb_tail_off(char_u *base, char_u *p)
1673{
1674 int i;
1675 int j;
1676
1677 if (*p == NUL)
1678 return 0;
1679
1680 // Find the last character that is 10xx.xxxx
1681 for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
1682
1683 // Check for illegal sequence.
1684 for (j = 0; p - j > base; j++) {
1685 if ((p[-j] & 0xc0) != 0x80) {
1686 break;
1687 }
1688 }
1689
1690 if (utf8len_tab[p[-j]] != i + j + 1) {
1691 return 0;
1692 }
1693 return i;
1694}
1695
1696/*
1697 * Find the next illegal byte sequence.
1698 */
1699void utf_find_illegal(void)
1700{
1701 pos_T pos = curwin->w_cursor;
1702 char_u *p;
1703 int len;
1704 vimconv_T vimconv;
1705 char_u *tofree = NULL;
1706
1707 vimconv.vc_type = CONV_NONE;
1708 if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) {
1709 // 'encoding' is "utf-8" but we are editing a 8-bit encoded file,
1710 // possibly a utf-8 file with illegal bytes. Setup for conversion
1711 // from utf-8 to 'fileencoding'.
1712 convert_setup(&vimconv, p_enc, curbuf->b_p_fenc);
1713 }
1714
1715 curwin->w_cursor.coladd = 0;
1716 for (;; ) {
1717 p = get_cursor_pos_ptr();
1718 if (vimconv.vc_type != CONV_NONE) {
1719 xfree(tofree);
1720 tofree = string_convert(&vimconv, p, NULL);
1721 if (tofree == NULL)
1722 break;
1723 p = tofree;
1724 }
1725
1726 while (*p != NUL) {
1727 /* Illegal means that there are not enough trail bytes (checked by
1728 * utf_ptr2len()) or too many of them (overlong sequence). */
1729 len = utf_ptr2len(p);
1730 if (*p >= 0x80 && (len == 1
1731 || utf_char2len(utf_ptr2char(p)) != len)) {
1732 if (vimconv.vc_type == CONV_NONE)
1733 curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr());
1734 else {
1735 int l;
1736
1737 len = (int)(p - tofree);
1738 for (p = get_cursor_pos_ptr(); *p != NUL && len-- > 0; p += l) {
1739 l = utf_ptr2len(p);
1740 curwin->w_cursor.col += l;
1741 }
1742 }
1743 goto theend;
1744 }
1745 p += len;
1746 }
1747 if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count)
1748 break;
1749 ++curwin->w_cursor.lnum;
1750 curwin->w_cursor.col = 0;
1751 }
1752
1753 /* didn't find it: don't move and beep */
1754 curwin->w_cursor = pos;
1755 beep_flush();
1756
1757theend:
1758 xfree(tofree);
1759 convert_setup(&vimconv, NULL, NULL);
1760}
1761
1762/*
1763 * If the cursor moves on an trail byte, set the cursor on the lead byte.
1764 * Thus it moves left if necessary.
1765 */
1766void mb_adjust_cursor(void)
1767{
1768 mark_mb_adjustpos(curbuf, &curwin->w_cursor);
1769}
1770
1771/// Checks and adjusts cursor column. Not mode-dependent.
1772/// @see check_cursor_col_win
1773///
1774/// @param win_ Places cursor on a valid column for this window.
1775void mb_check_adjust_col(void *win_)
1776{
1777 win_T *win = (win_T *)win_;
1778 colnr_T oldcol = win->w_cursor.col;
1779
1780 // Column 0 is always valid.
1781 if (oldcol != 0) {
1782 char_u *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
1783 colnr_T len = (colnr_T)STRLEN(p);
1784
1785 // Empty line or invalid column?
1786 if (len == 0 || oldcol < 0) {
1787 win->w_cursor.col = 0;
1788 } else {
1789 // Cursor column too big for line?
1790 if (oldcol > len) {
1791 win->w_cursor.col = len - 1;
1792 }
1793 // Move the cursor to the head byte.
1794 win->w_cursor.col -= utf_head_off(p, p + win->w_cursor.col);
1795 }
1796
1797 // Reset `coladd` when the cursor would be on the right half of a
1798 // double-wide character.
1799 if (win->w_cursor.coladd == 1 && p[win->w_cursor.col] != TAB
1800 && vim_isprintc(utf_ptr2char(p + win->w_cursor.col))
1801 && ptr2cells(p + win->w_cursor.col) > 1) {
1802 win->w_cursor.coladd = 0;
1803 }
1804 }
1805}
1806
1807/*
1808 * Return a pointer to the character before "*p", if there is one.
1809 */
1810char_u * mb_prevptr(
1811 char_u *line, /* start of the string */
1812 char_u *p
1813 )
1814{
1815 if (p > line) {
1816 MB_PTR_BACK(line, p);
1817 }
1818 return p;
1819}
1820
1821/*
1822 * Return the character length of "str". Each multi-byte character (with
1823 * following composing characters) counts as one.
1824 */
1825int mb_charlen(char_u *str)
1826{
1827 char_u *p = str;
1828 int count;
1829
1830 if (p == NULL)
1831 return 0;
1832
1833 for (count = 0; *p != NUL; count++)
1834 p += (*mb_ptr2len)(p);
1835
1836 return count;
1837}
1838
1839/*
1840 * Like mb_charlen() but for a string with specified length.
1841 */
1842int mb_charlen_len(char_u *str, int len)
1843{
1844 char_u *p = str;
1845 int count;
1846
1847 for (count = 0; *p != NUL && p < str + len; count++)
1848 p += (*mb_ptr2len)(p);
1849
1850 return count;
1851}
1852
1853/// Try to unescape a multibyte character
1854///
1855/// Used for the rhs and lhs of the mappings.
1856///
1857/// @param[in,out] pp String to unescape. Is advanced to just after the bytes
1858/// that form a multibyte character.
1859///
1860/// @return Unescaped string if it is a multibyte character, NULL if no
1861/// multibyte character was found. Returns a static buffer, always one
1862/// and the same.
1863const char *mb_unescape(const char **const pp)
1864 FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1865{
1866 static char buf[6];
1867 size_t buf_idx = 0;
1868 uint8_t *str = (uint8_t *)(*pp);
1869
1870 // Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL and CSI
1871 // KS_EXTRA KE_CSI to CSI.
1872 // Maximum length of a utf-8 character is 4 bytes.
1873 for (size_t str_idx = 0; str[str_idx] != NUL && buf_idx < 4; str_idx++) {
1874 if (str[str_idx] == K_SPECIAL
1875 && str[str_idx + 1] == KS_SPECIAL
1876 && str[str_idx + 2] == KE_FILLER) {
1877 buf[buf_idx++] = (char)K_SPECIAL;
1878 str_idx += 2;
1879 } else if ((str[str_idx] == K_SPECIAL)
1880 && str[str_idx + 1] == KS_EXTRA
1881 && str[str_idx + 2] == KE_CSI) {
1882 buf[buf_idx++] = (char)CSI;
1883 str_idx += 2;
1884 } else if (str[str_idx] == K_SPECIAL) {
1885 break; // A special key can't be a multibyte char.
1886 } else {
1887 buf[buf_idx++] = (char)str[str_idx];
1888 }
1889 buf[buf_idx] = NUL;
1890
1891 // Return a multi-byte character if it's found. An illegal sequence
1892 // will result in a 1 here.
1893 if (utf_ptr2len((const char_u *)buf) > 1) {
1894 *pp = (const char *)str + str_idx + 1;
1895 return buf;
1896 }
1897
1898 // Bail out quickly for ASCII.
1899 if ((uint8_t)buf[0] < 128) {
1900 break;
1901 }
1902 }
1903 return NULL;
1904}
1905
1906
1907/*
1908 * Skip the Vim specific head of a 'encoding' name.
1909 */
1910char_u * enc_skip(char_u *p)
1911{
1912 if (STRNCMP(p, "2byte-", 6) == 0)
1913 return p + 6;
1914 if (STRNCMP(p, "8bit-", 5) == 0)
1915 return p + 5;
1916 return p;
1917}
1918
1919/*
1920 * Find the canonical name for encoding "enc".
1921 * When the name isn't recognized, returns "enc" itself, but with all lower
1922 * case characters and '_' replaced with '-'.
1923 * Returns an allocated string.
1924 */
1925char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET
1926{
1927 char_u *p, *s;
1928 int i;
1929
1930 if (STRCMP(enc, "default") == 0) {
1931 // Use the default encoding as found by set_init_1().
1932 return vim_strsave(fenc_default);
1933 }
1934
1935 /* copy "enc" to allocated memory, with room for two '-' */
1936 char_u *r = xmalloc(STRLEN(enc) + 3);
1937 /* Make it all lower case and replace '_' with '-'. */
1938 p = r;
1939 for (s = enc; *s != NUL; ++s) {
1940 if (*s == '_')
1941 *p++ = '-';
1942 else
1943 *p++ = TOLOWER_ASC(*s);
1944 }
1945 *p = NUL;
1946
1947 /* Skip "2byte-" and "8bit-". */
1948 p = enc_skip(r);
1949
1950 /* Change "microsoft-cp" to "cp". Used in some spell files. */
1951 if (STRNCMP(p, "microsoft-cp", 12) == 0)
1952 STRMOVE(p, p + 10);
1953
1954 /* "iso8859" -> "iso-8859" */
1955 if (STRNCMP(p, "iso8859", 7) == 0) {
1956 STRMOVE(p + 4, p + 3);
1957 p[3] = '-';
1958 }
1959
1960 /* "iso-8859n" -> "iso-8859-n" */
1961 if (STRNCMP(p, "iso-8859", 8) == 0 && p[8] != '-') {
1962 STRMOVE(p + 9, p + 8);
1963 p[8] = '-';
1964 }
1965
1966 /* "latin-N" -> "latinN" */
1967 if (STRNCMP(p, "latin-", 6) == 0)
1968 STRMOVE(p + 5, p + 6);
1969
1970 if (enc_canon_search(p) >= 0) {
1971 /* canonical name can be used unmodified */
1972 if (p != r)
1973 STRMOVE(r, p);
1974 } else if ((i = enc_alias_search(p)) >= 0) {
1975 /* alias recognized, get canonical name */
1976 xfree(r);
1977 r = vim_strsave((char_u *)enc_canon_table[i].name);
1978 }
1979 return r;
1980}
1981
1982/*
1983 * Search for an encoding alias of "name".
1984 * Returns -1 when not found.
1985 */
1986static int enc_alias_search(char_u *name)
1987{
1988 int i;
1989
1990 for (i = 0; enc_alias_table[i].name != NULL; ++i)
1991 if (STRCMP(name, enc_alias_table[i].name) == 0)
1992 return enc_alias_table[i].canon;
1993 return -1;
1994}
1995
1996
1997#ifdef HAVE_LANGINFO_H
1998# include <langinfo.h>
1999#endif
2000
2001/*
2002 * Get the canonicalized encoding of the current locale.
2003 * Returns an allocated string when successful, NULL when not.
2004 */
2005char_u * enc_locale(void)
2006{
2007 int i;
2008 char buf[50];
2009
2010 const char *s;
2011# ifdef HAVE_NL_LANGINFO_CODESET
2012 if (!(s = nl_langinfo(CODESET)) || *s == NUL)
2013# endif
2014 {
2015# if defined(HAVE_LOCALE_H)
2016 if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL)
2017# endif
2018 {
2019 if ((s = os_getenv("LC_ALL"))) {
2020 if ((s = os_getenv("LC_CTYPE"))) {
2021 s = os_getenv("LANG");
2022 }
2023 }
2024 }
2025 }
2026
2027 if (!s) {
2028 return NULL;
2029 }
2030
2031 // The most generic locale format is:
2032 // language[_territory][.codeset][@modifier][+special][,[sponsor][_revision]]
2033 // If there is a '.' remove the part before it.
2034 // if there is something after the codeset, remove it.
2035 // Make the name lowercase and replace '_' with '-'.
2036 // Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn",
2037 // "ko_KR.EUC" == "euc-kr"
2038 const char *p = (char *)vim_strchr((char_u *)s, '.');
2039 if (p != NULL) {
2040 if (p > s + 2 && !STRNICMP(p + 1, "EUC", 3)
2041 && !isalnum((int)p[4]) && p[4] != '-' && p[-3] == '_') {
2042 // Copy "XY.EUC" to "euc-XY" to buf[10].
2043 memmove(buf, "euc-", 4);
2044 buf[4] = (ASCII_ISALNUM(p[-2]) ? TOLOWER_ASC(p[-2]) : 0);
2045 buf[5] = (ASCII_ISALNUM(p[-1]) ? TOLOWER_ASC(p[-1]) : 0);
2046 buf[6] = NUL;
2047 } else {
2048 s = p + 1;
2049 goto enc_locale_copy_enc;
2050 }
2051 } else {
2052enc_locale_copy_enc:
2053 for (i = 0; i < (int)sizeof(buf) - 1 && s[i] != NUL; i++) {
2054 if (s[i] == '_' || s[i] == '-') {
2055 buf[i] = '-';
2056 } else if (ASCII_ISALNUM((uint8_t)s[i])) {
2057 buf[i] = TOLOWER_ASC(s[i]);
2058 } else {
2059 break;
2060 }
2061 }
2062 buf[i] = NUL;
2063 }
2064
2065 return enc_canonize((char_u *)buf);
2066}
2067
2068# if defined(HAVE_ICONV)
2069
2070
2071/*
2072 * Call iconv_open() with a check if iconv() works properly (there are broken
2073 * versions).
2074 * Returns (void *)-1 if failed.
2075 * (should return iconv_t, but that causes problems with prototypes).
2076 */
2077void * my_iconv_open(char_u *to, char_u *from)
2078{
2079 iconv_t fd;
2080#define ICONV_TESTLEN 400
2081 char_u tobuf[ICONV_TESTLEN];
2082 char *p;
2083 size_t tolen;
2084 static WorkingStatus iconv_working = kUnknown;
2085
2086 if (iconv_working == kBroken)
2087 return (void *)-1; /* detected a broken iconv() previously */
2088
2089 fd = iconv_open((char *)enc_skip(to), (char *)enc_skip(from));
2090
2091 if (fd != (iconv_t)-1 && iconv_working == kUnknown) {
2092 /*
2093 * Do a dummy iconv() call to check if it actually works. There is a
2094 * version of iconv() on Linux that is broken. We can't ignore it,
2095 * because it's wide-spread. The symptoms are that after outputting
2096 * the initial shift state the "to" pointer is NULL and conversion
2097 * stops for no apparent reason after about 8160 characters.
2098 */
2099 p = (char *)tobuf;
2100 tolen = ICONV_TESTLEN;
2101 (void)iconv(fd, NULL, NULL, &p, &tolen);
2102 if (p == NULL) {
2103 iconv_working = kBroken;
2104 iconv_close(fd);
2105 fd = (iconv_t)-1;
2106 } else
2107 iconv_working = kWorking;
2108 }
2109
2110 return (void *)fd;
2111}
2112
2113/*
2114 * Convert the string "str[slen]" with iconv().
2115 * If "unconvlenp" is not NULL handle the string ending in an incomplete
2116 * sequence and set "*unconvlenp" to the length of it.
2117 * Returns the converted string in allocated memory. NULL for an error.
2118 * If resultlenp is not NULL, sets it to the result length in bytes.
2119 */
2120static char_u *iconv_string(const vimconv_T *const vcp, char_u *str,
2121 size_t slen, size_t *unconvlenp, size_t *resultlenp)
2122{
2123 const char *from;
2124 size_t fromlen;
2125 char *to;
2126 size_t tolen;
2127 size_t len = 0;
2128 size_t done = 0;
2129 char_u *result = NULL;
2130 char_u *p;
2131 int l;
2132
2133 from = (char *)str;
2134 fromlen = slen;
2135 for (;; ) {
2136 if (len == 0 || ICONV_ERRNO == ICONV_E2BIG) {
2137 /* Allocate enough room for most conversions. When re-allocating
2138 * increase the buffer size. */
2139 len = len + fromlen * 2 + 40;
2140 p = xmalloc(len);
2141 if (done > 0)
2142 memmove(p, result, done);
2143 xfree(result);
2144 result = p;
2145 }
2146
2147 to = (char *)result + done;
2148 tolen = len - done - 2;
2149 // Avoid a warning for systems with a wrong iconv() prototype by
2150 // casting the second argument to void *.
2151 if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) {
2152 // Finished, append a NUL.
2153 *to = NUL;
2154 break;
2155 }
2156
2157 // Check both ICONV_EINVAL and EINVAL, because the dynamically loaded
2158 // iconv library may use one of them.
2159 if (!vcp->vc_fail && unconvlenp != NULL
2160 && (ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) {
2161 // Handle an incomplete sequence at the end.
2162 *to = NUL;
2163 *unconvlenp = fromlen;
2164 break;
2165 } else if (!vcp->vc_fail
2166 && (ICONV_ERRNO == ICONV_EILSEQ || ICONV_ERRNO == EILSEQ
2167 || ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) {
2168 // Check both ICONV_EILSEQ and EILSEQ, because the dynamically loaded
2169 // iconv library may use one of them.
2170
2171 // Can't convert: insert a '?' and skip a character. This assumes
2172 // conversion from 'encoding' to something else. In other
2173 // situations we don't know what to skip anyway.
2174 *to++ = '?';
2175 if (utf_ptr2cells((char_u *)from) > 1) {
2176 *to++ = '?';
2177 }
2178 l = utfc_ptr2len_len((const char_u *)from, (int)fromlen);
2179 from += l;
2180 fromlen -= l;
2181 } else if (ICONV_ERRNO != ICONV_E2BIG) {
2182 // conversion failed
2183 XFREE_CLEAR(result);
2184 break;
2185 }
2186 // Not enough room or skipping illegal sequence.
2187 done = to - (char *)result;
2188 }
2189
2190 if (resultlenp != NULL && result != NULL)
2191 *resultlenp = (size_t)(to - (char *)result);
2192 return result;
2193}
2194
2195# endif // HAVE_ICONV
2196
2197
2198
2199
2200/*
2201 * Setup "vcp" for conversion from "from" to "to".
2202 * The names must have been made canonical with enc_canonize().
2203 * vcp->vc_type must have been initialized to CONV_NONE.
2204 * Note: cannot be used for conversion from/to ucs-2 and ucs-4 (will use utf-8
2205 * instead).
2206 * Afterwards invoke with "from" and "to" equal to NULL to cleanup.
2207 * Return FAIL when conversion is not supported, OK otherwise.
2208 */
2209int convert_setup(vimconv_T *vcp, char_u *from, char_u *to)
2210{
2211 return convert_setup_ext(vcp, from, true, to, true);
2212}
2213
2214/*
2215 * As convert_setup(), but only when from_unicode_is_utf8 is TRUE will all
2216 * "from" unicode charsets be considered utf-8. Same for "to".
2217 */
2218int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8,
2219 char_u *to, bool to_unicode_is_utf8)
2220{
2221 int from_prop;
2222 int to_prop;
2223 int from_is_utf8;
2224 int to_is_utf8;
2225
2226 // Reset to no conversion.
2227# ifdef HAVE_ICONV
2228 if (vcp->vc_type == CONV_ICONV && vcp->vc_fd != (iconv_t)-1) {
2229 iconv_close(vcp->vc_fd);
2230 }
2231# endif
2232 *vcp = (vimconv_T)MBYTE_NONE_CONV;
2233
2234 /* No conversion when one of the names is empty or they are equal. */
2235 if (from == NULL || *from == NUL || to == NULL || *to == NUL
2236 || STRCMP(from, to) == 0)
2237 return OK;
2238
2239 from_prop = enc_canon_props(from);
2240 to_prop = enc_canon_props(to);
2241 if (from_unicode_is_utf8)
2242 from_is_utf8 = from_prop & ENC_UNICODE;
2243 else
2244 from_is_utf8 = from_prop == ENC_UNICODE;
2245 if (to_unicode_is_utf8)
2246 to_is_utf8 = to_prop & ENC_UNICODE;
2247 else
2248 to_is_utf8 = to_prop == ENC_UNICODE;
2249
2250 if ((from_prop & ENC_LATIN1) && to_is_utf8) {
2251 /* Internal latin1 -> utf-8 conversion. */
2252 vcp->vc_type = CONV_TO_UTF8;
2253 vcp->vc_factor = 2; /* up to twice as long */
2254 } else if ((from_prop & ENC_LATIN9) && to_is_utf8) {
2255 /* Internal latin9 -> utf-8 conversion. */
2256 vcp->vc_type = CONV_9_TO_UTF8;
2257 vcp->vc_factor = 3; /* up to three as long (euro sign) */
2258 } else if (from_is_utf8 && (to_prop & ENC_LATIN1)) {
2259 /* Internal utf-8 -> latin1 conversion. */
2260 vcp->vc_type = CONV_TO_LATIN1;
2261 } else if (from_is_utf8 && (to_prop & ENC_LATIN9)) {
2262 /* Internal utf-8 -> latin9 conversion. */
2263 vcp->vc_type = CONV_TO_LATIN9;
2264 }
2265# ifdef HAVE_ICONV
2266 else { // NOLINT(readability/braces)
2267 // Use iconv() for conversion.
2268 vcp->vc_fd = (iconv_t)my_iconv_open(
2269 to_is_utf8 ? (char_u *)"utf-8" : to,
2270 from_is_utf8 ? (char_u *)"utf-8" : from);
2271 if (vcp->vc_fd != (iconv_t)-1) {
2272 vcp->vc_type = CONV_ICONV;
2273 vcp->vc_factor = 4; /* could be longer too... */
2274 }
2275 }
2276# endif
2277 if (vcp->vc_type == CONV_NONE)
2278 return FAIL;
2279
2280 return OK;
2281}
2282
2283/*
2284 * Convert text "ptr[*lenp]" according to "vcp".
2285 * Returns the result in allocated memory and sets "*lenp".
2286 * When "lenp" is NULL, use NUL terminated strings.
2287 * Illegal chars are often changed to "?", unless vcp->vc_fail is set.
2288 * When something goes wrong, NULL is returned and "*lenp" is unchanged.
2289 */
2290char_u *string_convert(const vimconv_T *const vcp, char_u *ptr, size_t *lenp)
2291{
2292 return string_convert_ext(vcp, ptr, lenp, NULL);
2293}
2294
2295/*
2296 * Like string_convert(), but when "unconvlenp" is not NULL and there are is
2297 * an incomplete sequence at the end it is not converted and "*unconvlenp" is
2298 * set to the number of remaining bytes.
2299 */
2300char_u * string_convert_ext(const vimconv_T *const vcp, char_u *ptr,
2301 size_t *lenp, size_t *unconvlenp)
2302{
2303 char_u *retval = NULL;
2304 char_u *d;
2305 int l;
2306 int c;
2307
2308 size_t len;
2309 if (lenp == NULL)
2310 len = STRLEN(ptr);
2311 else
2312 len = *lenp;
2313 if (len == 0)
2314 return vim_strsave((char_u *)"");
2315
2316 switch (vcp->vc_type) {
2317 case CONV_TO_UTF8: /* latin1 to utf-8 conversion */
2318 retval = xmalloc(len * 2 + 1);
2319 d = retval;
2320 for (size_t i = 0; i < len; ++i) {
2321 c = ptr[i];
2322 if (c < 0x80)
2323 *d++ = c;
2324 else {
2325 *d++ = 0xc0 + ((unsigned)c >> 6);
2326 *d++ = 0x80 + (c & 0x3f);
2327 }
2328 }
2329 *d = NUL;
2330 if (lenp != NULL)
2331 *lenp = (size_t)(d - retval);
2332 break;
2333
2334 case CONV_9_TO_UTF8: /* latin9 to utf-8 conversion */
2335 retval = xmalloc(len * 3 + 1);
2336 d = retval;
2337 for (size_t i = 0; i < len; ++i) {
2338 c = ptr[i];
2339 switch (c) {
2340 case 0xa4: c = 0x20ac; break; /* euro */
2341 case 0xa6: c = 0x0160; break; /* S hat */
2342 case 0xa8: c = 0x0161; break; /* S -hat */
2343 case 0xb4: c = 0x017d; break; /* Z hat */
2344 case 0xb8: c = 0x017e; break; /* Z -hat */
2345 case 0xbc: c = 0x0152; break; /* OE */
2346 case 0xbd: c = 0x0153; break; /* oe */
2347 case 0xbe: c = 0x0178; break; /* Y */
2348 }
2349 d += utf_char2bytes(c, d);
2350 }
2351 *d = NUL;
2352 if (lenp != NULL)
2353 *lenp = (size_t)(d - retval);
2354 break;
2355
2356 case CONV_TO_LATIN1: /* utf-8 to latin1 conversion */
2357 case CONV_TO_LATIN9: /* utf-8 to latin9 conversion */
2358 retval = xmalloc(len + 1);
2359 d = retval;
2360 for (size_t i = 0; i < len; ++i) {
2361 l = utf_ptr2len_len(ptr + i, len - i);
2362 if (l == 0)
2363 *d++ = NUL;
2364 else if (l == 1) {
2365 uint8_t l_w = utf8len_tab_zero[ptr[i]];
2366
2367 if (l_w == 0) {
2368 /* Illegal utf-8 byte cannot be converted */
2369 xfree(retval);
2370 return NULL;
2371 }
2372 if (unconvlenp != NULL && l_w > len - i) {
2373 /* Incomplete sequence at the end. */
2374 *unconvlenp = len - i;
2375 break;
2376 }
2377 *d++ = ptr[i];
2378 } else {
2379 c = utf_ptr2char(ptr + i);
2380 if (vcp->vc_type == CONV_TO_LATIN9)
2381 switch (c) {
2382 case 0x20ac: c = 0xa4; break; /* euro */
2383 case 0x0160: c = 0xa6; break; /* S hat */
2384 case 0x0161: c = 0xa8; break; /* S -hat */
2385 case 0x017d: c = 0xb4; break; /* Z hat */
2386 case 0x017e: c = 0xb8; break; /* Z -hat */
2387 case 0x0152: c = 0xbc; break; /* OE */
2388 case 0x0153: c = 0xbd; break; /* oe */
2389 case 0x0178: c = 0xbe; break; /* Y */
2390 case 0xa4:
2391 case 0xa6:
2392 case 0xa8:
2393 case 0xb4:
2394 case 0xb8:
2395 case 0xbc:
2396 case 0xbd:
2397 case 0xbe: c = 0x100; break; /* not in latin9 */
2398 }
2399 if (!utf_iscomposing(c)) { /* skip composing chars */
2400 if (c < 0x100)
2401 *d++ = c;
2402 else if (vcp->vc_fail) {
2403 xfree(retval);
2404 return NULL;
2405 } else {
2406 *d++ = 0xbf;
2407 if (utf_char2cells(c) > 1)
2408 *d++ = '?';
2409 }
2410 }
2411 i += l - 1;
2412 }
2413 }
2414 *d = NUL;
2415 if (lenp != NULL)
2416 *lenp = (size_t)(d - retval);
2417 break;
2418
2419# ifdef HAVE_ICONV
2420 case CONV_ICONV: // conversion with vcp->vc_fd
2421 retval = iconv_string(vcp, ptr, len, unconvlenp, lenp);
2422 break;
2423# endif
2424 }
2425
2426 return retval;
2427}
2428