mbyte.c source code [neovim/src/nvim/mbyte.c]

1	// This is an open source non-commercial project. Dear PVS-Studio, please check
2	// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4	/// mbyte.c: Code specifically for handling multi-byte characters.
5	/// Multibyte extensions partly by Sung-Hoon Baek
6	///
7	/// Strings internal to Nvim are always encoded as UTF-8 (thus the legacy
8	/// 'encoding' option is always "utf-8").
9	///
10	/// The cell width on the display needs to be determined from the character
11	/// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
12	/// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
13	/// character. To make things complicated, up to six composing characters
14	/// are allowed. These are drawn on top of the first char. For most editing
15	/// the sequence of bytes with composing characters included is considered to
16	/// be one character.
17	///
18	/// UTF-8 is used everywhere in the core. This is in registers, text
19	/// manipulation, buffers, etc. Nvim core communicates with external plugins
20	/// and GUIs in this encoding.
21	///
22	/// The encoding of a file is specified with 'fileencoding'. Conversion
23	/// is to be done when it's different from "utf-8".
24	///
25	/// Vim scripts may contain an ":scriptencoding" command. This has an effect
26	/// for some commands, like ":menutrans".
27
28	#include <inttypes.h>
29	#include <stdbool.h>
30	#include <string.h>
31	#include <wchar.h>
32	#include <wctype.h>
33
34	#include "nvim/vim.h"
35	#include "nvim/ascii.h"
36	#ifdef HAVE_LOCALE_H
37	# include <locale.h>
38	#endif
39	#include "nvim/eval.h"
40	#include "nvim/path.h"
41	#include "nvim/iconv.h"
42	#include "nvim/mbyte.h"
43	#include "nvim/charset.h"
44	#include "nvim/cursor.h"
45	#include "nvim/fileio.h"
46	#include "nvim/func_attr.h"
47	#include "nvim/memline.h"
48	#include "nvim/message.h"
49	#include "nvim/misc1.h"
50	#include "nvim/memory.h"
51	#include "nvim/option.h"
52	#include "nvim/screen.h"
53	#include "nvim/spell.h"
54	#include "nvim/strings.h"
55	#include "nvim/os/os.h"
56	#include "nvim/arabic.h"
57	#include "nvim/mark.h"
58
59	typedef struct {
60	int rangeStart;
61	int rangeEnd;
62	int step;
63	int offset;
64	} convertStruct;
65
66	struct interval {
67	long first;
68	long last;
69	};
70
71	#ifdef INCLUDE_GENERATED_DECLARATIONS
72	# include "mbyte.c.generated.h"
73	# include "unicode_tables.generated.h"
74	#endif
75
76	char_u e_loadlib[] = "E370: Could not load library %s";
77	char_u e_loadfunc[] = "E448: Could not load library function %s";
78
79	// To speed up BYTELEN(); keep a lookup table to quickly get the length in
80	// bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes
81	// which are illegal when used as the first byte have a 1. The NUL byte has
82	// length 1.
83	const uint8_t utf8len_tab[] = {
84	// ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
85	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 0?
86	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 1?
87	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 2?
88	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 3?
89	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 4?
90	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 5?
91	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 6?
92	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 7?
93	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 8?
94	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 9?
95	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // A?
96	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // B?
97	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // C?
98	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // D?
99	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, // E?
100	`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `5`, `5`, `5`, `5`, `6`, `6`, `1`, `1`, // F?
101	};
102
103	// Like utf8len_tab above, but using a zero for illegal lead bytes.
104	const uint8_t utf8len_tab_zero[] = {
105	// ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
106	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 0?
107	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 1?
108	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 2?
109	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 3?
110	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 4?
111	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 5?
112	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 6?
113	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, // 7?
114	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 8?
115	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // 9?
116	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // A?
117	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, // B?
118	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // C?
119	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, `2`, // D?
120	`3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, `3`, // E?
121	`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `5`, `5`, `5`, `5`, `6`, `6`, `0`, `0`, // F?
122	};
123
124	/*
125	* Canonical encoding names and their properties.
126	* "iso-8859-n" is handled by enc_canonize() directly.
127	*/
128	static struct
129	{ const char name; int* prop; int codepage; }
130	enc_canon_table[] =
131	{
132	#define IDX_LATIN_1 0
133	{"latin1", ENC_8BIT + ENC_LATIN1, `1252`},
134	#define IDX_ISO_2 1
135	{"iso-8859-2", ENC_8BIT, `0`},
136	#define IDX_ISO_3 2
137	{"iso-8859-3", ENC_8BIT, `0`},
138	#define IDX_ISO_4 3
139	{"iso-8859-4", ENC_8BIT, `0`},
140	#define IDX_ISO_5 4
141	{"iso-8859-5", ENC_8BIT, `0`},
142	#define IDX_ISO_6 5
143	{"iso-8859-6", ENC_8BIT, `0`},
144	#define IDX_ISO_7 6
145	{"iso-8859-7", ENC_8BIT, `0`},
146	#define IDX_ISO_8 7
147	{"iso-8859-8", ENC_8BIT, `0`},
148	#define IDX_ISO_9 8
149	{"iso-8859-9", ENC_8BIT, `0`},
150	#define IDX_ISO_10 9
151	{"iso-8859-10", ENC_8BIT, `0`},
152	#define IDX_ISO_11 10
153	{"iso-8859-11", ENC_8BIT, `0`},
154	#define IDX_ISO_13 11
155	{"iso-8859-13", ENC_8BIT, `0`},
156	#define IDX_ISO_14 12
157	{"iso-8859-14", ENC_8BIT, `0`},
158	#define IDX_ISO_15 13
159	{"iso-8859-15", ENC_8BIT + ENC_LATIN9, `0`},
160	#define IDX_KOI8_R 14
161	{"koi8-r", ENC_8BIT, `0`},
162	#define IDX_KOI8_U 15
163	{"koi8-u", ENC_8BIT, `0`},
164	#define IDX_UTF8 16
165	{"utf-8", ENC_UNICODE, `0`},
166	#define IDX_UCS2 17
167	{"ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, `0`},
168	#define IDX_UCS2LE 18
169	{"ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, `0`},
170	#define IDX_UTF16 19
171	{"utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, `0`},
172	#define IDX_UTF16LE 20
173	{"utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, `0`},
174	#define IDX_UCS4 21
175	{"ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, `0`},
176	#define IDX_UCS4LE 22
177	{"ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, `0`},
178
179	/ For debugging DBCS encoding on Unix. /
180	#define IDX_DEBUG 23
181	{"debug", ENC_DBCS, DBCS_DEBUG},
182	#define IDX_EUC_JP 24
183	{"euc-jp", ENC_DBCS, DBCS_JPNU},
184	#define IDX_SJIS 25
185	{"sjis", ENC_DBCS, DBCS_JPN},
186	#define IDX_EUC_KR 26
187	{"euc-kr", ENC_DBCS, DBCS_KORU},
188	#define IDX_EUC_CN 27
189	{"euc-cn", ENC_DBCS, DBCS_CHSU},
190	#define IDX_EUC_TW 28
191	{"euc-tw", ENC_DBCS, DBCS_CHTU},
192	#define IDX_BIG5 29
193	{"big5", ENC_DBCS, DBCS_CHT},
194
195	/ MS-DOS and MS-Windows codepages are included here, so that they can be*
196	* used on Unix too. Most of them are similar to ISO-8859 encodings, but
197	* not exactly the same. */
198	#define IDX_CP437 30
199	{"cp437", ENC_8BIT, `437`}, / like iso-8859-1 /
200	#define IDX_CP737 31
201	{"cp737", ENC_8BIT, `737`}, / like iso-8859-7 /
202	#define IDX_CP775 32
203	{"cp775", ENC_8BIT, `775`}, / Baltic /
204	#define IDX_CP850 33
205	{"cp850", ENC_8BIT, `850`}, / like iso-8859-4 /
206	#define IDX_CP852 34
207	{"cp852", ENC_8BIT, `852`}, / like iso-8859-1 /
208	#define IDX_CP855 35
209	{"cp855", ENC_8BIT, `855`}, / like iso-8859-2 /
210	#define IDX_CP857 36
211	{"cp857", ENC_8BIT, `857`}, / like iso-8859-5 /
212	#define IDX_CP860 37
213	{"cp860", ENC_8BIT, `860`}, / like iso-8859-9 /
214	#define IDX_CP861 38
215	{"cp861", ENC_8BIT, `861`}, / like iso-8859-1 /
216	#define IDX_CP862 39
217	{"cp862", ENC_8BIT, `862`}, / like iso-8859-1 /
218	#define IDX_CP863 40
219	{"cp863", ENC_8BIT, `863`}, / like iso-8859-8 /
220	#define IDX_CP865 41
221	{"cp865", ENC_8BIT, `865`}, / like iso-8859-1 /
222	#define IDX_CP866 42
223	{"cp866", ENC_8BIT, `866`}, / like iso-8859-5 /
224	#define IDX_CP869 43
225	{"cp869", ENC_8BIT, `869`}, / like iso-8859-7 /
226	#define IDX_CP874 44
227	{"cp874", ENC_8BIT, `874`}, / Thai /
228	#define IDX_CP932 45
229	{"cp932", ENC_DBCS, DBCS_JPN},
230	#define IDX_CP936 46
231	{"cp936", ENC_DBCS, DBCS_CHS},
232	#define IDX_CP949 47
233	{"cp949", ENC_DBCS, DBCS_KOR},
234	#define IDX_CP950 48
235	{"cp950", ENC_DBCS, DBCS_CHT},
236	#define IDX_CP1250 49
237	{"cp1250", ENC_8BIT, `1250`}, / Czech, Polish, etc. /
238	#define IDX_CP1251 50
239	{"cp1251", ENC_8BIT, `1251`}, / Cyrillic /
240	/ cp1252 is considered to be equal to latin1 /
241	#define IDX_CP1253 51
242	{"cp1253", ENC_8BIT, `1253`}, / Greek /
243	#define IDX_CP1254 52
244	{"cp1254", ENC_8BIT, `1254`}, / Turkish /
245	#define IDX_CP1255 53
246	{"cp1255", ENC_8BIT, `1255`}, / Hebrew /
247	#define IDX_CP1256 54
248	{"cp1256", ENC_8BIT, `1256`}, / Arabic /
249	#define IDX_CP1257 55
250	{"cp1257", ENC_8BIT, `1257`}, / Baltic /
251	#define IDX_CP1258 56
252	{"cp1258", ENC_8BIT, `1258`}, / Vietnamese /
253
254	#define IDX_MACROMAN 57
255	{"macroman", ENC_8BIT + ENC_MACROMAN, `0`}, / Mac OS /
256	#define IDX_HPROMAN8 58
257	{"hp-roman8", ENC_8BIT, `0`}, / HP Roman8 /
258	#define IDX_COUNT 59
259	};
260
261	/*
262	* Aliases for encoding names.
263	*/
264	static struct
265	{ const char name; int* canon; }
266	enc_alias_table[] =
267	{
268	{"ansi", IDX_LATIN_1},
269	{"iso-8859-1", IDX_LATIN_1},
270	{"latin2", IDX_ISO_2},
271	{"latin3", IDX_ISO_3},
272	{"latin4", IDX_ISO_4},
273	{"cyrillic", IDX_ISO_5},
274	{"arabic", IDX_ISO_6},
275	{"greek", IDX_ISO_7},
276	{"hebrew", IDX_ISO_8},
277	{"latin5", IDX_ISO_9},
278	{"turkish", IDX_ISO_9}, / ? /
279	{"latin6", IDX_ISO_10},
280	{"nordic", IDX_ISO_10}, / ? /
281	{"thai", IDX_ISO_11}, / ? /
282	{"latin7", IDX_ISO_13},
283	{"latin8", IDX_ISO_14},
284	{"latin9", IDX_ISO_15},
285	{"utf8", IDX_UTF8},
286	{"unicode", IDX_UCS2},
287	{"ucs2", IDX_UCS2},
288	{"ucs2be", IDX_UCS2},
289	{"ucs-2be", IDX_UCS2},
290	{"ucs2le", IDX_UCS2LE},
291	{"utf16", IDX_UTF16},
292	{"utf16be", IDX_UTF16},
293	{"utf-16be", IDX_UTF16},
294	{"utf16le", IDX_UTF16LE},
295	{"ucs4", IDX_UCS4},
296	{"ucs4be", IDX_UCS4},
297	{"ucs-4be", IDX_UCS4},
298	{"ucs4le", IDX_UCS4LE},
299	{"utf32", IDX_UCS4},
300	{"utf-32", IDX_UCS4},
301	{"utf32be", IDX_UCS4},
302	{"utf-32be", IDX_UCS4},
303	{"utf32le", IDX_UCS4LE},
304	{"utf-32le", IDX_UCS4LE},
305	{"932", IDX_CP932},
306	{"949", IDX_CP949},
307	{"936", IDX_CP936},
308	{"gbk", IDX_CP936},
309	{"950", IDX_CP950},
310	{"eucjp", IDX_EUC_JP},
311	{"unix-jis", IDX_EUC_JP},
312	{"ujis", IDX_EUC_JP},
313	{"shift-jis", IDX_SJIS},
314	{"pck", IDX_SJIS}, / Sun: PCK /
315	{"euckr", IDX_EUC_KR},
316	{"5601", IDX_EUC_KR}, / Sun: KS C 5601 /
317	{"euccn", IDX_EUC_CN},
318	{"gb2312", IDX_EUC_CN},
319	{"euctw", IDX_EUC_TW},
320	{"japan", IDX_EUC_JP},
321	{"korea", IDX_EUC_KR},
322	{"prc", IDX_EUC_CN},
323	{"chinese", IDX_EUC_CN},
324	{"taiwan", IDX_EUC_TW},
325	{"cp950", IDX_BIG5},
326	{"950", IDX_BIG5},
327	{"mac", IDX_MACROMAN},
328	{"mac-roman", IDX_MACROMAN},
329	{NULL, `0`}
330	};
331
332	/*
333	* Find encoding "name" in the list of canonical encoding names.
334	* Returns -1 if not found.
335	*/
336	static int enc_canon_search(const char_u *name)
337	{
338	int i;
339
340	for (i = `0`; i < IDX_COUNT; ++i)
341	if (STRCMP(name, enc_canon_table[i].name) == `0`)
342	return i;
343	return -`1`;
344	}
345
346
347
348	/*
349	* Find canonical encoding "name" in the list and return its properties.
350	* Returns 0 if not found.
351	*/
352	int enc_canon_props(const char_u *name)
353	{
354	int i;
355
356	i = enc_canon_search(name);
357	if (i >= `0`)
358	return enc_canon_table[i].prop;
359	if (STRNCMP(name, "2byte-", `6`) == `0`)
360	return ENC_DBCS;
361	if (STRNCMP(name, "8bit-", `5`) == `0` \|\| STRNCMP(name, "iso-8859-", `9`) == `0`)
362	return ENC_8BIT;
363	return `0`;
364	}
365
366	/*
367	* Return the size of the BOM for the current buffer:
368	* 0 - no BOM
369	* 2 - UCS-2 or UTF-16 BOM
370	* 4 - UCS-4 BOM
371	* 3 - UTF-8 BOM
372	*/
373	int bomb_size(void)
374	{
375	int n = `0`;
376
377	if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
378	if (*curbuf->b_p_fenc == NUL
379	\|\| STRCMP(curbuf->b_p_fenc, "utf-8") == `0`) {
380	n = `3`;
381	} else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", `5`) == `0`
382	\|\| STRNCMP(curbuf->b_p_fenc, "utf-16", `6`) == `0`) {
383	n = `2`;
384	} else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", `5`) == `0`) {
385	n = `4`;
386	}
387	}
388	return n;
389	}
390
391	/*
392	* Remove all BOM from "s" by moving remaining text.
393	*/
394	void remove_bom(char_u *s)
395	{
396	char p = (char* *)s;
397
398	while ((p = strchr(p, `0xef`)) != NULL) {
399	if ((uint8_t)p[`1`] == `0xbb` && (uint8_t)p[`2`] == `0xbf`) {
400	STRMOVE(p, p + `3`);
401	} else {
402	p++;
403	}
404	}
405	}
406
407	/*
408	* Get class of pointer:
409	* 0 for blank or NUL
410	* 1 for punctuation
411	* 2 for an (ASCII) word character
412	* >2 for other word characters
413	*/
414	int mb_get_class(const char_u *p)
415	{
416	return mb_get_class_tab(p, curbuf->b_chartab);
417	}
418
419	int mb_get_class_tab(const char_u p, const* uint64_t *const chartab)
420	{
421	if (MB_BYTE2LEN(p[`0`]) == `1`) {
422	if (p[`0`] == NUL \|\| ascii_iswhite(p[`0`])) {
423	return `0`;
424	}
425	if (vim_iswordc_tab(p[`0`], chartab)) {
426	return `2`;
427	}
428	return `1`;
429	}
430	return utf_class_tab(utf_ptr2char(p), chartab);
431	}
432
433	/*
434	* Return true if "c" is in "table".
435	*/
436	static bool intable(const struct interval table, size_t n_items, int* c)
437	{
438	int mid, bot, top;
439
440	/ first quick check for Latin1 etc. characters /
441	if (c < table[`0`].first)
442	return false;
443
444	/ binary search in table /
445	bot = `0`;
446	top = (int)(n_items - `1`);
447	while (top >= bot) {
448	mid = (bot + top) / `2`;
449	if (table[mid].last < c)
450	bot = mid + `1`;
451	else if (table[mid].first > c)
452	top = mid - `1`;
453	else
454	return true;
455	}
456	return false;
457	}
458
459	/// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
460	/// Returns 4 or 6 for an unprintable character.
461	/// Is only correct for characters >= 0x80.
462	/// When p_ambw is "double", return 2 for a character with East Asian Width
463	/// class 'A'(mbiguous).
464	///
465	/// @note Tables `doublewidth` and `ambiguous` are generated by
466	/// gen_unicode_tables.lua, which must be manually invoked as needed.
467	int utf_char2cells(int c)
468	{
469	if (c >= `0x100`) {
470	#ifdef USE_WCHAR_FUNCTIONS
471	//
472	// Assume the library function wcwidth() works better than our own
473	// stuff. It should return 1 for ambiguous width chars!
474	//
475	int n = wcwidth(c);
476
477	if (n < `0`) {
478	return `6`; // unprintable, displays <xxxx>
479	}
480	if (n > `1`) {
481	return n;
482	}
483	#else
484	if (!utf_printable(c)) {
485	return `6`; // unprintable, displays <xxxx>
486	}
487	if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
488	return `2`;
489	}
490	#endif
491	if (p_emoji && intable(emoji_width, ARRAY_SIZE(emoji_width), c)) {
492	return `2`;
493	}
494	} else if (c >= `0x80` && !vim_isprintc(c)) {
495	// Characters below 0x100 are influenced by 'isprint' option.
496	return `4`; // unprintable, displays <xx>
497	}
498
499	if (c >= `0x80` && *p_ambw == `'d'`
500	&& intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
501	return `2`;
502	}
503
504	return `1`;
505	}
506
507	/// Return the number of display cells character at "p" occupies.*
508	/// This doesn't take care of unprintable characters, use ptr2cells() for that.
509	int utf_ptr2cells(const char_u *p)
510	{
511	int c;
512
513	/ Need to convert to a wide character. /
514	if (*p >= `0x80`) {
515	c = utf_ptr2char(p);
516	/ An illegal byte is displayed as <xx>. /
517	if (utf_ptr2len(p) == `1` \|\| c == NUL)
518	return `4`;
519	/ If the char is ASCII it must be an overlong sequence. /
520	if (c < `0x80`)
521	return char2cells(c);
522	return utf_char2cells(c);
523	}
524	return `1`;
525	}
526
527	/// Like utf_ptr2cells(), but limit string length to "size".
528	/// For an empty string or truncated character returns 1.
529	int utf_ptr2cells_len(const char_u p, int* size)
530	{
531	int c;
532
533	/ Need to convert to a wide character. /
534	if (size > `0` && *p >= `0x80`) {
535	if (utf_ptr2len_len(p, size) < utf8len_tab[*p])
536	return `1`; / truncated /
537	c = utf_ptr2char(p);
538	/ An illegal byte is displayed as <xx>. /
539	if (utf_ptr2len(p) == `1` \|\| c == NUL)
540	return `4`;
541	/ If the char is ASCII it must be an overlong sequence. /
542	if (c < `0x80`)
543	return char2cells(c);
544	return utf_char2cells(c);
545	}
546	return `1`;
547	}
548
549	/// Calculate the number of cells occupied by string `str`.
550	///
551	/// @param str The source string, may not be NULL, must be a NUL-terminated
552	/// string.
553	/// @return The number of cells occupied by string `str`
554	size_t mb_string2cells(const char_u *str)
555	{
556	size_t clen = `0`;
557
558	for (const char_u p = str; p != NUL; p += (*mb_ptr2len)(p)) {
559	clen += utf_ptr2cells(p);
560	}
561
562	return clen;
563	}
564
565	/// Get the number of cells occupied by string `str` with maximum length `size`
566	///
567	/// @param str The source string, may not be NULL, must be a NUL-terminated
568	/// string.
569	/// @param size maximum length of string. It will terminate on earlier NUL.
570	/// @return The number of cells occupied by string `str`
571	size_t mb_string2cells_len(const char_u *str, size_t size)
572	{
573	size_t clen = `0`;
574
575	for (const char_u p = str; p != NUL && p < str+size;
576	p += utf_ptr2len_len(p, size+(p-str))) {
577	clen += utf_ptr2cells(p);
578	}
579
580	return clen;
581	}
582
583	/// Convert a UTF-8 byte sequence to a wide character
584	///
585	/// If the sequence is illegal or truncated by a NUL then the first byte is
586	/// returned.
587	/// For an overlong sequence this may return zero.
588	/// Does not include composing characters for obvious reasons.
589	///
590	/// @param[in] p String to convert.
591	///
592	/// @return Unicode codepoint or byte value.
593	int utf_ptr2char(const char_u *const p)
594	FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
595	{
596	if (p[`0`] < `0x80`) { // Be quick for ASCII.
597	return p[`0`];
598	}
599
600	const uint8_t len = utf8len_tab_zero[p[`0`]];
601	if (len > `1` && (p[`1`] & `0xc0`) == `0x80`) {
602	if (len == `2`) {
603	return ((p[`0`] & `0x1f`) << `6`) + (p[`1`] & `0x3f`);
604	}
605	if ((p[`2`] & `0xc0`) == `0x80`) {
606	if (len == `3`) {
607	return (((p[`0`] & `0x0f`) << `12`) + ((p[`1`] & `0x3f`) << `6`)
608	+ (p[`2`] & `0x3f`));
609	}
610	if ((p[`3`] & `0xc0`) == `0x80`) {
611	if (len == `4`) {
612	return (((p[`0`] & `0x07`) << `18`) + ((p[`1`] & `0x3f`) << `12`)
613	+ ((p[`2`] & `0x3f`) << `6`) + (p[`3`] & `0x3f`));
614	}
615	if ((p[`4`] & `0xc0`) == `0x80`) {
616	if (len == `5`) {
617	return (((p[`0`] & `0x03`) << `24`) + ((p[`1`] & `0x3f`) << `18`)
618	+ ((p[`2`] & `0x3f`) << `12`) + ((p[`3`] & `0x3f`) << `6`)
619	+ (p[`4`] & `0x3f`));
620	}
621	if ((p[`5`] & `0xc0`) == `0x80` && len == `6`) {
622	return (((p[`0`] & `0x01`) << `30`) + ((p[`1`] & `0x3f`) << `24`)
623	+ ((p[`2`] & `0x3f`) << `18`) + ((p[`3`] & `0x3f`) << `12`)
624	+ ((p[`4`] & `0x3f`) << `6`) + (p[`5`] & `0x3f`));
625	}
626	}
627	}
628	}
629	}
630	// Illegal value: just return the first byte.
631	return p[`0`];
632	}
633
634	/*
635	* Convert a UTF-8 byte sequence to a wide character.
636	* String is assumed to be terminated by NUL or after "n" bytes, whichever
637	* comes first.
638	* The function is safe in the sense that it never accesses memory beyond the
639	* first "n" bytes of "s".
640	*
641	* On success, returns decoded codepoint, advances "s" to the beginning of
642	* next character and decreases "n" accordingly.
643	*
644	* If end of string was reached, returns 0 and, if "n" > 0, advances "s" past
645	* NUL byte.
646	*
647	* If byte sequence is illegal or incomplete, returns -1 and does not advance
648	* "s".
649	*/
650	static int utf_safe_read_char_adv(const char_u *s, size_t n)
651	{
652	int c;
653
654	if (n == `0`) /* end of buffer /
655	return `0`;
656
657	uint8_t k = utf8len_tab_zero[**s];
658
659	if (k == `1`) {
660	/ ASCII character or NUL /
661	(*n)--;
662	return (s)++;
663	}
664
665	if (k <= *n) {
666	/ We have a multibyte sequence and it isn't truncated by buffer*
667	* limits so utf_ptr2char() is safe to use. Or the first byte is
668	* illegal (k=0), and it's also safe to use utf_ptr2char(). */
669	c = utf_ptr2char(*s);
670
671	/ On failure, utf_ptr2char() returns the first byte, so here we*
672	* check equality with the first byte. The only non-ASCII character
673	* which equals the first byte of its own UTF-8 representation is
674	* U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
675	* It's safe even if n=1, else we would have k=2 > n. */
676	if (c != (int)(*s) \|\| (c == `0xC3` && (s)[`1`] == `0x83`)) {
677	/ byte sequence was successfully decoded /
678	*s += k;
679	*n -= k;
680	return c;
681	}
682	}
683
684	/ byte sequence is incomplete or illegal /
685	return -`1`;
686	}
687
688	/*
689	* Get character at *pp and advance pp to the next character.
690	* Note: composing characters are skipped!
691	*/
692	int mb_ptr2char_adv(const char_u **const pp)
693	{
694	int c;
695
696	c = utf_ptr2char(*pp);
697	pp += (mb_ptr2len)(*pp);
698	return c;
699	}
700
701	/*
702	* Get character at *pp and advance pp to the next character.
703	* Note: composing characters are returned as separate characters.
704	*/
705	int mb_cptr2char_adv(const char_u **pp)
706	{
707	int c;
708
709	c = utf_ptr2char(*pp);
710	pp += utf_ptr2len(pp);
711	return c;
712	}
713
714	/*
715	* Check if the character pointed to by "p2" is a composing character when it
716	* comes after "p1". For Arabic sometimes "ab" is replaced with "c", which
717	* behaves like a composing character.
718	*/
719	bool utf_composinglike(const char_u p1, const* char_u *p2)
720	{
721	int c2;
722
723	c2 = utf_ptr2char(p2);
724	if (utf_iscomposing(c2))
725	return true;
726	if (!arabic_maycombine(c2))
727	return false;
728	return arabic_combine(utf_ptr2char(p1), c2);
729	}
730
731	/// Convert a UTF-8 string to a wide character
732	///
733	/// Also gets up to #MAX_MCO composing characters.
734	///
735	/// @param[out] pcc Location where to store composing characters. Must have
736	/// space at least for #MAX_MCO + 1 elements.
737	///
738	/// @return leading character.
739	int utfc_ptr2char(const char_u p, int* *pcc)
740	{
741	int len;
742	int c;
743	int cc;
744	int i = `0`;
745
746	c = utf_ptr2char(p);
747	len = utf_ptr2len(p);
748
749	/ Only accept a composing char when the first char isn't illegal. /
750	if ((len > `1` \|\| *p < `0x80`)
751	&& p[len] >= `0x80`
752	&& UTF_COMPOSINGLIKE(p, p + len)) {
753	cc = utf_ptr2char(p + len);
754	for (;; ) {
755	pcc[i++] = cc;
756	if (i == MAX_MCO)
757	break;
758	len += utf_ptr2len(p + len);
759	if (p[len] < `0x80` \|\| !utf_iscomposing(cc = utf_ptr2char(p + len)))
760	break;
761	}
762	}
763
764	if (i < MAX_MCO) / last composing char must be 0 /
765	pcc[i] = `0`;
766
767	return c;
768	}
769
770	/*
771	* Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO
772	* composing characters. Use no more than p[maxlen].
773	*
774	* @param [out] pcc: composing chars, last one is 0
775	*/
776	int utfc_ptr2char_len(const char_u p, int* pcc, int* maxlen)
777	{
778	#define IS_COMPOSING(s1, s2, s3) \
779	(i == 0 ? UTF_COMPOSINGLIKE((s1), (s2)) : utf_iscomposing((s3)))
780
781	assert(maxlen > `0`);
782
783	int i = `0`;
784
785	int len = utf_ptr2len_len(p, maxlen);
786	// Is it safe to use utf_ptr2char()?
787	bool safe = len > `1` && len <= maxlen;
788	int c = safe ? utf_ptr2char(p) : *p;
789
790	// Only accept a composing char when the first char isn't illegal.
791	if ((safe \|\| c < `0x80`) && len < maxlen && p[len] >= `0x80`) {
792	for (; i < MAX_MCO; i++) {
793	int len_cc = utf_ptr2len_len(p + len, maxlen - len);
794	safe = len_cc > `1` && len_cc <= maxlen - len;
795	if (!safe \|\| (pcc[i] = utf_ptr2char(p + len)) < `0x80`
796	\|\| !IS_COMPOSING(p, p + len, pcc[i])) {
797	break;
798	}
799	len += len_cc;
800	}
801	}
802
803	if (i < MAX_MCO) {
804	// last composing char must be 0
805	pcc[i] = `0`;
806	}
807
808	return c;
809	#undef ISCOMPOSING
810	}
811
812	/// Get the length of a UTF-8 byte sequence representing a single codepoint
813	///
814	/// @param[in] p UTF-8 string.
815	///
816	/// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte
817	/// sequence.
818	int utf_ptr2len(const char_u *const p)
819	FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
820	{
821	if (*p == NUL) {
822	return `0`;
823	}
824	const int len = utf8len_tab[*p];
825	for (int i = `1`; i < len; i++) {
826	if ((p[i] & `0xc0`) != `0x80`) {
827	return `1`;
828	}
829	}
830	return len;
831	}
832
833	/*
834	* Return length of UTF-8 character, obtained from the first byte.
835	* "b" must be between 0 and 255!
836	* Returns 1 for an invalid first byte value.
837	*/
838	int utf_byte2len(int b)
839	{
840	return utf8len_tab[b];
841	}
842
843	/*
844	* Get the length of UTF-8 byte sequence "p[size]". Does not include any
845	* following composing characters.
846	* Returns 1 for "".
847	* Returns 1 for an illegal byte sequence (also in incomplete byte seq.).
848	* Returns number > "size" for an incomplete byte sequence.
849	* Never returns zero.
850	*/
851	int utf_ptr2len_len(const char_u p, int* size)
852	{
853	int len;
854	int i;
855	int m;
856
857	len = utf8len_tab[*p];
858	if (len == `1`)
859	return `1`; / NUL, ascii or illegal lead byte /
860	if (len > size)
861	m = size; / incomplete byte sequence. /
862	else
863	m = len;
864	for (i = `1`; i < m; ++i)
865	if ((p[i] & `0xc0`) != `0x80`)
866	return `1`;
867	return len;
868	}
869
870	/// Return the number of bytes occupied by a UTF-8 character in a string
871	///
872	/// This includes following composing characters.
873	int utfc_ptr2len(const char_u *const p)
874	FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
875	{
876	uint8_t b0 = (uint8_t)(*p);
877
878	if (b0 == NUL) {
879	return `0`;
880	}
881	if (b0 < `0x80` && p[`1`] < `0x80`) { // be quick for ASCII
882	return `1`;
883	}
884
885	// Skip over first UTF-8 char, stopping at a NUL byte.
886	int len = utf_ptr2len(p);
887
888	// Check for illegal byte.
889	if (len == `1` && b0 >= `0x80`) {
890	return `1`;
891	}
892
893	// Check for composing characters. We can handle only the first six, but
894	// skip all of them (otherwise the cursor would get stuck).
895	int prevlen = `0`;
896	for (;;) {
897	if (p[len] < `0x80` \|\| !UTF_COMPOSINGLIKE(p + prevlen, p + len)) {
898	return len;
899	}
900
901	// Skip over composing char.
902	prevlen = len;
903	len += utf_ptr2len(p + len);
904	}
905	}
906
907	/*
908	* Return the number of bytes the UTF-8 encoding of the character at "p[size]"
909	* takes. This includes following composing characters.
910	* Returns 0 for an empty string.
911	* Returns 1 for an illegal char or an incomplete byte sequence.
912	*/
913	int utfc_ptr2len_len(const char_u p, int* size)
914	{
915	int len;
916	int prevlen;
917
918	if (size < `1` \|\| *p == NUL)
919	return `0`;
920	if (p[`0`] < `0x80` && (size == `1` \|\| p[`1`] < `0x80`)) / be quick for ASCII /
921	return `1`;
922
923	/ Skip over first UTF-8 char, stopping at a NUL byte. /
924	len = utf_ptr2len_len(p, size);
925
926	/ Check for illegal byte and incomplete byte sequence. /
927	if ((len == `1` && p[`0`] >= `0x80`) \|\| len > size)
928	return `1`;
929
930	/*
931	* Check for composing characters. We can handle only the first six, but
932	* skip all of them (otherwise the cursor would get stuck).
933	*/
934	prevlen = `0`;
935	while (len < size) {
936	int len_next_char;
937
938	if (p[len] < `0x80`)
939	break;
940
941	/*
942	* Next character length should not go beyond size to ensure that
943	* UTF_COMPOSINGLIKE(...) does not read beyond size.
944	*/
945	len_next_char = utf_ptr2len_len(p + len, size - len);
946	if (len_next_char > size - len)
947	break;
948
949	if (!UTF_COMPOSINGLIKE(p + prevlen, p + len))
950	break;
951
952	/ Skip over composing char /
953	prevlen = len;
954	len += len_next_char;
955	}
956	return len;
957	}
958
959	/// Determine how many bytes certain unicode codepoint will occupy
960	int utf_char2len(const int c)
961	{
962	if (c < `0x80`) {
963	return `1`;
964	} else if (c < `0x800`) {
965	return `2`;
966	} else if (c < `0x10000`) {
967	return `3`;
968	} else if (c < `0x200000`) {
969	return `4`;
970	} else if (c < `0x4000000`) {
971	return `5`;
972	} else {
973	return `6`;
974	}
975	}
976
977	/// Convert Unicode character to UTF-8 string
978	///
979	/// @param c character to convert to \p buf
980	/// @param[out] buf UTF-8 string generated from \p c, does not add \0
981	/// @return Number of bytes (1-6).
982	int utf_char2bytes(const int c, char_u *const buf)
983	{
984	if (c < `0x80`) { // 7 bits
985	buf[`0`] = c;
986	return `1`;
987	} else if (c < `0x800`) { // 11 bits
988	buf[`0`] = `0xc0` + ((unsigned)c >> `6`);
989	buf[`1`] = `0x80` + (c & `0x3f`);
990	return `2`;
991	} else if (c < `0x10000`) { // 16 bits
992	buf[`0`] = `0xe0` + ((unsigned)c >> `12`);
993	buf[`1`] = `0x80` + (((unsigned)c >> `6`) & `0x3f`);
994	buf[`2`] = `0x80` + (c & `0x3f`);
995	return `3`;
996	} else if (c < `0x200000`) { // 21 bits
997	buf[`0`] = `0xf0` + ((unsigned)c >> `18`);
998	buf[`1`] = `0x80` + (((unsigned)c >> `12`) & `0x3f`);
999	buf[`2`] = `0x80` + (((unsigned)c >> `6`) & `0x3f`);
1000	buf[`3`] = `0x80` + (c & `0x3f`);
1001	return `4`;
1002	} else if (c < `0x4000000`) { // 26 bits
1003	buf[`0`] = `0xf8` + ((unsigned)c >> `24`);
1004	buf[`1`] = `0x80` + (((unsigned)c >> `18`) & `0x3f`);
1005	buf[`2`] = `0x80` + (((unsigned)c >> `12`) & `0x3f`);
1006	buf[`3`] = `0x80` + (((unsigned)c >> `6`) & `0x3f`);
1007	buf[`4`] = `0x80` + (c & `0x3f`);
1008	return `5`;
1009	} else { // 31 bits
1010	buf[`0`] = `0xfc` + ((unsigned)c >> `30`);
1011	buf[`1`] = `0x80` + (((unsigned)c >> `24`) & `0x3f`);
1012	buf[`2`] = `0x80` + (((unsigned)c >> `18`) & `0x3f`);
1013	buf[`3`] = `0x80` + (((unsigned)c >> `12`) & `0x3f`);
1014	buf[`4`] = `0x80` + (((unsigned)c >> `6`) & `0x3f`);
1015	buf[`5`] = `0x80` + (c & `0x3f`);
1016	return `6`;
1017	}
1018	}
1019
1020	/*
1021	* Return true if "c" is a composing UTF-8 character. This means it will be
1022	* drawn on top of the preceding character.
1023	* Based on code from Markus Kuhn.
1024	*/
1025	bool utf_iscomposing(int c)
1026	{
1027	return intable(combining, ARRAY_SIZE(combining), c);
1028	}
1029
1030	/*
1031	* Return true for characters that can be displayed in a normal way.
1032	* Only for characters of 0x100 and above!
1033	*/
1034	bool utf_printable(int c)
1035	{
1036	#ifdef USE_WCHAR_FUNCTIONS
1037	/*
1038	* Assume the iswprint() library function works better than our own stuff.
1039	*/
1040	return iswprint(c);
1041	#else
1042	/ Sorted list of non-overlapping intervals.*
1043	* 0xd800-0xdfff is reserved for UTF-16, actually illegal. */
1044	static struct interval nonprint[] =
1045	{
1046	{`0x070f`, `0x070f`}, {`0x180b`, `0x180e`}, {`0x200b`, `0x200f`}, {`0x202a`, `0x202e`},
1047	{`0x206a`, `0x206f`}, {`0xd800`, `0xdfff`}, {`0xfeff`, `0xfeff`}, {`0xfff9`, `0xfffb`},
1048	{`0xfffe`, `0xffff`}
1049	};
1050
1051	return !intable(nonprint, ARRAY_SIZE(nonprint), c);
1052	#endif
1053	}
1054
1055	/*
1056	* Get class of a Unicode character.
1057	* 0: white space
1058	* 1: punctuation
1059	* 2 or bigger: some class of word character.
1060	*/
1061	int utf_class(const int c)
1062	{
1063	return utf_class_tab(c, curbuf->b_chartab);
1064	}
1065
1066	int utf_class_tab(const int c, const uint64_t *const chartab)
1067	{
1068	/ sorted list of non-overlapping intervals /
1069	static struct clinterval {
1070	unsigned int first;
1071	unsigned int last;
1072	unsigned int class;
1073	} classes[] = {
1074	{ `0x037e`, `0x037e`, `1` }, // Greek question mark
1075	{ `0x0387`, `0x0387`, `1` }, // Greek ano teleia
1076	{ `0x055a`, `0x055f`, `1` }, // Armenian punctuation
1077	{ `0x0589`, `0x0589`, `1` }, // Armenian full stop
1078	{ `0x05be`, `0x05be`, `1` },
1079	{ `0x05c0`, `0x05c0`, `1` },
1080	{ `0x05c3`, `0x05c3`, `1` },
1081	{ `0x05f3`, `0x05f4`, `1` },
1082	{ `0x060c`, `0x060c`, `1` },
1083	{ `0x061b`, `0x061b`, `1` },
1084	{ `0x061f`, `0x061f`, `1` },
1085	{ `0x066a`, `0x066d`, `1` },
1086	{ `0x06d4`, `0x06d4`, `1` },
1087	{ `0x0700`, `0x070d`, `1` }, // Syriac punctuation
1088	{ `0x0964`, `0x0965`, `1` },
1089	{ `0x0970`, `0x0970`, `1` },
1090	{ `0x0df4`, `0x0df4`, `1` },
1091	{ `0x0e4f`, `0x0e4f`, `1` },
1092	{ `0x0e5a`, `0x0e5b`, `1` },
1093	{ `0x0f04`, `0x0f12`, `1` },
1094	{ `0x0f3a`, `0x0f3d`, `1` },
1095	{ `0x0f85`, `0x0f85`, `1` },
1096	{ `0x104a`, `0x104f`, `1` }, // Myanmar punctuation
1097	{ `0x10fb`, `0x10fb`, `1` }, // Georgian punctuation
1098	{ `0x1361`, `0x1368`, `1` }, // Ethiopic punctuation
1099	{ `0x166d`, `0x166e`, `1` }, // Canadian Syl. punctuation
1100	{ `0x1680`, `0x1680`, `0` },
1101	{ `0x169b`, `0x169c`, `1` },
1102	{ `0x16eb`, `0x16ed`, `1` },
1103	{ `0x1735`, `0x1736`, `1` },
1104	{ `0x17d4`, `0x17dc`, `1` }, // Khmer punctuation
1105	{ `0x1800`, `0x180a`, `1` }, // Mongolian punctuation
1106	{ `0x2000`, `0x200b`, `0` }, // spaces
1107	{ `0x200c`, `0x2027`, `1` }, // punctuation and symbols
1108	{ `0x2028`, `0x2029`, `0` },
1109	{ `0x202a`, `0x202e`, `1` }, // punctuation and symbols
1110	{ `0x202f`, `0x202f`, `0` },
1111	{ `0x2030`, `0x205e`, `1` }, // punctuation and symbols
1112	{ `0x205f`, `0x205f`, `0` },
1113	{ `0x2060`, `0x27ff`, `1` }, // punctuation and symbols
1114	{ `0x2070`, `0x207f`, `0x2070` }, // superscript
1115	{ `0x2080`, `0x2094`, `0x2080` }, // subscript
1116	{ `0x20a0`, `0x27ff`, `1` }, // all kinds of symbols
1117	{ `0x2800`, `0x28ff`, `0x2800` }, // braille
1118	{ `0x2900`, `0x2998`, `1` }, // arrows, brackets, etc.
1119	{ `0x29d8`, `0x29db`, `1` },
1120	{ `0x29fc`, `0x29fd`, `1` },
1121	{ `0x2e00`, `0x2e7f`, `1` }, // supplemental punctuation
1122	{ `0x3000`, `0x3000`, `0` }, // ideographic space
1123	{ `0x3001`, `0x3020`, `1` }, // ideographic punctuation
1124	{ `0x3030`, `0x3030`, `1` },
1125	{ `0x303d`, `0x303d`, `1` },
1126	{ `0x3040`, `0x309f`, `0x3040` }, // Hiragana
1127	{ `0x30a0`, `0x30ff`, `0x30a0` }, // Katakana
1128	{ `0x3300`, `0x9fff`, `0x4e00` }, // CJK Ideographs
1129	{ `0xac00`, `0xd7a3`, `0xac00` }, // Hangul Syllables
1130	{ `0xf900`, `0xfaff`, `0x4e00` }, // CJK Ideographs
1131	{ `0xfd3e`, `0xfd3f`, `1` },
1132	{ `0xfe30`, `0xfe6b`, `1` }, // punctuation forms
1133	{ `0xff00`, `0xff0f`, `1` }, // half/fullwidth ASCII
1134	{ `0xff1a`, `0xff20`, `1` }, // half/fullwidth ASCII
1135	{ `0xff3b`, `0xff40`, `1` }, // half/fullwidth ASCII
1136	{ `0xff5b`, `0xff65`, `1` }, // half/fullwidth ASCII
1137	{ `0x1d000`, `0x1d24f`, `1` }, // Musical notation
1138	{ `0x1d400`, `0x1d7ff`, `1` }, // Mathematical Alphanumeric Symbols
1139	{ `0x1f000`, `0x1f2ff`, `1` }, // Game pieces; enclosed characters
1140	{ `0x1f300`, `0x1f9ff`, `1` }, // Many symbol blocks
1141	{ `0x20000`, `0x2a6df`, `0x4e00` }, // CJK Ideographs
1142	{ `0x2a700`, `0x2b73f`, `0x4e00` }, // CJK Ideographs
1143	{ `0x2b740`, `0x2b81f`, `0x4e00` }, // CJK Ideographs
1144	{ `0x2f800`, `0x2fa1f`, `0x4e00` }, // CJK Ideographs
1145	};
1146	int bot = `0`;
1147	int top = ARRAY_SIZE(classes) - `1`;
1148	int mid;
1149
1150	/ First quick check for Latin1 characters, use 'iskeyword'. /
1151	if (c < `0x100`) {
1152	if (c == `' '` \|\| c == `'\t'` \|\| c == NUL \|\| c == `0xa0`) {
1153	return `0`; // blank
1154	}
1155	if (vim_iswordc_tab(c, chartab)) {
1156	return `2`; // word character
1157	}
1158	return `1`; // punctuation
1159	}
1160
1161	/ binary search in table /
1162	while (top >= bot) {
1163	mid = (bot + top) / `2`;
1164	if (classes[mid].last < (unsigned int)c)
1165	bot = mid + `1`;
1166	else if (classes[mid].first > (unsigned int)c)
1167	top = mid - `1`;
1168	else
1169	return (int)classes[mid].class;
1170	}
1171
1172	// emoji
1173	if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
1174	return `3`;
1175	}
1176
1177	/ most other characters are "word" characters /
1178	return `2`;
1179	}
1180
1181	bool utf_ambiguous_width(int c)
1182	{
1183	return c >= `0x80` && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
1184	\|\| intable(emoji_all, ARRAY_SIZE(emoji_all), c));
1185	}
1186
1187	/*
1188	* Generic conversion function for case operations.
1189	* Return the converted equivalent of "a", which is a UCS-4 character. Use
1190	* the given conversion "table". Uses binary search on "table".
1191	*/
1192	static int utf_convert(int a, const convertStruct *const table, size_t n_items)
1193	{
1194	size_t start, mid, end; / indices into table /
1195
1196	start = `0`;
1197	end = n_items;
1198	while (start < end) {
1199	/ need to search further /
1200	mid = (end + start) / `2`;
1201	if (table[mid].rangeEnd < a)
1202	start = mid + `1`;
1203	else
1204	end = mid;
1205	}
1206	if (start < n_items
1207	&& table[start].rangeStart <= a
1208	&& a <= table[start].rangeEnd
1209	&& (a - table[start].rangeStart) % table[start].step == `0`)
1210	return a + table[start].offset;
1211	else
1212	return a;
1213	}
1214
1215	/*
1216	* Return the folded-case equivalent of "a", which is a UCS-4 character. Uses
1217	* simple case folding.
1218	*/
1219	int utf_fold(int a)
1220	{
1221	if (a < `0x80`) {
1222	// be fast for ASCII
1223	return a >= `0x41` && a <= `0x5a` ? a + `32` : a;
1224	}
1225	return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
1226	}
1227
1228	// Vim's own character class functions. These exist because many library
1229	// islower()/toupper() etc. do not work properly: they crash when used with
1230	// invalid values or can't handle latin1 when the locale is C.
1231	// Speed is most important here.
1232
1233	/// Return the upper-case equivalent of "a", which is a UCS-4 character. Use
1234	/// simple case folding.
1235	int mb_toupper(int a)
1236	{
1237	/ If 'casemap' contains "keepascii" use ASCII style toupper(). /
1238	if (a < `128` && (cmp_flags & CMP_KEEPASCII))
1239	return TOUPPER_ASC(a);
1240
1241	#if defined(__STDC_ISO_10646__)
1242	/ If towupper() is available and handles Unicode, use it. /
1243	if (!(cmp_flags & CMP_INTERNAL))
1244	return towupper(a);
1245	#endif
1246
1247	/ For characters below 128 use locale sensitive toupper(). /
1248	if (a < `128`)
1249	return TOUPPER_LOC(a);
1250
1251	/ For any other characters use the above mapping table. /
1252	return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
1253	}
1254
1255	bool mb_islower(int a)
1256	{
1257	// German sharp s is lower case but has no upper case equivalent.
1258	return (mb_toupper(a) != a) \|\| a == `0xdf`;
1259	}
1260
1261	/// Return the lower-case equivalent of "a", which is a UCS-4 character. Use
1262	/// simple case folding.
1263	int mb_tolower(int a)
1264	{
1265	/ If 'casemap' contains "keepascii" use ASCII style tolower(). /
1266	if (a < `128` && (cmp_flags & CMP_KEEPASCII))
1267	return TOLOWER_ASC(a);
1268
1269	#if defined(__STDC_ISO_10646__)
1270	/ If towlower() is available and handles Unicode, use it. /
1271	if (!(cmp_flags & CMP_INTERNAL))
1272	return towlower(a);
1273	#endif
1274
1275	/ For characters below 128 use locale sensitive tolower(). /
1276	if (a < `128`)
1277	return TOLOWER_LOC(a);
1278
1279	/ For any other characters use the above mapping table. /
1280	return utf_convert(a, toLower, ARRAY_SIZE(toLower));
1281	}
1282
1283	bool mb_isupper(int a)
1284	{
1285	return mb_tolower(a) != a;
1286	}
1287
1288	static int utf_strnicmp(const char_u s1, const* char_u *s2, size_t n1,
1289	size_t n2)
1290	{
1291	int c1, c2, cdiff;
1292	char_u buffer[`6`];
1293
1294	for (;; ) {
1295	c1 = utf_safe_read_char_adv(&s1, &n1);
1296	c2 = utf_safe_read_char_adv(&s2, &n2);
1297
1298	if (c1 <= `0` \|\| c2 <= `0`)
1299	break;
1300
1301	if (c1 == c2)
1302	continue;
1303
1304	cdiff = utf_fold(c1) - utf_fold(c2);
1305	if (cdiff != `0`)
1306	return cdiff;
1307	}
1308
1309	/ some string ended or has an incomplete/illegal character sequence /
1310
1311	if (c1 == `0` \|\| c2 == `0`) {
1312	/ some string ended. shorter string is smaller /
1313	if (c1 == `0` && c2 == `0`)
1314	return `0`;
1315	return c1 == `0` ? -`1` : `1`;
1316	}
1317
1318	/ Continue with bytewise comparison to produce some result that*
1319	* would make comparison operations involving this function transitive.
1320	*
1321	* If only one string had an error, comparison should be made with
1322	* folded version of the other string. In this case it is enough
1323	* to fold just one character to determine the result of comparison. */
1324
1325	if (c1 != -`1` && c2 == -`1`) {
1326	n1 = utf_char2bytes(utf_fold(c1), buffer);
1327	s1 = buffer;
1328	} else if (c2 != -`1` && c1 == -`1`) {
1329	n2 = utf_char2bytes(utf_fold(c2), buffer);
1330	s2 = buffer;
1331	}
1332
1333	while (n1 > `0` && n2 > `0` && s1 != NUL && s2 != NUL) {
1334	cdiff = (int)(s1) - (int)(s2);
1335	if (cdiff != `0`)
1336	return cdiff;
1337
1338	s1++;
1339	s2++;
1340	n1--;
1341	n2--;
1342	}
1343
1344	if (n1 > `0` && *s1 == NUL)
1345	n1 = `0`;
1346	if (n2 > `0` && *s2 == NUL)
1347	n2 = `0`;
1348
1349	if (n1 == `0` && n2 == `0`)
1350	return `0`;
1351	return n1 == `0` ? -`1` : `1`;
1352	}
1353
1354	#ifdef WIN32
1355	#ifndef CP_UTF8
1356	# define CP_UTF8 65001 /* magic number from winnls.h */
1357	#endif
1358
1359	/// Converts string from UTF-8 to UTF-16.
1360	///
1361	/// @param utf8 UTF-8 string.
1362	/// @param utf8len Length of `utf8`. May be -1 if `utf8` is NUL-terminated.
1363	/// @param utf16[out,allocated] NUL-terminated UTF-16 string, or NULL on error
1364	/// @return 0 on success, or libuv error code
1365	int utf8_to_utf16(const char utf8, int* utf8len, wchar_t **utf16)
1366	FUNC_ATTR_NONNULL_ALL
1367	{
1368	// Compute the length needed for the converted UTF-16 string.
1369	int bufsize = MultiByteToWideChar(CP_UTF8,
1370	`0`, // dwFlags: must be 0 for UTF-8
1371	utf8, // -1: process up to NUL
1372	utf8len,
1373	NULL,
1374	`0`); // 0: get length, don't convert
1375	if (bufsize == `0`) {
1376	*utf16 = NULL;
1377	return uv_translate_sys_error(GetLastError());
1378	}
1379
1380	// Allocate the destination buffer adding an extra byte for the terminating
1381	// NULL. If `utf8len` is not -1 MultiByteToWideChar will not add it, so
1382	// we do it ourselves always, just in case.
1383	utf16 = xmalloc(sizeof(wchar_t) (bufsize + `1`));
1384
1385	// Convert to UTF-16.
1386	bufsize = MultiByteToWideChar(CP_UTF8, `0`, utf8, utf8len, *utf16, bufsize);
1387	if (bufsize == `0`) {
1388	XFREE_CLEAR(*utf16);
1389	return uv_translate_sys_error(GetLastError());
1390	}
1391
1392	(*utf16)[bufsize] = L`'\0'`;
1393	return `0`;
1394	}
1395
1396	/// Converts string from UTF-16 to UTF-8.
1397	///
1398	/// @param utf16 UTF-16 string.
1399	/// @param utf16len Length of `utf16`. May be -1 if `utf16` is NUL-terminated.
1400	/// @param utf8[out,allocated] NUL-terminated UTF-8 string, or NULL on error
1401	/// @return 0 on success, or libuv error code
1402	int utf16_to_utf8(const wchar_t utf16, int* utf16len, char **utf8)
1403	FUNC_ATTR_NONNULL_ALL
1404	{
1405	// Compute the space needed for the converted UTF-8 string.
1406	DWORD bufsize = WideCharToMultiByte(CP_UTF8,
1407	`0`,
1408	utf16,
1409	utf16len,
1410	NULL,
1411	`0`,
1412	NULL,
1413	NULL);
1414	if (bufsize == `0`) {
1415	*utf8 = NULL;
1416	return uv_translate_sys_error(GetLastError());
1417	}
1418
1419	// Allocate the destination buffer adding an extra byte for the terminating
1420	// NULL. If `utf16len` is not -1 WideCharToMultiByte will not add it, so
1421	// we do it ourselves always, just in case.
1422	*utf8 = xmalloc(bufsize + `1`);
1423
1424	// Convert to UTF-8.
1425	bufsize = WideCharToMultiByte(CP_UTF8,
1426	`0`,
1427	utf16,
1428	utf16len,
1429	*utf8,
1430	bufsize,
1431	NULL,
1432	NULL);
1433	if (bufsize == `0`) {
1434	XFREE_CLEAR(*utf8);
1435	return uv_translate_sys_error(GetLastError());
1436	}
1437
1438	(*utf8)[bufsize] = `'\0'`;
1439	return `0`;
1440	}
1441
1442	#endif
1443
1444	/// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
1445	///
1446	/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
1447	/// each.
1448	///
1449	/// The out parameters are incremented. This is used to measure the size of
1450	/// a buffer region consisting of multiple line segments.
1451	///
1452	/// @param s the string
1453	/// @param len maximum length (an earlier NUL terminates)
1454	/// @param[out] codepoints incremented with UTF-32 code point size
1455	/// @param[out] codeunits incremented with UTF-16 code unit size
1456	void mb_utflen(const char_u s, size_t len, size_t codepoints,
1457	size_t *codeunits)
1458	FUNC_ATTR_NONNULL_ALL
1459	{
1460	size_t count = `0`, extra = `0`;
1461	size_t clen;
1462	for (size_t i = `0`; i < len && s[i] != NUL; i += clen) {
1463	clen = utf_ptr2len_len(s+i, len-i);
1464	// NB: gets the byte value of invalid sequence bytes.
1465	// we only care whether the char fits in the BMP or not
1466	int c = (clen > `1`) ? utf_ptr2char(s+i) : s[i];
1467	count++;
1468	if (c > `0xFFFF`) {
1469	extra++;
1470	}
1471	}
1472	*codepoints += count;
1473	*codeunits += count + extra;
1474	}
1475
1476	ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len,
1477	size_t index, bool use_utf16_units)
1478	FUNC_ATTR_NONNULL_ALL
1479	{
1480	size_t count = `0`;
1481	size_t clen, i;
1482	if (index == `0`) {
1483	return `0`;
1484	}
1485	for (i = `0`; i < len && s[i] != NUL; i += clen) {
1486	clen = utf_ptr2len_len(s+i, len-i);
1487	// NB: gets the byte value of invalid sequence bytes.
1488	// we only care whether the char fits in the BMP or not
1489	int c = (clen > `1`) ? utf_ptr2char(s+i) : s[i];
1490	count++;
1491	if (use_utf16_units && c > `0xFFFF`) {
1492	count++;
1493	}
1494	if (count >= index) {
1495	return i+clen;
1496	}
1497	}
1498	return -`1`;
1499	}
1500
1501
1502	/*
1503	* Version of strnicmp() that handles multi-byte characters.
1504	* Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can
1505	* probably use strnicmp(), because there are no ASCII characters in the
1506	* second byte.
1507	* Returns zero if s1 and s2 are equal (ignoring case), the difference between
1508	* two characters otherwise.
1509	*/
1510	int mb_strnicmp(const char_u s1, const* char_u s2, const* size_t nn)
1511	{
1512	return utf_strnicmp(s1, s2, nn, nn);
1513	}
1514
1515	/// Compare strings case-insensitively
1516	///
1517	/// @note We need to call mb_stricmp() even when we aren't dealing with
1518	/// a multi-byte encoding because mb_stricmp() takes care of all ASCII and
1519	/// non-ascii encodings, including characters with umlauts in latin1,
1520	/// etc., while STRICMP() only handles the system locale version, which
1521	/// often does not handle non-ascii properly.
1522	///
1523	/// @param[in] s1 First string to compare, not more then #MAXCOL characters.
1524	/// @param[in] s2 Second string to compare, not more then #MAXCOL characters.
1525	///
1526	/// @return 0 if strings are equal, <0 if s1 < s2, >0 if s1 > s2.
1527	int mb_stricmp(const char s1, const* char *s2)
1528	{
1529	return mb_strnicmp((const char_u )s1, (const* char_u *)s2, MAXCOL);
1530	}
1531
1532	/*
1533	* "g8": show bytes of the UTF-8 char under the cursor. Doesn't matter what
1534	* 'encoding' has been set to.
1535	*/
1536	void show_utf8(void)
1537	{
1538	int len;
1539	int rlen = `0`;
1540	char_u *line;
1541	int clen;
1542	int i;
1543
1544	/ Get the byte length of the char under the cursor, including composing*
1545	* characters. */
1546	line = get_cursor_pos_ptr();
1547	len = utfc_ptr2len(line);
1548	if (len == `0`) {
1549	MSG("NUL");
1550	return;
1551	}
1552
1553	clen = `0`;
1554	for (i = `0`; i < len; ++i) {
1555	if (clen == `0`) {
1556	/ start of (composing) character, get its length /
1557	if (i > `0`) {
1558	STRCPY(IObuff + rlen, "+ ");
1559	rlen += `2`;
1560	}
1561	clen = utf_ptr2len(line + i);
1562	}
1563	sprintf((char *)IObuff + rlen, "%02x ",
1564	(line[i] == NL) ? NUL : line[i]); / NUL is stored as NL /
1565	--clen;
1566	rlen += (int)STRLEN(IObuff + rlen);
1567	if (rlen > IOSIZE - `20`)
1568	break;
1569	}
1570
1571	msg(IObuff);
1572	}
1573
1574	/// Return offset from "p" to the first byte of the character it points into.
1575	/// If "p" points to the NUL at the end of the string return 0.
1576	/// Returns 0 when already at the first byte of a character.
1577	int utf_head_off(const char_u base, const* char_u *p)
1578	{
1579	int c;
1580	int len;
1581
1582	if (p < `0x80`) /* be quick for ASCII /
1583	return `0`;
1584
1585	/ Skip backwards over trailing bytes: 10xx.xxxx*
1586	* Skip backwards again if on a composing char. */
1587	const char_u *q;
1588	for (q = p;; --q) {
1589	/ Move s to the last byte of this char. /
1590	const char_u *s;
1591	for (s = q; (s[`1`] & `0xc0`) == `0x80`; ++s) {}
1592
1593	/ Move q to the first byte of this char. /
1594	while (q > base && (*q & `0xc0`) == `0x80`)
1595	--q;
1596	/ Check for illegal sequence. Do allow an illegal byte after where we*
1597	* started. */
1598	len = utf8len_tab[*q];
1599	if (len != (int)(s - q + `1`) && len != (int)(p - q + `1`))
1600	return `0`;
1601
1602	if (q <= base)
1603	break;
1604
1605	c = utf_ptr2char(q);
1606	if (utf_iscomposing(c))
1607	continue;
1608
1609	if (arabic_maycombine(c)) {
1610	/ Advance to get a sneak-peak at the next char /
1611	const char_u *j = q;
1612	--j;
1613	/ Move j to the first byte of this char. /
1614	while (j > base && (*j & `0xc0`) == `0x80`)
1615	--j;
1616	if (arabic_combine(utf_ptr2char(j), c))
1617	continue;
1618	}
1619	break;
1620	}
1621
1622	return (int)(p - q);
1623	}
1624
1625	/// Copy a character, advancing the pointers
1626	///
1627	/// @param[in,out] fp Source of the character to copy.
1628	/// @param[in,out] tp Destination to copy to.
1629	void mb_copy_char(const char_u **const fp, char_u **const tp)
1630	{
1631	const size_t l = (size_t)utfc_ptr2len(*fp);
1632
1633	memmove(tp, fp, l);
1634	*tp += l;
1635	*fp += l;
1636	}
1637
1638	/*
1639	* Return the offset from "p" to the first byte of a character. When "p" is
1640	* at the start of a character 0 is returned, otherwise the offset to the next
1641	* character. Can start anywhere in a stream of bytes.
1642	*/
1643	int mb_off_next(char_u base, char_u p)
1644	{
1645	int i;
1646	int j;
1647
1648	if (p < `0x80`) { // be quick for ASCII*
1649	return `0`;
1650	}
1651
1652	// Find the next character that isn't 10xx.xxxx
1653	for (i = `0`; (p[i] & `0xc0`) == `0x80`; i++) {}
1654	if (i > `0`) {
1655	// Check for illegal sequence.
1656	for (j = `0`; p - j > base; j++) {
1657	if ((p[-j] & `0xc0`) != `0x80`) {
1658	break;
1659	}
1660	}
1661	if (utf8len_tab[p[-j]] != i + j) {
1662	return `0`;
1663	}
1664	}
1665	return i;
1666	}
1667
1668	/*
1669	* Return the offset from "p" to the last byte of the character it points
1670	* into. Can start anywhere in a stream of bytes.
1671	*/
1672	int mb_tail_off(char_u base, char_u p)
1673	{
1674	int i;
1675	int j;
1676
1677	if (*p == NUL)
1678	return `0`;
1679
1680	// Find the last character that is 10xx.xxxx
1681	for (i = `0`; (p[i + `1`] & `0xc0`) == `0x80`; i++) {}
1682
1683	// Check for illegal sequence.
1684	for (j = `0`; p - j > base; j++) {
1685	if ((p[-j] & `0xc0`) != `0x80`) {
1686	break;
1687	}
1688	}
1689
1690	if (utf8len_tab[p[-j]] != i + j + `1`) {
1691	return `0`;
1692	}
1693	return i;
1694	}
1695
1696	/*
1697	* Find the next illegal byte sequence.
1698	*/
1699	void utf_find_illegal(void)
1700	{
1701	pos_T pos = curwin->w_cursor;
1702	char_u *p;
1703	int len;
1704	vimconv_T vimconv;
1705	char_u *tofree = NULL;
1706
1707	vimconv.vc_type = CONV_NONE;
1708	if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) {
1709	// 'encoding' is "utf-8" but we are editing a 8-bit encoded file,
1710	// possibly a utf-8 file with illegal bytes. Setup for conversion
1711	// from utf-8 to 'fileencoding'.
1712	convert_setup(&vimconv, p_enc, curbuf->b_p_fenc);
1713	}
1714
1715	curwin->w_cursor.coladd = `0`;
1716	for (;; ) {
1717	p = get_cursor_pos_ptr();
1718	if (vimconv.vc_type != CONV_NONE) {
1719	xfree(tofree);
1720	tofree = string_convert(&vimconv, p, NULL);
1721	if (tofree == NULL)
1722	break;
1723	p = tofree;
1724	}
1725
1726	while (*p != NUL) {
1727	/ Illegal means that there are not enough trail bytes (checked by*
1728	* utf_ptr2len()) or too many of them (overlong sequence). */
1729	len = utf_ptr2len(p);
1730	if (*p >= `0x80` && (len == `1`
1731	\|\| utf_char2len(utf_ptr2char(p)) != len)) {
1732	if (vimconv.vc_type == CONV_NONE)
1733	curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr());
1734	else {
1735	int l;
1736
1737	len = (int)(p - tofree);
1738	for (p = get_cursor_pos_ptr(); *p != NUL && len-- > `0`; p += l) {
1739	l = utf_ptr2len(p);
1740	curwin->w_cursor.col += l;
1741	}
1742	}
1743	goto theend;
1744	}
1745	p += len;
1746	}
1747	if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count)
1748	break;
1749	++curwin->w_cursor.lnum;
1750	curwin->w_cursor.col = `0`;
1751	}
1752
1753	/ didn't find it: don't move and beep /
1754	curwin->w_cursor = pos;
1755	beep_flush();
1756
1757	theend:
1758	xfree(tofree);
1759	convert_setup(&vimconv, NULL, NULL);
1760	}
1761
1762	/*
1763	* If the cursor moves on an trail byte, set the cursor on the lead byte.
1764	* Thus it moves left if necessary.
1765	*/
1766	void mb_adjust_cursor(void)
1767	{
1768	mark_mb_adjustpos(curbuf, &curwin->w_cursor);
1769	}
1770
1771	/// Checks and adjusts cursor column. Not mode-dependent.
1772	/// @see check_cursor_col_win
1773	///
1774	/// @param win_ Places cursor on a valid column for this window.
1775	void mb_check_adjust_col(void *win_)
1776	{
1777	win_T win = (win_T )win_;
1778	colnr_T oldcol = win->w_cursor.col;
1779
1780	// Column 0 is always valid.
1781	if (oldcol != `0`) {
1782	char_u *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
1783	colnr_T len = (colnr_T)STRLEN(p);
1784
1785	// Empty line or invalid column?
1786	if (len == `0` \|\| oldcol < `0`) {
1787	win->w_cursor.col = `0`;
1788	} else {
1789	// Cursor column too big for line?
1790	if (oldcol > len) {
1791	win->w_cursor.col = len - `1`;
1792	}
1793	// Move the cursor to the head byte.
1794	win->w_cursor.col -= utf_head_off(p, p + win->w_cursor.col);
1795	}
1796
1797	// Reset `coladd` when the cursor would be on the right half of a
1798	// double-wide character.
1799	if (win->w_cursor.coladd == `1` && p[win->w_cursor.col] != TAB
1800	&& vim_isprintc(utf_ptr2char(p + win->w_cursor.col))
1801	&& ptr2cells(p + win->w_cursor.col) > `1`) {
1802	win->w_cursor.coladd = `0`;
1803	}
1804	}
1805	}
1806
1807	/*
1808	* Return a pointer to the character before "*p", if there is one.
1809	*/
1810	char_u * mb_prevptr(
1811	char_u line, /* start of the string /
1812	char_u *p
1813	)
1814	{
1815	if (p > line) {
1816	MB_PTR_BACK(line, p);
1817	}
1818	return p;
1819	}
1820
1821	/*
1822	* Return the character length of "str". Each multi-byte character (with
1823	* following composing characters) counts as one.
1824	*/
1825	int mb_charlen(char_u *str)
1826	{
1827	char_u *p = str;
1828	int count;
1829
1830	if (p == NULL)
1831	return `0`;
1832
1833	for (count = `0`; *p != NUL; count++)
1834	p += (*mb_ptr2len)(p);
1835
1836	return count;
1837	}
1838
1839	/*
1840	* Like mb_charlen() but for a string with specified length.
1841	*/
1842	int mb_charlen_len(char_u str, int* len)
1843	{
1844	char_u *p = str;
1845	int count;
1846
1847	for (count = `0`; *p != NUL && p < str + len; count++)
1848	p += (*mb_ptr2len)(p);
1849
1850	return count;
1851	}
1852
1853	/// Try to unescape a multibyte character
1854	///
1855	/// Used for the rhs and lhs of the mappings.
1856	///
1857	/// @param[in,out] pp String to unescape. Is advanced to just after the bytes
1858	/// that form a multibyte character.
1859	///
1860	/// @return Unescaped string if it is a multibyte character, NULL if no
1861	/// multibyte character was found. Returns a static buffer, always one
1862	/// and the same.
1863	const char mb_unescape(const* char **const pp)
1864	FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
1865	{
1866	static char buf[`6`];
1867	size_t buf_idx = `0`;
1868	uint8_t str = (uint8_t )(*pp);
1869
1870	// Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL and CSI
1871	// KS_EXTRA KE_CSI to CSI.
1872	// Maximum length of a utf-8 character is 4 bytes.
1873	for (size_t str_idx = `0`; str[str_idx] != NUL && buf_idx < `4`; str_idx++) {
1874	if (str[str_idx] == K_SPECIAL
1875	&& str[str_idx + `1`] == KS_SPECIAL
1876	&& str[str_idx + `2`] == KE_FILLER) {
1877	buf[buf_idx++] = (char)K_SPECIAL;
1878	str_idx += `2`;
1879	} else if ((str[str_idx] == K_SPECIAL)
1880	&& str[str_idx + `1`] == KS_EXTRA
1881	&& str[str_idx + `2`] == KE_CSI) {
1882	buf[buf_idx++] = (char)CSI;
1883	str_idx += `2`;
1884	} else if (str[str_idx] == K_SPECIAL) {
1885	break; // A special key can't be a multibyte char.
1886	} else {
1887	buf[buf_idx++] = (char)str[str_idx];
1888	}
1889	buf[buf_idx] = NUL;
1890
1891	// Return a multi-byte character if it's found. An illegal sequence
1892	// will result in a 1 here.
1893	if (utf_ptr2len((const char_u *)buf) > `1`) {
1894	pp = (const* char *)str + str_idx + `1`;
1895	return buf;
1896	}
1897
1898	// Bail out quickly for ASCII.
1899	if ((uint8_t)buf[`0`] < `128`) {
1900	break;
1901	}
1902	}
1903	return NULL;
1904	}
1905
1906
1907	/*
1908	* Skip the Vim specific head of a 'encoding' name.
1909	*/
1910	char_u * enc_skip(char_u *p)
1911	{
1912	if (STRNCMP(p, "2byte-", `6`) == `0`)
1913	return p + `6`;
1914	if (STRNCMP(p, "8bit-", `5`) == `0`)
1915	return p + `5`;
1916	return p;
1917	}
1918
1919	/*
1920	* Find the canonical name for encoding "enc".
1921	* When the name isn't recognized, returns "enc" itself, but with all lower
1922	* case characters and '_' replaced with '-'.
1923	* Returns an allocated string.
1924	*/
1925	char_u enc_canonize(char_u enc) FUNC_ATTR_NONNULL_RET
1926	{
1927	char_u p, s;
1928	int i;
1929
1930	if (STRCMP(enc, "default") == `0`) {
1931	// Use the default encoding as found by set_init_1().
1932	return vim_strsave(fenc_default);
1933	}
1934
1935	/ copy "enc" to allocated memory, with room for two '-' /
1936	char_u *r = xmalloc(STRLEN(enc) + `3`);
1937	/ Make it all lower case and replace '_' with '-'. /
1938	p = r;
1939	for (s = enc; *s != NUL; ++s) {
1940	if (*s == `'_'`)
1941	*p++ = `'-'`;
1942	else
1943	p++ = TOLOWER_ASC(s);
1944	}
1945	*p = NUL;
1946
1947	/ Skip "2byte-" and "8bit-". /
1948	p = enc_skip(r);
1949
1950	/ Change "microsoft-cp" to "cp". Used in some spell files. /
1951	if (STRNCMP(p, "microsoft-cp", `12`) == `0`)
1952	STRMOVE(p, p + `10`);
1953
1954	/ "iso8859" -> "iso-8859" /
1955	if (STRNCMP(p, "iso8859", `7`) == `0`) {
1956	STRMOVE(p + `4`, p + `3`);
1957	p[`3`] = `'-'`;
1958	}
1959
1960	/ "iso-8859n" -> "iso-8859-n" /
1961	if (STRNCMP(p, "iso-8859", `8`) == `0` && p[`8`] != `'-'`) {
1962	STRMOVE(p + `9`, p + `8`);
1963	p[`8`] = `'-'`;
1964	}
1965
1966	/ "latin-N" -> "latinN" /
1967	if (STRNCMP(p, "latin-", `6`) == `0`)
1968	STRMOVE(p + `5`, p + `6`);
1969
1970	if (enc_canon_search(p) >= `0`) {
1971	/ canonical name can be used unmodified /
1972	if (p != r)
1973	STRMOVE(r, p);
1974	} else if ((i = enc_alias_search(p)) >= `0`) {
1975	/ alias recognized, get canonical name /
1976	xfree(r);
1977	r = vim_strsave((char_u *)enc_canon_table[i].name);
1978	}
1979	return r;
1980	}
1981
1982	/*
1983	* Search for an encoding alias of "name".
1984	* Returns -1 when not found.
1985	*/
1986	static int enc_alias_search(char_u *name)
1987	{
1988	int i;
1989
1990	for (i = `0`; enc_alias_table[i].name != NULL; ++i)
1991	if (STRCMP(name, enc_alias_table[i].name) == `0`)
1992	return enc_alias_table[i].canon;
1993	return -`1`;
1994	}
1995
1996
1997	#ifdef HAVE_LANGINFO_H
1998	# include <langinfo.h>
1999	#endif
2000
2001	/*
2002	* Get the canonicalized encoding of the current locale.
2003	* Returns an allocated string when successful, NULL when not.
2004	*/
2005	char_u * enc_locale(void)
2006	{
2007	int i;
2008	char buf[`50`];
2009
2010	const char *s;
2011	# ifdef HAVE_NL_LANGINFO_CODESET
2012	if (!(s = nl_langinfo(CODESET)) \|\| *s == NUL)
2013	# endif
2014	{
2015	# if defined(HAVE_LOCALE_H)
2016	if (!(s = setlocale(LC_CTYPE, NULL)) \|\| *s == NUL)
2017	# endif
2018	{
2019	if ((s = os_getenv("LC_ALL"))) {
2020	if ((s = os_getenv("LC_CTYPE"))) {
2021	s = os_getenv("LANG");
2022	}
2023	}
2024	}
2025	}
2026
2027	if (!s) {
2028	return NULL;
2029	}
2030
2031	// The most generic locale format is:
2032	// language[_territory][.codeset][@modifier][+special][,[sponsor][_revision]]
2033	// If there is a '.' remove the part before it.
2034	// if there is something after the codeset, remove it.
2035	// Make the name lowercase and replace '_' with '-'.
2036	// Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn",
2037	// "ko_KR.EUC" == "euc-kr"
2038	const char p = (char* )vim_strchr((char_u )s, `'.'`);
2039	if (p != NULL) {
2040	if (p > s + `2` && !STRNICMP(p + `1`, "EUC", `3`)
2041	&& !isalnum((int)p[`4`]) && p[`4`] != `'-'` && p[-`3`] == `'_'`) {
2042	// Copy "XY.EUC" to "euc-XY" to buf[10].
2043	memmove(buf, "euc-", `4`);
2044	buf[`4`] = (ASCII_ISALNUM(p[-`2`]) ? TOLOWER_ASC(p[-`2`]) : `0`);
2045	buf[`5`] = (ASCII_ISALNUM(p[-`1`]) ? TOLOWER_ASC(p[-`1`]) : `0`);
2046	buf[`6`] = NUL;
2047	} else {
2048	s = p + `1`;
2049	goto enc_locale_copy_enc;
2050	}
2051	} else {
2052	enc_locale_copy_enc:
2053	for (i = `0`; i < (int)sizeof(buf) - `1` && s[i] != NUL; i++) {
2054	if (s[i] == `'_'` \|\| s[i] == `'-'`) {
2055	buf[i] = `'-'`;
2056	} else if (ASCII_ISALNUM((uint8_t)s[i])) {
2057	buf[i] = TOLOWER_ASC(s[i]);
2058	} else {
2059	break;
2060	}
2061	}
2062	buf[i] = NUL;
2063	}
2064
2065	return enc_canonize((char_u *)buf);
2066	}
2067
2068	# if defined(HAVE_ICONV)
2069
2070
2071	/*
2072	* Call iconv_open() with a check if iconv() works properly (there are broken
2073	* versions).
2074	* Returns (void *)-1 if failed.
2075	* (should return iconv_t, but that causes problems with prototypes).
2076	*/
2077	void * my_iconv_open(char_u to, char_u from)
2078	{
2079	iconv_t fd;
2080	#define ICONV_TESTLEN 400
2081	char_u tobuf[ICONV_TESTLEN];
2082	char *p;
2083	size_t tolen;
2084	static WorkingStatus iconv_working = kUnknown;
2085
2086	if (iconv_working == kBroken)
2087	return (void )-`1`; /* detected a broken iconv() previously /
2088
2089	fd = iconv_open((char )enc_skip(to), (char* *)enc_skip(from));
2090
2091	if (fd != (iconv_t)-`1` && iconv_working == kUnknown) {
2092	/*
2093	* Do a dummy iconv() call to check if it actually works. There is a
2094	* version of iconv() on Linux that is broken. We can't ignore it,
2095	* because it's wide-spread. The symptoms are that after outputting
2096	* the initial shift state the "to" pointer is NULL and conversion
2097	* stops for no apparent reason after about 8160 characters.
2098	*/
2099	p = (char *)tobuf;
2100	tolen = ICONV_TESTLEN;
2101	(void)iconv(fd, NULL, NULL, &p, &tolen);
2102	if (p == NULL) {
2103	iconv_working = kBroken;
2104	iconv_close(fd);
2105	fd = (iconv_t)-`1`;
2106	} else
2107	iconv_working = kWorking;
2108	}
2109
2110	return (void *)fd;
2111	}
2112
2113	/*
2114	* Convert the string "str[slen]" with iconv().
2115	* If "unconvlenp" is not NULL handle the string ending in an incomplete
2116	* sequence and set "*unconvlenp" to the length of it.
2117	* Returns the converted string in allocated memory. NULL for an error.
2118	* If resultlenp is not NULL, sets it to the result length in bytes.
2119	*/
2120	static char_u iconv_string(const* vimconv_T *const vcp, char_u *str,
2121	size_t slen, size_t unconvlenp, size_t resultlenp)
2122	{
2123	const char *from;
2124	size_t fromlen;
2125	char *to;
2126	size_t tolen;
2127	size_t len = `0`;
2128	size_t done = `0`;
2129	char_u *result = NULL;
2130	char_u *p;
2131	int l;
2132
2133	from = (char *)str;
2134	fromlen = slen;
2135	for (;; ) {
2136	if (len == `0` \|\| ICONV_ERRNO == ICONV_E2BIG) {
2137	/ Allocate enough room for most conversions. When re-allocating*
2138	* increase the buffer size. */
2139	len = len + fromlen * `2` + `40`;
2140	p = xmalloc(len);
2141	if (done > `0`)
2142	memmove(p, result, done);
2143	xfree(result);
2144	result = p;
2145	}
2146
2147	to = (char *)result + done;
2148	tolen = len - done - `2`;
2149	// Avoid a warning for systems with a wrong iconv() prototype by
2150	// casting the second argument to void .*
2151	if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) {
2152	// Finished, append a NUL.
2153	*to = NUL;
2154	break;
2155	}
2156
2157	// Check both ICONV_EINVAL and EINVAL, because the dynamically loaded
2158	// iconv library may use one of them.
2159	if (!vcp->vc_fail && unconvlenp != NULL
2160	&& (ICONV_ERRNO == ICONV_EINVAL \|\| ICONV_ERRNO == EINVAL)) {
2161	// Handle an incomplete sequence at the end.
2162	*to = NUL;
2163	*unconvlenp = fromlen;
2164	break;
2165	} else if (!vcp->vc_fail
2166	&& (ICONV_ERRNO == ICONV_EILSEQ \|\| ICONV_ERRNO == EILSEQ
2167	\|\| ICONV_ERRNO == ICONV_EINVAL \|\| ICONV_ERRNO == EINVAL)) {
2168	// Check both ICONV_EILSEQ and EILSEQ, because the dynamically loaded
2169	// iconv library may use one of them.
2170
2171	// Can't convert: insert a '?' and skip a character. This assumes
2172	// conversion from 'encoding' to something else. In other
2173	// situations we don't know what to skip anyway.
2174	*to++ = `'?'`;
2175	if (utf_ptr2cells((char_u *)from) > `1`) {
2176	*to++ = `'?'`;
2177	}
2178	l = utfc_ptr2len_len((const char_u )from, (int*)fromlen);
2179	from += l;
2180	fromlen -= l;
2181	} else if (ICONV_ERRNO != ICONV_E2BIG) {
2182	// conversion failed
2183	XFREE_CLEAR(result);
2184	break;
2185	}
2186	// Not enough room or skipping illegal sequence.
2187	done = to - (char *)result;
2188	}
2189
2190	if (resultlenp != NULL && result != NULL)
2191	resultlenp = (size_t)(to - (char* *)result);
2192	return result;
2193	}
2194
2195	# endif // HAVE_ICONV
2196
2197
2198
2199
2200	/*
2201	* Setup "vcp" for conversion from "from" to "to".
2202	* The names must have been made canonical with enc_canonize().
2203	* vcp->vc_type must have been initialized to CONV_NONE.
2204	* Note: cannot be used for conversion from/to ucs-2 and ucs-4 (will use utf-8
2205	* instead).
2206	* Afterwards invoke with "from" and "to" equal to NULL to cleanup.
2207	* Return FAIL when conversion is not supported, OK otherwise.
2208	*/
2209	int convert_setup(vimconv_T vcp, char_u from, char_u *to)
2210	{
2211	return convert_setup_ext(vcp, from, true, to, true);
2212	}
2213
2214	/*
2215	* As convert_setup(), but only when from_unicode_is_utf8 is TRUE will all
2216	* "from" unicode charsets be considered utf-8. Same for "to".
2217	*/
2218	int convert_setup_ext(vimconv_T vcp, char_u from, bool from_unicode_is_utf8,
2219	char_u *to, bool to_unicode_is_utf8)
2220	{
2221	int from_prop;
2222	int to_prop;
2223	int from_is_utf8;
2224	int to_is_utf8;
2225
2226	// Reset to no conversion.
2227	# ifdef HAVE_ICONV
2228	if (vcp->vc_type == CONV_ICONV && vcp->vc_fd != (iconv_t)-`1`) {
2229	iconv_close(vcp->vc_fd);
2230	}
2231	# endif
2232	*vcp = (vimconv_T)MBYTE_NONE_CONV;
2233
2234	/ No conversion when one of the names is empty or they are equal. /
2235	if (from == NULL \|\| from == NUL \|\| to == NULL \|\| to == NUL
2236	\|\| STRCMP(from, to) == `0`)
2237	return OK;
2238
2239	from_prop = enc_canon_props(from);
2240	to_prop = enc_canon_props(to);
2241	if (from_unicode_is_utf8)
2242	from_is_utf8 = from_prop & ENC_UNICODE;
2243	else
2244	from_is_utf8 = from_prop == ENC_UNICODE;
2245	if (to_unicode_is_utf8)
2246	to_is_utf8 = to_prop & ENC_UNICODE;
2247	else
2248	to_is_utf8 = to_prop == ENC_UNICODE;
2249
2250	if ((from_prop & ENC_LATIN1) && to_is_utf8) {
2251	/ Internal latin1 -> utf-8 conversion. /
2252	vcp->vc_type = CONV_TO_UTF8;
2253	vcp->vc_factor = `2`; / up to twice as long /
2254	} else if ((from_prop & ENC_LATIN9) && to_is_utf8) {
2255	/ Internal latin9 -> utf-8 conversion. /
2256	vcp->vc_type = CONV_9_TO_UTF8;
2257	vcp->vc_factor = `3`; / up to three as long (euro sign) /
2258	} else if (from_is_utf8 && (to_prop & ENC_LATIN1)) {
2259	/ Internal utf-8 -> latin1 conversion. /
2260	vcp->vc_type = CONV_TO_LATIN1;
2261	} else if (from_is_utf8 && (to_prop & ENC_LATIN9)) {
2262	/ Internal utf-8 -> latin9 conversion. /
2263	vcp->vc_type = CONV_TO_LATIN9;
2264	}
2265	# ifdef HAVE_ICONV
2266	else { // NOLINT(readability/braces)
2267	// Use iconv() for conversion.
2268	vcp->vc_fd = (iconv_t)my_iconv_open(
2269	to_is_utf8 ? (char_u *)"utf-8" : to,
2270	from_is_utf8 ? (char_u *)"utf-8" : from);
2271	if (vcp->vc_fd != (iconv_t)-`1`) {
2272	vcp->vc_type = CONV_ICONV;
2273	vcp->vc_factor = `4`; / could be longer too... /
2274	}
2275	}
2276	# endif
2277	if (vcp->vc_type == CONV_NONE)
2278	return FAIL;
2279
2280	return OK;
2281	}
2282
2283	/*
2284	* Convert text "ptr[*lenp]" according to "vcp".
2285	* Returns the result in allocated memory and sets "*lenp".
2286	* When "lenp" is NULL, use NUL terminated strings.
2287	* Illegal chars are often changed to "?", unless vcp->vc_fail is set.
2288	* When something goes wrong, NULL is returned and "*lenp" is unchanged.
2289	*/
2290	char_u string_convert(const* vimconv_T *const vcp, char_u ptr, size_t lenp)
2291	{
2292	return string_convert_ext(vcp, ptr, lenp, NULL);
2293	}
2294
2295	/*
2296	* Like string_convert(), but when "unconvlenp" is not NULL and there are is
2297	* an incomplete sequence at the end it is not converted and "*unconvlenp" is
2298	* set to the number of remaining bytes.
2299	*/
2300	char_u * string_convert_ext(const vimconv_T *const vcp, char_u *ptr,
2301	size_t lenp, size_t unconvlenp)
2302	{
2303	char_u *retval = NULL;
2304	char_u *d;
2305	int l;
2306	int c;
2307
2308	size_t len;
2309	if (lenp == NULL)
2310	len = STRLEN(ptr);
2311	else
2312	len = *lenp;
2313	if (len == `0`)
2314	return vim_strsave((char_u *)"");
2315
2316	switch (vcp->vc_type) {
2317	case CONV_TO_UTF8: / latin1 to utf-8 conversion /
2318	retval = xmalloc(len * `2` + `1`);
2319	d = retval;
2320	for (size_t i = `0`; i < len; ++i) {
2321	c = ptr[i];
2322	if (c < `0x80`)
2323	*d++ = c;
2324	else {
2325	d++ = `0xc0` + ((unsigned*)c >> `6`);
2326	*d++ = `0x80` + (c & `0x3f`);
2327	}
2328	}
2329	*d = NUL;
2330	if (lenp != NULL)
2331	*lenp = (size_t)(d - retval);
2332	break;
2333
2334	case CONV_9_TO_UTF8: / latin9 to utf-8 conversion /
2335	retval = xmalloc(len * `3` + `1`);
2336	d = retval;
2337	for (size_t i = `0`; i < len; ++i) {
2338	c = ptr[i];
2339	switch (c) {
2340	case `0xa4`: c = `0x20ac`; break; / euro /
2341	case `0xa6`: c = `0x0160`; break; / S hat /
2342	case `0xa8`: c = `0x0161`; break; / S -hat /
2343	case `0xb4`: c = `0x017d`; break; / Z hat /
2344	case `0xb8`: c = `0x017e`; break; / Z -hat /
2345	case `0xbc`: c = `0x0152`; break; / OE /
2346	case `0xbd`: c = `0x0153`; break; / oe /
2347	case `0xbe`: c = `0x0178`; break; / Y /
2348	}
2349	d += utf_char2bytes(c, d);
2350	}
2351	*d = NUL;
2352	if (lenp != NULL)
2353	*lenp = (size_t)(d - retval);
2354	break;
2355
2356	case CONV_TO_LATIN1: / utf-8 to latin1 conversion /
2357	case CONV_TO_LATIN9: / utf-8 to latin9 conversion /
2358	retval = xmalloc(len + `1`);
2359	d = retval;
2360	for (size_t i = `0`; i < len; ++i) {
2361	l = utf_ptr2len_len(ptr + i, len - i);
2362	if (l == `0`)
2363	*d++ = NUL;
2364	else if (l == `1`) {
2365	uint8_t l_w = utf8len_tab_zero[ptr[i]];
2366
2367	if (l_w == `0`) {
2368	/ Illegal utf-8 byte cannot be converted /
2369	xfree(retval);
2370	return NULL;
2371	}
2372	if (unconvlenp != NULL && l_w > len - i) {
2373	/ Incomplete sequence at the end. /
2374	*unconvlenp = len - i;
2375	break;
2376	}
2377	*d++ = ptr[i];
2378	} else {
2379	c = utf_ptr2char(ptr + i);
2380	if (vcp->vc_type == CONV_TO_LATIN9)
2381	switch (c) {
2382	case `0x20ac`: c = `0xa4`; break; / euro /
2383	case `0x0160`: c = `0xa6`; break; / S hat /
2384	case `0x0161`: c = `0xa8`; break; / S -hat /
2385	case `0x017d`: c = `0xb4`; break; / Z hat /
2386	case `0x017e`: c = `0xb8`; break; / Z -hat /
2387	case `0x0152`: c = `0xbc`; break; / OE /
2388	case `0x0153`: c = `0xbd`; break; / oe /
2389	case `0x0178`: c = `0xbe`; break; / Y /
2390	case `0xa4`:
2391	case `0xa6`:
2392	case `0xa8`:
2393	case `0xb4`:
2394	case `0xb8`:
2395	case `0xbc`:
2396	case `0xbd`:
2397	case `0xbe`: c = `0x100`; break; / not in latin9 /
2398	}
2399	if (!utf_iscomposing(c)) { / skip composing chars /
2400	if (c < `0x100`)
2401	*d++ = c;
2402	else if (vcp->vc_fail) {
2403	xfree(retval);
2404	return NULL;
2405	} else {
2406	*d++ = `0xbf`;
2407	if (utf_char2cells(c) > `1`)
2408	*d++ = `'?'`;
2409	}
2410	}
2411	i += l - `1`;
2412	}
2413	}
2414	*d = NUL;
2415	if (lenp != NULL)
2416	*lenp = (size_t)(d - retval);
2417	break;
2418
2419	# ifdef HAVE_ICONV
2420	case CONV_ICONV: // conversion with vcp->vc_fd
2421	retval = iconv_string(vcp, ptr, len, unconvlenp, lenp);
2422	break;
2423	# endif
2424	}
2425
2426	return retval;
2427	}
2428

Browse the source code of neovim/src/nvim/mbyte.c