1 | // This is an open source non-commercial project. Dear PVS-Studio, please check |
2 | // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com |
3 | |
4 | /// mbyte.c: Code specifically for handling multi-byte characters. |
5 | /// Multibyte extensions partly by Sung-Hoon Baek |
6 | /// |
7 | /// Strings internal to Nvim are always encoded as UTF-8 (thus the legacy |
8 | /// 'encoding' option is always "utf-8"). |
9 | /// |
10 | /// The cell width on the display needs to be determined from the character |
11 | /// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char, |
12 | /// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte |
13 | /// character. To make things complicated, up to six composing characters |
14 | /// are allowed. These are drawn on top of the first char. For most editing |
15 | /// the sequence of bytes with composing characters included is considered to |
16 | /// be one character. |
17 | /// |
18 | /// UTF-8 is used everywhere in the core. This is in registers, text |
19 | /// manipulation, buffers, etc. Nvim core communicates with external plugins |
20 | /// and GUIs in this encoding. |
21 | /// |
22 | /// The encoding of a file is specified with 'fileencoding'. Conversion |
23 | /// is to be done when it's different from "utf-8". |
24 | /// |
25 | /// Vim scripts may contain an ":scriptencoding" command. This has an effect |
26 | /// for some commands, like ":menutrans". |
27 | |
28 | #include <inttypes.h> |
29 | #include <stdbool.h> |
30 | #include <string.h> |
31 | #include <wchar.h> |
32 | #include <wctype.h> |
33 | |
34 | #include "nvim/vim.h" |
35 | #include "nvim/ascii.h" |
36 | #ifdef HAVE_LOCALE_H |
37 | # include <locale.h> |
38 | #endif |
39 | #include "nvim/eval.h" |
40 | #include "nvim/path.h" |
41 | #include "nvim/iconv.h" |
42 | #include "nvim/mbyte.h" |
43 | #include "nvim/charset.h" |
44 | #include "nvim/cursor.h" |
45 | #include "nvim/fileio.h" |
46 | #include "nvim/func_attr.h" |
47 | #include "nvim/memline.h" |
48 | #include "nvim/message.h" |
49 | #include "nvim/misc1.h" |
50 | #include "nvim/memory.h" |
51 | #include "nvim/option.h" |
52 | #include "nvim/screen.h" |
53 | #include "nvim/spell.h" |
54 | #include "nvim/strings.h" |
55 | #include "nvim/os/os.h" |
56 | #include "nvim/arabic.h" |
57 | #include "nvim/mark.h" |
58 | |
59 | typedef struct { |
60 | int rangeStart; |
61 | int rangeEnd; |
62 | int step; |
63 | int offset; |
64 | } convertStruct; |
65 | |
66 | struct interval { |
67 | long first; |
68 | long last; |
69 | }; |
70 | |
71 | #ifdef INCLUDE_GENERATED_DECLARATIONS |
72 | # include "mbyte.c.generated.h" |
73 | # include "unicode_tables.generated.h" |
74 | #endif |
75 | |
76 | char_u e_loadlib[] = "E370: Could not load library %s" ; |
77 | char_u e_loadfunc[] = "E448: Could not load library function %s" ; |
78 | |
79 | // To speed up BYTELEN(); keep a lookup table to quickly get the length in |
80 | // bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes |
81 | // which are illegal when used as the first byte have a 1. The NUL byte has |
82 | // length 1. |
83 | const uint8_t utf8len_tab[] = { |
84 | // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F |
85 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0? |
86 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1? |
87 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2? |
88 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3? |
89 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4? |
90 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5? |
91 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6? |
92 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7? |
93 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8? |
94 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9? |
95 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A? |
96 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B? |
97 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C? |
98 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D? |
99 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E? |
100 | 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, // F? |
101 | }; |
102 | |
103 | // Like utf8len_tab above, but using a zero for illegal lead bytes. |
104 | const uint8_t utf8len_tab_zero[] = { |
105 | // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F |
106 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0? |
107 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1? |
108 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2? |
109 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3? |
110 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4? |
111 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5? |
112 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6? |
113 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7? |
114 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8? |
115 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9? |
116 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A? |
117 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B? |
118 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C? |
119 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D? |
120 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E? |
121 | 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0, // F? |
122 | }; |
123 | |
124 | /* |
125 | * Canonical encoding names and their properties. |
126 | * "iso-8859-n" is handled by enc_canonize() directly. |
127 | */ |
128 | static struct |
129 | { const char *name; int prop; int codepage; } |
130 | enc_canon_table[] = |
131 | { |
132 | #define IDX_LATIN_1 0 |
133 | {"latin1" , ENC_8BIT + ENC_LATIN1, 1252}, |
134 | #define IDX_ISO_2 1 |
135 | {"iso-8859-2" , ENC_8BIT, 0}, |
136 | #define IDX_ISO_3 2 |
137 | {"iso-8859-3" , ENC_8BIT, 0}, |
138 | #define IDX_ISO_4 3 |
139 | {"iso-8859-4" , ENC_8BIT, 0}, |
140 | #define IDX_ISO_5 4 |
141 | {"iso-8859-5" , ENC_8BIT, 0}, |
142 | #define IDX_ISO_6 5 |
143 | {"iso-8859-6" , ENC_8BIT, 0}, |
144 | #define IDX_ISO_7 6 |
145 | {"iso-8859-7" , ENC_8BIT, 0}, |
146 | #define IDX_ISO_8 7 |
147 | {"iso-8859-8" , ENC_8BIT, 0}, |
148 | #define IDX_ISO_9 8 |
149 | {"iso-8859-9" , ENC_8BIT, 0}, |
150 | #define IDX_ISO_10 9 |
151 | {"iso-8859-10" , ENC_8BIT, 0}, |
152 | #define IDX_ISO_11 10 |
153 | {"iso-8859-11" , ENC_8BIT, 0}, |
154 | #define IDX_ISO_13 11 |
155 | {"iso-8859-13" , ENC_8BIT, 0}, |
156 | #define IDX_ISO_14 12 |
157 | {"iso-8859-14" , ENC_8BIT, 0}, |
158 | #define IDX_ISO_15 13 |
159 | {"iso-8859-15" , ENC_8BIT + ENC_LATIN9, 0}, |
160 | #define IDX_KOI8_R 14 |
161 | {"koi8-r" , ENC_8BIT, 0}, |
162 | #define IDX_KOI8_U 15 |
163 | {"koi8-u" , ENC_8BIT, 0}, |
164 | #define IDX_UTF8 16 |
165 | {"utf-8" , ENC_UNICODE, 0}, |
166 | #define IDX_UCS2 17 |
167 | {"ucs-2" , ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0}, |
168 | #define IDX_UCS2LE 18 |
169 | {"ucs-2le" , ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0}, |
170 | #define IDX_UTF16 19 |
171 | {"utf-16" , ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0}, |
172 | #define IDX_UTF16LE 20 |
173 | {"utf-16le" , ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0}, |
174 | #define IDX_UCS4 21 |
175 | {"ucs-4" , ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0}, |
176 | #define IDX_UCS4LE 22 |
177 | {"ucs-4le" , ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0}, |
178 | |
179 | /* For debugging DBCS encoding on Unix. */ |
180 | #define IDX_DEBUG 23 |
181 | {"debug" , ENC_DBCS, DBCS_DEBUG}, |
182 | #define IDX_EUC_JP 24 |
183 | {"euc-jp" , ENC_DBCS, DBCS_JPNU}, |
184 | #define IDX_SJIS 25 |
185 | {"sjis" , ENC_DBCS, DBCS_JPN}, |
186 | #define IDX_EUC_KR 26 |
187 | {"euc-kr" , ENC_DBCS, DBCS_KORU}, |
188 | #define IDX_EUC_CN 27 |
189 | {"euc-cn" , ENC_DBCS, DBCS_CHSU}, |
190 | #define IDX_EUC_TW 28 |
191 | {"euc-tw" , ENC_DBCS, DBCS_CHTU}, |
192 | #define IDX_BIG5 29 |
193 | {"big5" , ENC_DBCS, DBCS_CHT}, |
194 | |
195 | /* MS-DOS and MS-Windows codepages are included here, so that they can be |
196 | * used on Unix too. Most of them are similar to ISO-8859 encodings, but |
197 | * not exactly the same. */ |
198 | #define IDX_CP437 30 |
199 | {"cp437" , ENC_8BIT, 437}, /* like iso-8859-1 */ |
200 | #define IDX_CP737 31 |
201 | {"cp737" , ENC_8BIT, 737}, /* like iso-8859-7 */ |
202 | #define IDX_CP775 32 |
203 | {"cp775" , ENC_8BIT, 775}, /* Baltic */ |
204 | #define IDX_CP850 33 |
205 | {"cp850" , ENC_8BIT, 850}, /* like iso-8859-4 */ |
206 | #define IDX_CP852 34 |
207 | {"cp852" , ENC_8BIT, 852}, /* like iso-8859-1 */ |
208 | #define IDX_CP855 35 |
209 | {"cp855" , ENC_8BIT, 855}, /* like iso-8859-2 */ |
210 | #define IDX_CP857 36 |
211 | {"cp857" , ENC_8BIT, 857}, /* like iso-8859-5 */ |
212 | #define IDX_CP860 37 |
213 | {"cp860" , ENC_8BIT, 860}, /* like iso-8859-9 */ |
214 | #define IDX_CP861 38 |
215 | {"cp861" , ENC_8BIT, 861}, /* like iso-8859-1 */ |
216 | #define IDX_CP862 39 |
217 | {"cp862" , ENC_8BIT, 862}, /* like iso-8859-1 */ |
218 | #define IDX_CP863 40 |
219 | {"cp863" , ENC_8BIT, 863}, /* like iso-8859-8 */ |
220 | #define IDX_CP865 41 |
221 | {"cp865" , ENC_8BIT, 865}, /* like iso-8859-1 */ |
222 | #define IDX_CP866 42 |
223 | {"cp866" , ENC_8BIT, 866}, /* like iso-8859-5 */ |
224 | #define IDX_CP869 43 |
225 | {"cp869" , ENC_8BIT, 869}, /* like iso-8859-7 */ |
226 | #define IDX_CP874 44 |
227 | {"cp874" , ENC_8BIT, 874}, /* Thai */ |
228 | #define IDX_CP932 45 |
229 | {"cp932" , ENC_DBCS, DBCS_JPN}, |
230 | #define IDX_CP936 46 |
231 | {"cp936" , ENC_DBCS, DBCS_CHS}, |
232 | #define IDX_CP949 47 |
233 | {"cp949" , ENC_DBCS, DBCS_KOR}, |
234 | #define IDX_CP950 48 |
235 | {"cp950" , ENC_DBCS, DBCS_CHT}, |
236 | #define IDX_CP1250 49 |
237 | {"cp1250" , ENC_8BIT, 1250}, /* Czech, Polish, etc. */ |
238 | #define IDX_CP1251 50 |
239 | {"cp1251" , ENC_8BIT, 1251}, /* Cyrillic */ |
240 | /* cp1252 is considered to be equal to latin1 */ |
241 | #define IDX_CP1253 51 |
242 | {"cp1253" , ENC_8BIT, 1253}, /* Greek */ |
243 | #define IDX_CP1254 52 |
244 | {"cp1254" , ENC_8BIT, 1254}, /* Turkish */ |
245 | #define IDX_CP1255 53 |
246 | {"cp1255" , ENC_8BIT, 1255}, /* Hebrew */ |
247 | #define IDX_CP1256 54 |
248 | {"cp1256" , ENC_8BIT, 1256}, /* Arabic */ |
249 | #define IDX_CP1257 55 |
250 | {"cp1257" , ENC_8BIT, 1257}, /* Baltic */ |
251 | #define IDX_CP1258 56 |
252 | {"cp1258" , ENC_8BIT, 1258}, /* Vietnamese */ |
253 | |
254 | #define IDX_MACROMAN 57 |
255 | {"macroman" , ENC_8BIT + ENC_MACROMAN, 0}, /* Mac OS */ |
256 | #define IDX_HPROMAN8 58 |
257 | {"hp-roman8" , ENC_8BIT, 0}, /* HP Roman8 */ |
258 | #define IDX_COUNT 59 |
259 | }; |
260 | |
261 | /* |
262 | * Aliases for encoding names. |
263 | */ |
264 | static struct |
265 | { const char *name; int canon; } |
266 | enc_alias_table[] = |
267 | { |
268 | {"ansi" , IDX_LATIN_1}, |
269 | {"iso-8859-1" , IDX_LATIN_1}, |
270 | {"latin2" , IDX_ISO_2}, |
271 | {"latin3" , IDX_ISO_3}, |
272 | {"latin4" , IDX_ISO_4}, |
273 | {"cyrillic" , IDX_ISO_5}, |
274 | {"arabic" , IDX_ISO_6}, |
275 | {"greek" , IDX_ISO_7}, |
276 | {"hebrew" , IDX_ISO_8}, |
277 | {"latin5" , IDX_ISO_9}, |
278 | {"turkish" , IDX_ISO_9}, /* ? */ |
279 | {"latin6" , IDX_ISO_10}, |
280 | {"nordic" , IDX_ISO_10}, /* ? */ |
281 | {"thai" , IDX_ISO_11}, /* ? */ |
282 | {"latin7" , IDX_ISO_13}, |
283 | {"latin8" , IDX_ISO_14}, |
284 | {"latin9" , IDX_ISO_15}, |
285 | {"utf8" , IDX_UTF8}, |
286 | {"unicode" , IDX_UCS2}, |
287 | {"ucs2" , IDX_UCS2}, |
288 | {"ucs2be" , IDX_UCS2}, |
289 | {"ucs-2be" , IDX_UCS2}, |
290 | {"ucs2le" , IDX_UCS2LE}, |
291 | {"utf16" , IDX_UTF16}, |
292 | {"utf16be" , IDX_UTF16}, |
293 | {"utf-16be" , IDX_UTF16}, |
294 | {"utf16le" , IDX_UTF16LE}, |
295 | {"ucs4" , IDX_UCS4}, |
296 | {"ucs4be" , IDX_UCS4}, |
297 | {"ucs-4be" , IDX_UCS4}, |
298 | {"ucs4le" , IDX_UCS4LE}, |
299 | {"utf32" , IDX_UCS4}, |
300 | {"utf-32" , IDX_UCS4}, |
301 | {"utf32be" , IDX_UCS4}, |
302 | {"utf-32be" , IDX_UCS4}, |
303 | {"utf32le" , IDX_UCS4LE}, |
304 | {"utf-32le" , IDX_UCS4LE}, |
305 | {"932" , IDX_CP932}, |
306 | {"949" , IDX_CP949}, |
307 | {"936" , IDX_CP936}, |
308 | {"gbk" , IDX_CP936}, |
309 | {"950" , IDX_CP950}, |
310 | {"eucjp" , IDX_EUC_JP}, |
311 | {"unix-jis" , IDX_EUC_JP}, |
312 | {"ujis" , IDX_EUC_JP}, |
313 | {"shift-jis" , IDX_SJIS}, |
314 | {"pck" , IDX_SJIS}, /* Sun: PCK */ |
315 | {"euckr" , IDX_EUC_KR}, |
316 | {"5601" , IDX_EUC_KR}, /* Sun: KS C 5601 */ |
317 | {"euccn" , IDX_EUC_CN}, |
318 | {"gb2312" , IDX_EUC_CN}, |
319 | {"euctw" , IDX_EUC_TW}, |
320 | {"japan" , IDX_EUC_JP}, |
321 | {"korea" , IDX_EUC_KR}, |
322 | {"prc" , IDX_EUC_CN}, |
323 | {"chinese" , IDX_EUC_CN}, |
324 | {"taiwan" , IDX_EUC_TW}, |
325 | {"cp950" , IDX_BIG5}, |
326 | {"950" , IDX_BIG5}, |
327 | {"mac" , IDX_MACROMAN}, |
328 | {"mac-roman" , IDX_MACROMAN}, |
329 | {NULL, 0} |
330 | }; |
331 | |
332 | /* |
333 | * Find encoding "name" in the list of canonical encoding names. |
334 | * Returns -1 if not found. |
335 | */ |
336 | static int enc_canon_search(const char_u *name) |
337 | { |
338 | int i; |
339 | |
340 | for (i = 0; i < IDX_COUNT; ++i) |
341 | if (STRCMP(name, enc_canon_table[i].name) == 0) |
342 | return i; |
343 | return -1; |
344 | } |
345 | |
346 | |
347 | |
348 | /* |
349 | * Find canonical encoding "name" in the list and return its properties. |
350 | * Returns 0 if not found. |
351 | */ |
352 | int enc_canon_props(const char_u *name) |
353 | { |
354 | int i; |
355 | |
356 | i = enc_canon_search(name); |
357 | if (i >= 0) |
358 | return enc_canon_table[i].prop; |
359 | if (STRNCMP(name, "2byte-" , 6) == 0) |
360 | return ENC_DBCS; |
361 | if (STRNCMP(name, "8bit-" , 5) == 0 || STRNCMP(name, "iso-8859-" , 9) == 0) |
362 | return ENC_8BIT; |
363 | return 0; |
364 | } |
365 | |
366 | /* |
367 | * Return the size of the BOM for the current buffer: |
368 | * 0 - no BOM |
369 | * 2 - UCS-2 or UTF-16 BOM |
370 | * 4 - UCS-4 BOM |
371 | * 3 - UTF-8 BOM |
372 | */ |
373 | int bomb_size(void) |
374 | { |
375 | int n = 0; |
376 | |
377 | if (curbuf->b_p_bomb && !curbuf->b_p_bin) { |
378 | if (*curbuf->b_p_fenc == NUL |
379 | || STRCMP(curbuf->b_p_fenc, "utf-8" ) == 0) { |
380 | n = 3; |
381 | } else if (STRNCMP(curbuf->b_p_fenc, "ucs-2" , 5) == 0 |
382 | || STRNCMP(curbuf->b_p_fenc, "utf-16" , 6) == 0) { |
383 | n = 2; |
384 | } else if (STRNCMP(curbuf->b_p_fenc, "ucs-4" , 5) == 0) { |
385 | n = 4; |
386 | } |
387 | } |
388 | return n; |
389 | } |
390 | |
391 | /* |
392 | * Remove all BOM from "s" by moving remaining text. |
393 | */ |
394 | void remove_bom(char_u *s) |
395 | { |
396 | char *p = (char *)s; |
397 | |
398 | while ((p = strchr(p, 0xef)) != NULL) { |
399 | if ((uint8_t)p[1] == 0xbb && (uint8_t)p[2] == 0xbf) { |
400 | STRMOVE(p, p + 3); |
401 | } else { |
402 | p++; |
403 | } |
404 | } |
405 | } |
406 | |
407 | /* |
408 | * Get class of pointer: |
409 | * 0 for blank or NUL |
410 | * 1 for punctuation |
411 | * 2 for an (ASCII) word character |
412 | * >2 for other word characters |
413 | */ |
414 | int mb_get_class(const char_u *p) |
415 | { |
416 | return mb_get_class_tab(p, curbuf->b_chartab); |
417 | } |
418 | |
419 | int mb_get_class_tab(const char_u *p, const uint64_t *const chartab) |
420 | { |
421 | if (MB_BYTE2LEN(p[0]) == 1) { |
422 | if (p[0] == NUL || ascii_iswhite(p[0])) { |
423 | return 0; |
424 | } |
425 | if (vim_iswordc_tab(p[0], chartab)) { |
426 | return 2; |
427 | } |
428 | return 1; |
429 | } |
430 | return utf_class_tab(utf_ptr2char(p), chartab); |
431 | } |
432 | |
433 | /* |
434 | * Return true if "c" is in "table". |
435 | */ |
436 | static bool intable(const struct interval *table, size_t n_items, int c) |
437 | { |
438 | int mid, bot, top; |
439 | |
440 | /* first quick check for Latin1 etc. characters */ |
441 | if (c < table[0].first) |
442 | return false; |
443 | |
444 | /* binary search in table */ |
445 | bot = 0; |
446 | top = (int)(n_items - 1); |
447 | while (top >= bot) { |
448 | mid = (bot + top) / 2; |
449 | if (table[mid].last < c) |
450 | bot = mid + 1; |
451 | else if (table[mid].first > c) |
452 | top = mid - 1; |
453 | else |
454 | return true; |
455 | } |
456 | return false; |
457 | } |
458 | |
459 | /// For UTF-8 character "c" return 2 for a double-width character, 1 for others. |
460 | /// Returns 4 or 6 for an unprintable character. |
461 | /// Is only correct for characters >= 0x80. |
462 | /// When p_ambw is "double", return 2 for a character with East Asian Width |
463 | /// class 'A'(mbiguous). |
464 | /// |
465 | /// @note Tables `doublewidth` and `ambiguous` are generated by |
466 | /// gen_unicode_tables.lua, which must be manually invoked as needed. |
467 | int utf_char2cells(int c) |
468 | { |
469 | if (c >= 0x100) { |
470 | #ifdef USE_WCHAR_FUNCTIONS |
471 | // |
472 | // Assume the library function wcwidth() works better than our own |
473 | // stuff. It should return 1 for ambiguous width chars! |
474 | // |
475 | int n = wcwidth(c); |
476 | |
477 | if (n < 0) { |
478 | return 6; // unprintable, displays <xxxx> |
479 | } |
480 | if (n > 1) { |
481 | return n; |
482 | } |
483 | #else |
484 | if (!utf_printable(c)) { |
485 | return 6; // unprintable, displays <xxxx> |
486 | } |
487 | if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) { |
488 | return 2; |
489 | } |
490 | #endif |
491 | if (p_emoji && intable(emoji_width, ARRAY_SIZE(emoji_width), c)) { |
492 | return 2; |
493 | } |
494 | } else if (c >= 0x80 && !vim_isprintc(c)) { |
495 | // Characters below 0x100 are influenced by 'isprint' option. |
496 | return 4; // unprintable, displays <xx> |
497 | } |
498 | |
499 | if (c >= 0x80 && *p_ambw == 'd' |
500 | && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) { |
501 | return 2; |
502 | } |
503 | |
504 | return 1; |
505 | } |
506 | |
507 | /// Return the number of display cells character at "*p" occupies. |
508 | /// This doesn't take care of unprintable characters, use ptr2cells() for that. |
509 | int utf_ptr2cells(const char_u *p) |
510 | { |
511 | int c; |
512 | |
513 | /* Need to convert to a wide character. */ |
514 | if (*p >= 0x80) { |
515 | c = utf_ptr2char(p); |
516 | /* An illegal byte is displayed as <xx>. */ |
517 | if (utf_ptr2len(p) == 1 || c == NUL) |
518 | return 4; |
519 | /* If the char is ASCII it must be an overlong sequence. */ |
520 | if (c < 0x80) |
521 | return char2cells(c); |
522 | return utf_char2cells(c); |
523 | } |
524 | return 1; |
525 | } |
526 | |
527 | /// Like utf_ptr2cells(), but limit string length to "size". |
528 | /// For an empty string or truncated character returns 1. |
529 | int utf_ptr2cells_len(const char_u *p, int size) |
530 | { |
531 | int c; |
532 | |
533 | /* Need to convert to a wide character. */ |
534 | if (size > 0 && *p >= 0x80) { |
535 | if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) |
536 | return 1; /* truncated */ |
537 | c = utf_ptr2char(p); |
538 | /* An illegal byte is displayed as <xx>. */ |
539 | if (utf_ptr2len(p) == 1 || c == NUL) |
540 | return 4; |
541 | /* If the char is ASCII it must be an overlong sequence. */ |
542 | if (c < 0x80) |
543 | return char2cells(c); |
544 | return utf_char2cells(c); |
545 | } |
546 | return 1; |
547 | } |
548 | |
549 | /// Calculate the number of cells occupied by string `str`. |
550 | /// |
551 | /// @param str The source string, may not be NULL, must be a NUL-terminated |
552 | /// string. |
553 | /// @return The number of cells occupied by string `str` |
554 | size_t mb_string2cells(const char_u *str) |
555 | { |
556 | size_t clen = 0; |
557 | |
558 | for (const char_u *p = str; *p != NUL; p += (*mb_ptr2len)(p)) { |
559 | clen += utf_ptr2cells(p); |
560 | } |
561 | |
562 | return clen; |
563 | } |
564 | |
565 | /// Get the number of cells occupied by string `str` with maximum length `size` |
566 | /// |
567 | /// @param str The source string, may not be NULL, must be a NUL-terminated |
568 | /// string. |
569 | /// @param size maximum length of string. It will terminate on earlier NUL. |
570 | /// @return The number of cells occupied by string `str` |
571 | size_t mb_string2cells_len(const char_u *str, size_t size) |
572 | { |
573 | size_t clen = 0; |
574 | |
575 | for (const char_u *p = str; *p != NUL && p < str+size; |
576 | p += utf_ptr2len_len(p, size+(p-str))) { |
577 | clen += utf_ptr2cells(p); |
578 | } |
579 | |
580 | return clen; |
581 | } |
582 | |
583 | /// Convert a UTF-8 byte sequence to a wide character |
584 | /// |
585 | /// If the sequence is illegal or truncated by a NUL then the first byte is |
586 | /// returned. |
587 | /// For an overlong sequence this may return zero. |
588 | /// Does not include composing characters for obvious reasons. |
589 | /// |
590 | /// @param[in] p String to convert. |
591 | /// |
592 | /// @return Unicode codepoint or byte value. |
593 | int utf_ptr2char(const char_u *const p) |
594 | FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT |
595 | { |
596 | if (p[0] < 0x80) { // Be quick for ASCII. |
597 | return p[0]; |
598 | } |
599 | |
600 | const uint8_t len = utf8len_tab_zero[p[0]]; |
601 | if (len > 1 && (p[1] & 0xc0) == 0x80) { |
602 | if (len == 2) { |
603 | return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f); |
604 | } |
605 | if ((p[2] & 0xc0) == 0x80) { |
606 | if (len == 3) { |
607 | return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) |
608 | + (p[2] & 0x3f)); |
609 | } |
610 | if ((p[3] & 0xc0) == 0x80) { |
611 | if (len == 4) { |
612 | return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) |
613 | + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f)); |
614 | } |
615 | if ((p[4] & 0xc0) == 0x80) { |
616 | if (len == 5) { |
617 | return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18) |
618 | + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6) |
619 | + (p[4] & 0x3f)); |
620 | } |
621 | if ((p[5] & 0xc0) == 0x80 && len == 6) { |
622 | return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24) |
623 | + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12) |
624 | + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f)); |
625 | } |
626 | } |
627 | } |
628 | } |
629 | } |
630 | // Illegal value: just return the first byte. |
631 | return p[0]; |
632 | } |
633 | |
634 | /* |
635 | * Convert a UTF-8 byte sequence to a wide character. |
636 | * String is assumed to be terminated by NUL or after "n" bytes, whichever |
637 | * comes first. |
638 | * The function is safe in the sense that it never accesses memory beyond the |
639 | * first "n" bytes of "s". |
640 | * |
641 | * On success, returns decoded codepoint, advances "s" to the beginning of |
642 | * next character and decreases "n" accordingly. |
643 | * |
644 | * If end of string was reached, returns 0 and, if "n" > 0, advances "s" past |
645 | * NUL byte. |
646 | * |
647 | * If byte sequence is illegal or incomplete, returns -1 and does not advance |
648 | * "s". |
649 | */ |
650 | static int utf_safe_read_char_adv(const char_u **s, size_t *n) |
651 | { |
652 | int c; |
653 | |
654 | if (*n == 0) /* end of buffer */ |
655 | return 0; |
656 | |
657 | uint8_t k = utf8len_tab_zero[**s]; |
658 | |
659 | if (k == 1) { |
660 | /* ASCII character or NUL */ |
661 | (*n)--; |
662 | return *(*s)++; |
663 | } |
664 | |
665 | if (k <= *n) { |
666 | /* We have a multibyte sequence and it isn't truncated by buffer |
667 | * limits so utf_ptr2char() is safe to use. Or the first byte is |
668 | * illegal (k=0), and it's also safe to use utf_ptr2char(). */ |
669 | c = utf_ptr2char(*s); |
670 | |
671 | /* On failure, utf_ptr2char() returns the first byte, so here we |
672 | * check equality with the first byte. The only non-ASCII character |
673 | * which equals the first byte of its own UTF-8 representation is |
674 | * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. |
675 | * It's safe even if n=1, else we would have k=2 > n. */ |
676 | if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) { |
677 | /* byte sequence was successfully decoded */ |
678 | *s += k; |
679 | *n -= k; |
680 | return c; |
681 | } |
682 | } |
683 | |
684 | /* byte sequence is incomplete or illegal */ |
685 | return -1; |
686 | } |
687 | |
688 | /* |
689 | * Get character at **pp and advance *pp to the next character. |
690 | * Note: composing characters are skipped! |
691 | */ |
692 | int mb_ptr2char_adv(const char_u **const pp) |
693 | { |
694 | int c; |
695 | |
696 | c = utf_ptr2char(*pp); |
697 | *pp += (*mb_ptr2len)(*pp); |
698 | return c; |
699 | } |
700 | |
701 | /* |
702 | * Get character at **pp and advance *pp to the next character. |
703 | * Note: composing characters are returned as separate characters. |
704 | */ |
705 | int mb_cptr2char_adv(const char_u **pp) |
706 | { |
707 | int c; |
708 | |
709 | c = utf_ptr2char(*pp); |
710 | *pp += utf_ptr2len(*pp); |
711 | return c; |
712 | } |
713 | |
714 | /* |
715 | * Check if the character pointed to by "p2" is a composing character when it |
716 | * comes after "p1". For Arabic sometimes "ab" is replaced with "c", which |
717 | * behaves like a composing character. |
718 | */ |
719 | bool utf_composinglike(const char_u *p1, const char_u *p2) |
720 | { |
721 | int c2; |
722 | |
723 | c2 = utf_ptr2char(p2); |
724 | if (utf_iscomposing(c2)) |
725 | return true; |
726 | if (!arabic_maycombine(c2)) |
727 | return false; |
728 | return arabic_combine(utf_ptr2char(p1), c2); |
729 | } |
730 | |
731 | /// Convert a UTF-8 string to a wide character |
732 | /// |
733 | /// Also gets up to #MAX_MCO composing characters. |
734 | /// |
735 | /// @param[out] pcc Location where to store composing characters. Must have |
736 | /// space at least for #MAX_MCO + 1 elements. |
737 | /// |
738 | /// @return leading character. |
739 | int utfc_ptr2char(const char_u *p, int *pcc) |
740 | { |
741 | int len; |
742 | int c; |
743 | int cc; |
744 | int i = 0; |
745 | |
746 | c = utf_ptr2char(p); |
747 | len = utf_ptr2len(p); |
748 | |
749 | /* Only accept a composing char when the first char isn't illegal. */ |
750 | if ((len > 1 || *p < 0x80) |
751 | && p[len] >= 0x80 |
752 | && UTF_COMPOSINGLIKE(p, p + len)) { |
753 | cc = utf_ptr2char(p + len); |
754 | for (;; ) { |
755 | pcc[i++] = cc; |
756 | if (i == MAX_MCO) |
757 | break; |
758 | len += utf_ptr2len(p + len); |
759 | if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) |
760 | break; |
761 | } |
762 | } |
763 | |
764 | if (i < MAX_MCO) /* last composing char must be 0 */ |
765 | pcc[i] = 0; |
766 | |
767 | return c; |
768 | } |
769 | |
770 | /* |
771 | * Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO |
772 | * composing characters. Use no more than p[maxlen]. |
773 | * |
774 | * @param [out] pcc: composing chars, last one is 0 |
775 | */ |
776 | int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen) |
777 | { |
778 | #define IS_COMPOSING(s1, s2, s3) \ |
779 | (i == 0 ? UTF_COMPOSINGLIKE((s1), (s2)) : utf_iscomposing((s3))) |
780 | |
781 | assert(maxlen > 0); |
782 | |
783 | int i = 0; |
784 | |
785 | int len = utf_ptr2len_len(p, maxlen); |
786 | // Is it safe to use utf_ptr2char()? |
787 | bool safe = len > 1 && len <= maxlen; |
788 | int c = safe ? utf_ptr2char(p) : *p; |
789 | |
790 | // Only accept a composing char when the first char isn't illegal. |
791 | if ((safe || c < 0x80) && len < maxlen && p[len] >= 0x80) { |
792 | for (; i < MAX_MCO; i++) { |
793 | int len_cc = utf_ptr2len_len(p + len, maxlen - len); |
794 | safe = len_cc > 1 && len_cc <= maxlen - len; |
795 | if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80 |
796 | || !IS_COMPOSING(p, p + len, pcc[i])) { |
797 | break; |
798 | } |
799 | len += len_cc; |
800 | } |
801 | } |
802 | |
803 | if (i < MAX_MCO) { |
804 | // last composing char must be 0 |
805 | pcc[i] = 0; |
806 | } |
807 | |
808 | return c; |
809 | #undef ISCOMPOSING |
810 | } |
811 | |
812 | /// Get the length of a UTF-8 byte sequence representing a single codepoint |
813 | /// |
814 | /// @param[in] p UTF-8 string. |
815 | /// |
816 | /// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte |
817 | /// sequence. |
818 | int utf_ptr2len(const char_u *const p) |
819 | FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL |
820 | { |
821 | if (*p == NUL) { |
822 | return 0; |
823 | } |
824 | const int len = utf8len_tab[*p]; |
825 | for (int i = 1; i < len; i++) { |
826 | if ((p[i] & 0xc0) != 0x80) { |
827 | return 1; |
828 | } |
829 | } |
830 | return len; |
831 | } |
832 | |
833 | /* |
834 | * Return length of UTF-8 character, obtained from the first byte. |
835 | * "b" must be between 0 and 255! |
836 | * Returns 1 for an invalid first byte value. |
837 | */ |
838 | int utf_byte2len(int b) |
839 | { |
840 | return utf8len_tab[b]; |
841 | } |
842 | |
843 | /* |
844 | * Get the length of UTF-8 byte sequence "p[size]". Does not include any |
845 | * following composing characters. |
846 | * Returns 1 for "". |
847 | * Returns 1 for an illegal byte sequence (also in incomplete byte seq.). |
848 | * Returns number > "size" for an incomplete byte sequence. |
849 | * Never returns zero. |
850 | */ |
851 | int utf_ptr2len_len(const char_u *p, int size) |
852 | { |
853 | int len; |
854 | int i; |
855 | int m; |
856 | |
857 | len = utf8len_tab[*p]; |
858 | if (len == 1) |
859 | return 1; /* NUL, ascii or illegal lead byte */ |
860 | if (len > size) |
861 | m = size; /* incomplete byte sequence. */ |
862 | else |
863 | m = len; |
864 | for (i = 1; i < m; ++i) |
865 | if ((p[i] & 0xc0) != 0x80) |
866 | return 1; |
867 | return len; |
868 | } |
869 | |
870 | /// Return the number of bytes occupied by a UTF-8 character in a string |
871 | /// |
872 | /// This includes following composing characters. |
873 | int utfc_ptr2len(const char_u *const p) |
874 | FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL |
875 | { |
876 | uint8_t b0 = (uint8_t)(*p); |
877 | |
878 | if (b0 == NUL) { |
879 | return 0; |
880 | } |
881 | if (b0 < 0x80 && p[1] < 0x80) { // be quick for ASCII |
882 | return 1; |
883 | } |
884 | |
885 | // Skip over first UTF-8 char, stopping at a NUL byte. |
886 | int len = utf_ptr2len(p); |
887 | |
888 | // Check for illegal byte. |
889 | if (len == 1 && b0 >= 0x80) { |
890 | return 1; |
891 | } |
892 | |
893 | // Check for composing characters. We can handle only the first six, but |
894 | // skip all of them (otherwise the cursor would get stuck). |
895 | int prevlen = 0; |
896 | for (;;) { |
897 | if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len)) { |
898 | return len; |
899 | } |
900 | |
901 | // Skip over composing char. |
902 | prevlen = len; |
903 | len += utf_ptr2len(p + len); |
904 | } |
905 | } |
906 | |
907 | /* |
908 | * Return the number of bytes the UTF-8 encoding of the character at "p[size]" |
909 | * takes. This includes following composing characters. |
910 | * Returns 0 for an empty string. |
911 | * Returns 1 for an illegal char or an incomplete byte sequence. |
912 | */ |
913 | int utfc_ptr2len_len(const char_u *p, int size) |
914 | { |
915 | int len; |
916 | int prevlen; |
917 | |
918 | if (size < 1 || *p == NUL) |
919 | return 0; |
920 | if (p[0] < 0x80 && (size == 1 || p[1] < 0x80)) /* be quick for ASCII */ |
921 | return 1; |
922 | |
923 | /* Skip over first UTF-8 char, stopping at a NUL byte. */ |
924 | len = utf_ptr2len_len(p, size); |
925 | |
926 | /* Check for illegal byte and incomplete byte sequence. */ |
927 | if ((len == 1 && p[0] >= 0x80) || len > size) |
928 | return 1; |
929 | |
930 | /* |
931 | * Check for composing characters. We can handle only the first six, but |
932 | * skip all of them (otherwise the cursor would get stuck). |
933 | */ |
934 | prevlen = 0; |
935 | while (len < size) { |
936 | int len_next_char; |
937 | |
938 | if (p[len] < 0x80) |
939 | break; |
940 | |
941 | /* |
942 | * Next character length should not go beyond size to ensure that |
943 | * UTF_COMPOSINGLIKE(...) does not read beyond size. |
944 | */ |
945 | len_next_char = utf_ptr2len_len(p + len, size - len); |
946 | if (len_next_char > size - len) |
947 | break; |
948 | |
949 | if (!UTF_COMPOSINGLIKE(p + prevlen, p + len)) |
950 | break; |
951 | |
952 | /* Skip over composing char */ |
953 | prevlen = len; |
954 | len += len_next_char; |
955 | } |
956 | return len; |
957 | } |
958 | |
959 | /// Determine how many bytes certain unicode codepoint will occupy |
960 | int utf_char2len(const int c) |
961 | { |
962 | if (c < 0x80) { |
963 | return 1; |
964 | } else if (c < 0x800) { |
965 | return 2; |
966 | } else if (c < 0x10000) { |
967 | return 3; |
968 | } else if (c < 0x200000) { |
969 | return 4; |
970 | } else if (c < 0x4000000) { |
971 | return 5; |
972 | } else { |
973 | return 6; |
974 | } |
975 | } |
976 | |
977 | /// Convert Unicode character to UTF-8 string |
978 | /// |
979 | /// @param c character to convert to \p buf |
980 | /// @param[out] buf UTF-8 string generated from \p c, does not add \0 |
981 | /// @return Number of bytes (1-6). |
982 | int utf_char2bytes(const int c, char_u *const buf) |
983 | { |
984 | if (c < 0x80) { // 7 bits |
985 | buf[0] = c; |
986 | return 1; |
987 | } else if (c < 0x800) { // 11 bits |
988 | buf[0] = 0xc0 + ((unsigned)c >> 6); |
989 | buf[1] = 0x80 + (c & 0x3f); |
990 | return 2; |
991 | } else if (c < 0x10000) { // 16 bits |
992 | buf[0] = 0xe0 + ((unsigned)c >> 12); |
993 | buf[1] = 0x80 + (((unsigned)c >> 6) & 0x3f); |
994 | buf[2] = 0x80 + (c & 0x3f); |
995 | return 3; |
996 | } else if (c < 0x200000) { // 21 bits |
997 | buf[0] = 0xf0 + ((unsigned)c >> 18); |
998 | buf[1] = 0x80 + (((unsigned)c >> 12) & 0x3f); |
999 | buf[2] = 0x80 + (((unsigned)c >> 6) & 0x3f); |
1000 | buf[3] = 0x80 + (c & 0x3f); |
1001 | return 4; |
1002 | } else if (c < 0x4000000) { // 26 bits |
1003 | buf[0] = 0xf8 + ((unsigned)c >> 24); |
1004 | buf[1] = 0x80 + (((unsigned)c >> 18) & 0x3f); |
1005 | buf[2] = 0x80 + (((unsigned)c >> 12) & 0x3f); |
1006 | buf[3] = 0x80 + (((unsigned)c >> 6) & 0x3f); |
1007 | buf[4] = 0x80 + (c & 0x3f); |
1008 | return 5; |
1009 | } else { // 31 bits |
1010 | buf[0] = 0xfc + ((unsigned)c >> 30); |
1011 | buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f); |
1012 | buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f); |
1013 | buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f); |
1014 | buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f); |
1015 | buf[5] = 0x80 + (c & 0x3f); |
1016 | return 6; |
1017 | } |
1018 | } |
1019 | |
1020 | /* |
1021 | * Return true if "c" is a composing UTF-8 character. This means it will be |
1022 | * drawn on top of the preceding character. |
1023 | * Based on code from Markus Kuhn. |
1024 | */ |
1025 | bool utf_iscomposing(int c) |
1026 | { |
1027 | return intable(combining, ARRAY_SIZE(combining), c); |
1028 | } |
1029 | |
1030 | /* |
1031 | * Return true for characters that can be displayed in a normal way. |
1032 | * Only for characters of 0x100 and above! |
1033 | */ |
1034 | bool utf_printable(int c) |
1035 | { |
1036 | #ifdef USE_WCHAR_FUNCTIONS |
1037 | /* |
1038 | * Assume the iswprint() library function works better than our own stuff. |
1039 | */ |
1040 | return iswprint(c); |
1041 | #else |
1042 | /* Sorted list of non-overlapping intervals. |
1043 | * 0xd800-0xdfff is reserved for UTF-16, actually illegal. */ |
1044 | static struct interval nonprint[] = |
1045 | { |
1046 | {0x070f, 0x070f}, {0x180b, 0x180e}, {0x200b, 0x200f}, {0x202a, 0x202e}, |
1047 | {0x206a, 0x206f}, {0xd800, 0xdfff}, {0xfeff, 0xfeff}, {0xfff9, 0xfffb}, |
1048 | {0xfffe, 0xffff} |
1049 | }; |
1050 | |
1051 | return !intable(nonprint, ARRAY_SIZE(nonprint), c); |
1052 | #endif |
1053 | } |
1054 | |
1055 | /* |
1056 | * Get class of a Unicode character. |
1057 | * 0: white space |
1058 | * 1: punctuation |
1059 | * 2 or bigger: some class of word character. |
1060 | */ |
1061 | int utf_class(const int c) |
1062 | { |
1063 | return utf_class_tab(c, curbuf->b_chartab); |
1064 | } |
1065 | |
1066 | int utf_class_tab(const int c, const uint64_t *const chartab) |
1067 | { |
1068 | /* sorted list of non-overlapping intervals */ |
1069 | static struct clinterval { |
1070 | unsigned int first; |
1071 | unsigned int last; |
1072 | unsigned int class; |
1073 | } classes[] = { |
1074 | { 0x037e, 0x037e, 1 }, // Greek question mark |
1075 | { 0x0387, 0x0387, 1 }, // Greek ano teleia |
1076 | { 0x055a, 0x055f, 1 }, // Armenian punctuation |
1077 | { 0x0589, 0x0589, 1 }, // Armenian full stop |
1078 | { 0x05be, 0x05be, 1 }, |
1079 | { 0x05c0, 0x05c0, 1 }, |
1080 | { 0x05c3, 0x05c3, 1 }, |
1081 | { 0x05f3, 0x05f4, 1 }, |
1082 | { 0x060c, 0x060c, 1 }, |
1083 | { 0x061b, 0x061b, 1 }, |
1084 | { 0x061f, 0x061f, 1 }, |
1085 | { 0x066a, 0x066d, 1 }, |
1086 | { 0x06d4, 0x06d4, 1 }, |
1087 | { 0x0700, 0x070d, 1 }, // Syriac punctuation |
1088 | { 0x0964, 0x0965, 1 }, |
1089 | { 0x0970, 0x0970, 1 }, |
1090 | { 0x0df4, 0x0df4, 1 }, |
1091 | { 0x0e4f, 0x0e4f, 1 }, |
1092 | { 0x0e5a, 0x0e5b, 1 }, |
1093 | { 0x0f04, 0x0f12, 1 }, |
1094 | { 0x0f3a, 0x0f3d, 1 }, |
1095 | { 0x0f85, 0x0f85, 1 }, |
1096 | { 0x104a, 0x104f, 1 }, // Myanmar punctuation |
1097 | { 0x10fb, 0x10fb, 1 }, // Georgian punctuation |
1098 | { 0x1361, 0x1368, 1 }, // Ethiopic punctuation |
1099 | { 0x166d, 0x166e, 1 }, // Canadian Syl. punctuation |
1100 | { 0x1680, 0x1680, 0 }, |
1101 | { 0x169b, 0x169c, 1 }, |
1102 | { 0x16eb, 0x16ed, 1 }, |
1103 | { 0x1735, 0x1736, 1 }, |
1104 | { 0x17d4, 0x17dc, 1 }, // Khmer punctuation |
1105 | { 0x1800, 0x180a, 1 }, // Mongolian punctuation |
1106 | { 0x2000, 0x200b, 0 }, // spaces |
1107 | { 0x200c, 0x2027, 1 }, // punctuation and symbols |
1108 | { 0x2028, 0x2029, 0 }, |
1109 | { 0x202a, 0x202e, 1 }, // punctuation and symbols |
1110 | { 0x202f, 0x202f, 0 }, |
1111 | { 0x2030, 0x205e, 1 }, // punctuation and symbols |
1112 | { 0x205f, 0x205f, 0 }, |
1113 | { 0x2060, 0x27ff, 1 }, // punctuation and symbols |
1114 | { 0x2070, 0x207f, 0x2070 }, // superscript |
1115 | { 0x2080, 0x2094, 0x2080 }, // subscript |
1116 | { 0x20a0, 0x27ff, 1 }, // all kinds of symbols |
1117 | { 0x2800, 0x28ff, 0x2800 }, // braille |
1118 | { 0x2900, 0x2998, 1 }, // arrows, brackets, etc. |
1119 | { 0x29d8, 0x29db, 1 }, |
1120 | { 0x29fc, 0x29fd, 1 }, |
1121 | { 0x2e00, 0x2e7f, 1 }, // supplemental punctuation |
1122 | { 0x3000, 0x3000, 0 }, // ideographic space |
1123 | { 0x3001, 0x3020, 1 }, // ideographic punctuation |
1124 | { 0x3030, 0x3030, 1 }, |
1125 | { 0x303d, 0x303d, 1 }, |
1126 | { 0x3040, 0x309f, 0x3040 }, // Hiragana |
1127 | { 0x30a0, 0x30ff, 0x30a0 }, // Katakana |
1128 | { 0x3300, 0x9fff, 0x4e00 }, // CJK Ideographs |
1129 | { 0xac00, 0xd7a3, 0xac00 }, // Hangul Syllables |
1130 | { 0xf900, 0xfaff, 0x4e00 }, // CJK Ideographs |
1131 | { 0xfd3e, 0xfd3f, 1 }, |
1132 | { 0xfe30, 0xfe6b, 1 }, // punctuation forms |
1133 | { 0xff00, 0xff0f, 1 }, // half/fullwidth ASCII |
1134 | { 0xff1a, 0xff20, 1 }, // half/fullwidth ASCII |
1135 | { 0xff3b, 0xff40, 1 }, // half/fullwidth ASCII |
1136 | { 0xff5b, 0xff65, 1 }, // half/fullwidth ASCII |
1137 | { 0x1d000, 0x1d24f, 1 }, // Musical notation |
1138 | { 0x1d400, 0x1d7ff, 1 }, // Mathematical Alphanumeric Symbols |
1139 | { 0x1f000, 0x1f2ff, 1 }, // Game pieces; enclosed characters |
1140 | { 0x1f300, 0x1f9ff, 1 }, // Many symbol blocks |
1141 | { 0x20000, 0x2a6df, 0x4e00 }, // CJK Ideographs |
1142 | { 0x2a700, 0x2b73f, 0x4e00 }, // CJK Ideographs |
1143 | { 0x2b740, 0x2b81f, 0x4e00 }, // CJK Ideographs |
1144 | { 0x2f800, 0x2fa1f, 0x4e00 }, // CJK Ideographs |
1145 | }; |
1146 | int bot = 0; |
1147 | int top = ARRAY_SIZE(classes) - 1; |
1148 | int mid; |
1149 | |
1150 | /* First quick check for Latin1 characters, use 'iskeyword'. */ |
1151 | if (c < 0x100) { |
1152 | if (c == ' ' || c == '\t' || c == NUL || c == 0xa0) { |
1153 | return 0; // blank |
1154 | } |
1155 | if (vim_iswordc_tab(c, chartab)) { |
1156 | return 2; // word character |
1157 | } |
1158 | return 1; // punctuation |
1159 | } |
1160 | |
1161 | /* binary search in table */ |
1162 | while (top >= bot) { |
1163 | mid = (bot + top) / 2; |
1164 | if (classes[mid].last < (unsigned int)c) |
1165 | bot = mid + 1; |
1166 | else if (classes[mid].first > (unsigned int)c) |
1167 | top = mid - 1; |
1168 | else |
1169 | return (int)classes[mid].class; |
1170 | } |
1171 | |
1172 | // emoji |
1173 | if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { |
1174 | return 3; |
1175 | } |
1176 | |
1177 | /* most other characters are "word" characters */ |
1178 | return 2; |
1179 | } |
1180 | |
1181 | bool utf_ambiguous_width(int c) |
1182 | { |
1183 | return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c) |
1184 | || intable(emoji_all, ARRAY_SIZE(emoji_all), c)); |
1185 | } |
1186 | |
1187 | /* |
1188 | * Generic conversion function for case operations. |
1189 | * Return the converted equivalent of "a", which is a UCS-4 character. Use |
1190 | * the given conversion "table". Uses binary search on "table". |
1191 | */ |
1192 | static int utf_convert(int a, const convertStruct *const table, size_t n_items) |
1193 | { |
1194 | size_t start, mid, end; /* indices into table */ |
1195 | |
1196 | start = 0; |
1197 | end = n_items; |
1198 | while (start < end) { |
1199 | /* need to search further */ |
1200 | mid = (end + start) / 2; |
1201 | if (table[mid].rangeEnd < a) |
1202 | start = mid + 1; |
1203 | else |
1204 | end = mid; |
1205 | } |
1206 | if (start < n_items |
1207 | && table[start].rangeStart <= a |
1208 | && a <= table[start].rangeEnd |
1209 | && (a - table[start].rangeStart) % table[start].step == 0) |
1210 | return a + table[start].offset; |
1211 | else |
1212 | return a; |
1213 | } |
1214 | |
1215 | /* |
1216 | * Return the folded-case equivalent of "a", which is a UCS-4 character. Uses |
1217 | * simple case folding. |
1218 | */ |
1219 | int utf_fold(int a) |
1220 | { |
1221 | if (a < 0x80) { |
1222 | // be fast for ASCII |
1223 | return a >= 0x41 && a <= 0x5a ? a + 32 : a; |
1224 | } |
1225 | return utf_convert(a, foldCase, ARRAY_SIZE(foldCase)); |
1226 | } |
1227 | |
1228 | // Vim's own character class functions. These exist because many library |
1229 | // islower()/toupper() etc. do not work properly: they crash when used with |
1230 | // invalid values or can't handle latin1 when the locale is C. |
1231 | // Speed is most important here. |
1232 | |
1233 | /// Return the upper-case equivalent of "a", which is a UCS-4 character. Use |
1234 | /// simple case folding. |
1235 | int mb_toupper(int a) |
1236 | { |
1237 | /* If 'casemap' contains "keepascii" use ASCII style toupper(). */ |
1238 | if (a < 128 && (cmp_flags & CMP_KEEPASCII)) |
1239 | return TOUPPER_ASC(a); |
1240 | |
1241 | #if defined(__STDC_ISO_10646__) |
1242 | /* If towupper() is available and handles Unicode, use it. */ |
1243 | if (!(cmp_flags & CMP_INTERNAL)) |
1244 | return towupper(a); |
1245 | #endif |
1246 | |
1247 | /* For characters below 128 use locale sensitive toupper(). */ |
1248 | if (a < 128) |
1249 | return TOUPPER_LOC(a); |
1250 | |
1251 | /* For any other characters use the above mapping table. */ |
1252 | return utf_convert(a, toUpper, ARRAY_SIZE(toUpper)); |
1253 | } |
1254 | |
1255 | bool mb_islower(int a) |
1256 | { |
1257 | // German sharp s is lower case but has no upper case equivalent. |
1258 | return (mb_toupper(a) != a) || a == 0xdf; |
1259 | } |
1260 | |
1261 | /// Return the lower-case equivalent of "a", which is a UCS-4 character. Use |
1262 | /// simple case folding. |
1263 | int mb_tolower(int a) |
1264 | { |
1265 | /* If 'casemap' contains "keepascii" use ASCII style tolower(). */ |
1266 | if (a < 128 && (cmp_flags & CMP_KEEPASCII)) |
1267 | return TOLOWER_ASC(a); |
1268 | |
1269 | #if defined(__STDC_ISO_10646__) |
1270 | /* If towlower() is available and handles Unicode, use it. */ |
1271 | if (!(cmp_flags & CMP_INTERNAL)) |
1272 | return towlower(a); |
1273 | #endif |
1274 | |
1275 | /* For characters below 128 use locale sensitive tolower(). */ |
1276 | if (a < 128) |
1277 | return TOLOWER_LOC(a); |
1278 | |
1279 | /* For any other characters use the above mapping table. */ |
1280 | return utf_convert(a, toLower, ARRAY_SIZE(toLower)); |
1281 | } |
1282 | |
1283 | bool mb_isupper(int a) |
1284 | { |
1285 | return mb_tolower(a) != a; |
1286 | } |
1287 | |
1288 | static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, |
1289 | size_t n2) |
1290 | { |
1291 | int c1, c2, cdiff; |
1292 | char_u buffer[6]; |
1293 | |
1294 | for (;; ) { |
1295 | c1 = utf_safe_read_char_adv(&s1, &n1); |
1296 | c2 = utf_safe_read_char_adv(&s2, &n2); |
1297 | |
1298 | if (c1 <= 0 || c2 <= 0) |
1299 | break; |
1300 | |
1301 | if (c1 == c2) |
1302 | continue; |
1303 | |
1304 | cdiff = utf_fold(c1) - utf_fold(c2); |
1305 | if (cdiff != 0) |
1306 | return cdiff; |
1307 | } |
1308 | |
1309 | /* some string ended or has an incomplete/illegal character sequence */ |
1310 | |
1311 | if (c1 == 0 || c2 == 0) { |
1312 | /* some string ended. shorter string is smaller */ |
1313 | if (c1 == 0 && c2 == 0) |
1314 | return 0; |
1315 | return c1 == 0 ? -1 : 1; |
1316 | } |
1317 | |
1318 | /* Continue with bytewise comparison to produce some result that |
1319 | * would make comparison operations involving this function transitive. |
1320 | * |
1321 | * If only one string had an error, comparison should be made with |
1322 | * folded version of the other string. In this case it is enough |
1323 | * to fold just one character to determine the result of comparison. */ |
1324 | |
1325 | if (c1 != -1 && c2 == -1) { |
1326 | n1 = utf_char2bytes(utf_fold(c1), buffer); |
1327 | s1 = buffer; |
1328 | } else if (c2 != -1 && c1 == -1) { |
1329 | n2 = utf_char2bytes(utf_fold(c2), buffer); |
1330 | s2 = buffer; |
1331 | } |
1332 | |
1333 | while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) { |
1334 | cdiff = (int)(*s1) - (int)(*s2); |
1335 | if (cdiff != 0) |
1336 | return cdiff; |
1337 | |
1338 | s1++; |
1339 | s2++; |
1340 | n1--; |
1341 | n2--; |
1342 | } |
1343 | |
1344 | if (n1 > 0 && *s1 == NUL) |
1345 | n1 = 0; |
1346 | if (n2 > 0 && *s2 == NUL) |
1347 | n2 = 0; |
1348 | |
1349 | if (n1 == 0 && n2 == 0) |
1350 | return 0; |
1351 | return n1 == 0 ? -1 : 1; |
1352 | } |
1353 | |
1354 | #ifdef WIN32 |
1355 | #ifndef CP_UTF8 |
1356 | # define CP_UTF8 65001 /* magic number from winnls.h */ |
1357 | #endif |
1358 | |
1359 | /// Converts string from UTF-8 to UTF-16. |
1360 | /// |
1361 | /// @param utf8 UTF-8 string. |
1362 | /// @param utf8len Length of `utf8`. May be -1 if `utf8` is NUL-terminated. |
1363 | /// @param utf16[out,allocated] NUL-terminated UTF-16 string, or NULL on error |
1364 | /// @return 0 on success, or libuv error code |
1365 | int utf8_to_utf16(const char *utf8, int utf8len, wchar_t **utf16) |
1366 | FUNC_ATTR_NONNULL_ALL |
1367 | { |
1368 | // Compute the length needed for the converted UTF-16 string. |
1369 | int bufsize = MultiByteToWideChar(CP_UTF8, |
1370 | 0, // dwFlags: must be 0 for UTF-8 |
1371 | utf8, // -1: process up to NUL |
1372 | utf8len, |
1373 | NULL, |
1374 | 0); // 0: get length, don't convert |
1375 | if (bufsize == 0) { |
1376 | *utf16 = NULL; |
1377 | return uv_translate_sys_error(GetLastError()); |
1378 | } |
1379 | |
1380 | // Allocate the destination buffer adding an extra byte for the terminating |
1381 | // NULL. If `utf8len` is not -1 MultiByteToWideChar will not add it, so |
1382 | // we do it ourselves always, just in case. |
1383 | *utf16 = xmalloc(sizeof(wchar_t) * (bufsize + 1)); |
1384 | |
1385 | // Convert to UTF-16. |
1386 | bufsize = MultiByteToWideChar(CP_UTF8, 0, utf8, utf8len, *utf16, bufsize); |
1387 | if (bufsize == 0) { |
1388 | XFREE_CLEAR(*utf16); |
1389 | return uv_translate_sys_error(GetLastError()); |
1390 | } |
1391 | |
1392 | (*utf16)[bufsize] = L'\0'; |
1393 | return 0; |
1394 | } |
1395 | |
1396 | /// Converts string from UTF-16 to UTF-8. |
1397 | /// |
1398 | /// @param utf16 UTF-16 string. |
1399 | /// @param utf16len Length of `utf16`. May be -1 if `utf16` is NUL-terminated. |
1400 | /// @param utf8[out,allocated] NUL-terminated UTF-8 string, or NULL on error |
1401 | /// @return 0 on success, or libuv error code |
1402 | int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8) |
1403 | FUNC_ATTR_NONNULL_ALL |
1404 | { |
1405 | // Compute the space needed for the converted UTF-8 string. |
1406 | DWORD bufsize = WideCharToMultiByte(CP_UTF8, |
1407 | 0, |
1408 | utf16, |
1409 | utf16len, |
1410 | NULL, |
1411 | 0, |
1412 | NULL, |
1413 | NULL); |
1414 | if (bufsize == 0) { |
1415 | *utf8 = NULL; |
1416 | return uv_translate_sys_error(GetLastError()); |
1417 | } |
1418 | |
1419 | // Allocate the destination buffer adding an extra byte for the terminating |
1420 | // NULL. If `utf16len` is not -1 WideCharToMultiByte will not add it, so |
1421 | // we do it ourselves always, just in case. |
1422 | *utf8 = xmalloc(bufsize + 1); |
1423 | |
1424 | // Convert to UTF-8. |
1425 | bufsize = WideCharToMultiByte(CP_UTF8, |
1426 | 0, |
1427 | utf16, |
1428 | utf16len, |
1429 | *utf8, |
1430 | bufsize, |
1431 | NULL, |
1432 | NULL); |
1433 | if (bufsize == 0) { |
1434 | XFREE_CLEAR(*utf8); |
1435 | return uv_translate_sys_error(GetLastError()); |
1436 | } |
1437 | |
1438 | (*utf8)[bufsize] = '\0'; |
1439 | return 0; |
1440 | } |
1441 | |
1442 | #endif |
1443 | |
1444 | /// Measure the length of a string in corresponding UTF-32 and UTF-16 units. |
1445 | /// |
1446 | /// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit |
1447 | /// each. |
1448 | /// |
1449 | /// The out parameters are incremented. This is used to measure the size of |
1450 | /// a buffer region consisting of multiple line segments. |
1451 | /// |
1452 | /// @param s the string |
1453 | /// @param len maximum length (an earlier NUL terminates) |
1454 | /// @param[out] codepoints incremented with UTF-32 code point size |
1455 | /// @param[out] codeunits incremented with UTF-16 code unit size |
1456 | void mb_utflen(const char_u *s, size_t len, size_t *codepoints, |
1457 | size_t *codeunits) |
1458 | FUNC_ATTR_NONNULL_ALL |
1459 | { |
1460 | size_t count = 0, = 0; |
1461 | size_t clen; |
1462 | for (size_t i = 0; i < len && s[i] != NUL; i += clen) { |
1463 | clen = utf_ptr2len_len(s+i, len-i); |
1464 | // NB: gets the byte value of invalid sequence bytes. |
1465 | // we only care whether the char fits in the BMP or not |
1466 | int c = (clen > 1) ? utf_ptr2char(s+i) : s[i]; |
1467 | count++; |
1468 | if (c > 0xFFFF) { |
1469 | extra++; |
1470 | } |
1471 | } |
1472 | *codepoints += count; |
1473 | *codeunits += count + extra; |
1474 | } |
1475 | |
1476 | ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len, |
1477 | size_t index, bool use_utf16_units) |
1478 | FUNC_ATTR_NONNULL_ALL |
1479 | { |
1480 | size_t count = 0; |
1481 | size_t clen, i; |
1482 | if (index == 0) { |
1483 | return 0; |
1484 | } |
1485 | for (i = 0; i < len && s[i] != NUL; i += clen) { |
1486 | clen = utf_ptr2len_len(s+i, len-i); |
1487 | // NB: gets the byte value of invalid sequence bytes. |
1488 | // we only care whether the char fits in the BMP or not |
1489 | int c = (clen > 1) ? utf_ptr2char(s+i) : s[i]; |
1490 | count++; |
1491 | if (use_utf16_units && c > 0xFFFF) { |
1492 | count++; |
1493 | } |
1494 | if (count >= index) { |
1495 | return i+clen; |
1496 | } |
1497 | } |
1498 | return -1; |
1499 | } |
1500 | |
1501 | |
1502 | /* |
1503 | * Version of strnicmp() that handles multi-byte characters. |
1504 | * Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can |
1505 | * probably use strnicmp(), because there are no ASCII characters in the |
1506 | * second byte. |
1507 | * Returns zero if s1 and s2 are equal (ignoring case), the difference between |
1508 | * two characters otherwise. |
1509 | */ |
1510 | int mb_strnicmp(const char_u *s1, const char_u *s2, const size_t nn) |
1511 | { |
1512 | return utf_strnicmp(s1, s2, nn, nn); |
1513 | } |
1514 | |
1515 | /// Compare strings case-insensitively |
1516 | /// |
1517 | /// @note We need to call mb_stricmp() even when we aren't dealing with |
1518 | /// a multi-byte encoding because mb_stricmp() takes care of all ASCII and |
1519 | /// non-ascii encodings, including characters with umlauts in latin1, |
1520 | /// etc., while STRICMP() only handles the system locale version, which |
1521 | /// often does not handle non-ascii properly. |
1522 | /// |
1523 | /// @param[in] s1 First string to compare, not more then #MAXCOL characters. |
1524 | /// @param[in] s2 Second string to compare, not more then #MAXCOL characters. |
1525 | /// |
1526 | /// @return 0 if strings are equal, <0 if s1 < s2, >0 if s1 > s2. |
1527 | int mb_stricmp(const char *s1, const char *s2) |
1528 | { |
1529 | return mb_strnicmp((const char_u *)s1, (const char_u *)s2, MAXCOL); |
1530 | } |
1531 | |
1532 | /* |
1533 | * "g8": show bytes of the UTF-8 char under the cursor. Doesn't matter what |
1534 | * 'encoding' has been set to. |
1535 | */ |
1536 | void show_utf8(void) |
1537 | { |
1538 | int len; |
1539 | int rlen = 0; |
1540 | char_u *line; |
1541 | int clen; |
1542 | int i; |
1543 | |
1544 | /* Get the byte length of the char under the cursor, including composing |
1545 | * characters. */ |
1546 | line = get_cursor_pos_ptr(); |
1547 | len = utfc_ptr2len(line); |
1548 | if (len == 0) { |
1549 | MSG("NUL" ); |
1550 | return; |
1551 | } |
1552 | |
1553 | clen = 0; |
1554 | for (i = 0; i < len; ++i) { |
1555 | if (clen == 0) { |
1556 | /* start of (composing) character, get its length */ |
1557 | if (i > 0) { |
1558 | STRCPY(IObuff + rlen, "+ " ); |
1559 | rlen += 2; |
1560 | } |
1561 | clen = utf_ptr2len(line + i); |
1562 | } |
1563 | sprintf((char *)IObuff + rlen, "%02x " , |
1564 | (line[i] == NL) ? NUL : line[i]); /* NUL is stored as NL */ |
1565 | --clen; |
1566 | rlen += (int)STRLEN(IObuff + rlen); |
1567 | if (rlen > IOSIZE - 20) |
1568 | break; |
1569 | } |
1570 | |
1571 | msg(IObuff); |
1572 | } |
1573 | |
1574 | /// Return offset from "p" to the first byte of the character it points into. |
1575 | /// If "p" points to the NUL at the end of the string return 0. |
1576 | /// Returns 0 when already at the first byte of a character. |
1577 | int utf_head_off(const char_u *base, const char_u *p) |
1578 | { |
1579 | int c; |
1580 | int len; |
1581 | |
1582 | if (*p < 0x80) /* be quick for ASCII */ |
1583 | return 0; |
1584 | |
1585 | /* Skip backwards over trailing bytes: 10xx.xxxx |
1586 | * Skip backwards again if on a composing char. */ |
1587 | const char_u *q; |
1588 | for (q = p;; --q) { |
1589 | /* Move s to the last byte of this char. */ |
1590 | const char_u *s; |
1591 | for (s = q; (s[1] & 0xc0) == 0x80; ++s) {} |
1592 | |
1593 | /* Move q to the first byte of this char. */ |
1594 | while (q > base && (*q & 0xc0) == 0x80) |
1595 | --q; |
1596 | /* Check for illegal sequence. Do allow an illegal byte after where we |
1597 | * started. */ |
1598 | len = utf8len_tab[*q]; |
1599 | if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) |
1600 | return 0; |
1601 | |
1602 | if (q <= base) |
1603 | break; |
1604 | |
1605 | c = utf_ptr2char(q); |
1606 | if (utf_iscomposing(c)) |
1607 | continue; |
1608 | |
1609 | if (arabic_maycombine(c)) { |
1610 | /* Advance to get a sneak-peak at the next char */ |
1611 | const char_u *j = q; |
1612 | --j; |
1613 | /* Move j to the first byte of this char. */ |
1614 | while (j > base && (*j & 0xc0) == 0x80) |
1615 | --j; |
1616 | if (arabic_combine(utf_ptr2char(j), c)) |
1617 | continue; |
1618 | } |
1619 | break; |
1620 | } |
1621 | |
1622 | return (int)(p - q); |
1623 | } |
1624 | |
1625 | /// Copy a character, advancing the pointers |
1626 | /// |
1627 | /// @param[in,out] fp Source of the character to copy. |
1628 | /// @param[in,out] tp Destination to copy to. |
1629 | void mb_copy_char(const char_u **const fp, char_u **const tp) |
1630 | { |
1631 | const size_t l = (size_t)utfc_ptr2len(*fp); |
1632 | |
1633 | memmove(*tp, *fp, l); |
1634 | *tp += l; |
1635 | *fp += l; |
1636 | } |
1637 | |
1638 | /* |
1639 | * Return the offset from "p" to the first byte of a character. When "p" is |
1640 | * at the start of a character 0 is returned, otherwise the offset to the next |
1641 | * character. Can start anywhere in a stream of bytes. |
1642 | */ |
1643 | int mb_off_next(char_u *base, char_u *p) |
1644 | { |
1645 | int i; |
1646 | int j; |
1647 | |
1648 | if (*p < 0x80) { // be quick for ASCII |
1649 | return 0; |
1650 | } |
1651 | |
1652 | // Find the next character that isn't 10xx.xxxx |
1653 | for (i = 0; (p[i] & 0xc0) == 0x80; i++) {} |
1654 | if (i > 0) { |
1655 | // Check for illegal sequence. |
1656 | for (j = 0; p - j > base; j++) { |
1657 | if ((p[-j] & 0xc0) != 0x80) { |
1658 | break; |
1659 | } |
1660 | } |
1661 | if (utf8len_tab[p[-j]] != i + j) { |
1662 | return 0; |
1663 | } |
1664 | } |
1665 | return i; |
1666 | } |
1667 | |
1668 | /* |
1669 | * Return the offset from "p" to the last byte of the character it points |
1670 | * into. Can start anywhere in a stream of bytes. |
1671 | */ |
1672 | int mb_tail_off(char_u *base, char_u *p) |
1673 | { |
1674 | int i; |
1675 | int j; |
1676 | |
1677 | if (*p == NUL) |
1678 | return 0; |
1679 | |
1680 | // Find the last character that is 10xx.xxxx |
1681 | for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {} |
1682 | |
1683 | // Check for illegal sequence. |
1684 | for (j = 0; p - j > base; j++) { |
1685 | if ((p[-j] & 0xc0) != 0x80) { |
1686 | break; |
1687 | } |
1688 | } |
1689 | |
1690 | if (utf8len_tab[p[-j]] != i + j + 1) { |
1691 | return 0; |
1692 | } |
1693 | return i; |
1694 | } |
1695 | |
1696 | /* |
1697 | * Find the next illegal byte sequence. |
1698 | */ |
1699 | void utf_find_illegal(void) |
1700 | { |
1701 | pos_T pos = curwin->w_cursor; |
1702 | char_u *p; |
1703 | int len; |
1704 | vimconv_T vimconv; |
1705 | char_u *tofree = NULL; |
1706 | |
1707 | vimconv.vc_type = CONV_NONE; |
1708 | if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) { |
1709 | // 'encoding' is "utf-8" but we are editing a 8-bit encoded file, |
1710 | // possibly a utf-8 file with illegal bytes. Setup for conversion |
1711 | // from utf-8 to 'fileencoding'. |
1712 | convert_setup(&vimconv, p_enc, curbuf->b_p_fenc); |
1713 | } |
1714 | |
1715 | curwin->w_cursor.coladd = 0; |
1716 | for (;; ) { |
1717 | p = get_cursor_pos_ptr(); |
1718 | if (vimconv.vc_type != CONV_NONE) { |
1719 | xfree(tofree); |
1720 | tofree = string_convert(&vimconv, p, NULL); |
1721 | if (tofree == NULL) |
1722 | break; |
1723 | p = tofree; |
1724 | } |
1725 | |
1726 | while (*p != NUL) { |
1727 | /* Illegal means that there are not enough trail bytes (checked by |
1728 | * utf_ptr2len()) or too many of them (overlong sequence). */ |
1729 | len = utf_ptr2len(p); |
1730 | if (*p >= 0x80 && (len == 1 |
1731 | || utf_char2len(utf_ptr2char(p)) != len)) { |
1732 | if (vimconv.vc_type == CONV_NONE) |
1733 | curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr()); |
1734 | else { |
1735 | int l; |
1736 | |
1737 | len = (int)(p - tofree); |
1738 | for (p = get_cursor_pos_ptr(); *p != NUL && len-- > 0; p += l) { |
1739 | l = utf_ptr2len(p); |
1740 | curwin->w_cursor.col += l; |
1741 | } |
1742 | } |
1743 | goto theend; |
1744 | } |
1745 | p += len; |
1746 | } |
1747 | if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count) |
1748 | break; |
1749 | ++curwin->w_cursor.lnum; |
1750 | curwin->w_cursor.col = 0; |
1751 | } |
1752 | |
1753 | /* didn't find it: don't move and beep */ |
1754 | curwin->w_cursor = pos; |
1755 | beep_flush(); |
1756 | |
1757 | theend: |
1758 | xfree(tofree); |
1759 | convert_setup(&vimconv, NULL, NULL); |
1760 | } |
1761 | |
1762 | /* |
1763 | * If the cursor moves on an trail byte, set the cursor on the lead byte. |
1764 | * Thus it moves left if necessary. |
1765 | */ |
1766 | void mb_adjust_cursor(void) |
1767 | { |
1768 | mark_mb_adjustpos(curbuf, &curwin->w_cursor); |
1769 | } |
1770 | |
1771 | /// Checks and adjusts cursor column. Not mode-dependent. |
1772 | /// @see check_cursor_col_win |
1773 | /// |
1774 | /// @param win_ Places cursor on a valid column for this window. |
1775 | void mb_check_adjust_col(void *win_) |
1776 | { |
1777 | win_T *win = (win_T *)win_; |
1778 | colnr_T oldcol = win->w_cursor.col; |
1779 | |
1780 | // Column 0 is always valid. |
1781 | if (oldcol != 0) { |
1782 | char_u *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false); |
1783 | colnr_T len = (colnr_T)STRLEN(p); |
1784 | |
1785 | // Empty line or invalid column? |
1786 | if (len == 0 || oldcol < 0) { |
1787 | win->w_cursor.col = 0; |
1788 | } else { |
1789 | // Cursor column too big for line? |
1790 | if (oldcol > len) { |
1791 | win->w_cursor.col = len - 1; |
1792 | } |
1793 | // Move the cursor to the head byte. |
1794 | win->w_cursor.col -= utf_head_off(p, p + win->w_cursor.col); |
1795 | } |
1796 | |
1797 | // Reset `coladd` when the cursor would be on the right half of a |
1798 | // double-wide character. |
1799 | if (win->w_cursor.coladd == 1 && p[win->w_cursor.col] != TAB |
1800 | && vim_isprintc(utf_ptr2char(p + win->w_cursor.col)) |
1801 | && ptr2cells(p + win->w_cursor.col) > 1) { |
1802 | win->w_cursor.coladd = 0; |
1803 | } |
1804 | } |
1805 | } |
1806 | |
1807 | /* |
1808 | * Return a pointer to the character before "*p", if there is one. |
1809 | */ |
1810 | char_u * mb_prevptr( |
1811 | char_u *line, /* start of the string */ |
1812 | char_u *p |
1813 | ) |
1814 | { |
1815 | if (p > line) { |
1816 | MB_PTR_BACK(line, p); |
1817 | } |
1818 | return p; |
1819 | } |
1820 | |
1821 | /* |
1822 | * Return the character length of "str". Each multi-byte character (with |
1823 | * following composing characters) counts as one. |
1824 | */ |
1825 | int mb_charlen(char_u *str) |
1826 | { |
1827 | char_u *p = str; |
1828 | int count; |
1829 | |
1830 | if (p == NULL) |
1831 | return 0; |
1832 | |
1833 | for (count = 0; *p != NUL; count++) |
1834 | p += (*mb_ptr2len)(p); |
1835 | |
1836 | return count; |
1837 | } |
1838 | |
1839 | /* |
1840 | * Like mb_charlen() but for a string with specified length. |
1841 | */ |
1842 | int mb_charlen_len(char_u *str, int len) |
1843 | { |
1844 | char_u *p = str; |
1845 | int count; |
1846 | |
1847 | for (count = 0; *p != NUL && p < str + len; count++) |
1848 | p += (*mb_ptr2len)(p); |
1849 | |
1850 | return count; |
1851 | } |
1852 | |
1853 | /// Try to unescape a multibyte character |
1854 | /// |
1855 | /// Used for the rhs and lhs of the mappings. |
1856 | /// |
1857 | /// @param[in,out] pp String to unescape. Is advanced to just after the bytes |
1858 | /// that form a multibyte character. |
1859 | /// |
1860 | /// @return Unescaped string if it is a multibyte character, NULL if no |
1861 | /// multibyte character was found. Returns a static buffer, always one |
1862 | /// and the same. |
1863 | const char *mb_unescape(const char **const pp) |
1864 | FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL |
1865 | { |
1866 | static char buf[6]; |
1867 | size_t buf_idx = 0; |
1868 | uint8_t *str = (uint8_t *)(*pp); |
1869 | |
1870 | // Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL and CSI |
1871 | // KS_EXTRA KE_CSI to CSI. |
1872 | // Maximum length of a utf-8 character is 4 bytes. |
1873 | for (size_t str_idx = 0; str[str_idx] != NUL && buf_idx < 4; str_idx++) { |
1874 | if (str[str_idx] == K_SPECIAL |
1875 | && str[str_idx + 1] == KS_SPECIAL |
1876 | && str[str_idx + 2] == KE_FILLER) { |
1877 | buf[buf_idx++] = (char)K_SPECIAL; |
1878 | str_idx += 2; |
1879 | } else if ((str[str_idx] == K_SPECIAL) |
1880 | && str[str_idx + 1] == KS_EXTRA |
1881 | && str[str_idx + 2] == KE_CSI) { |
1882 | buf[buf_idx++] = (char)CSI; |
1883 | str_idx += 2; |
1884 | } else if (str[str_idx] == K_SPECIAL) { |
1885 | break; // A special key can't be a multibyte char. |
1886 | } else { |
1887 | buf[buf_idx++] = (char)str[str_idx]; |
1888 | } |
1889 | buf[buf_idx] = NUL; |
1890 | |
1891 | // Return a multi-byte character if it's found. An illegal sequence |
1892 | // will result in a 1 here. |
1893 | if (utf_ptr2len((const char_u *)buf) > 1) { |
1894 | *pp = (const char *)str + str_idx + 1; |
1895 | return buf; |
1896 | } |
1897 | |
1898 | // Bail out quickly for ASCII. |
1899 | if ((uint8_t)buf[0] < 128) { |
1900 | break; |
1901 | } |
1902 | } |
1903 | return NULL; |
1904 | } |
1905 | |
1906 | |
1907 | /* |
1908 | * Skip the Vim specific head of a 'encoding' name. |
1909 | */ |
1910 | char_u * enc_skip(char_u *p) |
1911 | { |
1912 | if (STRNCMP(p, "2byte-" , 6) == 0) |
1913 | return p + 6; |
1914 | if (STRNCMP(p, "8bit-" , 5) == 0) |
1915 | return p + 5; |
1916 | return p; |
1917 | } |
1918 | |
1919 | /* |
1920 | * Find the canonical name for encoding "enc". |
1921 | * When the name isn't recognized, returns "enc" itself, but with all lower |
1922 | * case characters and '_' replaced with '-'. |
1923 | * Returns an allocated string. |
1924 | */ |
1925 | char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET |
1926 | { |
1927 | char_u *p, *s; |
1928 | int i; |
1929 | |
1930 | if (STRCMP(enc, "default" ) == 0) { |
1931 | // Use the default encoding as found by set_init_1(). |
1932 | return vim_strsave(fenc_default); |
1933 | } |
1934 | |
1935 | /* copy "enc" to allocated memory, with room for two '-' */ |
1936 | char_u *r = xmalloc(STRLEN(enc) + 3); |
1937 | /* Make it all lower case and replace '_' with '-'. */ |
1938 | p = r; |
1939 | for (s = enc; *s != NUL; ++s) { |
1940 | if (*s == '_') |
1941 | *p++ = '-'; |
1942 | else |
1943 | *p++ = TOLOWER_ASC(*s); |
1944 | } |
1945 | *p = NUL; |
1946 | |
1947 | /* Skip "2byte-" and "8bit-". */ |
1948 | p = enc_skip(r); |
1949 | |
1950 | /* Change "microsoft-cp" to "cp". Used in some spell files. */ |
1951 | if (STRNCMP(p, "microsoft-cp" , 12) == 0) |
1952 | STRMOVE(p, p + 10); |
1953 | |
1954 | /* "iso8859" -> "iso-8859" */ |
1955 | if (STRNCMP(p, "iso8859" , 7) == 0) { |
1956 | STRMOVE(p + 4, p + 3); |
1957 | p[3] = '-'; |
1958 | } |
1959 | |
1960 | /* "iso-8859n" -> "iso-8859-n" */ |
1961 | if (STRNCMP(p, "iso-8859" , 8) == 0 && p[8] != '-') { |
1962 | STRMOVE(p + 9, p + 8); |
1963 | p[8] = '-'; |
1964 | } |
1965 | |
1966 | /* "latin-N" -> "latinN" */ |
1967 | if (STRNCMP(p, "latin-" , 6) == 0) |
1968 | STRMOVE(p + 5, p + 6); |
1969 | |
1970 | if (enc_canon_search(p) >= 0) { |
1971 | /* canonical name can be used unmodified */ |
1972 | if (p != r) |
1973 | STRMOVE(r, p); |
1974 | } else if ((i = enc_alias_search(p)) >= 0) { |
1975 | /* alias recognized, get canonical name */ |
1976 | xfree(r); |
1977 | r = vim_strsave((char_u *)enc_canon_table[i].name); |
1978 | } |
1979 | return r; |
1980 | } |
1981 | |
1982 | /* |
1983 | * Search for an encoding alias of "name". |
1984 | * Returns -1 when not found. |
1985 | */ |
1986 | static int enc_alias_search(char_u *name) |
1987 | { |
1988 | int i; |
1989 | |
1990 | for (i = 0; enc_alias_table[i].name != NULL; ++i) |
1991 | if (STRCMP(name, enc_alias_table[i].name) == 0) |
1992 | return enc_alias_table[i].canon; |
1993 | return -1; |
1994 | } |
1995 | |
1996 | |
1997 | #ifdef HAVE_LANGINFO_H |
1998 | # include <langinfo.h> |
1999 | #endif |
2000 | |
2001 | /* |
2002 | * Get the canonicalized encoding of the current locale. |
2003 | * Returns an allocated string when successful, NULL when not. |
2004 | */ |
2005 | char_u * enc_locale(void) |
2006 | { |
2007 | int i; |
2008 | char buf[50]; |
2009 | |
2010 | const char *s; |
2011 | # ifdef HAVE_NL_LANGINFO_CODESET |
2012 | if (!(s = nl_langinfo(CODESET)) || *s == NUL) |
2013 | # endif |
2014 | { |
2015 | # if defined(HAVE_LOCALE_H) |
2016 | if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL) |
2017 | # endif |
2018 | { |
2019 | if ((s = os_getenv("LC_ALL" ))) { |
2020 | if ((s = os_getenv("LC_CTYPE" ))) { |
2021 | s = os_getenv("LANG" ); |
2022 | } |
2023 | } |
2024 | } |
2025 | } |
2026 | |
2027 | if (!s) { |
2028 | return NULL; |
2029 | } |
2030 | |
2031 | // The most generic locale format is: |
2032 | // language[_territory][.codeset][@modifier][+special][,[sponsor][_revision]] |
2033 | // If there is a '.' remove the part before it. |
2034 | // if there is something after the codeset, remove it. |
2035 | // Make the name lowercase and replace '_' with '-'. |
2036 | // Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn", |
2037 | // "ko_KR.EUC" == "euc-kr" |
2038 | const char *p = (char *)vim_strchr((char_u *)s, '.'); |
2039 | if (p != NULL) { |
2040 | if (p > s + 2 && !STRNICMP(p + 1, "EUC" , 3) |
2041 | && !isalnum((int)p[4]) && p[4] != '-' && p[-3] == '_') { |
2042 | // Copy "XY.EUC" to "euc-XY" to buf[10]. |
2043 | memmove(buf, "euc-" , 4); |
2044 | buf[4] = (ASCII_ISALNUM(p[-2]) ? TOLOWER_ASC(p[-2]) : 0); |
2045 | buf[5] = (ASCII_ISALNUM(p[-1]) ? TOLOWER_ASC(p[-1]) : 0); |
2046 | buf[6] = NUL; |
2047 | } else { |
2048 | s = p + 1; |
2049 | goto enc_locale_copy_enc; |
2050 | } |
2051 | } else { |
2052 | enc_locale_copy_enc: |
2053 | for (i = 0; i < (int)sizeof(buf) - 1 && s[i] != NUL; i++) { |
2054 | if (s[i] == '_' || s[i] == '-') { |
2055 | buf[i] = '-'; |
2056 | } else if (ASCII_ISALNUM((uint8_t)s[i])) { |
2057 | buf[i] = TOLOWER_ASC(s[i]); |
2058 | } else { |
2059 | break; |
2060 | } |
2061 | } |
2062 | buf[i] = NUL; |
2063 | } |
2064 | |
2065 | return enc_canonize((char_u *)buf); |
2066 | } |
2067 | |
2068 | # if defined(HAVE_ICONV) |
2069 | |
2070 | |
2071 | /* |
2072 | * Call iconv_open() with a check if iconv() works properly (there are broken |
2073 | * versions). |
2074 | * Returns (void *)-1 if failed. |
2075 | * (should return iconv_t, but that causes problems with prototypes). |
2076 | */ |
2077 | void * my_iconv_open(char_u *to, char_u *from) |
2078 | { |
2079 | iconv_t fd; |
2080 | #define ICONV_TESTLEN 400 |
2081 | char_u tobuf[ICONV_TESTLEN]; |
2082 | char *p; |
2083 | size_t tolen; |
2084 | static WorkingStatus iconv_working = kUnknown; |
2085 | |
2086 | if (iconv_working == kBroken) |
2087 | return (void *)-1; /* detected a broken iconv() previously */ |
2088 | |
2089 | fd = iconv_open((char *)enc_skip(to), (char *)enc_skip(from)); |
2090 | |
2091 | if (fd != (iconv_t)-1 && iconv_working == kUnknown) { |
2092 | /* |
2093 | * Do a dummy iconv() call to check if it actually works. There is a |
2094 | * version of iconv() on Linux that is broken. We can't ignore it, |
2095 | * because it's wide-spread. The symptoms are that after outputting |
2096 | * the initial shift state the "to" pointer is NULL and conversion |
2097 | * stops for no apparent reason after about 8160 characters. |
2098 | */ |
2099 | p = (char *)tobuf; |
2100 | tolen = ICONV_TESTLEN; |
2101 | (void)iconv(fd, NULL, NULL, &p, &tolen); |
2102 | if (p == NULL) { |
2103 | iconv_working = kBroken; |
2104 | iconv_close(fd); |
2105 | fd = (iconv_t)-1; |
2106 | } else |
2107 | iconv_working = kWorking; |
2108 | } |
2109 | |
2110 | return (void *)fd; |
2111 | } |
2112 | |
2113 | /* |
2114 | * Convert the string "str[slen]" with iconv(). |
2115 | * If "unconvlenp" is not NULL handle the string ending in an incomplete |
2116 | * sequence and set "*unconvlenp" to the length of it. |
2117 | * Returns the converted string in allocated memory. NULL for an error. |
2118 | * If resultlenp is not NULL, sets it to the result length in bytes. |
2119 | */ |
2120 | static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, |
2121 | size_t slen, size_t *unconvlenp, size_t *resultlenp) |
2122 | { |
2123 | const char *from; |
2124 | size_t fromlen; |
2125 | char *to; |
2126 | size_t tolen; |
2127 | size_t len = 0; |
2128 | size_t done = 0; |
2129 | char_u *result = NULL; |
2130 | char_u *p; |
2131 | int l; |
2132 | |
2133 | from = (char *)str; |
2134 | fromlen = slen; |
2135 | for (;; ) { |
2136 | if (len == 0 || ICONV_ERRNO == ICONV_E2BIG) { |
2137 | /* Allocate enough room for most conversions. When re-allocating |
2138 | * increase the buffer size. */ |
2139 | len = len + fromlen * 2 + 40; |
2140 | p = xmalloc(len); |
2141 | if (done > 0) |
2142 | memmove(p, result, done); |
2143 | xfree(result); |
2144 | result = p; |
2145 | } |
2146 | |
2147 | to = (char *)result + done; |
2148 | tolen = len - done - 2; |
2149 | // Avoid a warning for systems with a wrong iconv() prototype by |
2150 | // casting the second argument to void *. |
2151 | if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) { |
2152 | // Finished, append a NUL. |
2153 | *to = NUL; |
2154 | break; |
2155 | } |
2156 | |
2157 | // Check both ICONV_EINVAL and EINVAL, because the dynamically loaded |
2158 | // iconv library may use one of them. |
2159 | if (!vcp->vc_fail && unconvlenp != NULL |
2160 | && (ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) { |
2161 | // Handle an incomplete sequence at the end. |
2162 | *to = NUL; |
2163 | *unconvlenp = fromlen; |
2164 | break; |
2165 | } else if (!vcp->vc_fail |
2166 | && (ICONV_ERRNO == ICONV_EILSEQ || ICONV_ERRNO == EILSEQ |
2167 | || ICONV_ERRNO == ICONV_EINVAL || ICONV_ERRNO == EINVAL)) { |
2168 | // Check both ICONV_EILSEQ and EILSEQ, because the dynamically loaded |
2169 | // iconv library may use one of them. |
2170 | |
2171 | // Can't convert: insert a '?' and skip a character. This assumes |
2172 | // conversion from 'encoding' to something else. In other |
2173 | // situations we don't know what to skip anyway. |
2174 | *to++ = '?'; |
2175 | if (utf_ptr2cells((char_u *)from) > 1) { |
2176 | *to++ = '?'; |
2177 | } |
2178 | l = utfc_ptr2len_len((const char_u *)from, (int)fromlen); |
2179 | from += l; |
2180 | fromlen -= l; |
2181 | } else if (ICONV_ERRNO != ICONV_E2BIG) { |
2182 | // conversion failed |
2183 | XFREE_CLEAR(result); |
2184 | break; |
2185 | } |
2186 | // Not enough room or skipping illegal sequence. |
2187 | done = to - (char *)result; |
2188 | } |
2189 | |
2190 | if (resultlenp != NULL && result != NULL) |
2191 | *resultlenp = (size_t)(to - (char *)result); |
2192 | return result; |
2193 | } |
2194 | |
2195 | # endif // HAVE_ICONV |
2196 | |
2197 | |
2198 | |
2199 | |
2200 | /* |
2201 | * Setup "vcp" for conversion from "from" to "to". |
2202 | * The names must have been made canonical with enc_canonize(). |
2203 | * vcp->vc_type must have been initialized to CONV_NONE. |
2204 | * Note: cannot be used for conversion from/to ucs-2 and ucs-4 (will use utf-8 |
2205 | * instead). |
2206 | * Afterwards invoke with "from" and "to" equal to NULL to cleanup. |
2207 | * Return FAIL when conversion is not supported, OK otherwise. |
2208 | */ |
2209 | int convert_setup(vimconv_T *vcp, char_u *from, char_u *to) |
2210 | { |
2211 | return convert_setup_ext(vcp, from, true, to, true); |
2212 | } |
2213 | |
2214 | /* |
2215 | * As convert_setup(), but only when from_unicode_is_utf8 is TRUE will all |
2216 | * "from" unicode charsets be considered utf-8. Same for "to". |
2217 | */ |
2218 | int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8, |
2219 | char_u *to, bool to_unicode_is_utf8) |
2220 | { |
2221 | int from_prop; |
2222 | int to_prop; |
2223 | int from_is_utf8; |
2224 | int to_is_utf8; |
2225 | |
2226 | // Reset to no conversion. |
2227 | # ifdef HAVE_ICONV |
2228 | if (vcp->vc_type == CONV_ICONV && vcp->vc_fd != (iconv_t)-1) { |
2229 | iconv_close(vcp->vc_fd); |
2230 | } |
2231 | # endif |
2232 | *vcp = (vimconv_T)MBYTE_NONE_CONV; |
2233 | |
2234 | /* No conversion when one of the names is empty or they are equal. */ |
2235 | if (from == NULL || *from == NUL || to == NULL || *to == NUL |
2236 | || STRCMP(from, to) == 0) |
2237 | return OK; |
2238 | |
2239 | from_prop = enc_canon_props(from); |
2240 | to_prop = enc_canon_props(to); |
2241 | if (from_unicode_is_utf8) |
2242 | from_is_utf8 = from_prop & ENC_UNICODE; |
2243 | else |
2244 | from_is_utf8 = from_prop == ENC_UNICODE; |
2245 | if (to_unicode_is_utf8) |
2246 | to_is_utf8 = to_prop & ENC_UNICODE; |
2247 | else |
2248 | to_is_utf8 = to_prop == ENC_UNICODE; |
2249 | |
2250 | if ((from_prop & ENC_LATIN1) && to_is_utf8) { |
2251 | /* Internal latin1 -> utf-8 conversion. */ |
2252 | vcp->vc_type = CONV_TO_UTF8; |
2253 | vcp->vc_factor = 2; /* up to twice as long */ |
2254 | } else if ((from_prop & ENC_LATIN9) && to_is_utf8) { |
2255 | /* Internal latin9 -> utf-8 conversion. */ |
2256 | vcp->vc_type = CONV_9_TO_UTF8; |
2257 | vcp->vc_factor = 3; /* up to three as long (euro sign) */ |
2258 | } else if (from_is_utf8 && (to_prop & ENC_LATIN1)) { |
2259 | /* Internal utf-8 -> latin1 conversion. */ |
2260 | vcp->vc_type = CONV_TO_LATIN1; |
2261 | } else if (from_is_utf8 && (to_prop & ENC_LATIN9)) { |
2262 | /* Internal utf-8 -> latin9 conversion. */ |
2263 | vcp->vc_type = CONV_TO_LATIN9; |
2264 | } |
2265 | # ifdef HAVE_ICONV |
2266 | else { // NOLINT(readability/braces) |
2267 | // Use iconv() for conversion. |
2268 | vcp->vc_fd = (iconv_t)my_iconv_open( |
2269 | to_is_utf8 ? (char_u *)"utf-8" : to, |
2270 | from_is_utf8 ? (char_u *)"utf-8" : from); |
2271 | if (vcp->vc_fd != (iconv_t)-1) { |
2272 | vcp->vc_type = CONV_ICONV; |
2273 | vcp->vc_factor = 4; /* could be longer too... */ |
2274 | } |
2275 | } |
2276 | # endif |
2277 | if (vcp->vc_type == CONV_NONE) |
2278 | return FAIL; |
2279 | |
2280 | return OK; |
2281 | } |
2282 | |
2283 | /* |
2284 | * Convert text "ptr[*lenp]" according to "vcp". |
2285 | * Returns the result in allocated memory and sets "*lenp". |
2286 | * When "lenp" is NULL, use NUL terminated strings. |
2287 | * Illegal chars are often changed to "?", unless vcp->vc_fail is set. |
2288 | * When something goes wrong, NULL is returned and "*lenp" is unchanged. |
2289 | */ |
2290 | char_u *string_convert(const vimconv_T *const vcp, char_u *ptr, size_t *lenp) |
2291 | { |
2292 | return string_convert_ext(vcp, ptr, lenp, NULL); |
2293 | } |
2294 | |
2295 | /* |
2296 | * Like string_convert(), but when "unconvlenp" is not NULL and there are is |
2297 | * an incomplete sequence at the end it is not converted and "*unconvlenp" is |
2298 | * set to the number of remaining bytes. |
2299 | */ |
2300 | char_u * string_convert_ext(const vimconv_T *const vcp, char_u *ptr, |
2301 | size_t *lenp, size_t *unconvlenp) |
2302 | { |
2303 | char_u *retval = NULL; |
2304 | char_u *d; |
2305 | int l; |
2306 | int c; |
2307 | |
2308 | size_t len; |
2309 | if (lenp == NULL) |
2310 | len = STRLEN(ptr); |
2311 | else |
2312 | len = *lenp; |
2313 | if (len == 0) |
2314 | return vim_strsave((char_u *)"" ); |
2315 | |
2316 | switch (vcp->vc_type) { |
2317 | case CONV_TO_UTF8: /* latin1 to utf-8 conversion */ |
2318 | retval = xmalloc(len * 2 + 1); |
2319 | d = retval; |
2320 | for (size_t i = 0; i < len; ++i) { |
2321 | c = ptr[i]; |
2322 | if (c < 0x80) |
2323 | *d++ = c; |
2324 | else { |
2325 | *d++ = 0xc0 + ((unsigned)c >> 6); |
2326 | *d++ = 0x80 + (c & 0x3f); |
2327 | } |
2328 | } |
2329 | *d = NUL; |
2330 | if (lenp != NULL) |
2331 | *lenp = (size_t)(d - retval); |
2332 | break; |
2333 | |
2334 | case CONV_9_TO_UTF8: /* latin9 to utf-8 conversion */ |
2335 | retval = xmalloc(len * 3 + 1); |
2336 | d = retval; |
2337 | for (size_t i = 0; i < len; ++i) { |
2338 | c = ptr[i]; |
2339 | switch (c) { |
2340 | case 0xa4: c = 0x20ac; break; /* euro */ |
2341 | case 0xa6: c = 0x0160; break; /* S hat */ |
2342 | case 0xa8: c = 0x0161; break; /* S -hat */ |
2343 | case 0xb4: c = 0x017d; break; /* Z hat */ |
2344 | case 0xb8: c = 0x017e; break; /* Z -hat */ |
2345 | case 0xbc: c = 0x0152; break; /* OE */ |
2346 | case 0xbd: c = 0x0153; break; /* oe */ |
2347 | case 0xbe: c = 0x0178; break; /* Y */ |
2348 | } |
2349 | d += utf_char2bytes(c, d); |
2350 | } |
2351 | *d = NUL; |
2352 | if (lenp != NULL) |
2353 | *lenp = (size_t)(d - retval); |
2354 | break; |
2355 | |
2356 | case CONV_TO_LATIN1: /* utf-8 to latin1 conversion */ |
2357 | case CONV_TO_LATIN9: /* utf-8 to latin9 conversion */ |
2358 | retval = xmalloc(len + 1); |
2359 | d = retval; |
2360 | for (size_t i = 0; i < len; ++i) { |
2361 | l = utf_ptr2len_len(ptr + i, len - i); |
2362 | if (l == 0) |
2363 | *d++ = NUL; |
2364 | else if (l == 1) { |
2365 | uint8_t l_w = utf8len_tab_zero[ptr[i]]; |
2366 | |
2367 | if (l_w == 0) { |
2368 | /* Illegal utf-8 byte cannot be converted */ |
2369 | xfree(retval); |
2370 | return NULL; |
2371 | } |
2372 | if (unconvlenp != NULL && l_w > len - i) { |
2373 | /* Incomplete sequence at the end. */ |
2374 | *unconvlenp = len - i; |
2375 | break; |
2376 | } |
2377 | *d++ = ptr[i]; |
2378 | } else { |
2379 | c = utf_ptr2char(ptr + i); |
2380 | if (vcp->vc_type == CONV_TO_LATIN9) |
2381 | switch (c) { |
2382 | case 0x20ac: c = 0xa4; break; /* euro */ |
2383 | case 0x0160: c = 0xa6; break; /* S hat */ |
2384 | case 0x0161: c = 0xa8; break; /* S -hat */ |
2385 | case 0x017d: c = 0xb4; break; /* Z hat */ |
2386 | case 0x017e: c = 0xb8; break; /* Z -hat */ |
2387 | case 0x0152: c = 0xbc; break; /* OE */ |
2388 | case 0x0153: c = 0xbd; break; /* oe */ |
2389 | case 0x0178: c = 0xbe; break; /* Y */ |
2390 | case 0xa4: |
2391 | case 0xa6: |
2392 | case 0xa8: |
2393 | case 0xb4: |
2394 | case 0xb8: |
2395 | case 0xbc: |
2396 | case 0xbd: |
2397 | case 0xbe: c = 0x100; break; /* not in latin9 */ |
2398 | } |
2399 | if (!utf_iscomposing(c)) { /* skip composing chars */ |
2400 | if (c < 0x100) |
2401 | *d++ = c; |
2402 | else if (vcp->vc_fail) { |
2403 | xfree(retval); |
2404 | return NULL; |
2405 | } else { |
2406 | *d++ = 0xbf; |
2407 | if (utf_char2cells(c) > 1) |
2408 | *d++ = '?'; |
2409 | } |
2410 | } |
2411 | i += l - 1; |
2412 | } |
2413 | } |
2414 | *d = NUL; |
2415 | if (lenp != NULL) |
2416 | *lenp = (size_t)(d - retval); |
2417 | break; |
2418 | |
2419 | # ifdef HAVE_ICONV |
2420 | case CONV_ICONV: // conversion with vcp->vc_fd |
2421 | retval = iconv_string(vcp, ptr, len, unconvlenp, lenp); |
2422 | break; |
2423 | # endif |
2424 | } |
2425 | |
2426 | return retval; |
2427 | } |
2428 | |