1 | // This is an open source non-commercial project. Dear PVS-Studio, please check |
2 | // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com |
3 | |
4 | /* |
5 | * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub() |
6 | * |
7 | * NOTICE: |
8 | * |
9 | * This is NOT the original regular expression code as written by Henry |
10 | * Spencer. This code has been modified specifically for use with the VIM |
11 | * editor, and should not be used separately from Vim. If you want a good |
12 | * regular expression library, get the original code. The copyright notice |
13 | * that follows is from the original. |
14 | * |
15 | * END NOTICE |
16 | * |
17 | * Copyright (c) 1986 by University of Toronto. |
18 | * Written by Henry Spencer. Not derived from licensed software. |
19 | * |
20 | * Permission is granted to anyone to use this software for any |
21 | * purpose on any computer system, and to redistribute it freely, |
22 | * subject to the following restrictions: |
23 | * |
24 | * 1. The author is not responsible for the consequences of use of |
25 | * this software, no matter how awful, even if they arise |
26 | * from defects in it. |
27 | * |
28 | * 2. The origin of this software must not be misrepresented, either |
29 | * by explicit claim or by omission. |
30 | * |
31 | * 3. Altered versions must be plainly marked as such, and must not |
32 | * be misrepresented as being the original software. |
33 | * |
34 | * Beware that some of this code is subtly aware of the way operator |
35 | * precedence is structured in regular expressions. Serious changes in |
36 | * regular-expression syntax might require a total rethink. |
37 | * |
38 | * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert |
39 | * Webb, Ciaran McCreesh and Bram Moolenaar. |
40 | * Named character class support added by Walter Briscoe (1998 Jul 01) |
41 | */ |
42 | |
43 | /* Uncomment the first if you do not want to see debugging logs or files |
44 | * related to regular expressions, even when compiling with -DDEBUG. |
45 | * Uncomment the second to get the regexp debugging. */ |
46 | /* #undef REGEXP_DEBUG */ |
47 | /* #define REGEXP_DEBUG */ |
48 | |
49 | #include <assert.h> |
50 | #include <inttypes.h> |
51 | #include <stdbool.h> |
52 | #include <string.h> |
53 | |
54 | #include "nvim/vim.h" |
55 | #include "nvim/ascii.h" |
56 | #include "nvim/regexp.h" |
57 | #include "nvim/charset.h" |
58 | #include "nvim/eval.h" |
59 | #include "nvim/ex_cmds2.h" |
60 | #include "nvim/mark.h" |
61 | #include "nvim/memline.h" |
62 | #include "nvim/memory.h" |
63 | #include "nvim/message.h" |
64 | #include "nvim/misc1.h" |
65 | #include "nvim/garray.h" |
66 | #include "nvim/strings.h" |
67 | |
68 | #ifdef REGEXP_DEBUG |
69 | /* show/save debugging data when BT engine is used */ |
70 | # define BT_REGEXP_DUMP |
71 | /* save the debugging data to a file instead of displaying it */ |
72 | # define BT_REGEXP_LOG |
73 | # define BT_REGEXP_DEBUG_LOG |
74 | # define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log" |
75 | #endif |
76 | |
77 | /* |
78 | * The "internal use only" fields in regexp_defs.h are present to pass info from |
79 | * compile to execute that permits the execute phase to run lots faster on |
80 | * simple cases. They are: |
81 | * |
82 | * regstart char that must begin a match; NUL if none obvious; Can be a |
83 | * multi-byte character. |
84 | * reganch is the match anchored (at beginning-of-line only)? |
85 | * regmust string (pointer into program) that match must include, or NULL |
86 | * regmlen length of regmust string |
87 | * regflags RF_ values or'ed together |
88 | * |
89 | * Regstart and reganch permit very fast decisions on suitable starting points |
90 | * for a match, cutting down the work a lot. Regmust permits fast rejection |
91 | * of lines that cannot possibly match. The regmust tests are costly enough |
92 | * that vim_regcomp() supplies a regmust only if the r.e. contains something |
93 | * potentially expensive (at present, the only such thing detected is * or + |
94 | * at the start of the r.e., which can involve a lot of backup). Regmlen is |
95 | * supplied because the test in vim_regexec() needs it and vim_regcomp() is |
96 | * computing it anyway. |
97 | */ |
98 | |
99 | /* |
100 | * Structure for regexp "program". This is essentially a linear encoding |
101 | * of a nondeterministic finite-state machine (aka syntax charts or |
102 | * "railroad normal form" in parsing technology). Each node is an opcode |
103 | * plus a "next" pointer, possibly plus an operand. "Next" pointers of |
104 | * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next" |
105 | * pointer with a BRANCH on both ends of it is connecting two alternatives. |
106 | * (Here we have one of the subtle syntax dependencies: an individual BRANCH |
107 | * (as opposed to a collection of them) is never concatenated with anything |
108 | * because of operator precedence). The "next" pointer of a BRACES_COMPLEX |
109 | * node points to the node after the stuff to be repeated. |
110 | * The operand of some types of node is a literal string; for others, it is a |
111 | * node leading into a sub-FSM. In particular, the operand of a BRANCH node |
112 | * is the first node of the branch. |
113 | * (NB this is *not* a tree structure: the tail of the branch connects to the |
114 | * thing following the set of BRANCHes.) |
115 | * |
116 | * pattern is coded like: |
117 | * |
118 | * +-----------------+ |
119 | * | V |
120 | * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END |
121 | * | ^ | ^ |
122 | * +------+ +----------+ |
123 | * |
124 | * |
125 | * +------------------+ |
126 | * V | |
127 | * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END |
128 | * | | ^ ^ |
129 | * | +---------------+ | |
130 | * +---------------------------------------------+ |
131 | * |
132 | * |
133 | * +----------------------+ |
134 | * V | |
135 | * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END |
136 | * | | ^ ^ |
137 | * | +-----------+ | |
138 | * +--------------------------------------------------+ |
139 | * |
140 | * |
141 | * +-------------------------+ |
142 | * V | |
143 | * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END |
144 | * | | ^ |
145 | * | +----------------+ |
146 | * +-----------------------------------------------+ |
147 | * |
148 | * |
149 | * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END |
150 | * | | ^ ^ |
151 | * | +----------------+ | |
152 | * +--------------------------------+ |
153 | * |
154 | * +---------+ |
155 | * | V |
156 | * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END |
157 | * | | | | ^ ^ |
158 | * | | | +-----+ | |
159 | * | | +----------------+ | |
160 | * | +---------------------------+ | |
161 | * +------------------------------------------------------+ |
162 | * |
163 | * They all start with a BRANCH for "\|" alternatives, even when there is only |
164 | * one alternative. |
165 | */ |
166 | |
167 | /* |
168 | * The opcodes are: |
169 | */ |
170 | |
171 | /* definition number opnd? meaning */ |
172 | #define END 0 /* End of program or NOMATCH operand. */ |
173 | #define BOL 1 /* Match "" at beginning of line. */ |
174 | #define EOL 2 /* Match "" at end of line. */ |
175 | #define BRANCH 3 /* node Match this alternative, or the |
176 | * next... */ |
177 | #define BACK 4 /* Match "", "next" ptr points backward. */ |
178 | #define EXACTLY 5 /* str Match this string. */ |
179 | #define NOTHING 6 /* Match empty string. */ |
180 | #define STAR 7 /* node Match this (simple) thing 0 or more |
181 | * times. */ |
182 | #define PLUS 8 /* node Match this (simple) thing 1 or more |
183 | * times. */ |
184 | #define MATCH 9 /* node match the operand zero-width */ |
185 | #define NOMATCH 10 /* node check for no match with operand */ |
186 | #define BEHIND 11 /* node look behind for a match with operand */ |
187 | #define NOBEHIND 12 /* node look behind for no match with operand */ |
188 | #define SUBPAT 13 /* node match the operand here */ |
189 | #define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and |
190 | * n times (\{m,n\}). */ |
191 | #define BOW 15 /* Match "" after [^a-zA-Z0-9_] */ |
192 | #define EOW 16 /* Match "" at [^a-zA-Z0-9_] */ |
193 | #define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE |
194 | * and BRACE_COMPLEX. */ |
195 | #define NEWL 18 /* Match line-break */ |
196 | #define BHPOS 19 /* End position for BEHIND or NOBEHIND */ |
197 | |
198 | |
199 | /* character classes: 20-48 normal, 50-78 include a line-break */ |
200 | #define ADD_NL 30 |
201 | #define FIRST_NL ANY + ADD_NL |
202 | #define ANY 20 /* Match any one character. */ |
203 | #define ANYOF 21 /* str Match any character in this string. */ |
204 | #define ANYBUT 22 /* str Match any character not in this |
205 | * string. */ |
206 | #define IDENT 23 /* Match identifier char */ |
207 | #define SIDENT 24 /* Match identifier char but no digit */ |
208 | #define KWORD 25 /* Match keyword char */ |
209 | #define SKWORD 26 /* Match word char but no digit */ |
210 | #define FNAME 27 /* Match file name char */ |
211 | #define SFNAME 28 /* Match file name char but no digit */ |
212 | #define PRINT 29 /* Match printable char */ |
213 | #define SPRINT 30 /* Match printable char but no digit */ |
214 | #define WHITE 31 /* Match whitespace char */ |
215 | #define NWHITE 32 /* Match non-whitespace char */ |
216 | #define DIGIT 33 /* Match digit char */ |
217 | #define NDIGIT 34 /* Match non-digit char */ |
218 | #define HEX 35 /* Match hex char */ |
219 | #define NHEX 36 /* Match non-hex char */ |
220 | #define OCTAL 37 /* Match octal char */ |
221 | #define NOCTAL 38 /* Match non-octal char */ |
222 | #define WORD 39 /* Match word char */ |
223 | #define NWORD 40 /* Match non-word char */ |
224 | #define HEAD 41 /* Match head char */ |
225 | #define NHEAD 42 /* Match non-head char */ |
226 | #define ALPHA 43 /* Match alpha char */ |
227 | #define NALPHA 44 /* Match non-alpha char */ |
228 | #define LOWER 45 /* Match lowercase char */ |
229 | #define NLOWER 46 /* Match non-lowercase char */ |
230 | #define UPPER 47 /* Match uppercase char */ |
231 | #define NUPPER 48 /* Match non-uppercase char */ |
232 | #define LAST_NL NUPPER + ADD_NL |
233 | // -V:WITH_NL:560 |
234 | #define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL) |
235 | |
236 | #define MOPEN 80 // -89 Mark this point in input as start of |
237 | // \( … \) subexpr. MOPEN + 0 marks start of |
238 | // match. |
239 | #define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks |
240 | // end of match. |
241 | #define BACKREF 100 // -109 node Match same string again \1-\9. |
242 | |
243 | # define ZOPEN 110 // -119 Mark this point in input as start of |
244 | // \z( … \) subexpr. |
245 | # define ZCLOSE 120 // -129 Analogous to ZOPEN. |
246 | # define ZREF 130 // -139 node Match external submatch \z1-\z9 |
247 | |
248 | #define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */ |
249 | |
250 | #define NOPEN 150 // Mark this point in input as start of |
251 | // \%( subexpr. |
252 | #define NCLOSE 151 // Analogous to NOPEN. |
253 | |
254 | #define MULTIBYTECODE 200 /* mbc Match one multi-byte character */ |
255 | #define RE_BOF 201 /* Match "" at beginning of file. */ |
256 | #define RE_EOF 202 /* Match "" at end of file. */ |
257 | #define CURSOR 203 /* Match location of cursor. */ |
258 | |
259 | #define RE_LNUM 204 /* nr cmp Match line number */ |
260 | #define RE_COL 205 /* nr cmp Match column number */ |
261 | #define RE_VCOL 206 /* nr cmp Match virtual column number */ |
262 | |
263 | #define RE_MARK 207 /* mark cmp Match mark position */ |
264 | #define RE_VISUAL 208 /* Match Visual area */ |
265 | #define RE_COMPOSING 209 // any composing characters |
266 | |
267 | /* |
268 | * Magic characters have a special meaning, they don't match literally. |
269 | * Magic characters are negative. This separates them from literal characters |
270 | * (possibly multi-byte). Only ASCII characters can be Magic. |
271 | */ |
272 | #define Magic(x) ((int)(x) - 256) |
273 | #define un_Magic(x) ((x) + 256) |
274 | #define is_Magic(x) ((x) < 0) |
275 | |
276 | /* |
277 | * We should define ftpr as a pointer to a function returning a pointer to |
278 | * a function returning a pointer to a function ... |
279 | * This is impossible, so we declare a pointer to a function returning a |
280 | * pointer to a function returning void. This should work for all compilers. |
281 | */ |
282 | typedef void (*(*fptr_T)(int *, int))(void); |
283 | |
284 | typedef struct { |
285 | char_u *regparse; |
286 | int prevchr_len; |
287 | int curchr; |
288 | int prevchr; |
289 | int prevprevchr; |
290 | int nextchr; |
291 | int at_start; |
292 | int prev_at_start; |
293 | int regnpar; |
294 | } parse_state_T; |
295 | |
296 | /* |
297 | * Structure used to save the current input state, when it needs to be |
298 | * restored after trying a match. Used by reg_save() and reg_restore(). |
299 | * Also stores the length of "backpos". |
300 | */ |
301 | typedef struct { |
302 | union { |
303 | char_u *ptr; /* reginput pointer, for single-line regexp */ |
304 | lpos_T pos; /* reginput pos, for multi-line regexp */ |
305 | } rs_u; |
306 | int rs_len; |
307 | } regsave_T; |
308 | |
309 | /* struct to save start/end pointer/position in for \(\) */ |
310 | typedef struct { |
311 | union { |
312 | char_u *ptr; |
313 | lpos_T pos; |
314 | } se_u; |
315 | } save_se_T; |
316 | |
317 | /* used for BEHIND and NOBEHIND matching */ |
318 | typedef struct regbehind_S { |
319 | regsave_T save_after; |
320 | regsave_T save_behind; |
321 | int save_need_clear_subexpr; |
322 | save_se_T save_start[NSUBEXP]; |
323 | save_se_T save_end[NSUBEXP]; |
324 | } regbehind_T; |
325 | |
326 | /* Values for rs_state in regitem_T. */ |
327 | typedef enum regstate_E { |
328 | RS_NOPEN = 0 /* NOPEN and NCLOSE */ |
329 | , RS_MOPEN /* MOPEN + [0-9] */ |
330 | , RS_MCLOSE /* MCLOSE + [0-9] */ |
331 | , RS_ZOPEN /* ZOPEN + [0-9] */ |
332 | , RS_ZCLOSE /* ZCLOSE + [0-9] */ |
333 | , RS_BRANCH /* BRANCH */ |
334 | , RS_BRCPLX_MORE /* BRACE_COMPLEX and trying one more match */ |
335 | , RS_BRCPLX_LONG /* BRACE_COMPLEX and trying longest match */ |
336 | , RS_BRCPLX_SHORT /* BRACE_COMPLEX and trying shortest match */ |
337 | , RS_NOMATCH /* NOMATCH */ |
338 | , RS_BEHIND1 /* BEHIND / NOBEHIND matching rest */ |
339 | , RS_BEHIND2 /* BEHIND / NOBEHIND matching behind part */ |
340 | , RS_STAR_LONG /* STAR/PLUS/BRACE_SIMPLE longest match */ |
341 | , RS_STAR_SHORT /* STAR/PLUS/BRACE_SIMPLE shortest match */ |
342 | } regstate_T; |
343 | |
344 | /* |
345 | * When there are alternatives a regstate_T is put on the regstack to remember |
346 | * what we are doing. |
347 | * Before it may be another type of item, depending on rs_state, to remember |
348 | * more things. |
349 | */ |
350 | typedef struct regitem_S { |
351 | regstate_T rs_state; // what we are doing, one of RS_ above |
352 | uint16_t rs_no; // submatch nr or BEHIND/NOBEHIND |
353 | char_u *rs_scan; // current node in program |
354 | union { |
355 | save_se_T sesave; |
356 | regsave_T regsave; |
357 | } rs_un; // room for saving reginput |
358 | } regitem_T; |
359 | |
360 | |
361 | /* used for STAR, PLUS and BRACE_SIMPLE matching */ |
362 | typedef struct regstar_S { |
363 | int nextb; /* next byte */ |
364 | int nextb_ic; /* next byte reverse case */ |
365 | long count; |
366 | long minval; |
367 | long maxval; |
368 | } regstar_T; |
369 | |
370 | /* used to store input position when a BACK was encountered, so that we now if |
371 | * we made any progress since the last time. */ |
372 | typedef struct backpos_S { |
373 | char_u *bp_scan; /* "scan" where BACK was encountered */ |
374 | regsave_T bp_pos; /* last input position */ |
375 | } backpos_T; |
376 | |
377 | typedef struct { |
378 | int a, b, c; |
379 | } decomp_T; |
380 | |
381 | |
382 | #ifdef INCLUDE_GENERATED_DECLARATIONS |
383 | # include "regexp.c.generated.h" |
384 | #endif |
385 | static int no_Magic(int x) |
386 | { |
387 | if (is_Magic(x)) |
388 | return un_Magic(x); |
389 | return x; |
390 | } |
391 | |
392 | static int toggle_Magic(int x) |
393 | { |
394 | if (is_Magic(x)) |
395 | return un_Magic(x); |
396 | return Magic(x); |
397 | } |
398 | |
399 | /* |
400 | * The first byte of the regexp internal "program" is actually this magic |
401 | * number; the start node begins in the second byte. It's used to catch the |
402 | * most severe mutilation of the program by the caller. |
403 | */ |
404 | |
405 | #define REGMAGIC 0234 |
406 | |
407 | /* |
408 | * Opcode notes: |
409 | * |
410 | * BRANCH The set of branches constituting a single choice are hooked |
411 | * together with their "next" pointers, since precedence prevents |
412 | * anything being concatenated to any individual branch. The |
413 | * "next" pointer of the last BRANCH in a choice points to the |
414 | * thing following the whole choice. This is also where the |
415 | * final "next" pointer of each individual branch points; each |
416 | * branch starts with the operand node of a BRANCH node. |
417 | * |
418 | * BACK Normal "next" pointers all implicitly point forward; BACK |
419 | * exists to make loop structures possible. |
420 | * |
421 | * STAR,PLUS '=', and complex '*' and '+', are implemented as circular |
422 | * BRANCH structures using BACK. Simple cases (one character |
423 | * per match) are implemented with STAR and PLUS for speed |
424 | * and to minimize recursive plunges. |
425 | * |
426 | * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX |
427 | * node, and defines the min and max limits to be used for that |
428 | * node. |
429 | * |
430 | * MOPEN,MCLOSE ...are numbered at compile time. |
431 | * ZOPEN,ZCLOSE ...ditto |
432 | */ |
433 | |
434 | /* |
435 | * A node is one char of opcode followed by two chars of "next" pointer. |
436 | * "Next" pointers are stored as two 8-bit bytes, high order first. The |
437 | * value is a positive offset from the opcode of the node containing it. |
438 | * An operand, if any, simply follows the node. (Note that much of the |
439 | * code generation knows about this implicit relationship.) |
440 | * |
441 | * Using two bytes for the "next" pointer is vast overkill for most things, |
442 | * but allows patterns to get big without disasters. |
443 | */ |
444 | #define OP(p) ((int)*(p)) |
445 | #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377)) |
446 | #define OPERAND(p) ((p) + 3) |
447 | /* Obtain an operand that was stored as four bytes, MSB first. */ |
448 | #define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \ |
449 | + ((long)(p)[5] << 8) + (long)(p)[6]) |
450 | /* Obtain a second operand stored as four bytes. */ |
451 | #define OPERAND_MAX(p) OPERAND_MIN((p) + 4) |
452 | /* Obtain a second single-byte operand stored after a four bytes operand. */ |
453 | #define OPERAND_CMP(p) (p)[7] |
454 | |
455 | /* |
456 | * Utility definitions. |
457 | */ |
458 | #define UCHARAT(p) ((int)*(char_u *)(p)) |
459 | |
460 | /* Used for an error (down from) vim_regcomp(): give the error message, set |
461 | * rc_did_emsg and return NULL */ |
462 | #define EMSG_RET_NULL(m) return (EMSG(m), rc_did_emsg = true, (void *)NULL) |
463 | #define IEMSG_RET_NULL(m) return (IEMSG(m), rc_did_emsg = true, (void *)NULL) |
464 | #define EMSG_RET_FAIL(m) return (EMSG(m), rc_did_emsg = true, FAIL) |
465 | #define EMSG2_RET_NULL(m, c) \ |
466 | return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = true, (void *)NULL) |
467 | #define EMSG2_RET_FAIL(m, c) \ |
468 | return (EMSG2((m), (c) ? "" : "\\"), rc_did_emsg = true, FAIL) |
469 | #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_( \ |
470 | "E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL) |
471 | |
472 | #define MAX_LIMIT (32767L << 16L) |
473 | |
474 | |
475 | #ifdef BT_REGEXP_DUMP |
476 | static void regdump(char_u *, bt_regprog_T *); |
477 | #endif |
478 | #ifdef REGEXP_DEBUG |
479 | static char_u *regprop(char_u *); |
480 | #endif |
481 | |
482 | static char_u e_missingbracket[] = N_("E769: Missing ] after %s[" ); |
483 | static char_u e_reverse_range[] = N_("E944: Reverse range in character class" ); |
484 | static char_u e_large_class[] = N_("E945: Range too large in character class" ); |
485 | static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(" ); |
486 | static char_u e_unmatchedp[] = N_("E54: Unmatched %s(" ); |
487 | static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)" ); |
488 | static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here" ); |
489 | static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here" ); |
490 | static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[" ); |
491 | static char_u e_empty_sb[] = N_("E70: Empty %s%%[]" ); |
492 | #define NOT_MULTI 0 |
493 | #define MULTI_ONE 1 |
494 | #define MULTI_MULT 2 |
495 | /* |
496 | * Return NOT_MULTI if c is not a "multi" operator. |
497 | * Return MULTI_ONE if c is a single "multi" operator. |
498 | * Return MULTI_MULT if c is a multi "multi" operator. |
499 | */ |
500 | static int re_multi_type(int c) |
501 | { |
502 | if (c == Magic('@') || c == Magic('=') || c == Magic('?')) |
503 | return MULTI_ONE; |
504 | if (c == Magic('*') || c == Magic('+') || c == Magic('{')) |
505 | return MULTI_MULT; |
506 | return NOT_MULTI; |
507 | } |
508 | |
509 | /* |
510 | * Flags to be passed up and down. |
511 | */ |
512 | #define HASWIDTH 0x1 /* Known never to match null string. */ |
513 | #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ |
514 | #define SPSTART 0x4 /* Starts with * or +. */ |
515 | #define HASNL 0x8 /* Contains some \n. */ |
516 | #define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */ |
517 | #define WORST 0 /* Worst case. */ |
518 | |
519 | /* |
520 | * When regcode is set to this value, code is not emitted and size is computed |
521 | * instead. |
522 | */ |
523 | #define JUST_CALC_SIZE ((char_u *) -1) |
524 | |
525 | static char_u *reg_prev_sub = NULL; |
526 | |
527 | /* |
528 | * REGEXP_INRANGE contains all characters which are always special in a [] |
529 | * range after '\'. |
530 | * REGEXP_ABBR contains all characters which act as abbreviations after '\'. |
531 | * These are: |
532 | * \n - New line (NL). |
533 | * \r - Carriage Return (CR). |
534 | * \t - Tab (TAB). |
535 | * \e - Escape (ESC). |
536 | * \b - Backspace (Ctrl_H). |
537 | * \d - Character code in decimal, eg \d123 |
538 | * \o - Character code in octal, eg \o80 |
539 | * \x - Character code in hex, eg \x4a |
540 | * \u - Multibyte character code, eg \u20ac |
541 | * \U - Long multibyte character code, eg \U12345678 |
542 | */ |
543 | static char_u REGEXP_INRANGE[] = "]^-n\\" ; |
544 | static char_u REGEXP_ABBR[] = "nrtebdoxuU" ; |
545 | |
546 | |
547 | /* |
548 | * Translate '\x' to its control character, except "\n", which is Magic. |
549 | */ |
550 | static int backslash_trans(int c) |
551 | { |
552 | switch (c) { |
553 | case 'r': return CAR; |
554 | case 't': return TAB; |
555 | case 'e': return ESC; |
556 | case 'b': return BS; |
557 | } |
558 | return c; |
559 | } |
560 | |
561 | /* |
562 | * Check for a character class name "[:name:]". "pp" points to the '['. |
563 | * Returns one of the CLASS_ items. CLASS_NONE means that no item was |
564 | * recognized. Otherwise "pp" is advanced to after the item. |
565 | */ |
566 | static int get_char_class(char_u **pp) |
567 | { |
568 | static const char *(class_names[]) = |
569 | { |
570 | "alnum:]" , |
571 | #define CLASS_ALNUM 0 |
572 | "alpha:]" , |
573 | #define CLASS_ALPHA 1 |
574 | "blank:]" , |
575 | #define CLASS_BLANK 2 |
576 | "cntrl:]" , |
577 | #define CLASS_CNTRL 3 |
578 | "digit:]" , |
579 | #define CLASS_DIGIT 4 |
580 | "graph:]" , |
581 | #define CLASS_GRAPH 5 |
582 | "lower:]" , |
583 | #define CLASS_LOWER 6 |
584 | "print:]" , |
585 | #define CLASS_PRINT 7 |
586 | "punct:]" , |
587 | #define CLASS_PUNCT 8 |
588 | "space:]" , |
589 | #define CLASS_SPACE 9 |
590 | "upper:]" , |
591 | #define CLASS_UPPER 10 |
592 | "xdigit:]" , |
593 | #define CLASS_XDIGIT 11 |
594 | "tab:]" , |
595 | #define CLASS_TAB 12 |
596 | "return:]" , |
597 | #define CLASS_RETURN 13 |
598 | "backspace:]" , |
599 | #define CLASS_BACKSPACE 14 |
600 | "escape:]" , |
601 | #define CLASS_ESCAPE 15 |
602 | }; |
603 | #define CLASS_NONE 99 |
604 | int i; |
605 | |
606 | if ((*pp)[1] == ':') { |
607 | for (i = 0; i < (int)ARRAY_SIZE(class_names); ++i) |
608 | if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0) { |
609 | *pp += STRLEN(class_names[i]) + 2; |
610 | return i; |
611 | } |
612 | } |
613 | return CLASS_NONE; |
614 | } |
615 | |
616 | /* |
617 | * Specific version of character class functions. |
618 | * Using a table to keep this fast. |
619 | */ |
620 | static short class_tab[256]; |
621 | |
622 | #define RI_DIGIT 0x01 |
623 | #define RI_HEX 0x02 |
624 | #define RI_OCTAL 0x04 |
625 | #define RI_WORD 0x08 |
626 | #define RI_HEAD 0x10 |
627 | #define RI_ALPHA 0x20 |
628 | #define RI_LOWER 0x40 |
629 | #define RI_UPPER 0x80 |
630 | #define RI_WHITE 0x100 |
631 | |
632 | static void init_class_tab(void) |
633 | { |
634 | int i; |
635 | static int done = FALSE; |
636 | |
637 | if (done) |
638 | return; |
639 | |
640 | for (i = 0; i < 256; ++i) { |
641 | if (i >= '0' && i <= '7') |
642 | class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD; |
643 | else if (i >= '8' && i <= '9') |
644 | class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD; |
645 | else if (i >= 'a' && i <= 'f') |
646 | class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER; |
647 | else if (i >= 'g' && i <= 'z') |
648 | class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER; |
649 | else if (i >= 'A' && i <= 'F') |
650 | class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER; |
651 | else if (i >= 'G' && i <= 'Z') |
652 | class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER; |
653 | else if (i == '_') |
654 | class_tab[i] = RI_WORD + RI_HEAD; |
655 | else |
656 | class_tab[i] = 0; |
657 | } |
658 | class_tab[' '] |= RI_WHITE; |
659 | class_tab['\t'] |= RI_WHITE; |
660 | done = TRUE; |
661 | } |
662 | |
663 | # define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT)) |
664 | # define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX)) |
665 | # define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL)) |
666 | # define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD)) |
667 | # define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD)) |
668 | # define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA)) |
669 | # define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER)) |
670 | # define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER)) |
671 | # define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE)) |
672 | |
673 | /* flags for regflags */ |
674 | #define RF_ICASE 1 /* ignore case */ |
675 | #define RF_NOICASE 2 /* don't ignore case */ |
676 | #define RF_HASNL 4 /* can match a NL */ |
677 | #define RF_ICOMBINE 8 /* ignore combining characters */ |
678 | #define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */ |
679 | |
680 | /* |
681 | * Global work variables for vim_regcomp(). |
682 | */ |
683 | |
684 | static char_u *regparse; /* Input-scan pointer. */ |
685 | static int prevchr_len; /* byte length of previous char */ |
686 | static int num_complex_braces; /* Complex \{...} count */ |
687 | static int regnpar; /* () count. */ |
688 | static int regnzpar; /* \z() count. */ |
689 | static int re_has_z; /* \z item detected */ |
690 | static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */ |
691 | static long regsize; /* Code size. */ |
692 | static int reg_toolong; /* TRUE when offset out of range */ |
693 | static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */ |
694 | static unsigned regflags; /* RF_ flags for prog */ |
695 | static long brace_min[10]; /* Minimums for complex brace repeats */ |
696 | static long brace_max[10]; /* Maximums for complex brace repeats */ |
697 | static int brace_count[10]; /* Current counts for complex brace repeats */ |
698 | static int had_eol; /* TRUE when EOL found by vim_regcomp() */ |
699 | static int one_exactly = FALSE; /* only do one char for EXACTLY */ |
700 | |
701 | static int reg_magic; /* magicness of the pattern: */ |
702 | #define MAGIC_NONE 1 /* "\V" very unmagic */ |
703 | #define MAGIC_OFF 2 /* "\M" or 'magic' off */ |
704 | #define MAGIC_ON 3 /* "\m" or 'magic' */ |
705 | #define MAGIC_ALL 4 /* "\v" very magic */ |
706 | |
707 | static int reg_string; /* matching with a string instead of a buffer |
708 | line */ |
709 | static int reg_strict; /* "[abc" is illegal */ |
710 | |
711 | /* |
712 | * META contains all characters that may be magic, except '^' and '$'. |
713 | */ |
714 | |
715 | /* META[] is used often enough to justify turning it into a table. */ |
716 | static char_u META_flags[] = { |
717 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
718 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
719 | /* % & ( ) * + . */ |
720 | 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, |
721 | /* 1 2 3 4 5 6 7 8 9 < = > ? */ |
722 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, |
723 | /* @ A C D F H I K L M O */ |
724 | 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, |
725 | /* P S U V W X Z [ _ */ |
726 | 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, |
727 | /* a c d f h i k l m n o */ |
728 | 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, |
729 | /* p s u v w x z { | ~ */ |
730 | 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1 |
731 | }; |
732 | |
733 | static int curchr; /* currently parsed character */ |
734 | /* Previous character. Note: prevchr is sometimes -1 when we are not at the |
735 | * start, eg in /[ ^I]^ the pattern was never found even if it existed, |
736 | * because ^ was taken to be magic -- webb */ |
737 | static int prevchr; |
738 | static int prevprevchr; /* previous-previous character */ |
739 | static int nextchr; /* used for ungetchr() */ |
740 | |
741 | /* arguments for reg() */ |
742 | #define REG_NOPAREN 0 /* toplevel reg() */ |
743 | #define REG_PAREN 1 /* \(\) */ |
744 | #define REG_ZPAREN 2 /* \z(\) */ |
745 | #define REG_NPAREN 3 /* \%(\) */ |
746 | |
747 | /* |
748 | * Forward declarations for vim_regcomp()'s friends. |
749 | */ |
750 | # define REGMBC(x) regmbc(x); |
751 | # define CASEMBC(x) case x: |
752 | |
753 | static regengine_T bt_regengine; |
754 | static regengine_T nfa_regengine; |
755 | |
756 | /* |
757 | * Return TRUE if compiled regular expression "prog" can match a line break. |
758 | */ |
759 | int re_multiline(regprog_T *prog) |
760 | { |
761 | return prog->regflags & RF_HASNL; |
762 | } |
763 | |
764 | /* |
765 | * Check for an equivalence class name "[=a=]". "pp" points to the '['. |
766 | * Returns a character representing the class. Zero means that no item was |
767 | * recognized. Otherwise "pp" is advanced to after the item. |
768 | */ |
769 | static int get_equi_class(char_u **pp) |
770 | { |
771 | int c; |
772 | int l = 1; |
773 | char_u *p = *pp; |
774 | |
775 | if (p[1] == '=' && p[2] != NUL) { |
776 | l = (*mb_ptr2len)(p + 2); |
777 | if (p[l + 2] == '=' && p[l + 3] == ']') { |
778 | c = utf_ptr2char(p + 2); |
779 | *pp += l + 4; |
780 | return c; |
781 | } |
782 | } |
783 | return 0; |
784 | } |
785 | |
786 | |
787 | /* |
788 | * Produce the bytes for equivalence class "c". |
789 | * Currently only handles latin1, latin9 and utf-8. |
790 | * NOTE: When changing this function, also change nfa_emit_equi_class() |
791 | */ |
792 | static void reg_equi_class(int c) |
793 | { |
794 | if (enc_utf8 || STRCMP(p_enc, "latin1" ) == 0 |
795 | || STRCMP(p_enc, "iso-8859-15" ) == 0) { |
796 | switch (c) { |
797 | // Do not use '\300' style, it results in a negative number. |
798 | case 'A': case 0xc0: case 0xc1: case 0xc2: |
799 | case 0xc3: case 0xc4: case 0xc5: |
800 | CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104) CASEMBC(0x1cd) |
801 | CASEMBC(0x1de) CASEMBC(0x1e0) CASEMBC(0x1ea2) |
802 | regmbc('A'); regmbc(0xc0); regmbc(0xc1); |
803 | regmbc(0xc2); regmbc(0xc3); regmbc(0xc4); |
804 | regmbc(0xc5); |
805 | REGMBC(0x100) REGMBC(0x102) REGMBC(0x104) |
806 | REGMBC(0x1cd) REGMBC(0x1de) REGMBC(0x1e0) |
807 | REGMBC(0x1ea2) |
808 | return; |
809 | case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06) |
810 | regmbc('B'); REGMBC(0x1e02) REGMBC(0x1e06) |
811 | return; |
812 | case 'C': case 0xc7: |
813 | CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a) CASEMBC(0x10c) |
814 | regmbc('C'); regmbc(0xc7); |
815 | REGMBC(0x106) REGMBC(0x108) REGMBC(0x10a) |
816 | REGMBC(0x10c) |
817 | return; |
818 | case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a) |
819 | CASEMBC(0x1e0e) CASEMBC(0x1e10) |
820 | regmbc('D'); REGMBC(0x10e) REGMBC(0x110) |
821 | REGMBC(0x1e0a) REGMBC(0x1e0e) REGMBC(0x1e10) |
822 | return; |
823 | case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb: |
824 | CASEMBC(0x112) CASEMBC(0x114) CASEMBC(0x116) CASEMBC(0x118) |
825 | CASEMBC(0x11a) CASEMBC(0x1eba) CASEMBC(0x1ebc) |
826 | regmbc('E'); regmbc(0xc8); regmbc(0xc9); |
827 | regmbc(0xca); regmbc(0xcb); |
828 | REGMBC(0x112) REGMBC(0x114) REGMBC(0x116) |
829 | REGMBC(0x118) REGMBC(0x11a) REGMBC(0x1eba) |
830 | REGMBC(0x1ebc) |
831 | return; |
832 | case 'F': CASEMBC(0x1e1e) |
833 | regmbc('F'); REGMBC(0x1e1e) |
834 | return; |
835 | case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120) |
836 | CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6) CASEMBC(0x1f4) |
837 | CASEMBC(0x1e20) |
838 | regmbc('G'); REGMBC(0x11c) REGMBC(0x11e) |
839 | REGMBC(0x120) REGMBC(0x122) REGMBC(0x1e4) |
840 | REGMBC(0x1e6) REGMBC(0x1f4) REGMBC(0x1e20) |
841 | return; |
842 | case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22) |
843 | CASEMBC(0x1e26) CASEMBC(0x1e28) |
844 | regmbc('H'); REGMBC(0x124) REGMBC(0x126) |
845 | REGMBC(0x1e22) REGMBC(0x1e26) REGMBC(0x1e28) |
846 | return; |
847 | case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf: |
848 | CASEMBC(0x128) CASEMBC(0x12a) CASEMBC(0x12c) CASEMBC(0x12e) |
849 | CASEMBC(0x130) CASEMBC(0x1cf) CASEMBC(0x1ec8) |
850 | regmbc('I'); regmbc(0xcc); regmbc(0xcd); |
851 | regmbc(0xce); regmbc(0xcf); |
852 | REGMBC(0x128) REGMBC(0x12a) REGMBC(0x12c) |
853 | REGMBC(0x12e) REGMBC(0x130) REGMBC(0x1cf) |
854 | REGMBC(0x1ec8) |
855 | return; |
856 | case 'J': CASEMBC(0x134) |
857 | regmbc('J'); REGMBC(0x134) |
858 | return; |
859 | case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30) |
860 | CASEMBC(0x1e34) |
861 | regmbc('K'); REGMBC(0x136) REGMBC(0x1e8) |
862 | REGMBC(0x1e30) REGMBC(0x1e34) |
863 | return; |
864 | case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d) |
865 | CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a) |
866 | regmbc('L'); REGMBC(0x139) REGMBC(0x13b) |
867 | REGMBC(0x13d) REGMBC(0x13f) REGMBC(0x141) |
868 | REGMBC(0x1e3a) |
869 | return; |
870 | case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40) |
871 | regmbc('M'); REGMBC(0x1e3e) REGMBC(0x1e40) |
872 | return; |
873 | case 'N': case 0xd1: |
874 | CASEMBC(0x143) CASEMBC(0x145) CASEMBC(0x147) CASEMBC(0x1e44) |
875 | CASEMBC(0x1e48) |
876 | regmbc('N'); regmbc(0xd1); |
877 | REGMBC(0x143) REGMBC(0x145) REGMBC(0x147) |
878 | REGMBC(0x1e44) REGMBC(0x1e48) |
879 | return; |
880 | case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: |
881 | case 0xd6: case 0xd8: |
882 | CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150) CASEMBC(0x1a0) |
883 | CASEMBC(0x1d1) CASEMBC(0x1ea) CASEMBC(0x1ec) CASEMBC(0x1ece) |
884 | regmbc('O'); regmbc(0xd2); regmbc(0xd3); |
885 | regmbc(0xd4); regmbc(0xd5); regmbc(0xd6); |
886 | regmbc(0xd8); |
887 | REGMBC(0x14c) REGMBC(0x14e) REGMBC(0x150) |
888 | REGMBC(0x1a0) REGMBC(0x1d1) REGMBC(0x1ea) |
889 | REGMBC(0x1ec) REGMBC(0x1ece) |
890 | return; |
891 | case 'P': case 0x1e54: case 0x1e56: |
892 | regmbc('P'); REGMBC(0x1e54) REGMBC(0x1e56) |
893 | return; |
894 | case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158) |
895 | CASEMBC(0x1e58) CASEMBC(0x1e5e) |
896 | regmbc('R'); REGMBC(0x154) REGMBC(0x156) REGMBC(0x158) |
897 | REGMBC(0x1e58) REGMBC(0x1e5e) |
898 | return; |
899 | case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e) |
900 | CASEMBC(0x160) CASEMBC(0x1e60) |
901 | regmbc('S'); REGMBC(0x15a) REGMBC(0x15c) |
902 | REGMBC(0x15e) REGMBC(0x160) REGMBC(0x1e60) |
903 | return; |
904 | case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166) |
905 | CASEMBC(0x1e6a) CASEMBC(0x1e6e) |
906 | regmbc('T'); REGMBC(0x162) REGMBC(0x164) |
907 | REGMBC(0x166) REGMBC(0x1e6a) REGMBC(0x1e6e) |
908 | return; |
909 | case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc: |
910 | CASEMBC(0x168) CASEMBC(0x16a) CASEMBC(0x16c) CASEMBC(0x16e) |
911 | CASEMBC(0x170) CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3) |
912 | CASEMBC(0x1ee6) |
913 | regmbc('U'); regmbc(0xd9); regmbc(0xda); |
914 | regmbc(0xdb); regmbc(0xdc); |
915 | REGMBC(0x168) REGMBC(0x16a) REGMBC(0x16c) |
916 | REGMBC(0x16e) REGMBC(0x170) REGMBC(0x172) |
917 | REGMBC(0x1af) REGMBC(0x1d3) REGMBC(0x1ee6) |
918 | return; |
919 | case 'V': CASEMBC(0x1e7c) |
920 | regmbc('V'); REGMBC(0x1e7c) |
921 | return; |
922 | case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82) |
923 | CASEMBC(0x1e84) CASEMBC(0x1e86) |
924 | regmbc('W'); REGMBC(0x174) REGMBC(0x1e80) |
925 | REGMBC(0x1e82) REGMBC(0x1e84) REGMBC(0x1e86) |
926 | return; |
927 | case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c) |
928 | regmbc('X'); REGMBC(0x1e8a) REGMBC(0x1e8c) |
929 | return; |
930 | case 'Y': case 0xdd: |
931 | CASEMBC(0x176) CASEMBC(0x178) CASEMBC(0x1e8e) CASEMBC(0x1ef2) |
932 | CASEMBC(0x1ef6) CASEMBC(0x1ef8) |
933 | regmbc('Y'); regmbc(0xdd); |
934 | REGMBC(0x176) REGMBC(0x178) REGMBC(0x1e8e) |
935 | REGMBC(0x1ef2) REGMBC(0x1ef6) REGMBC(0x1ef8) |
936 | return; |
937 | case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d) |
938 | CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94) |
939 | regmbc('Z'); REGMBC(0x179) REGMBC(0x17b) |
940 | REGMBC(0x17d) REGMBC(0x1b5) REGMBC(0x1e90) |
941 | REGMBC(0x1e94) |
942 | return; |
943 | case 'a': case 0xe0: case 0xe1: case 0xe2: |
944 | case 0xe3: case 0xe4: case 0xe5: |
945 | CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105) CASEMBC(0x1ce) |
946 | CASEMBC(0x1df) CASEMBC(0x1e1) CASEMBC(0x1ea3) |
947 | regmbc('a'); regmbc(0xe0); regmbc(0xe1); |
948 | regmbc(0xe2); regmbc(0xe3); regmbc(0xe4); |
949 | regmbc(0xe5); |
950 | REGMBC(0x101) REGMBC(0x103) REGMBC(0x105) |
951 | REGMBC(0x1ce) REGMBC(0x1df) REGMBC(0x1e1) |
952 | REGMBC(0x1ea3) |
953 | return; |
954 | case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07) |
955 | regmbc('b'); REGMBC(0x1e03) REGMBC(0x1e07) |
956 | return; |
957 | case 'c': case 0xe7: |
958 | CASEMBC(0x107) CASEMBC(0x109) CASEMBC(0x10b) CASEMBC(0x10d) |
959 | regmbc('c'); regmbc(0xe7); |
960 | REGMBC(0x107) REGMBC(0x109) REGMBC(0x10b) |
961 | REGMBC(0x10d) |
962 | return; |
963 | case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b) |
964 | CASEMBC(0x1e0f) CASEMBC(0x1e11) |
965 | regmbc('d'); REGMBC(0x10f) REGMBC(0x111) |
966 | REGMBC(0x1e0b) REGMBC(0x1e0f) REGMBC(0x1e11) |
967 | return; |
968 | case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb: |
969 | CASEMBC(0x113) CASEMBC(0x115) CASEMBC(0x117) CASEMBC(0x119) |
970 | CASEMBC(0x11b) CASEMBC(0x1ebb) CASEMBC(0x1ebd) |
971 | regmbc('e'); regmbc(0xe8); regmbc(0xe9); |
972 | regmbc(0xea); regmbc(0xeb); |
973 | REGMBC(0x113) REGMBC(0x115) REGMBC(0x117) |
974 | REGMBC(0x119) REGMBC(0x11b) REGMBC(0x1ebb) |
975 | REGMBC(0x1ebd) |
976 | return; |
977 | case 'f': CASEMBC(0x1e1f) |
978 | regmbc('f'); REGMBC(0x1e1f) |
979 | return; |
980 | case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121) |
981 | CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7) CASEMBC(0x1f5) |
982 | CASEMBC(0x1e21) |
983 | regmbc('g'); REGMBC(0x11d) REGMBC(0x11f) |
984 | REGMBC(0x121) REGMBC(0x123) REGMBC(0x1e5) |
985 | REGMBC(0x1e7) REGMBC(0x1f5) REGMBC(0x1e21) |
986 | return; |
987 | case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23) |
988 | CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96) |
989 | regmbc('h'); REGMBC(0x125) REGMBC(0x127) |
990 | REGMBC(0x1e23) REGMBC(0x1e27) REGMBC(0x1e29) |
991 | REGMBC(0x1e96) |
992 | return; |
993 | case 'i': case 0xec: case 0xed: case 0xee: case 0xef: |
994 | CASEMBC(0x129) CASEMBC(0x12b) CASEMBC(0x12d) CASEMBC(0x12f) |
995 | CASEMBC(0x1d0) CASEMBC(0x1ec9) |
996 | regmbc('i'); regmbc(0xec); regmbc(0xed); |
997 | regmbc(0xee); regmbc(0xef); |
998 | REGMBC(0x129) REGMBC(0x12b) REGMBC(0x12d) |
999 | REGMBC(0x12f) REGMBC(0x1d0) REGMBC(0x1ec9) |
1000 | return; |
1001 | case 'j': CASEMBC(0x135) CASEMBC(0x1f0) |
1002 | regmbc('j'); REGMBC(0x135) REGMBC(0x1f0) |
1003 | return; |
1004 | case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31) |
1005 | CASEMBC(0x1e35) |
1006 | regmbc('k'); REGMBC(0x137) REGMBC(0x1e9) |
1007 | REGMBC(0x1e31) REGMBC(0x1e35) |
1008 | return; |
1009 | case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e) |
1010 | CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b) |
1011 | regmbc('l'); REGMBC(0x13a) REGMBC(0x13c) |
1012 | REGMBC(0x13e) REGMBC(0x140) REGMBC(0x142) |
1013 | REGMBC(0x1e3b) |
1014 | return; |
1015 | case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41) |
1016 | regmbc('m'); REGMBC(0x1e3f) REGMBC(0x1e41) |
1017 | return; |
1018 | case 'n': case 0xf1: |
1019 | CASEMBC(0x144) CASEMBC(0x146) CASEMBC(0x148) CASEMBC(0x149) |
1020 | CASEMBC(0x1e45) CASEMBC(0x1e49) |
1021 | regmbc('n'); regmbc(0xf1); |
1022 | REGMBC(0x144) REGMBC(0x146) REGMBC(0x148) |
1023 | REGMBC(0x149) REGMBC(0x1e45) REGMBC(0x1e49) |
1024 | return; |
1025 | case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5: |
1026 | case 0xf6: case 0xf8: |
1027 | CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151) CASEMBC(0x1a1) |
1028 | CASEMBC(0x1d2) CASEMBC(0x1eb) CASEMBC(0x1ed) CASEMBC(0x1ecf) |
1029 | regmbc('o'); regmbc(0xf2); regmbc(0xf3); |
1030 | regmbc(0xf4); regmbc(0xf5); regmbc(0xf6); |
1031 | regmbc(0xf8); |
1032 | REGMBC(0x14d) REGMBC(0x14f) REGMBC(0x151) |
1033 | REGMBC(0x1a1) REGMBC(0x1d2) REGMBC(0x1eb) |
1034 | REGMBC(0x1ed) REGMBC(0x1ecf) |
1035 | return; |
1036 | case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57) |
1037 | regmbc('p'); REGMBC(0x1e55) REGMBC(0x1e57) |
1038 | return; |
1039 | case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159) |
1040 | CASEMBC(0x1e59) CASEMBC(0x1e5f) |
1041 | regmbc('r'); REGMBC(0x155) REGMBC(0x157) REGMBC(0x159) |
1042 | REGMBC(0x1e59) REGMBC(0x1e5f) |
1043 | return; |
1044 | case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f) |
1045 | CASEMBC(0x161) CASEMBC(0x1e61) |
1046 | regmbc('s'); REGMBC(0x15b) REGMBC(0x15d) |
1047 | REGMBC(0x15f) REGMBC(0x161) REGMBC(0x1e61) |
1048 | return; |
1049 | case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167) |
1050 | CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97) |
1051 | regmbc('t'); REGMBC(0x163) REGMBC(0x165) REGMBC(0x167) |
1052 | REGMBC(0x1e6b) REGMBC(0x1e6f) REGMBC(0x1e97) |
1053 | return; |
1054 | case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc: |
1055 | CASEMBC(0x169) CASEMBC(0x16b) CASEMBC(0x16d) CASEMBC(0x16f) |
1056 | CASEMBC(0x171) CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4) |
1057 | CASEMBC(0x1ee7) |
1058 | regmbc('u'); regmbc(0xf9); regmbc(0xfa); |
1059 | regmbc(0xfb); regmbc(0xfc); |
1060 | REGMBC(0x169) REGMBC(0x16b) REGMBC(0x16d) |
1061 | REGMBC(0x16f) REGMBC(0x171) REGMBC(0x173) |
1062 | REGMBC(0x1b0) REGMBC(0x1d4) REGMBC(0x1ee7) |
1063 | return; |
1064 | case 'v': CASEMBC(0x1e7d) |
1065 | regmbc('v'); REGMBC(0x1e7d) |
1066 | return; |
1067 | case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83) |
1068 | CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98) |
1069 | regmbc('w'); REGMBC(0x175) REGMBC(0x1e81) |
1070 | REGMBC(0x1e83) REGMBC(0x1e85) REGMBC(0x1e87) |
1071 | REGMBC(0x1e98) |
1072 | return; |
1073 | case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d) |
1074 | regmbc('x'); REGMBC(0x1e8b) REGMBC(0x1e8d) |
1075 | return; |
1076 | case 'y': case 0xfd: case 0xff: |
1077 | CASEMBC(0x177) CASEMBC(0x1e8f) CASEMBC(0x1e99) |
1078 | CASEMBC(0x1ef3) CASEMBC(0x1ef7) CASEMBC(0x1ef9) |
1079 | regmbc('y'); regmbc(0xfd); regmbc(0xff); |
1080 | REGMBC(0x177) REGMBC(0x1e8f) REGMBC(0x1e99) |
1081 | REGMBC(0x1ef3) REGMBC(0x1ef7) REGMBC(0x1ef9) |
1082 | return; |
1083 | case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e) |
1084 | CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95) |
1085 | regmbc('z'); REGMBC(0x17a) REGMBC(0x17c) |
1086 | REGMBC(0x17e) REGMBC(0x1b6) REGMBC(0x1e91) |
1087 | REGMBC(0x1e95) |
1088 | return; |
1089 | } |
1090 | } |
1091 | regmbc(c); |
1092 | } |
1093 | |
1094 | /* |
1095 | * Check for a collating element "[.a.]". "pp" points to the '['. |
1096 | * Returns a character. Zero means that no item was recognized. Otherwise |
1097 | * "pp" is advanced to after the item. |
1098 | * Currently only single characters are recognized! |
1099 | */ |
1100 | static int get_coll_element(char_u **pp) |
1101 | { |
1102 | int c; |
1103 | int l = 1; |
1104 | char_u *p = *pp; |
1105 | |
1106 | if (p[0] != NUL && p[1] == '.' && p[2] != NUL) { |
1107 | l = utfc_ptr2len(p + 2); |
1108 | if (p[l + 2] == '.' && p[l + 3] == ']') { |
1109 | c = utf_ptr2char(p + 2); |
1110 | *pp += l + 4; |
1111 | return c; |
1112 | } |
1113 | } |
1114 | return 0; |
1115 | } |
1116 | |
1117 | static int reg_cpo_lit; /* 'cpoptions' contains 'l' flag */ |
1118 | |
1119 | static void get_cpo_flags(void) |
1120 | { |
1121 | reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL; |
1122 | } |
1123 | |
1124 | /* |
1125 | * Skip over a "[]" range. |
1126 | * "p" must point to the character after the '['. |
1127 | * The returned pointer is on the matching ']', or the terminating NUL. |
1128 | */ |
1129 | static char_u *skip_anyof(char_u *p) |
1130 | { |
1131 | int l; |
1132 | |
1133 | if (*p == '^') /* Complement of range. */ |
1134 | ++p; |
1135 | if (*p == ']' || *p == '-') |
1136 | ++p; |
1137 | while (*p != NUL && *p != ']') { |
1138 | if (has_mbyte && (l = (*mb_ptr2len)(p)) > 1) { |
1139 | p += l; |
1140 | } else if (*p == '-') { |
1141 | p++; |
1142 | if (*p != ']' && *p != NUL) { |
1143 | MB_PTR_ADV(p); |
1144 | } |
1145 | } else if (*p == '\\' |
1146 | && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL |
1147 | || (!reg_cpo_lit |
1148 | && vim_strchr(REGEXP_ABBR, p[1]) != NULL))) { |
1149 | p += 2; |
1150 | } else if (*p == '[') { |
1151 | if (get_char_class(&p) == CLASS_NONE |
1152 | && get_equi_class(&p) == 0 |
1153 | && get_coll_element(&p) == 0 |
1154 | && *p != NUL) { |
1155 | p++; // It is not a class name and not NUL |
1156 | } |
1157 | } else { |
1158 | p++; |
1159 | } |
1160 | } |
1161 | |
1162 | return p; |
1163 | } |
1164 | |
1165 | /* |
1166 | * Skip past regular expression. |
1167 | * Stop at end of "startp" or where "dirc" is found ('/', '?', etc). |
1168 | * Take care of characters with a backslash in front of it. |
1169 | * Skip strings inside [ and ]. |
1170 | * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the |
1171 | * expression and change "\?" to "?". If "*newp" is not NULL the expression |
1172 | * is changed in-place. |
1173 | */ |
1174 | char_u *skip_regexp(char_u *startp, int dirc, int magic, char_u **newp) |
1175 | { |
1176 | int mymagic; |
1177 | char_u *p = startp; |
1178 | |
1179 | if (magic) |
1180 | mymagic = MAGIC_ON; |
1181 | else |
1182 | mymagic = MAGIC_OFF; |
1183 | get_cpo_flags(); |
1184 | |
1185 | for (; p[0] != NUL; MB_PTR_ADV(p)) { |
1186 | if (p[0] == dirc) { // found end of regexp |
1187 | break; |
1188 | } |
1189 | if ((p[0] == '[' && mymagic >= MAGIC_ON) |
1190 | || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF)) { |
1191 | p = skip_anyof(p + 1); |
1192 | if (p[0] == NUL) |
1193 | break; |
1194 | } else if (p[0] == '\\' && p[1] != NUL) { |
1195 | if (dirc == '?' && newp != NULL && p[1] == '?') { |
1196 | /* change "\?" to "?", make a copy first. */ |
1197 | if (*newp == NULL) { |
1198 | *newp = vim_strsave(startp); |
1199 | p = *newp + (p - startp); |
1200 | } |
1201 | STRMOVE(p, p + 1); |
1202 | } else |
1203 | ++p; /* skip next character */ |
1204 | if (*p == 'v') |
1205 | mymagic = MAGIC_ALL; |
1206 | else if (*p == 'V') |
1207 | mymagic = MAGIC_NONE; |
1208 | } |
1209 | } |
1210 | return p; |
1211 | } |
1212 | |
1213 | /// Return TRUE if the back reference is legal. We must have seen the close |
1214 | /// brace. |
1215 | /// TODO(vim): Should also check that we don't refer to something repeated |
1216 | /// (+*=): what instance of the repetition should we match? |
1217 | static int seen_endbrace(int refnum) |
1218 | { |
1219 | if (!had_endbrace[refnum]) { |
1220 | char_u *p; |
1221 | |
1222 | // Trick: check if "@<=" or "@<!" follows, in which case |
1223 | // the \1 can appear before the referenced match. |
1224 | for (p = regparse; *p != NUL; p++) { |
1225 | if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '=')) { |
1226 | break; |
1227 | } |
1228 | } |
1229 | |
1230 | if (*p == NUL) { |
1231 | EMSG(_("E65: Illegal back reference" )); |
1232 | rc_did_emsg = true; |
1233 | return false; |
1234 | } |
1235 | } |
1236 | return TRUE; |
1237 | } |
1238 | |
1239 | /* |
1240 | * bt_regcomp() - compile a regular expression into internal code for the |
1241 | * traditional back track matcher. |
1242 | * Returns the program in allocated space. Returns NULL for an error. |
1243 | * |
1244 | * We can't allocate space until we know how big the compiled form will be, |
1245 | * but we can't compile it (and thus know how big it is) until we've got a |
1246 | * place to put the code. So we cheat: we compile it twice, once with code |
1247 | * generation turned off and size counting turned on, and once "for real". |
1248 | * This also means that we don't allocate space until we are sure that the |
1249 | * thing really will compile successfully, and we never have to move the |
1250 | * code and thus invalidate pointers into it. (Note that it has to be in |
1251 | * one piece because free() must be able to free it all.) |
1252 | * |
1253 | * Whether upper/lower case is to be ignored is decided when executing the |
1254 | * program, it does not matter here. |
1255 | * |
1256 | * Beware that the optimization-preparation code in here knows about some |
1257 | * of the structure of the compiled regexp. |
1258 | * "re_flags": RE_MAGIC and/or RE_STRING. |
1259 | */ |
1260 | static regprog_T *bt_regcomp(char_u *expr, int re_flags) |
1261 | { |
1262 | char_u *scan; |
1263 | char_u *longest; |
1264 | int len; |
1265 | int flags; |
1266 | |
1267 | if (expr == NULL) |
1268 | EMSG_RET_NULL(_(e_null)); |
1269 | |
1270 | init_class_tab(); |
1271 | |
1272 | /* |
1273 | * First pass: determine size, legality. |
1274 | */ |
1275 | regcomp_start(expr, re_flags); |
1276 | regcode = JUST_CALC_SIZE; |
1277 | regc(REGMAGIC); |
1278 | if (reg(REG_NOPAREN, &flags) == NULL) |
1279 | return NULL; |
1280 | |
1281 | /* Allocate space. */ |
1282 | bt_regprog_T *r = xmalloc(sizeof(bt_regprog_T) + regsize); |
1283 | |
1284 | /* |
1285 | * Second pass: emit code. |
1286 | */ |
1287 | regcomp_start(expr, re_flags); |
1288 | regcode = r->program; |
1289 | regc(REGMAGIC); |
1290 | if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong) { |
1291 | xfree(r); |
1292 | if (reg_toolong) |
1293 | EMSG_RET_NULL(_("E339: Pattern too long" )); |
1294 | return NULL; |
1295 | } |
1296 | |
1297 | /* Dig out information for optimizations. */ |
1298 | r->regstart = NUL; /* Worst-case defaults. */ |
1299 | r->reganch = 0; |
1300 | r->regmust = NULL; |
1301 | r->regmlen = 0; |
1302 | r->regflags = regflags; |
1303 | if (flags & HASNL) |
1304 | r->regflags |= RF_HASNL; |
1305 | if (flags & HASLOOKBH) |
1306 | r->regflags |= RF_LOOKBH; |
1307 | /* Remember whether this pattern has any \z specials in it. */ |
1308 | r->reghasz = re_has_z; |
1309 | scan = r->program + 1; /* First BRANCH. */ |
1310 | if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ |
1311 | scan = OPERAND(scan); |
1312 | |
1313 | /* Starting-point info. */ |
1314 | if (OP(scan) == BOL || OP(scan) == RE_BOF) { |
1315 | r->reganch++; |
1316 | scan = regnext(scan); |
1317 | } |
1318 | |
1319 | if (OP(scan) == EXACTLY) { |
1320 | r->regstart = utf_ptr2char(OPERAND(scan)); |
1321 | } else if (OP(scan) == BOW |
1322 | || OP(scan) == EOW |
1323 | || OP(scan) == NOTHING |
1324 | || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN |
1325 | || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE) { |
1326 | char_u *regnext_scan = regnext(scan); |
1327 | if (OP(regnext_scan) == EXACTLY) { |
1328 | r->regstart = utf_ptr2char(OPERAND(regnext_scan)); |
1329 | } |
1330 | } |
1331 | |
1332 | /* |
1333 | * If there's something expensive in the r.e., find the longest |
1334 | * literal string that must appear and make it the regmust. Resolve |
1335 | * ties in favor of later strings, since the regstart check works |
1336 | * with the beginning of the r.e. and avoiding duplication |
1337 | * strengthens checking. Not a strong reason, but sufficient in the |
1338 | * absence of others. |
1339 | */ |
1340 | /* |
1341 | * When the r.e. starts with BOW, it is faster to look for a regmust |
1342 | * first. Used a lot for "#" and "*" commands. (Added by mool). |
1343 | */ |
1344 | if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW) |
1345 | && !(flags & HASNL)) { |
1346 | longest = NULL; |
1347 | len = 0; |
1348 | for (; scan != NULL; scan = regnext(scan)) |
1349 | if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len) { |
1350 | longest = OPERAND(scan); |
1351 | len = (int)STRLEN(OPERAND(scan)); |
1352 | } |
1353 | r->regmust = longest; |
1354 | r->regmlen = len; |
1355 | } |
1356 | } |
1357 | #ifdef BT_REGEXP_DUMP |
1358 | regdump(expr, r); |
1359 | #endif |
1360 | r->engine = &bt_regengine; |
1361 | return (regprog_T *)r; |
1362 | } |
1363 | |
1364 | /* |
1365 | * Free a compiled regexp program, returned by bt_regcomp(). |
1366 | */ |
1367 | static void bt_regfree(regprog_T *prog) |
1368 | { |
1369 | xfree(prog); |
1370 | } |
1371 | |
1372 | /* |
1373 | * Setup to parse the regexp. Used once to get the length and once to do it. |
1374 | */ |
1375 | static void |
1376 | regcomp_start ( |
1377 | char_u *expr, |
1378 | int re_flags /* see vim_regcomp() */ |
1379 | ) |
1380 | { |
1381 | initchr(expr); |
1382 | if (re_flags & RE_MAGIC) |
1383 | reg_magic = MAGIC_ON; |
1384 | else |
1385 | reg_magic = MAGIC_OFF; |
1386 | reg_string = (re_flags & RE_STRING); |
1387 | reg_strict = (re_flags & RE_STRICT); |
1388 | get_cpo_flags(); |
1389 | |
1390 | num_complex_braces = 0; |
1391 | regnpar = 1; |
1392 | memset(had_endbrace, 0, sizeof(had_endbrace)); |
1393 | regnzpar = 1; |
1394 | re_has_z = 0; |
1395 | regsize = 0L; |
1396 | reg_toolong = FALSE; |
1397 | regflags = 0; |
1398 | had_eol = FALSE; |
1399 | } |
1400 | |
1401 | /* |
1402 | * Check if during the previous call to vim_regcomp the EOL item "$" has been |
1403 | * found. This is messy, but it works fine. |
1404 | */ |
1405 | int vim_regcomp_had_eol(void) |
1406 | { |
1407 | return had_eol; |
1408 | } |
1409 | |
1410 | // variables for parsing reginput |
1411 | static int at_start; // True when on the first character |
1412 | static int prev_at_start; // True when on the second character |
1413 | |
1414 | /* |
1415 | * Parse regular expression, i.e. main body or parenthesized thing. |
1416 | * |
1417 | * Caller must absorb opening parenthesis. |
1418 | * |
1419 | * Combining parenthesis handling with the base level of regular expression |
1420 | * is a trifle forced, but the need to tie the tails of the branches to what |
1421 | * follows makes it hard to avoid. |
1422 | */ |
1423 | static char_u * |
1424 | reg ( |
1425 | int paren, /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */ |
1426 | int *flagp |
1427 | ) |
1428 | { |
1429 | char_u *ret; |
1430 | char_u *br; |
1431 | char_u *ender; |
1432 | int parno = 0; |
1433 | int flags; |
1434 | |
1435 | *flagp = HASWIDTH; /* Tentatively. */ |
1436 | |
1437 | if (paren == REG_ZPAREN) { |
1438 | /* Make a ZOPEN node. */ |
1439 | if (regnzpar >= NSUBEXP) |
1440 | EMSG_RET_NULL(_("E50: Too many \\z(" )); |
1441 | parno = regnzpar; |
1442 | regnzpar++; |
1443 | ret = regnode(ZOPEN + parno); |
1444 | } else if (paren == REG_PAREN) { |
1445 | /* Make a MOPEN node. */ |
1446 | if (regnpar >= NSUBEXP) |
1447 | EMSG2_RET_NULL(_("E51: Too many %s(" ), reg_magic == MAGIC_ALL); |
1448 | parno = regnpar; |
1449 | ++regnpar; |
1450 | ret = regnode(MOPEN + parno); |
1451 | } else if (paren == REG_NPAREN) { |
1452 | /* Make a NOPEN node. */ |
1453 | ret = regnode(NOPEN); |
1454 | } else |
1455 | ret = NULL; |
1456 | |
1457 | /* Pick up the branches, linking them together. */ |
1458 | br = regbranch(&flags); |
1459 | if (br == NULL) |
1460 | return NULL; |
1461 | if (ret != NULL) |
1462 | regtail(ret, br); /* [MZ]OPEN -> first. */ |
1463 | else |
1464 | ret = br; |
1465 | /* If one of the branches can be zero-width, the whole thing can. |
1466 | * If one of the branches has * at start or matches a line-break, the |
1467 | * whole thing can. */ |
1468 | if (!(flags & HASWIDTH)) |
1469 | *flagp &= ~HASWIDTH; |
1470 | *flagp |= flags & (SPSTART | HASNL | HASLOOKBH); |
1471 | while (peekchr() == Magic('|')) { |
1472 | skipchr(); |
1473 | br = regbranch(&flags); |
1474 | if (br == NULL || reg_toolong) |
1475 | return NULL; |
1476 | regtail(ret, br); /* BRANCH -> BRANCH. */ |
1477 | if (!(flags & HASWIDTH)) |
1478 | *flagp &= ~HASWIDTH; |
1479 | *flagp |= flags & (SPSTART | HASNL | HASLOOKBH); |
1480 | } |
1481 | |
1482 | /* Make a closing node, and hook it on the end. */ |
1483 | ender = regnode( |
1484 | paren == REG_ZPAREN ? ZCLOSE + parno : |
1485 | paren == REG_PAREN ? MCLOSE + parno : |
1486 | paren == REG_NPAREN ? NCLOSE : END); |
1487 | regtail(ret, ender); |
1488 | |
1489 | /* Hook the tails of the branches to the closing node. */ |
1490 | for (br = ret; br != NULL; br = regnext(br)) |
1491 | regoptail(br, ender); |
1492 | |
1493 | /* Check for proper termination. */ |
1494 | if (paren != REG_NOPAREN && getchr() != Magic(')')) { |
1495 | if (paren == REG_ZPAREN) |
1496 | EMSG_RET_NULL(_("E52: Unmatched \\z(" )); |
1497 | else if (paren == REG_NPAREN) |
1498 | EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL); |
1499 | else |
1500 | EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL); |
1501 | } else if (paren == REG_NOPAREN && peekchr() != NUL) { |
1502 | if (curchr == Magic(')')) |
1503 | EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL); |
1504 | else |
1505 | EMSG_RET_NULL(_(e_trailing)); /* "Can't happen". */ |
1506 | /* NOTREACHED */ |
1507 | } |
1508 | /* |
1509 | * Here we set the flag allowing back references to this set of |
1510 | * parentheses. |
1511 | */ |
1512 | if (paren == REG_PAREN) |
1513 | had_endbrace[parno] = TRUE; /* have seen the close paren */ |
1514 | return ret; |
1515 | } |
1516 | |
1517 | /* |
1518 | * Parse one alternative of an | operator. |
1519 | * Implements the & operator. |
1520 | */ |
1521 | static char_u *regbranch(int *flagp) |
1522 | { |
1523 | char_u *ret; |
1524 | char_u *chain = NULL; |
1525 | char_u *latest; |
1526 | int flags; |
1527 | |
1528 | *flagp = WORST | HASNL; /* Tentatively. */ |
1529 | |
1530 | ret = regnode(BRANCH); |
1531 | for (;; ) { |
1532 | latest = regconcat(&flags); |
1533 | if (latest == NULL) |
1534 | return NULL; |
1535 | /* If one of the branches has width, the whole thing has. If one of |
1536 | * the branches anchors at start-of-line, the whole thing does. |
1537 | * If one of the branches uses look-behind, the whole thing does. */ |
1538 | *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH); |
1539 | /* If one of the branches doesn't match a line-break, the whole thing |
1540 | * doesn't. */ |
1541 | *flagp &= ~HASNL | (flags & HASNL); |
1542 | if (chain != NULL) |
1543 | regtail(chain, latest); |
1544 | if (peekchr() != Magic('&')) |
1545 | break; |
1546 | skipchr(); |
1547 | regtail(latest, regnode(END)); /* operand ends */ |
1548 | if (reg_toolong) |
1549 | break; |
1550 | reginsert(MATCH, latest); |
1551 | chain = latest; |
1552 | } |
1553 | |
1554 | return ret; |
1555 | } |
1556 | |
1557 | /* |
1558 | * Parse one alternative of an | or & operator. |
1559 | * Implements the concatenation operator. |
1560 | */ |
1561 | static char_u *regconcat(int *flagp) |
1562 | { |
1563 | char_u *first = NULL; |
1564 | char_u *chain = NULL; |
1565 | char_u *latest; |
1566 | int flags; |
1567 | int cont = TRUE; |
1568 | |
1569 | *flagp = WORST; /* Tentatively. */ |
1570 | |
1571 | while (cont) { |
1572 | switch (peekchr()) { |
1573 | case NUL: |
1574 | case Magic('|'): |
1575 | case Magic('&'): |
1576 | case Magic(')'): |
1577 | cont = FALSE; |
1578 | break; |
1579 | case Magic('Z'): |
1580 | regflags |= RF_ICOMBINE; |
1581 | skipchr_keepstart(); |
1582 | break; |
1583 | case Magic('c'): |
1584 | regflags |= RF_ICASE; |
1585 | skipchr_keepstart(); |
1586 | break; |
1587 | case Magic('C'): |
1588 | regflags |= RF_NOICASE; |
1589 | skipchr_keepstart(); |
1590 | break; |
1591 | case Magic('v'): |
1592 | reg_magic = MAGIC_ALL; |
1593 | skipchr_keepstart(); |
1594 | curchr = -1; |
1595 | break; |
1596 | case Magic('m'): |
1597 | reg_magic = MAGIC_ON; |
1598 | skipchr_keepstart(); |
1599 | curchr = -1; |
1600 | break; |
1601 | case Magic('M'): |
1602 | reg_magic = MAGIC_OFF; |
1603 | skipchr_keepstart(); |
1604 | curchr = -1; |
1605 | break; |
1606 | case Magic('V'): |
1607 | reg_magic = MAGIC_NONE; |
1608 | skipchr_keepstart(); |
1609 | curchr = -1; |
1610 | break; |
1611 | default: |
1612 | latest = regpiece(&flags); |
1613 | if (latest == NULL || reg_toolong) |
1614 | return NULL; |
1615 | *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH); |
1616 | if (chain == NULL) /* First piece. */ |
1617 | *flagp |= flags & SPSTART; |
1618 | else |
1619 | regtail(chain, latest); |
1620 | chain = latest; |
1621 | if (first == NULL) |
1622 | first = latest; |
1623 | break; |
1624 | } |
1625 | } |
1626 | if (first == NULL) /* Loop ran zero times. */ |
1627 | first = regnode(NOTHING); |
1628 | return first; |
1629 | } |
1630 | |
1631 | /* |
1632 | * Parse something followed by possible [*+=]. |
1633 | * |
1634 | * Note that the branching code sequences used for = and the general cases |
1635 | * of * and + are somewhat optimized: they use the same NOTHING node as |
1636 | * both the endmarker for their branch list and the body of the last branch. |
1637 | * It might seem that this node could be dispensed with entirely, but the |
1638 | * endmarker role is not redundant. |
1639 | */ |
1640 | static char_u *regpiece(int *flagp) |
1641 | { |
1642 | char_u *ret; |
1643 | int op; |
1644 | char_u *next; |
1645 | int flags; |
1646 | long minval; |
1647 | long maxval; |
1648 | |
1649 | ret = regatom(&flags); |
1650 | if (ret == NULL) |
1651 | return NULL; |
1652 | |
1653 | op = peekchr(); |
1654 | if (re_multi_type(op) == NOT_MULTI) { |
1655 | *flagp = flags; |
1656 | return ret; |
1657 | } |
1658 | /* default flags */ |
1659 | *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH))); |
1660 | |
1661 | skipchr(); |
1662 | switch (op) { |
1663 | case Magic('*'): |
1664 | if (flags & SIMPLE) |
1665 | reginsert(STAR, ret); |
1666 | else { |
1667 | /* Emit x* as (x&|), where & means "self". */ |
1668 | reginsert(BRANCH, ret); /* Either x */ |
1669 | regoptail(ret, regnode(BACK)); /* and loop */ |
1670 | regoptail(ret, ret); /* back */ |
1671 | regtail(ret, regnode(BRANCH)); /* or */ |
1672 | regtail(ret, regnode(NOTHING)); /* null. */ |
1673 | } |
1674 | break; |
1675 | |
1676 | case Magic('+'): |
1677 | if (flags & SIMPLE) |
1678 | reginsert(PLUS, ret); |
1679 | else { |
1680 | /* Emit x+ as x(&|), where & means "self". */ |
1681 | next = regnode(BRANCH); /* Either */ |
1682 | regtail(ret, next); |
1683 | regtail(regnode(BACK), ret); /* loop back */ |
1684 | regtail(next, regnode(BRANCH)); /* or */ |
1685 | regtail(ret, regnode(NOTHING)); /* null. */ |
1686 | } |
1687 | *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH))); |
1688 | break; |
1689 | |
1690 | case Magic('@'): |
1691 | { |
1692 | int lop = END; |
1693 | int64_t nr = getdecchrs(); |
1694 | |
1695 | switch (no_Magic(getchr())) { |
1696 | case '=': lop = MATCH; break; /* \@= */ |
1697 | case '!': lop = NOMATCH; break; /* \@! */ |
1698 | case '>': lop = SUBPAT; break; /* \@> */ |
1699 | case '<': switch (no_Magic(getchr())) { |
1700 | case '=': lop = BEHIND; break; /* \@<= */ |
1701 | case '!': lop = NOBEHIND; break; /* \@<! */ |
1702 | } |
1703 | } |
1704 | if (lop == END) |
1705 | EMSG2_RET_NULL(_("E59: invalid character after %s@" ), |
1706 | reg_magic == MAGIC_ALL); |
1707 | /* Look behind must match with behind_pos. */ |
1708 | if (lop == BEHIND || lop == NOBEHIND) { |
1709 | regtail(ret, regnode(BHPOS)); |
1710 | *flagp |= HASLOOKBH; |
1711 | } |
1712 | regtail(ret, regnode(END)); /* operand ends */ |
1713 | if (lop == BEHIND || lop == NOBEHIND) { |
1714 | if (nr < 0) |
1715 | nr = 0; /* no limit is same as zero limit */ |
1716 | reginsert_nr(lop, (uint32_t)nr, ret); |
1717 | } else |
1718 | reginsert(lop, ret); |
1719 | break; |
1720 | } |
1721 | |
1722 | case Magic('?'): |
1723 | case Magic('='): |
1724 | /* Emit x= as (x|) */ |
1725 | reginsert(BRANCH, ret); /* Either x */ |
1726 | regtail(ret, regnode(BRANCH)); /* or */ |
1727 | next = regnode(NOTHING); /* null. */ |
1728 | regtail(ret, next); |
1729 | regoptail(ret, next); |
1730 | break; |
1731 | |
1732 | case Magic('{'): |
1733 | if (!read_limits(&minval, &maxval)) |
1734 | return NULL; |
1735 | if (flags & SIMPLE) { |
1736 | reginsert(BRACE_SIMPLE, ret); |
1737 | reginsert_limits(BRACE_LIMITS, minval, maxval, ret); |
1738 | } else { |
1739 | if (num_complex_braces >= 10) |
1740 | EMSG2_RET_NULL(_("E60: Too many complex %s{...}s" ), |
1741 | reg_magic == MAGIC_ALL); |
1742 | reginsert(BRACE_COMPLEX + num_complex_braces, ret); |
1743 | regoptail(ret, regnode(BACK)); |
1744 | regoptail(ret, ret); |
1745 | reginsert_limits(BRACE_LIMITS, minval, maxval, ret); |
1746 | ++num_complex_braces; |
1747 | } |
1748 | if (minval > 0 && maxval > 0) |
1749 | *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH))); |
1750 | break; |
1751 | } |
1752 | if (re_multi_type(peekchr()) != NOT_MULTI) { |
1753 | /* Can't have a multi follow a multi. */ |
1754 | if (peekchr() == Magic('*')) |
1755 | sprintf((char *)IObuff, _("E61: Nested %s*" ), |
1756 | reg_magic >= MAGIC_ON ? "" : "\\" ); |
1757 | else |
1758 | sprintf((char *)IObuff, _("E62: Nested %s%c" ), |
1759 | reg_magic == MAGIC_ALL ? "" : "\\" , no_Magic(peekchr())); |
1760 | EMSG_RET_NULL(IObuff); |
1761 | } |
1762 | |
1763 | return ret; |
1764 | } |
1765 | |
1766 | /* When making changes to classchars also change nfa_classcodes. */ |
1767 | static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU" ; |
1768 | static int classcodes[] = { |
1769 | ANY, IDENT, SIDENT, KWORD, SKWORD, |
1770 | FNAME, SFNAME, PRINT, SPRINT, |
1771 | WHITE, NWHITE, DIGIT, NDIGIT, |
1772 | HEX, NHEX, OCTAL, NOCTAL, |
1773 | WORD, NWORD, HEAD, NHEAD, |
1774 | ALPHA, NALPHA, LOWER, NLOWER, |
1775 | UPPER, NUPPER |
1776 | }; |
1777 | |
1778 | /* |
1779 | * Parse the lowest level. |
1780 | * |
1781 | * Optimization: gobbles an entire sequence of ordinary characters so that |
1782 | * it can turn them into a single node, which is smaller to store and |
1783 | * faster to run. Don't do this when one_exactly is set. |
1784 | */ |
1785 | static char_u *regatom(int *flagp) |
1786 | { |
1787 | char_u *ret; |
1788 | int flags; |
1789 | int c; |
1790 | char_u *p; |
1791 | int = 0; |
1792 | int save_prev_at_start = prev_at_start; |
1793 | |
1794 | *flagp = WORST; /* Tentatively. */ |
1795 | |
1796 | c = getchr(); |
1797 | switch (c) { |
1798 | case Magic('^'): |
1799 | ret = regnode(BOL); |
1800 | break; |
1801 | |
1802 | case Magic('$'): |
1803 | ret = regnode(EOL); |
1804 | had_eol = TRUE; |
1805 | break; |
1806 | |
1807 | case Magic('<'): |
1808 | ret = regnode(BOW); |
1809 | break; |
1810 | |
1811 | case Magic('>'): |
1812 | ret = regnode(EOW); |
1813 | break; |
1814 | |
1815 | case Magic('_'): |
1816 | c = no_Magic(getchr()); |
1817 | if (c == '^') { /* "\_^" is start-of-line */ |
1818 | ret = regnode(BOL); |
1819 | break; |
1820 | } |
1821 | if (c == '$') { /* "\_$" is end-of-line */ |
1822 | ret = regnode(EOL); |
1823 | had_eol = TRUE; |
1824 | break; |
1825 | } |
1826 | |
1827 | extra = ADD_NL; |
1828 | *flagp |= HASNL; |
1829 | |
1830 | /* "\_[" is character range plus newline */ |
1831 | if (c == '[') |
1832 | goto collection; |
1833 | |
1834 | // "\_x" is character class plus newline |
1835 | FALLTHROUGH; |
1836 | |
1837 | /* |
1838 | * Character classes. |
1839 | */ |
1840 | case Magic('.'): |
1841 | case Magic('i'): |
1842 | case Magic('I'): |
1843 | case Magic('k'): |
1844 | case Magic('K'): |
1845 | case Magic('f'): |
1846 | case Magic('F'): |
1847 | case Magic('p'): |
1848 | case Magic('P'): |
1849 | case Magic('s'): |
1850 | case Magic('S'): |
1851 | case Magic('d'): |
1852 | case Magic('D'): |
1853 | case Magic('x'): |
1854 | case Magic('X'): |
1855 | case Magic('o'): |
1856 | case Magic('O'): |
1857 | case Magic('w'): |
1858 | case Magic('W'): |
1859 | case Magic('h'): |
1860 | case Magic('H'): |
1861 | case Magic('a'): |
1862 | case Magic('A'): |
1863 | case Magic('l'): |
1864 | case Magic('L'): |
1865 | case Magic('u'): |
1866 | case Magic('U'): |
1867 | p = vim_strchr(classchars, no_Magic(c)); |
1868 | if (p == NULL) |
1869 | EMSG_RET_NULL(_("E63: invalid use of \\_" )); |
1870 | /* When '.' is followed by a composing char ignore the dot, so that |
1871 | * the composing char is matched here. */ |
1872 | if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr())) { |
1873 | c = getchr(); |
1874 | goto do_multibyte; |
1875 | } |
1876 | ret = regnode(classcodes[p - classchars] + extra); |
1877 | *flagp |= HASWIDTH | SIMPLE; |
1878 | break; |
1879 | |
1880 | case Magic('n'): |
1881 | if (reg_string) { |
1882 | /* In a string "\n" matches a newline character. */ |
1883 | ret = regnode(EXACTLY); |
1884 | regc(NL); |
1885 | regc(NUL); |
1886 | *flagp |= HASWIDTH | SIMPLE; |
1887 | } else { |
1888 | /* In buffer text "\n" matches the end of a line. */ |
1889 | ret = regnode(NEWL); |
1890 | *flagp |= HASWIDTH | HASNL; |
1891 | } |
1892 | break; |
1893 | |
1894 | case Magic('('): |
1895 | if (one_exactly) |
1896 | EMSG_ONE_RET_NULL; |
1897 | ret = reg(REG_PAREN, &flags); |
1898 | if (ret == NULL) |
1899 | return NULL; |
1900 | *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH); |
1901 | break; |
1902 | |
1903 | case NUL: |
1904 | case Magic('|'): |
1905 | case Magic('&'): |
1906 | case Magic(')'): |
1907 | if (one_exactly) |
1908 | EMSG_ONE_RET_NULL; |
1909 | IEMSG_RET_NULL(_(e_internal)); // Supposed to be caught earlier. |
1910 | // NOTREACHED |
1911 | |
1912 | case Magic('='): |
1913 | case Magic('?'): |
1914 | case Magic('+'): |
1915 | case Magic('@'): |
1916 | case Magic('{'): |
1917 | case Magic('*'): |
1918 | c = no_Magic(c); |
1919 | sprintf((char *)IObuff, _("E64: %s%c follows nothing" ), |
1920 | (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL) |
1921 | ? "" : "\\" , c); |
1922 | EMSG_RET_NULL(IObuff); |
1923 | /* NOTREACHED */ |
1924 | |
1925 | case Magic('~'): /* previous substitute pattern */ |
1926 | if (reg_prev_sub != NULL) { |
1927 | char_u *lp; |
1928 | |
1929 | ret = regnode(EXACTLY); |
1930 | lp = reg_prev_sub; |
1931 | while (*lp != NUL) |
1932 | regc(*lp++); |
1933 | regc(NUL); |
1934 | if (*reg_prev_sub != NUL) { |
1935 | *flagp |= HASWIDTH; |
1936 | if ((lp - reg_prev_sub) == 1) |
1937 | *flagp |= SIMPLE; |
1938 | } |
1939 | } else |
1940 | EMSG_RET_NULL(_(e_nopresub)); |
1941 | break; |
1942 | |
1943 | case Magic('1'): |
1944 | case Magic('2'): |
1945 | case Magic('3'): |
1946 | case Magic('4'): |
1947 | case Magic('5'): |
1948 | case Magic('6'): |
1949 | case Magic('7'): |
1950 | case Magic('8'): |
1951 | case Magic('9'): |
1952 | { |
1953 | int refnum; |
1954 | |
1955 | refnum = c - Magic('0'); |
1956 | if (!seen_endbrace(refnum)) { |
1957 | return NULL; |
1958 | } |
1959 | ret = regnode(BACKREF + refnum); |
1960 | } |
1961 | break; |
1962 | |
1963 | case Magic('z'): |
1964 | { |
1965 | c = no_Magic(getchr()); |
1966 | switch (c) { |
1967 | case '(': if ((reg_do_extmatch & REX_SET) == 0) |
1968 | EMSG_RET_NULL(_(e_z_not_allowed)); |
1969 | if (one_exactly) |
1970 | EMSG_ONE_RET_NULL; |
1971 | ret = reg(REG_ZPAREN, &flags); |
1972 | if (ret == NULL) |
1973 | return NULL; |
1974 | *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH); |
1975 | re_has_z = REX_SET; |
1976 | break; |
1977 | |
1978 | case '1': |
1979 | case '2': |
1980 | case '3': |
1981 | case '4': |
1982 | case '5': |
1983 | case '6': |
1984 | case '7': |
1985 | case '8': |
1986 | case '9': if ((reg_do_extmatch & REX_USE) == 0) |
1987 | EMSG_RET_NULL(_(e_z1_not_allowed)); |
1988 | ret = regnode(ZREF + c - '0'); |
1989 | re_has_z = REX_USE; |
1990 | break; |
1991 | |
1992 | case 's': ret = regnode(MOPEN + 0); |
1993 | if (!re_mult_next("\\zs" )) { |
1994 | return NULL; |
1995 | } |
1996 | break; |
1997 | |
1998 | case 'e': ret = regnode(MCLOSE + 0); |
1999 | if (!re_mult_next("\\ze" )) { |
2000 | return NULL; |
2001 | } |
2002 | break; |
2003 | |
2004 | default: EMSG_RET_NULL(_("E68: Invalid character after \\z" )); |
2005 | } |
2006 | } |
2007 | break; |
2008 | |
2009 | case Magic('%'): |
2010 | { |
2011 | c = no_Magic(getchr()); |
2012 | switch (c) { |
2013 | /* () without a back reference */ |
2014 | case '(': |
2015 | if (one_exactly) |
2016 | EMSG_ONE_RET_NULL; |
2017 | ret = reg(REG_NPAREN, &flags); |
2018 | if (ret == NULL) |
2019 | return NULL; |
2020 | *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH); |
2021 | break; |
2022 | |
2023 | /* Catch \%^ and \%$ regardless of where they appear in the |
2024 | * pattern -- regardless of whether or not it makes sense. */ |
2025 | case '^': |
2026 | ret = regnode(RE_BOF); |
2027 | break; |
2028 | |
2029 | case '$': |
2030 | ret = regnode(RE_EOF); |
2031 | break; |
2032 | |
2033 | case '#': |
2034 | ret = regnode(CURSOR); |
2035 | break; |
2036 | |
2037 | case 'V': |
2038 | ret = regnode(RE_VISUAL); |
2039 | break; |
2040 | |
2041 | case 'C': |
2042 | ret = regnode(RE_COMPOSING); |
2043 | break; |
2044 | |
2045 | /* \%[abc]: Emit as a list of branches, all ending at the last |
2046 | * branch which matches nothing. */ |
2047 | case '[': |
2048 | if (one_exactly) /* doesn't nest */ |
2049 | EMSG_ONE_RET_NULL; |
2050 | { |
2051 | char_u *lastbranch; |
2052 | char_u *lastnode = NULL; |
2053 | char_u *br; |
2054 | |
2055 | ret = NULL; |
2056 | while ((c = getchr()) != ']') { |
2057 | if (c == NUL) |
2058 | EMSG2_RET_NULL(_(e_missing_sb), |
2059 | reg_magic == MAGIC_ALL); |
2060 | br = regnode(BRANCH); |
2061 | if (ret == NULL) { |
2062 | ret = br; |
2063 | } else { |
2064 | regtail(lastnode, br); |
2065 | if (reg_toolong) { |
2066 | return NULL; |
2067 | } |
2068 | } |
2069 | |
2070 | ungetchr(); |
2071 | one_exactly = TRUE; |
2072 | lastnode = regatom(flagp); |
2073 | one_exactly = FALSE; |
2074 | if (lastnode == NULL) |
2075 | return NULL; |
2076 | } |
2077 | if (ret == NULL) |
2078 | EMSG2_RET_NULL(_(e_empty_sb), |
2079 | reg_magic == MAGIC_ALL); |
2080 | lastbranch = regnode(BRANCH); |
2081 | br = regnode(NOTHING); |
2082 | if (ret != JUST_CALC_SIZE) { |
2083 | regtail(lastnode, br); |
2084 | regtail(lastbranch, br); |
2085 | /* connect all branches to the NOTHING |
2086 | * branch at the end */ |
2087 | for (br = ret; br != lastnode; ) { |
2088 | if (OP(br) == BRANCH) { |
2089 | regtail(br, lastbranch); |
2090 | if (reg_toolong) { |
2091 | return NULL; |
2092 | } |
2093 | br = OPERAND(br); |
2094 | } else |
2095 | br = regnext(br); |
2096 | } |
2097 | } |
2098 | *flagp &= ~(HASWIDTH | SIMPLE); |
2099 | break; |
2100 | } |
2101 | |
2102 | case 'd': /* %d123 decimal */ |
2103 | case 'o': /* %o123 octal */ |
2104 | case 'x': /* %xab hex 2 */ |
2105 | case 'u': /* %uabcd hex 4 */ |
2106 | case 'U': /* %U1234abcd hex 8 */ |
2107 | { |
2108 | int64_t i; |
2109 | |
2110 | switch (c) { |
2111 | case 'd': i = getdecchrs(); break; |
2112 | case 'o': i = getoctchrs(); break; |
2113 | case 'x': i = gethexchrs(2); break; |
2114 | case 'u': i = gethexchrs(4); break; |
2115 | case 'U': i = gethexchrs(8); break; |
2116 | default: i = -1; break; |
2117 | } |
2118 | |
2119 | if (i < 0 || i > INT_MAX) { |
2120 | EMSG2_RET_NULL(_("E678: Invalid character after %s%%[dxouU]" ), |
2121 | reg_magic == MAGIC_ALL); |
2122 | } |
2123 | if (use_multibytecode(i)) { |
2124 | ret = regnode(MULTIBYTECODE); |
2125 | } else { |
2126 | ret = regnode(EXACTLY); |
2127 | } |
2128 | if (i == 0) { |
2129 | regc(0x0a); |
2130 | } else { |
2131 | regmbc(i); |
2132 | } |
2133 | regc(NUL); |
2134 | *flagp |= HASWIDTH; |
2135 | break; |
2136 | } |
2137 | |
2138 | default: |
2139 | if (ascii_isdigit(c) || c == '<' || c == '>' |
2140 | || c == '\'') { |
2141 | uint32_t n = 0; |
2142 | int cmp; |
2143 | |
2144 | cmp = c; |
2145 | if (cmp == '<' || cmp == '>') |
2146 | c = getchr(); |
2147 | while (ascii_isdigit(c)) { |
2148 | n = n * 10 + (uint32_t)(c - '0'); |
2149 | c = getchr(); |
2150 | } |
2151 | if (c == '\'' && n == 0) { |
2152 | /* "\%'m", "\%<'m" and "\%>'m": Mark */ |
2153 | c = getchr(); |
2154 | ret = regnode(RE_MARK); |
2155 | if (ret == JUST_CALC_SIZE) |
2156 | regsize += 2; |
2157 | else { |
2158 | *regcode++ = c; |
2159 | *regcode++ = cmp; |
2160 | } |
2161 | break; |
2162 | } else if (c == 'l' || c == 'c' || c == 'v') { |
2163 | if (c == 'l') { |
2164 | ret = regnode(RE_LNUM); |
2165 | if (save_prev_at_start) { |
2166 | at_start = true; |
2167 | } |
2168 | } else if (c == 'c') { |
2169 | ret = regnode(RE_COL); |
2170 | } else { |
2171 | ret = regnode(RE_VCOL); |
2172 | } |
2173 | if (ret == JUST_CALC_SIZE) { |
2174 | regsize += 5; |
2175 | } else { |
2176 | // put the number and the optional |
2177 | // comparator after the opcode |
2178 | regcode = re_put_uint32(regcode, n); |
2179 | *regcode++ = cmp; |
2180 | } |
2181 | break; |
2182 | } |
2183 | } |
2184 | |
2185 | EMSG2_RET_NULL(_("E71: Invalid character after %s%%" ), |
2186 | reg_magic == MAGIC_ALL); |
2187 | } |
2188 | } |
2189 | break; |
2190 | |
2191 | case Magic('['): |
2192 | collection: |
2193 | { |
2194 | char_u *lp; |
2195 | |
2196 | /* |
2197 | * If there is no matching ']', we assume the '[' is a normal |
2198 | * character. This makes 'incsearch' and ":help [" work. |
2199 | */ |
2200 | lp = skip_anyof(regparse); |
2201 | if (*lp == ']') { /* there is a matching ']' */ |
2202 | int startc = -1; /* > 0 when next '-' is a range */ |
2203 | int endc; |
2204 | |
2205 | /* |
2206 | * In a character class, different parsing rules apply. |
2207 | * Not even \ is special anymore, nothing is. |
2208 | */ |
2209 | if (*regparse == '^') { /* Complement of range. */ |
2210 | ret = regnode(ANYBUT + extra); |
2211 | regparse++; |
2212 | } else |
2213 | ret = regnode(ANYOF + extra); |
2214 | |
2215 | /* At the start ']' and '-' mean the literal character. */ |
2216 | if (*regparse == ']' || *regparse == '-') { |
2217 | startc = *regparse; |
2218 | regc(*regparse++); |
2219 | } |
2220 | |
2221 | while (*regparse != NUL && *regparse != ']') { |
2222 | if (*regparse == '-') { |
2223 | ++regparse; |
2224 | /* The '-' is not used for a range at the end and |
2225 | * after or before a '\n'. */ |
2226 | if (*regparse == ']' || *regparse == NUL |
2227 | || startc == -1 |
2228 | || (regparse[0] == '\\' && regparse[1] == 'n')) { |
2229 | regc('-'); |
2230 | startc = '-'; /* [--x] is a range */ |
2231 | } else { |
2232 | /* Also accept "a-[.z.]" */ |
2233 | endc = 0; |
2234 | if (*regparse == '[') |
2235 | endc = get_coll_element(®parse); |
2236 | if (endc == 0) { |
2237 | if (has_mbyte) { |
2238 | endc = mb_ptr2char_adv((const char_u **)®parse); |
2239 | } else { |
2240 | endc = *regparse++; |
2241 | } |
2242 | } |
2243 | |
2244 | /* Handle \o40, \x20 and \u20AC style sequences */ |
2245 | if (endc == '\\' && !reg_cpo_lit) |
2246 | endc = coll_get_char(); |
2247 | |
2248 | if (startc > endc) { |
2249 | EMSG_RET_NULL(_(e_reverse_range)); |
2250 | } |
2251 | if (has_mbyte && ((*mb_char2len)(startc) > 1 |
2252 | || (*mb_char2len)(endc) > 1)) { |
2253 | // Limit to a range of 256 chars |
2254 | if (endc > startc + 256) { |
2255 | EMSG_RET_NULL(_(e_large_class)); |
2256 | } |
2257 | while (++startc <= endc) { |
2258 | regmbc(startc); |
2259 | } |
2260 | } else { |
2261 | while (++startc <= endc) |
2262 | regc(startc); |
2263 | } |
2264 | startc = -1; |
2265 | } |
2266 | } |
2267 | /* |
2268 | * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim |
2269 | * accepts "\t", "\e", etc., but only when the 'l' flag in |
2270 | * 'cpoptions' is not included. |
2271 | */ |
2272 | else if (*regparse == '\\' |
2273 | && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL |
2274 | || (!reg_cpo_lit |
2275 | && vim_strchr(REGEXP_ABBR, |
2276 | regparse[1]) != NULL))) { |
2277 | regparse++; |
2278 | if (*regparse == 'n') { |
2279 | /* '\n' in range: also match NL */ |
2280 | if (ret != JUST_CALC_SIZE) { |
2281 | /* Using \n inside [^] does not change what |
2282 | * matches. "[^\n]" is the same as ".". */ |
2283 | if (*ret == ANYOF) { |
2284 | *ret = ANYOF + ADD_NL; |
2285 | *flagp |= HASNL; |
2286 | } |
2287 | /* else: must have had a \n already */ |
2288 | } |
2289 | regparse++; |
2290 | startc = -1; |
2291 | } else if (*regparse == 'd' |
2292 | || *regparse == 'o' |
2293 | || *regparse == 'x' |
2294 | || *regparse == 'u' |
2295 | || *regparse == 'U') { |
2296 | startc = coll_get_char(); |
2297 | if (startc == 0) |
2298 | regc(0x0a); |
2299 | else |
2300 | regmbc(startc); |
2301 | } else { |
2302 | startc = backslash_trans(*regparse++); |
2303 | regc(startc); |
2304 | } |
2305 | } else if (*regparse == '[') { |
2306 | int c_class; |
2307 | int cu; |
2308 | |
2309 | c_class = get_char_class(®parse); |
2310 | startc = -1; |
2311 | /* Characters assumed to be 8 bits! */ |
2312 | switch (c_class) { |
2313 | case CLASS_NONE: |
2314 | c_class = get_equi_class(®parse); |
2315 | if (c_class != 0) { |
2316 | /* produce equivalence class */ |
2317 | reg_equi_class(c_class); |
2318 | } else if ((c_class = |
2319 | get_coll_element(®parse)) != 0) { |
2320 | /* produce a collating element */ |
2321 | regmbc(c_class); |
2322 | } else { |
2323 | /* literal '[', allow [[-x] as a range */ |
2324 | startc = *regparse++; |
2325 | regc(startc); |
2326 | } |
2327 | break; |
2328 | case CLASS_ALNUM: |
2329 | for (cu = 1; cu < 128; cu++) { |
2330 | if (isalnum(cu)) { |
2331 | regmbc(cu); |
2332 | } |
2333 | } |
2334 | break; |
2335 | case CLASS_ALPHA: |
2336 | for (cu = 1; cu < 128; cu++) { |
2337 | if (isalpha(cu)) { |
2338 | regmbc(cu); |
2339 | } |
2340 | } |
2341 | break; |
2342 | case CLASS_BLANK: |
2343 | regc(' '); |
2344 | regc('\t'); |
2345 | break; |
2346 | case CLASS_CNTRL: |
2347 | for (cu = 1; cu <= 127; cu++) { |
2348 | if (iscntrl(cu)) { |
2349 | regmbc(cu); |
2350 | } |
2351 | } |
2352 | break; |
2353 | case CLASS_DIGIT: |
2354 | for (cu = 1; cu <= 127; cu++) { |
2355 | if (ascii_isdigit(cu)) { |
2356 | regmbc(cu); |
2357 | } |
2358 | } |
2359 | break; |
2360 | case CLASS_GRAPH: |
2361 | for (cu = 1; cu <= 127; cu++) { |
2362 | if (isgraph(cu)) { |
2363 | regmbc(cu); |
2364 | } |
2365 | } |
2366 | break; |
2367 | case CLASS_LOWER: |
2368 | for (cu = 1; cu <= 255; cu++) { |
2369 | if (mb_islower(cu) && cu != 170 && cu != 186) { |
2370 | regmbc(cu); |
2371 | } |
2372 | } |
2373 | break; |
2374 | case CLASS_PRINT: |
2375 | for (cu = 1; cu <= 255; cu++) { |
2376 | if (vim_isprintc(cu)) { |
2377 | regmbc(cu); |
2378 | } |
2379 | } |
2380 | break; |
2381 | case CLASS_PUNCT: |
2382 | for (cu = 1; cu < 128; cu++) { |
2383 | if (ispunct(cu)) { |
2384 | regmbc(cu); |
2385 | } |
2386 | } |
2387 | break; |
2388 | case CLASS_SPACE: |
2389 | for (cu = 9; cu <= 13; cu++) |
2390 | regc(cu); |
2391 | regc(' '); |
2392 | break; |
2393 | case CLASS_UPPER: |
2394 | for (cu = 1; cu <= 255; cu++) { |
2395 | if (mb_isupper(cu)) { |
2396 | regmbc(cu); |
2397 | } |
2398 | } |
2399 | break; |
2400 | case CLASS_XDIGIT: |
2401 | for (cu = 1; cu <= 255; cu++) { |
2402 | if (ascii_isxdigit(cu)) { |
2403 | regmbc(cu); |
2404 | } |
2405 | } |
2406 | break; |
2407 | case CLASS_TAB: |
2408 | regc('\t'); |
2409 | break; |
2410 | case CLASS_RETURN: |
2411 | regc('\r'); |
2412 | break; |
2413 | case CLASS_BACKSPACE: |
2414 | regc('\b'); |
2415 | break; |
2416 | case CLASS_ESCAPE: |
2417 | regc(ESC); |
2418 | break; |
2419 | } |
2420 | } else { |
2421 | // produce a multibyte character, including any |
2422 | // following composing characters. |
2423 | startc = utf_ptr2char(regparse); |
2424 | int len = utfc_ptr2len(regparse); |
2425 | if (utf_char2len(startc) != len) { |
2426 | // composing chars |
2427 | startc = -1; |
2428 | } |
2429 | while (--len >= 0) { |
2430 | regc(*regparse++); |
2431 | } |
2432 | } |
2433 | } |
2434 | regc(NUL); |
2435 | prevchr_len = 1; /* last char was the ']' */ |
2436 | if (*regparse != ']') |
2437 | EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */ |
2438 | skipchr(); /* let's be friends with the lexer again */ |
2439 | *flagp |= HASWIDTH | SIMPLE; |
2440 | break; |
2441 | } else if (reg_strict) |
2442 | EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF); |
2443 | } |
2444 | FALLTHROUGH; |
2445 | |
2446 | default: |
2447 | { |
2448 | int len; |
2449 | |
2450 | /* A multi-byte character is handled as a separate atom if it's |
2451 | * before a multi and when it's a composing char. */ |
2452 | if (use_multibytecode(c)) { |
2453 | do_multibyte: |
2454 | ret = regnode(MULTIBYTECODE); |
2455 | regmbc(c); |
2456 | *flagp |= HASWIDTH | SIMPLE; |
2457 | break; |
2458 | } |
2459 | |
2460 | ret = regnode(EXACTLY); |
2461 | |
2462 | /* |
2463 | * Append characters as long as: |
2464 | * - there is no following multi, we then need the character in |
2465 | * front of it as a single character operand |
2466 | * - not running into a Magic character |
2467 | * - "one_exactly" is not set |
2468 | * But always emit at least one character. Might be a Multi, |
2469 | * e.g., a "[" without matching "]". |
2470 | */ |
2471 | for (len = 0; c != NUL && (len == 0 |
2472 | || (re_multi_type(peekchr()) == NOT_MULTI |
2473 | && !one_exactly |
2474 | && !is_Magic(c))); ++len) { |
2475 | c = no_Magic(c); |
2476 | if (has_mbyte) { |
2477 | regmbc(c); |
2478 | if (enc_utf8) { |
2479 | int l; |
2480 | |
2481 | /* Need to get composing character too. */ |
2482 | for (;; ) { |
2483 | l = utf_ptr2len(regparse); |
2484 | if (!UTF_COMPOSINGLIKE(regparse, regparse + l)) |
2485 | break; |
2486 | regmbc(utf_ptr2char(regparse)); |
2487 | skipchr(); |
2488 | } |
2489 | } |
2490 | } else |
2491 | regc(c); |
2492 | c = getchr(); |
2493 | } |
2494 | ungetchr(); |
2495 | |
2496 | regc(NUL); |
2497 | *flagp |= HASWIDTH; |
2498 | if (len == 1) |
2499 | *flagp |= SIMPLE; |
2500 | } |
2501 | break; |
2502 | } |
2503 | |
2504 | return ret; |
2505 | } |
2506 | |
2507 | /// Used in a place where no * or \+ can follow. |
2508 | static bool re_mult_next(char *what) |
2509 | { |
2510 | if (re_multi_type(peekchr()) == MULTI_MULT) { |
2511 | EMSG2_RET_FAIL(_("E888: (NFA regexp) cannot repeat %s" ), what); |
2512 | } |
2513 | return true; |
2514 | } |
2515 | |
2516 | /* |
2517 | * Return TRUE if MULTIBYTECODE should be used instead of EXACTLY for |
2518 | * character "c". |
2519 | */ |
2520 | static int use_multibytecode(int c) |
2521 | { |
2522 | return has_mbyte && (*mb_char2len)(c) > 1 |
2523 | && (re_multi_type(peekchr()) != NOT_MULTI |
2524 | || (enc_utf8 && utf_iscomposing(c))); |
2525 | } |
2526 | |
2527 | /* |
2528 | * Emit a node. |
2529 | * Return pointer to generated code. |
2530 | */ |
2531 | static char_u *regnode(int op) |
2532 | { |
2533 | char_u *ret; |
2534 | |
2535 | ret = regcode; |
2536 | if (ret == JUST_CALC_SIZE) |
2537 | regsize += 3; |
2538 | else { |
2539 | *regcode++ = op; |
2540 | *regcode++ = NUL; /* Null "next" pointer. */ |
2541 | *regcode++ = NUL; |
2542 | } |
2543 | return ret; |
2544 | } |
2545 | |
2546 | /* |
2547 | * Emit (if appropriate) a byte of code |
2548 | */ |
2549 | static void regc(int b) |
2550 | { |
2551 | if (regcode == JUST_CALC_SIZE) |
2552 | regsize++; |
2553 | else |
2554 | *regcode++ = b; |
2555 | } |
2556 | |
2557 | /* |
2558 | * Emit (if appropriate) a multi-byte character of code |
2559 | */ |
2560 | static void regmbc(int c) |
2561 | { |
2562 | if (regcode == JUST_CALC_SIZE) { |
2563 | regsize += utf_char2len(c); |
2564 | } else { |
2565 | regcode += utf_char2bytes(c, regcode); |
2566 | } |
2567 | } |
2568 | |
2569 | /* |
2570 | * Insert an operator in front of already-emitted operand |
2571 | * |
2572 | * Means relocating the operand. |
2573 | */ |
2574 | static void reginsert(int op, char_u *opnd) |
2575 | { |
2576 | char_u *src; |
2577 | char_u *dst; |
2578 | char_u *place; |
2579 | |
2580 | if (regcode == JUST_CALC_SIZE) { |
2581 | regsize += 3; |
2582 | return; |
2583 | } |
2584 | src = regcode; |
2585 | regcode += 3; |
2586 | dst = regcode; |
2587 | while (src > opnd) |
2588 | *--dst = *--src; |
2589 | |
2590 | place = opnd; /* Op node, where operand used to be. */ |
2591 | *place++ = op; |
2592 | *place++ = NUL; |
2593 | *place = NUL; |
2594 | } |
2595 | |
2596 | /* |
2597 | * Insert an operator in front of already-emitted operand. |
2598 | * Add a number to the operator. |
2599 | */ |
2600 | static void reginsert_nr(int op, long val, char_u *opnd) |
2601 | { |
2602 | char_u *src; |
2603 | char_u *dst; |
2604 | char_u *place; |
2605 | |
2606 | if (regcode == JUST_CALC_SIZE) { |
2607 | regsize += 7; |
2608 | return; |
2609 | } |
2610 | src = regcode; |
2611 | regcode += 7; |
2612 | dst = regcode; |
2613 | while (src > opnd) |
2614 | *--dst = *--src; |
2615 | |
2616 | place = opnd; /* Op node, where operand used to be. */ |
2617 | *place++ = op; |
2618 | *place++ = NUL; |
2619 | *place++ = NUL; |
2620 | assert(val >= 0 && (uintmax_t)val <= UINT32_MAX); |
2621 | re_put_uint32(place, (uint32_t)val); |
2622 | } |
2623 | |
2624 | /* |
2625 | * Insert an operator in front of already-emitted operand. |
2626 | * The operator has the given limit values as operands. Also set next pointer. |
2627 | * |
2628 | * Means relocating the operand. |
2629 | */ |
2630 | static void reginsert_limits(int op, long minval, long maxval, char_u *opnd) |
2631 | { |
2632 | char_u *src; |
2633 | char_u *dst; |
2634 | char_u *place; |
2635 | |
2636 | if (regcode == JUST_CALC_SIZE) { |
2637 | regsize += 11; |
2638 | return; |
2639 | } |
2640 | src = regcode; |
2641 | regcode += 11; |
2642 | dst = regcode; |
2643 | while (src > opnd) |
2644 | *--dst = *--src; |
2645 | |
2646 | place = opnd; /* Op node, where operand used to be. */ |
2647 | *place++ = op; |
2648 | *place++ = NUL; |
2649 | *place++ = NUL; |
2650 | assert(minval >= 0 && (uintmax_t)minval <= UINT32_MAX); |
2651 | place = re_put_uint32(place, (uint32_t)minval); |
2652 | assert(maxval >= 0 && (uintmax_t)maxval <= UINT32_MAX); |
2653 | place = re_put_uint32(place, (uint32_t)maxval); |
2654 | regtail(opnd, place); |
2655 | } |
2656 | |
2657 | /* |
2658 | * Write a four bytes number at "p" and return pointer to the next char. |
2659 | */ |
2660 | static char_u *re_put_uint32(char_u *p, uint32_t val) |
2661 | { |
2662 | *p++ = (char_u) ((val >> 24) & 0377); |
2663 | *p++ = (char_u) ((val >> 16) & 0377); |
2664 | *p++ = (char_u) ((val >> 8) & 0377); |
2665 | *p++ = (char_u) (val & 0377); |
2666 | return p; |
2667 | } |
2668 | |
2669 | /* |
2670 | * Set the next-pointer at the end of a node chain. |
2671 | */ |
2672 | static void regtail(char_u *p, char_u *val) |
2673 | { |
2674 | char_u *scan; |
2675 | char_u *temp; |
2676 | int offset; |
2677 | |
2678 | if (p == JUST_CALC_SIZE) |
2679 | return; |
2680 | |
2681 | /* Find last node. */ |
2682 | scan = p; |
2683 | for (;; ) { |
2684 | temp = regnext(scan); |
2685 | if (temp == NULL) |
2686 | break; |
2687 | scan = temp; |
2688 | } |
2689 | |
2690 | if (OP(scan) == BACK) |
2691 | offset = (int)(scan - val); |
2692 | else |
2693 | offset = (int)(val - scan); |
2694 | /* When the offset uses more than 16 bits it can no longer fit in the two |
2695 | * bytes available. Use a global flag to avoid having to check return |
2696 | * values in too many places. */ |
2697 | if (offset > 0xffff) |
2698 | reg_toolong = TRUE; |
2699 | else { |
2700 | *(scan + 1) = (char_u) (((unsigned)offset >> 8) & 0377); |
2701 | *(scan + 2) = (char_u) (offset & 0377); |
2702 | } |
2703 | } |
2704 | |
2705 | /* |
2706 | * Like regtail, on item after a BRANCH; nop if none. |
2707 | */ |
2708 | static void regoptail(char_u *p, char_u *val) |
2709 | { |
2710 | /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */ |
2711 | if (p == NULL || p == JUST_CALC_SIZE |
2712 | || (OP(p) != BRANCH |
2713 | && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9))) |
2714 | return; |
2715 | regtail(OPERAND(p), val); |
2716 | } |
2717 | |
2718 | /* |
2719 | * Functions for getting characters from the regexp input. |
2720 | */ |
2721 | |
2722 | /* |
2723 | * Start parsing at "str". |
2724 | */ |
2725 | static void initchr(char_u *str) |
2726 | { |
2727 | regparse = str; |
2728 | prevchr_len = 0; |
2729 | curchr = prevprevchr = prevchr = nextchr = -1; |
2730 | at_start = TRUE; |
2731 | prev_at_start = FALSE; |
2732 | } |
2733 | |
2734 | /* |
2735 | * Save the current parse state, so that it can be restored and parsing |
2736 | * starts in the same state again. |
2737 | */ |
2738 | static void save_parse_state(parse_state_T *ps) |
2739 | { |
2740 | ps->regparse = regparse; |
2741 | ps->prevchr_len = prevchr_len; |
2742 | ps->curchr = curchr; |
2743 | ps->prevchr = prevchr; |
2744 | ps->prevprevchr = prevprevchr; |
2745 | ps->nextchr = nextchr; |
2746 | ps->at_start = at_start; |
2747 | ps->prev_at_start = prev_at_start; |
2748 | ps->regnpar = regnpar; |
2749 | } |
2750 | |
2751 | /* |
2752 | * Restore a previously saved parse state. |
2753 | */ |
2754 | static void restore_parse_state(parse_state_T *ps) |
2755 | { |
2756 | regparse = ps->regparse; |
2757 | prevchr_len = ps->prevchr_len; |
2758 | curchr = ps->curchr; |
2759 | prevchr = ps->prevchr; |
2760 | prevprevchr = ps->prevprevchr; |
2761 | nextchr = ps->nextchr; |
2762 | at_start = ps->at_start; |
2763 | prev_at_start = ps->prev_at_start; |
2764 | regnpar = ps->regnpar; |
2765 | } |
2766 | |
2767 | |
2768 | /* |
2769 | * Get the next character without advancing. |
2770 | */ |
2771 | static int peekchr(void) |
2772 | { |
2773 | static int after_slash = FALSE; |
2774 | |
2775 | if (curchr != -1) { |
2776 | return curchr; |
2777 | } |
2778 | |
2779 | switch (curchr = regparse[0]) { |
2780 | case '.': |
2781 | case '[': |
2782 | case '~': |
2783 | /* magic when 'magic' is on */ |
2784 | if (reg_magic >= MAGIC_ON) |
2785 | curchr = Magic(curchr); |
2786 | break; |
2787 | case '(': |
2788 | case ')': |
2789 | case '{': |
2790 | case '%': |
2791 | case '+': |
2792 | case '=': |
2793 | case '?': |
2794 | case '@': |
2795 | case '!': |
2796 | case '&': |
2797 | case '|': |
2798 | case '<': |
2799 | case '>': |
2800 | case '#': /* future ext. */ |
2801 | case '"': /* future ext. */ |
2802 | case '\'': /* future ext. */ |
2803 | case ',': /* future ext. */ |
2804 | case '-': /* future ext. */ |
2805 | case ':': /* future ext. */ |
2806 | case ';': /* future ext. */ |
2807 | case '`': /* future ext. */ |
2808 | case '/': /* Can't be used in / command */ |
2809 | /* magic only after "\v" */ |
2810 | if (reg_magic == MAGIC_ALL) |
2811 | curchr = Magic(curchr); |
2812 | break; |
2813 | case '*': |
2814 | /* * is not magic as the very first character, eg "?*ptr", when |
2815 | * after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But |
2816 | * "\(\*" is not magic, thus must be magic if "after_slash" */ |
2817 | if (reg_magic >= MAGIC_ON |
2818 | && !at_start |
2819 | && !(prev_at_start && prevchr == Magic('^')) |
2820 | && (after_slash |
2821 | || (prevchr != Magic('(') |
2822 | && prevchr != Magic('&') |
2823 | && prevchr != Magic('|')))) |
2824 | curchr = Magic('*'); |
2825 | break; |
2826 | case '^': |
2827 | /* '^' is only magic as the very first character and if it's after |
2828 | * "\(", "\|", "\&' or "\n" */ |
2829 | if (reg_magic >= MAGIC_OFF |
2830 | && (at_start |
2831 | || reg_magic == MAGIC_ALL |
2832 | || prevchr == Magic('(') |
2833 | || prevchr == Magic('|') |
2834 | || prevchr == Magic('&') |
2835 | || prevchr == Magic('n') |
2836 | || (no_Magic(prevchr) == '(' |
2837 | && prevprevchr == Magic('%')))) { |
2838 | curchr = Magic('^'); |
2839 | at_start = TRUE; |
2840 | prev_at_start = FALSE; |
2841 | } |
2842 | break; |
2843 | case '$': |
2844 | /* '$' is only magic as the very last char and if it's in front of |
2845 | * either "\|", "\)", "\&", or "\n" */ |
2846 | if (reg_magic >= MAGIC_OFF) { |
2847 | char_u *p = regparse + 1; |
2848 | bool is_magic_all = (reg_magic == MAGIC_ALL); |
2849 | |
2850 | // ignore \c \C \m \M \v \V and \Z after '$' |
2851 | while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C' |
2852 | || p[1] == 'm' || p[1] == 'M' |
2853 | || p[1] == 'v' || p[1] == 'V' |
2854 | || p[1] == 'Z')) { |
2855 | if (p[1] == 'v') { |
2856 | is_magic_all = true; |
2857 | } else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V') { |
2858 | is_magic_all = false; |
2859 | } |
2860 | p += 2; |
2861 | } |
2862 | if (p[0] == NUL |
2863 | || (p[0] == '\\' |
2864 | && (p[1] == '|' || p[1] == '&' || p[1] == ')' |
2865 | || p[1] == 'n')) |
2866 | || (is_magic_all |
2867 | && (p[0] == '|' || p[0] == '&' || p[0] == ')')) |
2868 | || reg_magic == MAGIC_ALL) { |
2869 | curchr = Magic('$'); |
2870 | } |
2871 | } |
2872 | break; |
2873 | case '\\': |
2874 | { |
2875 | int c = regparse[1]; |
2876 | |
2877 | if (c == NUL) |
2878 | curchr = '\\'; /* trailing '\' */ |
2879 | else if ( |
2880 | c <= '~' && META_flags[c] |
2881 | ) { |
2882 | /* |
2883 | * META contains everything that may be magic sometimes, |
2884 | * except ^ and $ ("\^" and "\$" are only magic after |
2885 | * "\V"). We now fetch the next character and toggle its |
2886 | * magicness. Therefore, \ is so meta-magic that it is |
2887 | * not in META. |
2888 | */ |
2889 | curchr = -1; |
2890 | prev_at_start = at_start; |
2891 | at_start = FALSE; /* be able to say "/\*ptr" */ |
2892 | ++regparse; |
2893 | ++after_slash; |
2894 | peekchr(); |
2895 | --regparse; |
2896 | --after_slash; |
2897 | curchr = toggle_Magic(curchr); |
2898 | } else if (vim_strchr(REGEXP_ABBR, c)) { |
2899 | /* |
2900 | * Handle abbreviations, like "\t" for TAB -- webb |
2901 | */ |
2902 | curchr = backslash_trans(c); |
2903 | } else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^')) |
2904 | curchr = toggle_Magic(c); |
2905 | else { |
2906 | /* |
2907 | * Next character can never be (made) magic? |
2908 | * Then backslashing it won't do anything. |
2909 | */ |
2910 | curchr = utf_ptr2char(regparse + 1); |
2911 | } |
2912 | break; |
2913 | } |
2914 | |
2915 | default: |
2916 | curchr = utf_ptr2char(regparse); |
2917 | } |
2918 | |
2919 | return curchr; |
2920 | } |
2921 | |
2922 | /* |
2923 | * Eat one lexed character. Do this in a way that we can undo it. |
2924 | */ |
2925 | static void skipchr(void) |
2926 | { |
2927 | /* peekchr() eats a backslash, do the same here */ |
2928 | if (*regparse == '\\') |
2929 | prevchr_len = 1; |
2930 | else |
2931 | prevchr_len = 0; |
2932 | if (regparse[prevchr_len] != NUL) { |
2933 | // Exclude composing chars that utfc_ptr2len does include. |
2934 | prevchr_len += utf_ptr2len(regparse + prevchr_len); |
2935 | } |
2936 | regparse += prevchr_len; |
2937 | prev_at_start = at_start; |
2938 | at_start = FALSE; |
2939 | prevprevchr = prevchr; |
2940 | prevchr = curchr; |
2941 | curchr = nextchr; /* use previously unget char, or -1 */ |
2942 | nextchr = -1; |
2943 | } |
2944 | |
2945 | /* |
2946 | * Skip a character while keeping the value of prev_at_start for at_start. |
2947 | * prevchr and prevprevchr are also kept. |
2948 | */ |
2949 | static void skipchr_keepstart(void) |
2950 | { |
2951 | int as = prev_at_start; |
2952 | int pr = prevchr; |
2953 | int prpr = prevprevchr; |
2954 | |
2955 | skipchr(); |
2956 | at_start = as; |
2957 | prevchr = pr; |
2958 | prevprevchr = prpr; |
2959 | } |
2960 | |
2961 | /* |
2962 | * Get the next character from the pattern. We know about magic and such, so |
2963 | * therefore we need a lexical analyzer. |
2964 | */ |
2965 | static int getchr(void) |
2966 | { |
2967 | int chr = peekchr(); |
2968 | |
2969 | skipchr(); |
2970 | return chr; |
2971 | } |
2972 | |
2973 | /* |
2974 | * put character back. Works only once! |
2975 | */ |
2976 | static void ungetchr(void) |
2977 | { |
2978 | nextchr = curchr; |
2979 | curchr = prevchr; |
2980 | prevchr = prevprevchr; |
2981 | at_start = prev_at_start; |
2982 | prev_at_start = FALSE; |
2983 | |
2984 | /* Backup regparse, so that it's at the same position as before the |
2985 | * getchr(). */ |
2986 | regparse -= prevchr_len; |
2987 | } |
2988 | |
2989 | /* |
2990 | * Get and return the value of the hex string at the current position. |
2991 | * Return -1 if there is no valid hex number. |
2992 | * The position is updated: |
2993 | * blahblah\%x20asdf |
2994 | * before-^ ^-after |
2995 | * The parameter controls the maximum number of input characters. This will be |
2996 | * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence. |
2997 | */ |
2998 | static int64_t gethexchrs(int maxinputlen) |
2999 | { |
3000 | int64_t nr = 0; |
3001 | int c; |
3002 | int i; |
3003 | |
3004 | for (i = 0; i < maxinputlen; ++i) { |
3005 | c = regparse[0]; |
3006 | if (!ascii_isxdigit(c)) |
3007 | break; |
3008 | nr <<= 4; |
3009 | nr |= hex2nr(c); |
3010 | ++regparse; |
3011 | } |
3012 | |
3013 | if (i == 0) |
3014 | return -1; |
3015 | return nr; |
3016 | } |
3017 | |
3018 | /* |
3019 | * Get and return the value of the decimal string immediately after the |
3020 | * current position. Return -1 for invalid. Consumes all digits. |
3021 | */ |
3022 | static int64_t getdecchrs(void) |
3023 | { |
3024 | int64_t nr = 0; |
3025 | int c; |
3026 | int i; |
3027 | |
3028 | for (i = 0;; ++i) { |
3029 | c = regparse[0]; |
3030 | if (c < '0' || c > '9') |
3031 | break; |
3032 | nr *= 10; |
3033 | nr += c - '0'; |
3034 | ++regparse; |
3035 | curchr = -1; /* no longer valid */ |
3036 | } |
3037 | |
3038 | if (i == 0) |
3039 | return -1; |
3040 | return nr; |
3041 | } |
3042 | |
3043 | /* |
3044 | * get and return the value of the octal string immediately after the current |
3045 | * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle |
3046 | * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't |
3047 | * treat 8 or 9 as recognised characters. Position is updated: |
3048 | * blahblah\%o210asdf |
3049 | * before-^ ^-after |
3050 | */ |
3051 | static int64_t getoctchrs(void) |
3052 | { |
3053 | int64_t nr = 0; |
3054 | int c; |
3055 | int i; |
3056 | |
3057 | for (i = 0; i < 3 && nr < 040; i++) { // -V536 |
3058 | c = regparse[0]; |
3059 | if (c < '0' || c > '7') |
3060 | break; |
3061 | nr <<= 3; |
3062 | nr |= hex2nr(c); |
3063 | ++regparse; |
3064 | } |
3065 | |
3066 | if (i == 0) |
3067 | return -1; |
3068 | return nr; |
3069 | } |
3070 | |
3071 | /* |
3072 | * Get a number after a backslash that is inside []. |
3073 | * When nothing is recognized return a backslash. |
3074 | */ |
3075 | static int coll_get_char(void) |
3076 | { |
3077 | int64_t nr = -1; |
3078 | |
3079 | switch (*regparse++) { |
3080 | case 'd': nr = getdecchrs(); break; |
3081 | case 'o': nr = getoctchrs(); break; |
3082 | case 'x': nr = gethexchrs(2); break; |
3083 | case 'u': nr = gethexchrs(4); break; |
3084 | case 'U': nr = gethexchrs(8); break; |
3085 | } |
3086 | if (nr < 0 || nr > INT_MAX) { |
3087 | // If getting the number fails be backwards compatible: the character |
3088 | // is a backslash. |
3089 | regparse--; |
3090 | nr = '\\'; |
3091 | } |
3092 | return nr; |
3093 | } |
3094 | |
3095 | /* |
3096 | * read_limits - Read two integers to be taken as a minimum and maximum. |
3097 | * If the first character is '-', then the range is reversed. |
3098 | * Should end with 'end'. If minval is missing, zero is default, if maxval is |
3099 | * missing, a very big number is the default. |
3100 | */ |
3101 | static int read_limits(long *minval, long *maxval) |
3102 | { |
3103 | int reverse = FALSE; |
3104 | char_u *first_char; |
3105 | long tmp; |
3106 | |
3107 | if (*regparse == '-') { |
3108 | // Starts with '-', so reverse the range later. |
3109 | regparse++; |
3110 | reverse = TRUE; |
3111 | } |
3112 | first_char = regparse; |
3113 | *minval = getdigits_long(®parse, false, 0); |
3114 | if (*regparse == ',') { // There is a comma. |
3115 | if (ascii_isdigit(*++regparse)) { |
3116 | *maxval = getdigits_long(®parse, false, MAX_LIMIT); |
3117 | } else { |
3118 | *maxval = MAX_LIMIT; |
3119 | } |
3120 | } else if (ascii_isdigit(*first_char)) { |
3121 | *maxval = *minval; // It was \{n} or \{-n} |
3122 | } else { |
3123 | *maxval = MAX_LIMIT; // It was \{} or \{-} |
3124 | } |
3125 | if (*regparse == '\\') { |
3126 | regparse++; // Allow either \{...} or \{...\} |
3127 | } |
3128 | if (*regparse != '}') { |
3129 | sprintf((char *)IObuff, _("E554: Syntax error in %s{...}" ), |
3130 | reg_magic == MAGIC_ALL ? "" : "\\" ); |
3131 | EMSG_RET_FAIL(IObuff); |
3132 | } |
3133 | |
3134 | /* |
3135 | * Reverse the range if there was a '-', or make sure it is in the right |
3136 | * order otherwise. |
3137 | */ |
3138 | if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval)) { |
3139 | tmp = *minval; |
3140 | *minval = *maxval; |
3141 | *maxval = tmp; |
3142 | } |
3143 | skipchr(); /* let's be friends with the lexer again */ |
3144 | return OK; |
3145 | } |
3146 | |
3147 | /* |
3148 | * vim_regexec and friends |
3149 | */ |
3150 | |
3151 | /* |
3152 | * Global work variables for vim_regexec(). |
3153 | */ |
3154 | |
3155 | /* The current match-position is remembered with these variables: */ |
3156 | static linenr_T reglnum; /* line number, relative to first line */ |
3157 | static char_u *regline; /* start of current line */ |
3158 | static char_u *reginput; /* current input, points into "regline" */ |
3159 | |
3160 | static int need_clear_subexpr; /* subexpressions still need to be |
3161 | * cleared */ |
3162 | static int need_clear_zsubexpr = FALSE; /* extmatch subexpressions |
3163 | * still need to be cleared */ |
3164 | |
3165 | |
3166 | /* Save the sub-expressions before attempting a match. */ |
3167 | #define save_se(savep, posp, pp) \ |
3168 | REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp)) |
3169 | |
3170 | /* After a failed match restore the sub-expressions. */ |
3171 | #define restore_se(savep, posp, pp) { \ |
3172 | if (REG_MULTI) \ |
3173 | *(posp) = (savep)->se_u.pos; \ |
3174 | else \ |
3175 | *(pp) = (savep)->se_u.ptr; } |
3176 | |
3177 | |
3178 | #ifdef REGEXP_DEBUG |
3179 | int regnarrate = 0; |
3180 | #endif |
3181 | |
3182 | // Sometimes need to save a copy of a line. Since alloc()/free() is very |
3183 | // slow, we keep one allocated piece of memory and only re-allocate it when |
3184 | // it's too small. It's freed in bt_regexec_both() when finished. |
3185 | static char_u *reg_tofree = NULL; |
3186 | static unsigned reg_tofreelen; |
3187 | |
3188 | // Structure used to store the execution state of the regex engine. |
3189 | // Which ones are set depends on whether a single-line or multi-line match is |
3190 | // done: |
3191 | // single-line multi-line |
3192 | // reg_match ®match_T NULL |
3193 | // reg_mmatch NULL ®mmatch_T |
3194 | // reg_startp reg_match->startp <invalid> |
3195 | // reg_endp reg_match->endp <invalid> |
3196 | // reg_startpos <invalid> reg_mmatch->startpos |
3197 | // reg_endpos <invalid> reg_mmatch->endpos |
3198 | // reg_win NULL window in which to search |
3199 | // reg_buf curbuf buffer in which to search |
3200 | // reg_firstlnum <invalid> first line in which to search |
3201 | // reg_maxline 0 last line nr |
3202 | // reg_line_lbr false or true false |
3203 | typedef struct { |
3204 | regmatch_T *reg_match; |
3205 | regmmatch_T *reg_mmatch; |
3206 | char_u **reg_startp; |
3207 | char_u **reg_endp; |
3208 | lpos_T *reg_startpos; |
3209 | lpos_T *reg_endpos; |
3210 | win_T *reg_win; |
3211 | buf_T *reg_buf; |
3212 | linenr_T reg_firstlnum; |
3213 | linenr_T reg_maxline; |
3214 | bool reg_line_lbr; // "\n" in string is line break |
3215 | |
3216 | // Internal copy of 'ignorecase'. It is set at each call to vim_regexec(). |
3217 | // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern |
3218 | // contains '\c' or '\C' the value is overruled. |
3219 | bool reg_ic; |
3220 | |
3221 | // Similar to rex.reg_ic, but only for 'combining' characters. Set with \Z |
3222 | // flag in the regexp. Defaults to false, always. |
3223 | bool reg_icombine; |
3224 | |
3225 | // Copy of "rmm_maxcol": maximum column to search for a match. Zero when |
3226 | // there is no maximum. |
3227 | colnr_T reg_maxcol; |
3228 | } regexec_T; |
3229 | |
3230 | static regexec_T rex; |
3231 | static bool rex_in_use = false; |
3232 | |
3233 | /* |
3234 | * "regstack" and "backpos" are used by regmatch(). They are kept over calls |
3235 | * to avoid invoking malloc() and free() often. |
3236 | * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T |
3237 | * or regbehind_T. |
3238 | * "backpos_T" is a table with backpos_T for BACK |
3239 | */ |
3240 | static garray_T regstack = GA_EMPTY_INIT_VALUE; |
3241 | static garray_T backpos = GA_EMPTY_INIT_VALUE; |
3242 | |
3243 | /* |
3244 | * Both for regstack and backpos tables we use the following strategy of |
3245 | * allocation (to reduce malloc/free calls): |
3246 | * - Initial size is fairly small. |
3247 | * - When needed, the tables are grown bigger (8 times at first, double after |
3248 | * that). |
3249 | * - After executing the match we free the memory only if the array has grown. |
3250 | * Thus the memory is kept allocated when it's at the initial size. |
3251 | * This makes it fast while not keeping a lot of memory allocated. |
3252 | * A three times speed increase was observed when using many simple patterns. |
3253 | */ |
3254 | #define REGSTACK_INITIAL 2048 |
3255 | #define BACKPOS_INITIAL 64 |
3256 | |
3257 | #if defined(EXITFREE) |
3258 | void free_regexp_stuff(void) |
3259 | { |
3260 | ga_clear(®stack); |
3261 | ga_clear(&backpos); |
3262 | xfree(reg_tofree); |
3263 | xfree(reg_prev_sub); |
3264 | } |
3265 | |
3266 | #endif |
3267 | |
3268 | /* |
3269 | * Get pointer to the line "lnum", which is relative to "reg_firstlnum". |
3270 | */ |
3271 | static char_u *reg_getline(linenr_T lnum) |
3272 | { |
3273 | // when looking behind for a match/no-match lnum is negative. But we |
3274 | // can't go before line 1 |
3275 | if (rex.reg_firstlnum + lnum < 1) { |
3276 | return NULL; |
3277 | } |
3278 | if (lnum > rex.reg_maxline) { |
3279 | // Must have matched the "\n" in the last line. |
3280 | return (char_u *)"" ; |
3281 | } |
3282 | return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, false); |
3283 | } |
3284 | |
3285 | static regsave_T behind_pos; |
3286 | |
3287 | static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */ |
3288 | static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */ |
3289 | static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */ |
3290 | static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */ |
3291 | |
3292 | // TRUE if using multi-line regexp. |
3293 | #define REG_MULTI (rex.reg_match == NULL) |
3294 | |
3295 | /* |
3296 | * Match a regexp against a string. |
3297 | * "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). |
3298 | * Uses curbuf for line count and 'iskeyword'. |
3299 | * If "line_lbr" is true, consider a "\n" in "line" to be a line break. |
3300 | * |
3301 | * Returns 0 for failure, number of lines contained in the match otherwise. |
3302 | */ |
3303 | static int |
3304 | bt_regexec_nl ( |
3305 | regmatch_T *rmp, |
3306 | char_u *line, /* string to match against */ |
3307 | colnr_T col, /* column to start looking for match */ |
3308 | bool line_lbr |
3309 | ) |
3310 | { |
3311 | rex.reg_match = rmp; |
3312 | rex.reg_mmatch = NULL; |
3313 | rex.reg_maxline = 0; |
3314 | rex.reg_line_lbr = line_lbr; |
3315 | rex.reg_buf = curbuf; |
3316 | rex.reg_win = NULL; |
3317 | rex.reg_ic = rmp->rm_ic; |
3318 | rex.reg_icombine = false; |
3319 | rex.reg_maxcol = 0; |
3320 | |
3321 | long r = bt_regexec_both(line, col, NULL, NULL); |
3322 | assert(r <= INT_MAX); |
3323 | return (int)r; |
3324 | } |
3325 | |
3326 | /// Wrapper around strchr which accounts for case-insensitive searches and |
3327 | /// non-ASCII characters. |
3328 | /// |
3329 | /// This function is used a lot for simple searches, keep it fast! |
3330 | /// |
3331 | /// @param s string to search |
3332 | /// @param c character to find in @a s |
3333 | /// |
3334 | /// @return NULL if no match, otherwise pointer to the position in @a s |
3335 | static inline char_u *cstrchr(const char_u *const s, const int c) |
3336 | FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL |
3337 | FUNC_ATTR_ALWAYS_INLINE |
3338 | { |
3339 | if (!rex.reg_ic) { |
3340 | return vim_strchr(s, c); |
3341 | } |
3342 | |
3343 | // Use folded case for UTF-8, slow! For ASCII use libc strpbrk which is |
3344 | // expected to be highly optimized. |
3345 | if (c > 0x80) { |
3346 | const int folded_c = utf_fold(c); |
3347 | for (const char_u *p = s; *p != NUL; p += utfc_ptr2len(p)) { |
3348 | if (utf_fold(utf_ptr2char(p)) == folded_c) { |
3349 | return (char_u *)p; |
3350 | } |
3351 | } |
3352 | return NULL; |
3353 | } |
3354 | |
3355 | int cc; |
3356 | if (ASCII_ISUPPER(c)) { |
3357 | cc = TOLOWER_ASC(c); |
3358 | } else if (ASCII_ISLOWER(c)) { |
3359 | cc = TOUPPER_ASC(c); |
3360 | } else { |
3361 | return vim_strchr(s, c); |
3362 | } |
3363 | |
3364 | char tofind[] = { (char)c, (char)cc, NUL }; |
3365 | return (char_u *)strpbrk((const char *)s, tofind); |
3366 | } |
3367 | |
3368 | /// Matches a regexp against multiple lines. |
3369 | /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). |
3370 | /// Uses curbuf for line count and 'iskeyword'. |
3371 | /// |
3372 | /// @param win Window in which to search or NULL |
3373 | /// @param buf Buffer in which to search |
3374 | /// @param lnum Number of line to start looking for match |
3375 | /// @param col Column to start looking for match |
3376 | /// @param tm Timeout limit or NULL |
3377 | /// |
3378 | /// @return zero if there is no match and number of lines contained in the match |
3379 | /// otherwise. |
3380 | static long bt_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, |
3381 | linenr_T lnum, colnr_T col, |
3382 | proftime_T *tm, int *timed_out) |
3383 | { |
3384 | rex.reg_match = NULL; |
3385 | rex.reg_mmatch = rmp; |
3386 | rex.reg_buf = buf; |
3387 | rex.reg_win = win; |
3388 | rex.reg_firstlnum = lnum; |
3389 | rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum; |
3390 | rex.reg_line_lbr = false; |
3391 | rex.reg_ic = rmp->rmm_ic; |
3392 | rex.reg_icombine = false; |
3393 | rex.reg_maxcol = rmp->rmm_maxcol; |
3394 | |
3395 | return bt_regexec_both(NULL, col, tm, timed_out); |
3396 | } |
3397 | |
3398 | /// Match a regexp against a string ("line" points to the string) or multiple |
3399 | /// lines ("line" is NULL, use reg_getline()). |
3400 | /// @return 0 for failure, or number of lines contained in the match. |
3401 | static long bt_regexec_both(char_u *line, |
3402 | colnr_T col, // column to start search |
3403 | proftime_T *tm, // timeout limit or NULL |
3404 | int *timed_out) // flag set on timeout or NULL |
3405 | { |
3406 | bt_regprog_T *prog; |
3407 | char_u *s; |
3408 | long retval = 0L; |
3409 | |
3410 | /* Create "regstack" and "backpos" if they are not allocated yet. |
3411 | * We allocate *_INITIAL amount of bytes first and then set the grow size |
3412 | * to much bigger value to avoid many malloc calls in case of deep regular |
3413 | * expressions. */ |
3414 | if (regstack.ga_data == NULL) { |
3415 | /* Use an item size of 1 byte, since we push different things |
3416 | * onto the regstack. */ |
3417 | ga_init(®stack, 1, REGSTACK_INITIAL); |
3418 | ga_grow(®stack, REGSTACK_INITIAL); |
3419 | ga_set_growsize(®stack, REGSTACK_INITIAL * 8); |
3420 | } |
3421 | |
3422 | if (backpos.ga_data == NULL) { |
3423 | ga_init(&backpos, sizeof(backpos_T), BACKPOS_INITIAL); |
3424 | ga_grow(&backpos, BACKPOS_INITIAL); |
3425 | ga_set_growsize(&backpos, BACKPOS_INITIAL * 8); |
3426 | } |
3427 | |
3428 | if (REG_MULTI) { |
3429 | prog = (bt_regprog_T *)rex.reg_mmatch->regprog; |
3430 | line = reg_getline((linenr_T)0); |
3431 | rex.reg_startpos = rex.reg_mmatch->startpos; |
3432 | rex.reg_endpos = rex.reg_mmatch->endpos; |
3433 | } else { |
3434 | prog = (bt_regprog_T *)rex.reg_match->regprog; |
3435 | rex.reg_startp = rex.reg_match->startp; |
3436 | rex.reg_endp = rex.reg_match->endp; |
3437 | } |
3438 | |
3439 | /* Be paranoid... */ |
3440 | if (prog == NULL || line == NULL) { |
3441 | EMSG(_(e_null)); |
3442 | goto theend; |
3443 | } |
3444 | |
3445 | /* Check validity of program. */ |
3446 | if (prog_magic_wrong()) |
3447 | goto theend; |
3448 | |
3449 | // If the start column is past the maximum column: no need to try. |
3450 | if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) { |
3451 | goto theend; |
3452 | } |
3453 | |
3454 | // If pattern contains "\c" or "\C": overrule value of rex.reg_ic |
3455 | if (prog->regflags & RF_ICASE) { |
3456 | rex.reg_ic = true; |
3457 | } else if (prog->regflags & RF_NOICASE) { |
3458 | rex.reg_ic = false; |
3459 | } |
3460 | |
3461 | // If pattern contains "\Z" overrule value of rex.reg_icombine |
3462 | if (prog->regflags & RF_ICOMBINE) { |
3463 | rex.reg_icombine = true; |
3464 | } |
3465 | |
3466 | /* If there is a "must appear" string, look for it. */ |
3467 | if (prog->regmust != NULL) { |
3468 | int c = utf_ptr2char(prog->regmust); |
3469 | s = line + col; |
3470 | |
3471 | // This is used very often, esp. for ":global". Use two versions of |
3472 | // the loop to avoid overhead of conditions. |
3473 | if (!rex.reg_ic) { |
3474 | while ((s = vim_strchr(s, c)) != NULL) { |
3475 | if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0) { |
3476 | break; // Found it. |
3477 | } |
3478 | MB_PTR_ADV(s); |
3479 | } |
3480 | } else { |
3481 | while ((s = cstrchr(s, c)) != NULL) { |
3482 | if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0) { |
3483 | break; // Found it. |
3484 | } |
3485 | MB_PTR_ADV(s); |
3486 | } |
3487 | } |
3488 | if (s == NULL) { // Not present. |
3489 | goto theend; |
3490 | } |
3491 | } |
3492 | |
3493 | regline = line; |
3494 | reglnum = 0; |
3495 | reg_toolong = FALSE; |
3496 | |
3497 | /* Simplest case: Anchored match need be tried only once. */ |
3498 | if (prog->reganch) { |
3499 | int c = utf_ptr2char(regline + col); |
3500 | if (prog->regstart == NUL |
3501 | || prog->regstart == c |
3502 | || (rex.reg_ic |
3503 | && (utf_fold(prog->regstart) == utf_fold(c) |
3504 | || (c < 255 && prog->regstart < 255 |
3505 | && mb_tolower(prog->regstart) == mb_tolower(c))))) { |
3506 | retval = regtry(prog, col, tm, timed_out); |
3507 | } else { |
3508 | retval = 0; |
3509 | } |
3510 | } else { |
3511 | int tm_count = 0; |
3512 | /* Messy cases: unanchored match. */ |
3513 | while (!got_int) { |
3514 | if (prog->regstart != NUL) { |
3515 | // Skip until the char we know it must start with. |
3516 | s = cstrchr(regline + col, prog->regstart); |
3517 | if (s == NULL) { |
3518 | retval = 0; |
3519 | break; |
3520 | } |
3521 | col = (int)(s - regline); |
3522 | } |
3523 | |
3524 | // Check for maximum column to try. |
3525 | if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) { |
3526 | retval = 0; |
3527 | break; |
3528 | } |
3529 | |
3530 | retval = regtry(prog, col, tm, timed_out); |
3531 | if (retval > 0) { |
3532 | break; |
3533 | } |
3534 | |
3535 | /* if not currently on the first line, get it again */ |
3536 | if (reglnum != 0) { |
3537 | reglnum = 0; |
3538 | regline = reg_getline((linenr_T)0); |
3539 | } |
3540 | if (regline[col] == NUL) |
3541 | break; |
3542 | if (has_mbyte) |
3543 | col += (*mb_ptr2len)(regline + col); |
3544 | else |
3545 | ++col; |
3546 | /* Check for timeout once in a twenty times to avoid overhead. */ |
3547 | if (tm != NULL && ++tm_count == 20) { |
3548 | tm_count = 0; |
3549 | if (profile_passed_limit(*tm)) { |
3550 | if (timed_out != NULL) { |
3551 | *timed_out = true; |
3552 | } |
3553 | break; |
3554 | } |
3555 | } |
3556 | } |
3557 | } |
3558 | |
3559 | theend: |
3560 | /* Free "reg_tofree" when it's a bit big. |
3561 | * Free regstack and backpos if they are bigger than their initial size. */ |
3562 | if (reg_tofreelen > 400) { |
3563 | XFREE_CLEAR(reg_tofree); |
3564 | } |
3565 | if (regstack.ga_maxlen > REGSTACK_INITIAL) |
3566 | ga_clear(®stack); |
3567 | if (backpos.ga_maxlen > BACKPOS_INITIAL) |
3568 | ga_clear(&backpos); |
3569 | |
3570 | return retval; |
3571 | } |
3572 | |
3573 | |
3574 | /* |
3575 | * Create a new extmatch and mark it as referenced once. |
3576 | */ |
3577 | static reg_extmatch_T *make_extmatch(void) |
3578 | { |
3579 | reg_extmatch_T *em = xcalloc(1, sizeof(reg_extmatch_T)); |
3580 | em->refcnt = 1; |
3581 | return em; |
3582 | } |
3583 | |
3584 | /* |
3585 | * Add a reference to an extmatch. |
3586 | */ |
3587 | reg_extmatch_T *ref_extmatch(reg_extmatch_T *em) |
3588 | { |
3589 | if (em != NULL) |
3590 | em->refcnt++; |
3591 | return em; |
3592 | } |
3593 | |
3594 | /* |
3595 | * Remove a reference to an extmatch. If there are no references left, free |
3596 | * the info. |
3597 | */ |
3598 | void unref_extmatch(reg_extmatch_T *em) |
3599 | { |
3600 | int i; |
3601 | |
3602 | if (em != NULL && --em->refcnt <= 0) { |
3603 | for (i = 0; i < NSUBEXP; ++i) |
3604 | xfree(em->matches[i]); |
3605 | xfree(em); |
3606 | } |
3607 | } |
3608 | |
3609 | /// Try match of "prog" with at regline["col"]. |
3610 | /// @returns 0 for failure, or number of lines contained in the match. |
3611 | static long regtry(bt_regprog_T *prog, |
3612 | colnr_T col, |
3613 | proftime_T *tm, // timeout limit or NULL |
3614 | int *timed_out) // flag set on timeout or NULL |
3615 | { |
3616 | reginput = regline + col; |
3617 | need_clear_subexpr = TRUE; |
3618 | /* Clear the external match subpointers if necessary. */ |
3619 | if (prog->reghasz == REX_SET) |
3620 | need_clear_zsubexpr = TRUE; |
3621 | |
3622 | if (regmatch(prog->program + 1, tm, timed_out) == 0) { |
3623 | return 0; |
3624 | } |
3625 | |
3626 | cleanup_subexpr(); |
3627 | if (REG_MULTI) { |
3628 | if (rex.reg_startpos[0].lnum < 0) { |
3629 | rex.reg_startpos[0].lnum = 0; |
3630 | rex.reg_startpos[0].col = col; |
3631 | } |
3632 | if (rex.reg_endpos[0].lnum < 0) { |
3633 | rex.reg_endpos[0].lnum = reglnum; |
3634 | rex.reg_endpos[0].col = (int)(reginput - regline); |
3635 | } else { |
3636 | // Use line number of "\ze". |
3637 | reglnum = rex.reg_endpos[0].lnum; |
3638 | } |
3639 | } else { |
3640 | if (rex.reg_startp[0] == NULL) { |
3641 | rex.reg_startp[0] = regline + col; |
3642 | } |
3643 | if (rex.reg_endp[0] == NULL) { |
3644 | rex.reg_endp[0] = reginput; |
3645 | } |
3646 | } |
3647 | /* Package any found \z(...\) matches for export. Default is none. */ |
3648 | unref_extmatch(re_extmatch_out); |
3649 | re_extmatch_out = NULL; |
3650 | |
3651 | if (prog->reghasz == REX_SET) { |
3652 | int i; |
3653 | |
3654 | cleanup_zsubexpr(); |
3655 | re_extmatch_out = make_extmatch(); |
3656 | for (i = 0; i < NSUBEXP; i++) { |
3657 | if (REG_MULTI) { |
3658 | /* Only accept single line matches. */ |
3659 | if (reg_startzpos[i].lnum >= 0 |
3660 | && reg_endzpos[i].lnum == reg_startzpos[i].lnum |
3661 | && reg_endzpos[i].col >= reg_startzpos[i].col) { |
3662 | re_extmatch_out->matches[i] = |
3663 | vim_strnsave(reg_getline(reg_startzpos[i].lnum) |
3664 | + reg_startzpos[i].col, |
3665 | reg_endzpos[i].col |
3666 | - reg_startzpos[i].col); |
3667 | } |
3668 | } else { |
3669 | if (reg_startzp[i] != NULL && reg_endzp[i] != NULL) |
3670 | re_extmatch_out->matches[i] = |
3671 | vim_strnsave(reg_startzp[i], |
3672 | (int)(reg_endzp[i] - reg_startzp[i])); |
3673 | } |
3674 | } |
3675 | } |
3676 | return 1 + reglnum; |
3677 | } |
3678 | |
3679 | |
3680 | // Get class of previous character. |
3681 | static int reg_prev_class(void) |
3682 | { |
3683 | if (reginput > regline) { |
3684 | return mb_get_class_tab(reginput - 1 - utf_head_off(regline, reginput - 1), |
3685 | rex.reg_buf->b_chartab); |
3686 | } |
3687 | return -1; |
3688 | } |
3689 | |
3690 | |
3691 | // Return TRUE if the current reginput position matches the Visual area. |
3692 | static int reg_match_visual(void) |
3693 | { |
3694 | pos_T top, bot; |
3695 | linenr_T lnum; |
3696 | colnr_T col; |
3697 | win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win; |
3698 | int mode; |
3699 | colnr_T start, end; |
3700 | colnr_T start2, end2; |
3701 | |
3702 | // Check if the buffer is the current buffer. |
3703 | if (rex.reg_buf != curbuf || VIsual.lnum == 0) { |
3704 | return false; |
3705 | } |
3706 | |
3707 | if (VIsual_active) { |
3708 | if (lt(VIsual, wp->w_cursor)) { |
3709 | top = VIsual; |
3710 | bot = wp->w_cursor; |
3711 | } else { |
3712 | top = wp->w_cursor; |
3713 | bot = VIsual; |
3714 | } |
3715 | mode = VIsual_mode; |
3716 | } else { |
3717 | if (lt(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end)) { |
3718 | top = curbuf->b_visual.vi_start; |
3719 | bot = curbuf->b_visual.vi_end; |
3720 | } else { |
3721 | top = curbuf->b_visual.vi_end; |
3722 | bot = curbuf->b_visual.vi_start; |
3723 | } |
3724 | mode = curbuf->b_visual.vi_mode; |
3725 | } |
3726 | lnum = reglnum + rex.reg_firstlnum; |
3727 | if (lnum < top.lnum || lnum > bot.lnum) { |
3728 | return false; |
3729 | } |
3730 | |
3731 | if (mode == 'v') { |
3732 | col = (colnr_T)(reginput - regline); |
3733 | if ((lnum == top.lnum && col < top.col) |
3734 | || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e'))) |
3735 | return FALSE; |
3736 | } else if (mode == Ctrl_V) { |
3737 | getvvcol(wp, &top, &start, NULL, &end); |
3738 | getvvcol(wp, &bot, &start2, NULL, &end2); |
3739 | if (start2 < start) |
3740 | start = start2; |
3741 | if (end2 > end) |
3742 | end = end2; |
3743 | if (top.col == MAXCOL || bot.col == MAXCOL) |
3744 | end = MAXCOL; |
3745 | unsigned int cols_u = win_linetabsize(wp, regline, |
3746 | (colnr_T)(reginput - regline)); |
3747 | assert(cols_u <= MAXCOL); |
3748 | colnr_T cols = (colnr_T)cols_u; |
3749 | if (cols < start || cols > end - (*p_sel == 'e')) |
3750 | return FALSE; |
3751 | } |
3752 | return TRUE; |
3753 | } |
3754 | |
3755 | #define ADVANCE_REGINPUT() MB_PTR_ADV(reginput) |
3756 | |
3757 | /* |
3758 | * The arguments from BRACE_LIMITS are stored here. They are actually local |
3759 | * to regmatch(), but they are here to reduce the amount of stack space used |
3760 | * (it can be called recursively many times). |
3761 | */ |
3762 | static long bl_minval; |
3763 | static long bl_maxval; |
3764 | |
3765 | /// Main matching routine |
3766 | /// |
3767 | /// Conceptually the strategy is simple: Check to see whether the current node |
3768 | /// matches, push an item onto the regstack and loop to see whether the rest |
3769 | /// matches, and then act accordingly. In practice we make some effort to |
3770 | /// avoid using the regstack, in particular by going through "ordinary" nodes |
3771 | /// (that don't need to know whether the rest of the match failed) by a nested |
3772 | /// loop. |
3773 | /// |
3774 | /// Returns TRUE when there is a match. Leaves reginput and reglnum just after |
3775 | /// the last matched character. |
3776 | /// Returns FALSE when there is no match. Leaves reginput and reglnum in an |
3777 | /// undefined state! |
3778 | static int regmatch( |
3779 | char_u *scan, // Current node. |
3780 | proftime_T *tm, // timeout limit or NULL |
3781 | int *timed_out // flag set on timeout or NULL |
3782 | ) |
3783 | { |
3784 | char_u *next; /* Next node. */ |
3785 | int op; |
3786 | int c; |
3787 | regitem_T *rp; |
3788 | int no; |
3789 | int status; // one of the RA_ values: |
3790 | int tm_count = 0; |
3791 | #define RA_FAIL 1 // something failed, abort |
3792 | #define RA_CONT 2 // continue in inner loop |
3793 | #define RA_BREAK 3 // break inner loop |
3794 | #define RA_MATCH 4 // successful match |
3795 | #define RA_NOMATCH 5 // didn't match |
3796 | |
3797 | // Make "regstack" and "backpos" empty. They are allocated and freed in |
3798 | // bt_regexec_both() to reduce malloc()/free() calls. |
3799 | regstack.ga_len = 0; |
3800 | backpos.ga_len = 0; |
3801 | |
3802 | /* |
3803 | * Repeat until "regstack" is empty. |
3804 | */ |
3805 | for (;; ) { |
3806 | /* Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q". |
3807 | * Allow interrupting them with CTRL-C. */ |
3808 | fast_breakcheck(); |
3809 | |
3810 | #ifdef REGEXP_DEBUG |
3811 | if (scan != NULL && regnarrate) { |
3812 | mch_errmsg((char *)regprop(scan)); |
3813 | mch_errmsg("(\n" ); |
3814 | } |
3815 | #endif |
3816 | |
3817 | /* |
3818 | * Repeat for items that can be matched sequentially, without using the |
3819 | * regstack. |
3820 | */ |
3821 | for (;; ) { |
3822 | if (got_int || scan == NULL) { |
3823 | status = RA_FAIL; |
3824 | break; |
3825 | } |
3826 | // Check for timeout once in a 100 times to avoid overhead. |
3827 | if (tm != NULL && ++tm_count == 100) { |
3828 | tm_count = 0; |
3829 | if (profile_passed_limit(*tm)) { |
3830 | if (timed_out != NULL) { |
3831 | *timed_out = true; |
3832 | } |
3833 | status = RA_FAIL; |
3834 | break; |
3835 | } |
3836 | } |
3837 | status = RA_CONT; |
3838 | |
3839 | #ifdef REGEXP_DEBUG |
3840 | if (regnarrate) { |
3841 | mch_errmsg((char *)regprop(scan)); |
3842 | mch_errmsg("...\n" ); |
3843 | if (re_extmatch_in != NULL) { |
3844 | int i; |
3845 | |
3846 | mch_errmsg(_("External submatches:\n" )); |
3847 | for (i = 0; i < NSUBEXP; i++) { |
3848 | mch_errmsg(" \"" ); |
3849 | if (re_extmatch_in->matches[i] != NULL) |
3850 | mch_errmsg((char *)re_extmatch_in->matches[i]); |
3851 | mch_errmsg("\"\n" ); |
3852 | } |
3853 | } |
3854 | } |
3855 | #endif |
3856 | next = regnext(scan); |
3857 | |
3858 | op = OP(scan); |
3859 | // Check for character class with NL added. |
3860 | if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI |
3861 | && *reginput == NUL && reglnum <= rex.reg_maxline) { |
3862 | reg_nextline(); |
3863 | } else if (rex.reg_line_lbr && WITH_NL(op) && *reginput == '\n') { |
3864 | ADVANCE_REGINPUT(); |
3865 | } else { |
3866 | if (WITH_NL(op)) { |
3867 | op -= ADD_NL; |
3868 | } |
3869 | c = utf_ptr2char(reginput); |
3870 | switch (op) { |
3871 | case BOL: |
3872 | if (reginput != regline) |
3873 | status = RA_NOMATCH; |
3874 | break; |
3875 | |
3876 | case EOL: |
3877 | if (c != NUL) |
3878 | status = RA_NOMATCH; |
3879 | break; |
3880 | |
3881 | case RE_BOF: |
3882 | // We're not at the beginning of the file when below the first |
3883 | // line where we started, not at the start of the line or we |
3884 | // didn't start at the first line of the buffer. |
3885 | if (reglnum != 0 || reginput != regline |
3886 | || (REG_MULTI && rex.reg_firstlnum > 1)) { |
3887 | status = RA_NOMATCH; |
3888 | } |
3889 | break; |
3890 | |
3891 | case RE_EOF: |
3892 | if (reglnum != rex.reg_maxline || c != NUL) { |
3893 | status = RA_NOMATCH; |
3894 | } |
3895 | break; |
3896 | |
3897 | case CURSOR: |
3898 | // Check if the buffer is in a window and compare the |
3899 | // rex.reg_win->w_cursor position to the match position. |
3900 | if (rex.reg_win == NULL |
3901 | || (reglnum + rex.reg_firstlnum != rex.reg_win->w_cursor.lnum) |
3902 | || ((colnr_T)(reginput - regline) != rex.reg_win->w_cursor.col)) { |
3903 | status = RA_NOMATCH; |
3904 | } |
3905 | break; |
3906 | |
3907 | case RE_MARK: |
3908 | /* Compare the mark position to the match position. */ |
3909 | { |
3910 | int mark = OPERAND(scan)[0]; |
3911 | int cmp = OPERAND(scan)[1]; |
3912 | pos_T *pos; |
3913 | |
3914 | pos = getmark_buf(rex.reg_buf, mark, false); |
3915 | if (pos == NULL // mark doesn't exist |
3916 | || pos->lnum <= 0 // mark isn't set in reg_buf |
3917 | || (pos->lnum == reglnum + rex.reg_firstlnum |
3918 | ? (pos->col == (colnr_T)(reginput - regline) |
3919 | ? (cmp == '<' || cmp == '>') |
3920 | : (pos->col < (colnr_T)(reginput - regline) |
3921 | ? cmp != '>' |
3922 | : cmp != '<')) |
3923 | : (pos->lnum < reglnum + rex.reg_firstlnum |
3924 | ? cmp != '>' |
3925 | : cmp != '<'))) { |
3926 | status = RA_NOMATCH; |
3927 | } |
3928 | } |
3929 | break; |
3930 | |
3931 | case RE_VISUAL: |
3932 | if (!reg_match_visual()) |
3933 | status = RA_NOMATCH; |
3934 | break; |
3935 | |
3936 | case RE_LNUM: |
3937 | assert(reglnum + rex.reg_firstlnum >= 0 |
3938 | && (uintmax_t)(reglnum + rex.reg_firstlnum) <= UINT32_MAX); |
3939 | if (!REG_MULTI |
3940 | || !re_num_cmp((uint32_t)(reglnum + rex.reg_firstlnum), scan)) { |
3941 | status = RA_NOMATCH; |
3942 | } |
3943 | break; |
3944 | |
3945 | case RE_COL: |
3946 | assert(reginput - regline + 1 >= 0 |
3947 | && (uintmax_t)(reginput - regline + 1) <= UINT32_MAX); |
3948 | if (!re_num_cmp((uint32_t)(reginput - regline + 1), scan)) |
3949 | status = RA_NOMATCH; |
3950 | break; |
3951 | |
3952 | case RE_VCOL: |
3953 | if (!re_num_cmp(win_linetabsize(rex.reg_win == NULL |
3954 | ? curwin : rex.reg_win, |
3955 | regline, |
3956 | (colnr_T)(reginput - regline)) + 1, |
3957 | scan)) { |
3958 | status = RA_NOMATCH; |
3959 | } |
3960 | break; |
3961 | |
3962 | case BOW: /* \<word; reginput points to w */ |
3963 | if (c == NUL) /* Can't match at end of line */ |
3964 | status = RA_NOMATCH; |
3965 | else if (has_mbyte) { |
3966 | int this_class; |
3967 | |
3968 | // Get class of current and previous char (if it exists). |
3969 | this_class = mb_get_class_tab(reginput, rex.reg_buf->b_chartab); |
3970 | if (this_class <= 1) { |
3971 | status = RA_NOMATCH; // Not on a word at all. |
3972 | } else if (reg_prev_class() == this_class) { |
3973 | status = RA_NOMATCH; // Previous char is in same word. |
3974 | } |
3975 | } else { |
3976 | if (!vim_iswordc_buf(c, rex.reg_buf) |
3977 | || (reginput > regline |
3978 | && vim_iswordc_buf(reginput[-1], rex.reg_buf))) { |
3979 | status = RA_NOMATCH; |
3980 | } |
3981 | } |
3982 | break; |
3983 | |
3984 | case EOW: /* word\>; reginput points after d */ |
3985 | if (reginput == regline) /* Can't match at start of line */ |
3986 | status = RA_NOMATCH; |
3987 | else if (has_mbyte) { |
3988 | int this_class, prev_class; |
3989 | |
3990 | // Get class of current and previous char (if it exists). |
3991 | this_class = mb_get_class_tab(reginput, rex.reg_buf->b_chartab); |
3992 | prev_class = reg_prev_class(); |
3993 | if (this_class == prev_class |
3994 | || prev_class == 0 || prev_class == 1) |
3995 | status = RA_NOMATCH; |
3996 | } else { |
3997 | if (!vim_iswordc_buf(reginput[-1], rex.reg_buf) |
3998 | || (reginput[0] != NUL && vim_iswordc_buf(c, rex.reg_buf))) { |
3999 | status = RA_NOMATCH; |
4000 | } |
4001 | } |
4002 | break; /* Matched with EOW */ |
4003 | |
4004 | case ANY: |
4005 | /* ANY does not match new lines. */ |
4006 | if (c == NUL) |
4007 | status = RA_NOMATCH; |
4008 | else |
4009 | ADVANCE_REGINPUT(); |
4010 | break; |
4011 | |
4012 | case IDENT: |
4013 | if (!vim_isIDc(c)) |
4014 | status = RA_NOMATCH; |
4015 | else |
4016 | ADVANCE_REGINPUT(); |
4017 | break; |
4018 | |
4019 | case SIDENT: |
4020 | if (ascii_isdigit(*reginput) || !vim_isIDc(c)) |
4021 | status = RA_NOMATCH; |
4022 | else |
4023 | ADVANCE_REGINPUT(); |
4024 | break; |
4025 | |
4026 | case KWORD: |
4027 | if (!vim_iswordp_buf(reginput, rex.reg_buf)) { |
4028 | status = RA_NOMATCH; |
4029 | } else { |
4030 | ADVANCE_REGINPUT(); |
4031 | } |
4032 | break; |
4033 | |
4034 | case SKWORD: |
4035 | if (ascii_isdigit(*reginput) |
4036 | || !vim_iswordp_buf(reginput, rex.reg_buf)) { |
4037 | status = RA_NOMATCH; |
4038 | } else { |
4039 | ADVANCE_REGINPUT(); |
4040 | } |
4041 | break; |
4042 | |
4043 | case FNAME: |
4044 | if (!vim_isfilec(c)) |
4045 | status = RA_NOMATCH; |
4046 | else |
4047 | ADVANCE_REGINPUT(); |
4048 | break; |
4049 | |
4050 | case SFNAME: |
4051 | if (ascii_isdigit(*reginput) || !vim_isfilec(c)) |
4052 | status = RA_NOMATCH; |
4053 | else |
4054 | ADVANCE_REGINPUT(); |
4055 | break; |
4056 | |
4057 | case PRINT: |
4058 | if (!vim_isprintc(PTR2CHAR(reginput))) |
4059 | status = RA_NOMATCH; |
4060 | else |
4061 | ADVANCE_REGINPUT(); |
4062 | break; |
4063 | |
4064 | case SPRINT: |
4065 | if (ascii_isdigit(*reginput) || !vim_isprintc(PTR2CHAR(reginput))) |
4066 | status = RA_NOMATCH; |
4067 | else |
4068 | ADVANCE_REGINPUT(); |
4069 | break; |
4070 | |
4071 | case WHITE: |
4072 | if (!ascii_iswhite(c)) |
4073 | status = RA_NOMATCH; |
4074 | else |
4075 | ADVANCE_REGINPUT(); |
4076 | break; |
4077 | |
4078 | case NWHITE: |
4079 | if (c == NUL || ascii_iswhite(c)) |
4080 | status = RA_NOMATCH; |
4081 | else |
4082 | ADVANCE_REGINPUT(); |
4083 | break; |
4084 | |
4085 | case DIGIT: |
4086 | if (!ri_digit(c)) |
4087 | status = RA_NOMATCH; |
4088 | else |
4089 | ADVANCE_REGINPUT(); |
4090 | break; |
4091 | |
4092 | case NDIGIT: |
4093 | if (c == NUL || ri_digit(c)) |
4094 | status = RA_NOMATCH; |
4095 | else |
4096 | ADVANCE_REGINPUT(); |
4097 | break; |
4098 | |
4099 | case HEX: |
4100 | if (!ri_hex(c)) |
4101 | status = RA_NOMATCH; |
4102 | else |
4103 | ADVANCE_REGINPUT(); |
4104 | break; |
4105 | |
4106 | case NHEX: |
4107 | if (c == NUL || ri_hex(c)) |
4108 | status = RA_NOMATCH; |
4109 | else |
4110 | ADVANCE_REGINPUT(); |
4111 | break; |
4112 | |
4113 | case OCTAL: |
4114 | if (!ri_octal(c)) |
4115 | status = RA_NOMATCH; |
4116 | else |
4117 | ADVANCE_REGINPUT(); |
4118 | break; |
4119 | |
4120 | case NOCTAL: |
4121 | if (c == NUL || ri_octal(c)) |
4122 | status = RA_NOMATCH; |
4123 | else |
4124 | ADVANCE_REGINPUT(); |
4125 | break; |
4126 | |
4127 | case WORD: |
4128 | if (!ri_word(c)) |
4129 | status = RA_NOMATCH; |
4130 | else |
4131 | ADVANCE_REGINPUT(); |
4132 | break; |
4133 | |
4134 | case NWORD: |
4135 | if (c == NUL || ri_word(c)) |
4136 | status = RA_NOMATCH; |
4137 | else |
4138 | ADVANCE_REGINPUT(); |
4139 | break; |
4140 | |
4141 | case HEAD: |
4142 | if (!ri_head(c)) |
4143 | status = RA_NOMATCH; |
4144 | else |
4145 | ADVANCE_REGINPUT(); |
4146 | break; |
4147 | |
4148 | case NHEAD: |
4149 | if (c == NUL || ri_head(c)) |
4150 | status = RA_NOMATCH; |
4151 | else |
4152 | ADVANCE_REGINPUT(); |
4153 | break; |
4154 | |
4155 | case ALPHA: |
4156 | if (!ri_alpha(c)) |
4157 | status = RA_NOMATCH; |
4158 | else |
4159 | ADVANCE_REGINPUT(); |
4160 | break; |
4161 | |
4162 | case NALPHA: |
4163 | if (c == NUL || ri_alpha(c)) |
4164 | status = RA_NOMATCH; |
4165 | else |
4166 | ADVANCE_REGINPUT(); |
4167 | break; |
4168 | |
4169 | case LOWER: |
4170 | if (!ri_lower(c)) |
4171 | status = RA_NOMATCH; |
4172 | else |
4173 | ADVANCE_REGINPUT(); |
4174 | break; |
4175 | |
4176 | case NLOWER: |
4177 | if (c == NUL || ri_lower(c)) |
4178 | status = RA_NOMATCH; |
4179 | else |
4180 | ADVANCE_REGINPUT(); |
4181 | break; |
4182 | |
4183 | case UPPER: |
4184 | if (!ri_upper(c)) |
4185 | status = RA_NOMATCH; |
4186 | else |
4187 | ADVANCE_REGINPUT(); |
4188 | break; |
4189 | |
4190 | case NUPPER: |
4191 | if (c == NUL || ri_upper(c)) |
4192 | status = RA_NOMATCH; |
4193 | else |
4194 | ADVANCE_REGINPUT(); |
4195 | break; |
4196 | |
4197 | case EXACTLY: |
4198 | { |
4199 | int len; |
4200 | char_u *opnd; |
4201 | |
4202 | opnd = OPERAND(scan); |
4203 | // Inline the first byte, for speed. |
4204 | if (*opnd != *reginput |
4205 | && (!rex.reg_ic |
4206 | || (!enc_utf8 |
4207 | && mb_tolower(*opnd) != mb_tolower(*reginput)))) { |
4208 | status = RA_NOMATCH; |
4209 | } else if (*opnd == NUL) { |
4210 | // match empty string always works; happens when "~" is |
4211 | // empty. |
4212 | } else { |
4213 | if (opnd[1] == NUL && !(enc_utf8 && rex.reg_ic)) { |
4214 | len = 1; // matched a single byte above |
4215 | } else { |
4216 | // Need to match first byte again for multi-byte. |
4217 | len = (int)STRLEN(opnd); |
4218 | if (cstrncmp(opnd, reginput, &len) != 0) { |
4219 | status = RA_NOMATCH; |
4220 | } |
4221 | } |
4222 | // Check for following composing character, unless %C |
4223 | // follows (skips over all composing chars). |
4224 | if (status != RA_NOMATCH && enc_utf8 |
4225 | && UTF_COMPOSINGLIKE(reginput, reginput + len) |
4226 | && !rex.reg_icombine |
4227 | && OP(next) != RE_COMPOSING) { |
4228 | // raaron: This code makes a composing character get |
4229 | // ignored, which is the correct behavior (sometimes) |
4230 | // for voweled Hebrew texts. |
4231 | status = RA_NOMATCH; |
4232 | } |
4233 | if (status != RA_NOMATCH) { |
4234 | reginput += len; |
4235 | } |
4236 | } |
4237 | } |
4238 | break; |
4239 | |
4240 | case ANYOF: |
4241 | case ANYBUT: |
4242 | if (c == NUL) |
4243 | status = RA_NOMATCH; |
4244 | else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF)) |
4245 | status = RA_NOMATCH; |
4246 | else |
4247 | ADVANCE_REGINPUT(); |
4248 | break; |
4249 | |
4250 | case MULTIBYTECODE: |
4251 | if (has_mbyte) { |
4252 | int i, len; |
4253 | char_u *opnd; |
4254 | int opndc = 0, inpc; |
4255 | |
4256 | opnd = OPERAND(scan); |
4257 | // Safety check (just in case 'encoding' was changed since |
4258 | // compiling the program). |
4259 | if ((len = (*mb_ptr2len)(opnd)) < 2) { |
4260 | status = RA_NOMATCH; |
4261 | break; |
4262 | } |
4263 | if (enc_utf8) { |
4264 | opndc = utf_ptr2char(opnd); |
4265 | } |
4266 | if (enc_utf8 && utf_iscomposing(opndc)) { |
4267 | /* When only a composing char is given match at any |
4268 | * position where that composing char appears. */ |
4269 | status = RA_NOMATCH; |
4270 | for (i = 0; reginput[i] != NUL; i += utf_ptr2len(reginput + i)) { |
4271 | inpc = utf_ptr2char(reginput + i); |
4272 | if (!utf_iscomposing(inpc)) { |
4273 | if (i > 0) { |
4274 | break; |
4275 | } |
4276 | } else if (opndc == inpc) { |
4277 | // Include all following composing chars. |
4278 | len = i + utfc_ptr2len(reginput + i); |
4279 | status = RA_MATCH; |
4280 | break; |
4281 | } |
4282 | } |
4283 | } else |
4284 | for (i = 0; i < len; ++i) |
4285 | if (opnd[i] != reginput[i]) { |
4286 | status = RA_NOMATCH; |
4287 | break; |
4288 | } |
4289 | reginput += len; |
4290 | } else |
4291 | status = RA_NOMATCH; |
4292 | break; |
4293 | |
4294 | case RE_COMPOSING: |
4295 | if (enc_utf8) { |
4296 | // Skip composing characters. |
4297 | while (utf_iscomposing(utf_ptr2char(reginput))) { |
4298 | MB_CPTR_ADV(reginput); |
4299 | } |
4300 | } |
4301 | break; |
4302 | |
4303 | case NOTHING: |
4304 | break; |
4305 | |
4306 | case BACK: |
4307 | { |
4308 | int i; |
4309 | |
4310 | /* |
4311 | * When we run into BACK we need to check if we don't keep |
4312 | * looping without matching any input. The second and later |
4313 | * times a BACK is encountered it fails if the input is still |
4314 | * at the same position as the previous time. |
4315 | * The positions are stored in "backpos" and found by the |
4316 | * current value of "scan", the position in the RE program. |
4317 | */ |
4318 | backpos_T *bp = (backpos_T *)backpos.ga_data; |
4319 | for (i = 0; i < backpos.ga_len; ++i) |
4320 | if (bp[i].bp_scan == scan) |
4321 | break; |
4322 | if (i == backpos.ga_len) { |
4323 | backpos_T *p = GA_APPEND_VIA_PTR(backpos_T, &backpos); |
4324 | p->bp_scan = scan; |
4325 | } else if (reg_save_equal(&bp[i].bp_pos)) |
4326 | /* Still at same position as last time, fail. */ |
4327 | status = RA_NOMATCH; |
4328 | |
4329 | assert(status != RA_FAIL); |
4330 | if (status != RA_NOMATCH) { |
4331 | reg_save(&bp[i].bp_pos, &backpos); |
4332 | } |
4333 | } |
4334 | break; |
4335 | |
4336 | case MOPEN + 0: /* Match start: \zs */ |
4337 | case MOPEN + 1: /* \( */ |
4338 | case MOPEN + 2: |
4339 | case MOPEN + 3: |
4340 | case MOPEN + 4: |
4341 | case MOPEN + 5: |
4342 | case MOPEN + 6: |
4343 | case MOPEN + 7: |
4344 | case MOPEN + 8: |
4345 | case MOPEN + 9: |
4346 | { |
4347 | no = op - MOPEN; |
4348 | cleanup_subexpr(); |
4349 | rp = regstack_push(RS_MOPEN, scan); |
4350 | if (rp == NULL) |
4351 | status = RA_FAIL; |
4352 | else { |
4353 | rp->rs_no = no; |
4354 | save_se(&rp->rs_un.sesave, &rex.reg_startpos[no], |
4355 | &rex.reg_startp[no]); |
4356 | // We simply continue and handle the result when done. |
4357 | } |
4358 | } |
4359 | break; |
4360 | |
4361 | case NOPEN: /* \%( */ |
4362 | case NCLOSE: /* \) after \%( */ |
4363 | if (regstack_push(RS_NOPEN, scan) == NULL) |
4364 | status = RA_FAIL; |
4365 | /* We simply continue and handle the result when done. */ |
4366 | break; |
4367 | |
4368 | case ZOPEN + 1: |
4369 | case ZOPEN + 2: |
4370 | case ZOPEN + 3: |
4371 | case ZOPEN + 4: |
4372 | case ZOPEN + 5: |
4373 | case ZOPEN + 6: |
4374 | case ZOPEN + 7: |
4375 | case ZOPEN + 8: |
4376 | case ZOPEN + 9: |
4377 | { |
4378 | no = op - ZOPEN; |
4379 | cleanup_zsubexpr(); |
4380 | rp = regstack_push(RS_ZOPEN, scan); |
4381 | if (rp == NULL) |
4382 | status = RA_FAIL; |
4383 | else { |
4384 | rp->rs_no = no; |
4385 | save_se(&rp->rs_un.sesave, ®_startzpos[no], |
4386 | ®_startzp[no]); |
4387 | /* We simply continue and handle the result when done. */ |
4388 | } |
4389 | } |
4390 | break; |
4391 | |
4392 | case MCLOSE + 0: /* Match end: \ze */ |
4393 | case MCLOSE + 1: /* \) */ |
4394 | case MCLOSE + 2: |
4395 | case MCLOSE + 3: |
4396 | case MCLOSE + 4: |
4397 | case MCLOSE + 5: |
4398 | case MCLOSE + 6: |
4399 | case MCLOSE + 7: |
4400 | case MCLOSE + 8: |
4401 | case MCLOSE + 9: |
4402 | { |
4403 | no = op - MCLOSE; |
4404 | cleanup_subexpr(); |
4405 | rp = regstack_push(RS_MCLOSE, scan); |
4406 | if (rp == NULL) { |
4407 | status = RA_FAIL; |
4408 | } else { |
4409 | rp->rs_no = no; |
4410 | save_se(&rp->rs_un.sesave, &rex.reg_endpos[no], &rex.reg_endp[no]); |
4411 | // We simply continue and handle the result when done. |
4412 | } |
4413 | } |
4414 | break; |
4415 | |
4416 | case ZCLOSE + 1: /* \) after \z( */ |
4417 | case ZCLOSE + 2: |
4418 | case ZCLOSE + 3: |
4419 | case ZCLOSE + 4: |
4420 | case ZCLOSE + 5: |
4421 | case ZCLOSE + 6: |
4422 | case ZCLOSE + 7: |
4423 | case ZCLOSE + 8: |
4424 | case ZCLOSE + 9: |
4425 | { |
4426 | no = op - ZCLOSE; |
4427 | cleanup_zsubexpr(); |
4428 | rp = regstack_push(RS_ZCLOSE, scan); |
4429 | if (rp == NULL) |
4430 | status = RA_FAIL; |
4431 | else { |
4432 | rp->rs_no = no; |
4433 | save_se(&rp->rs_un.sesave, ®_endzpos[no], |
4434 | ®_endzp[no]); |
4435 | /* We simply continue and handle the result when done. */ |
4436 | } |
4437 | } |
4438 | break; |
4439 | |
4440 | case BACKREF + 1: |
4441 | case BACKREF + 2: |
4442 | case BACKREF + 3: |
4443 | case BACKREF + 4: |
4444 | case BACKREF + 5: |
4445 | case BACKREF + 6: |
4446 | case BACKREF + 7: |
4447 | case BACKREF + 8: |
4448 | case BACKREF + 9: |
4449 | { |
4450 | int len; |
4451 | |
4452 | no = op - BACKREF; |
4453 | cleanup_subexpr(); |
4454 | if (!REG_MULTI) { // Single-line regexp |
4455 | if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL) { |
4456 | // Backref was not set: Match an empty string. |
4457 | len = 0; |
4458 | } else { |
4459 | // Compare current input with back-ref in the same line. |
4460 | len = (int)(rex.reg_endp[no] - rex.reg_startp[no]); |
4461 | if (cstrncmp(rex.reg_startp[no], reginput, &len) != 0) { |
4462 | status = RA_NOMATCH; |
4463 | } |
4464 | } |
4465 | } else { // Multi-line regexp |
4466 | if (rex.reg_startpos[no].lnum < 0 || rex.reg_endpos[no].lnum < 0) { |
4467 | // Backref was not set: Match an empty string. |
4468 | len = 0; |
4469 | } else { |
4470 | if (rex.reg_startpos[no].lnum == reglnum |
4471 | && rex.reg_endpos[no].lnum == reglnum) { |
4472 | // Compare back-ref within the current line. |
4473 | len = rex.reg_endpos[no].col - rex.reg_startpos[no].col; |
4474 | if (cstrncmp(regline + rex.reg_startpos[no].col, |
4475 | reginput, &len) != 0) { |
4476 | status = RA_NOMATCH; |
4477 | } |
4478 | } else { |
4479 | // Messy situation: Need to compare between two lines. |
4480 | int r = match_with_backref(rex.reg_startpos[no].lnum, |
4481 | rex.reg_startpos[no].col, |
4482 | rex.reg_endpos[no].lnum, |
4483 | rex.reg_endpos[no].col, |
4484 | &len); |
4485 | if (r != RA_MATCH) { |
4486 | status = r; |
4487 | } |
4488 | } |
4489 | } |
4490 | } |
4491 | |
4492 | /* Matched the backref, skip over it. */ |
4493 | reginput += len; |
4494 | } |
4495 | break; |
4496 | |
4497 | case ZREF + 1: |
4498 | case ZREF + 2: |
4499 | case ZREF + 3: |
4500 | case ZREF + 4: |
4501 | case ZREF + 5: |
4502 | case ZREF + 6: |
4503 | case ZREF + 7: |
4504 | case ZREF + 8: |
4505 | case ZREF + 9: |
4506 | { |
4507 | int len; |
4508 | |
4509 | cleanup_zsubexpr(); |
4510 | no = op - ZREF; |
4511 | if (re_extmatch_in != NULL |
4512 | && re_extmatch_in->matches[no] != NULL) { |
4513 | len = (int)STRLEN(re_extmatch_in->matches[no]); |
4514 | if (cstrncmp(re_extmatch_in->matches[no], |
4515 | reginput, &len) != 0) |
4516 | status = RA_NOMATCH; |
4517 | else |
4518 | reginput += len; |
4519 | } else { |
4520 | /* Backref was not set: Match an empty string. */ |
4521 | } |
4522 | } |
4523 | break; |
4524 | |
4525 | case BRANCH: |
4526 | { |
4527 | if (OP(next) != BRANCH) /* No choice. */ |
4528 | next = OPERAND(scan); /* Avoid recursion. */ |
4529 | else { |
4530 | rp = regstack_push(RS_BRANCH, scan); |
4531 | if (rp == NULL) |
4532 | status = RA_FAIL; |
4533 | else |
4534 | status = RA_BREAK; /* rest is below */ |
4535 | } |
4536 | } |
4537 | break; |
4538 | |
4539 | case BRACE_LIMITS: |
4540 | { |
4541 | if (OP(next) == BRACE_SIMPLE) { |
4542 | bl_minval = OPERAND_MIN(scan); |
4543 | bl_maxval = OPERAND_MAX(scan); |
4544 | } else if (OP(next) >= BRACE_COMPLEX |
4545 | && OP(next) < BRACE_COMPLEX + 10) { |
4546 | no = OP(next) - BRACE_COMPLEX; |
4547 | brace_min[no] = OPERAND_MIN(scan); |
4548 | brace_max[no] = OPERAND_MAX(scan); |
4549 | brace_count[no] = 0; |
4550 | } else { |
4551 | internal_error("BRACE_LIMITS" ); |
4552 | status = RA_FAIL; |
4553 | } |
4554 | } |
4555 | break; |
4556 | |
4557 | case BRACE_COMPLEX + 0: |
4558 | case BRACE_COMPLEX + 1: |
4559 | case BRACE_COMPLEX + 2: |
4560 | case BRACE_COMPLEX + 3: |
4561 | case BRACE_COMPLEX + 4: |
4562 | case BRACE_COMPLEX + 5: |
4563 | case BRACE_COMPLEX + 6: |
4564 | case BRACE_COMPLEX + 7: |
4565 | case BRACE_COMPLEX + 8: |
4566 | case BRACE_COMPLEX + 9: |
4567 | { |
4568 | no = op - BRACE_COMPLEX; |
4569 | ++brace_count[no]; |
4570 | |
4571 | /* If not matched enough times yet, try one more */ |
4572 | if (brace_count[no] <= (brace_min[no] <= brace_max[no] |
4573 | ? brace_min[no] : brace_max[no])) { |
4574 | rp = regstack_push(RS_BRCPLX_MORE, scan); |
4575 | if (rp == NULL) |
4576 | status = RA_FAIL; |
4577 | else { |
4578 | rp->rs_no = no; |
4579 | reg_save(&rp->rs_un.regsave, &backpos); |
4580 | next = OPERAND(scan); |
4581 | /* We continue and handle the result when done. */ |
4582 | } |
4583 | break; |
4584 | } |
4585 | |
4586 | /* If matched enough times, may try matching some more */ |
4587 | if (brace_min[no] <= brace_max[no]) { |
4588 | /* Range is the normal way around, use longest match */ |
4589 | if (brace_count[no] <= brace_max[no]) { |
4590 | rp = regstack_push(RS_BRCPLX_LONG, scan); |
4591 | if (rp == NULL) |
4592 | status = RA_FAIL; |
4593 | else { |
4594 | rp->rs_no = no; |
4595 | reg_save(&rp->rs_un.regsave, &backpos); |
4596 | next = OPERAND(scan); |
4597 | /* We continue and handle the result when done. */ |
4598 | } |
4599 | } |
4600 | } else { |
4601 | /* Range is backwards, use shortest match first */ |
4602 | if (brace_count[no] <= brace_min[no]) { |
4603 | rp = regstack_push(RS_BRCPLX_SHORT, scan); |
4604 | if (rp == NULL) |
4605 | status = RA_FAIL; |
4606 | else { |
4607 | reg_save(&rp->rs_un.regsave, &backpos); |
4608 | /* We continue and handle the result when done. */ |
4609 | } |
4610 | } |
4611 | } |
4612 | } |
4613 | break; |
4614 | |
4615 | case BRACE_SIMPLE: |
4616 | case STAR: |
4617 | case PLUS: |
4618 | { |
4619 | regstar_T rst; |
4620 | |
4621 | /* |
4622 | * Lookahead to avoid useless match attempts when we know |
4623 | * what character comes next. |
4624 | */ |
4625 | if (OP(next) == EXACTLY) { |
4626 | rst.nextb = *OPERAND(next); |
4627 | if (rex.reg_ic) { |
4628 | if (mb_isupper(rst.nextb)) { |
4629 | rst.nextb_ic = mb_tolower(rst.nextb); |
4630 | } else { |
4631 | rst.nextb_ic = mb_toupper(rst.nextb); |
4632 | } |
4633 | } else { |
4634 | rst.nextb_ic = rst.nextb; |
4635 | } |
4636 | } else { |
4637 | rst.nextb = NUL; |
4638 | rst.nextb_ic = NUL; |
4639 | } |
4640 | if (op != BRACE_SIMPLE) { |
4641 | rst.minval = (op == STAR) ? 0 : 1; |
4642 | rst.maxval = MAX_LIMIT; |
4643 | } else { |
4644 | rst.minval = bl_minval; |
4645 | rst.maxval = bl_maxval; |
4646 | } |
4647 | |
4648 | /* |
4649 | * When maxval > minval, try matching as much as possible, up |
4650 | * to maxval. When maxval < minval, try matching at least the |
4651 | * minimal number (since the range is backwards, that's also |
4652 | * maxval!). |
4653 | */ |
4654 | rst.count = regrepeat(OPERAND(scan), rst.maxval); |
4655 | if (got_int) { |
4656 | status = RA_FAIL; |
4657 | break; |
4658 | } |
4659 | if (rst.minval <= rst.maxval |
4660 | ? rst.count >= rst.minval : rst.count >= rst.maxval) { |
4661 | /* It could match. Prepare for trying to match what |
4662 | * follows. The code is below. Parameters are stored in |
4663 | * a regstar_T on the regstack. */ |
4664 | if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) { |
4665 | EMSG(_(e_maxmempat)); |
4666 | status = RA_FAIL; |
4667 | } else { |
4668 | ga_grow(®stack, sizeof(regstar_T)); |
4669 | regstack.ga_len += sizeof(regstar_T); |
4670 | rp = regstack_push(rst.minval <= rst.maxval |
4671 | ? RS_STAR_LONG : RS_STAR_SHORT, scan); |
4672 | if (rp == NULL) |
4673 | status = RA_FAIL; |
4674 | else { |
4675 | *(((regstar_T *)rp) - 1) = rst; |
4676 | status = RA_BREAK; /* skip the restore bits */ |
4677 | } |
4678 | } |
4679 | } else |
4680 | status = RA_NOMATCH; |
4681 | |
4682 | } |
4683 | break; |
4684 | |
4685 | case NOMATCH: |
4686 | case MATCH: |
4687 | case SUBPAT: |
4688 | rp = regstack_push(RS_NOMATCH, scan); |
4689 | if (rp == NULL) |
4690 | status = RA_FAIL; |
4691 | else { |
4692 | rp->rs_no = op; |
4693 | reg_save(&rp->rs_un.regsave, &backpos); |
4694 | next = OPERAND(scan); |
4695 | /* We continue and handle the result when done. */ |
4696 | } |
4697 | break; |
4698 | |
4699 | case BEHIND: |
4700 | case NOBEHIND: |
4701 | /* Need a bit of room to store extra positions. */ |
4702 | if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) { |
4703 | EMSG(_(e_maxmempat)); |
4704 | status = RA_FAIL; |
4705 | } else { |
4706 | ga_grow(®stack, sizeof(regbehind_T)); |
4707 | regstack.ga_len += sizeof(regbehind_T); |
4708 | rp = regstack_push(RS_BEHIND1, scan); |
4709 | if (rp == NULL) |
4710 | status = RA_FAIL; |
4711 | else { |
4712 | /* Need to save the subexpr to be able to restore them |
4713 | * when there is a match but we don't use it. */ |
4714 | save_subexpr(((regbehind_T *)rp) - 1); |
4715 | |
4716 | rp->rs_no = op; |
4717 | reg_save(&rp->rs_un.regsave, &backpos); |
4718 | /* First try if what follows matches. If it does then we |
4719 | * check the behind match by looping. */ |
4720 | } |
4721 | } |
4722 | break; |
4723 | |
4724 | case BHPOS: |
4725 | if (REG_MULTI) { |
4726 | if (behind_pos.rs_u.pos.col != (colnr_T)(reginput - regline) |
4727 | || behind_pos.rs_u.pos.lnum != reglnum) |
4728 | status = RA_NOMATCH; |
4729 | } else if (behind_pos.rs_u.ptr != reginput) |
4730 | status = RA_NOMATCH; |
4731 | break; |
4732 | |
4733 | case NEWL: |
4734 | if ((c != NUL || !REG_MULTI || reglnum > rex.reg_maxline |
4735 | || rex.reg_line_lbr) && (c != '\n' || !rex.reg_line_lbr)) { |
4736 | status = RA_NOMATCH; |
4737 | } else if (rex.reg_line_lbr) { |
4738 | ADVANCE_REGINPUT(); |
4739 | } else { |
4740 | reg_nextline(); |
4741 | } |
4742 | break; |
4743 | |
4744 | case END: |
4745 | status = RA_MATCH; /* Success! */ |
4746 | break; |
4747 | |
4748 | default: |
4749 | EMSG(_(e_re_corr)); |
4750 | #ifdef REGEXP_DEBUG |
4751 | printf("Illegal op code %d\n" , op); |
4752 | #endif |
4753 | status = RA_FAIL; |
4754 | break; |
4755 | } |
4756 | } |
4757 | |
4758 | /* If we can't continue sequentially, break the inner loop. */ |
4759 | if (status != RA_CONT) |
4760 | break; |
4761 | |
4762 | /* Continue in inner loop, advance to next item. */ |
4763 | scan = next; |
4764 | |
4765 | } /* end of inner loop */ |
4766 | |
4767 | /* |
4768 | * If there is something on the regstack execute the code for the state. |
4769 | * If the state is popped then loop and use the older state. |
4770 | */ |
4771 | while (!GA_EMPTY(®stack) && status != RA_FAIL) { |
4772 | rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1; |
4773 | switch (rp->rs_state) { |
4774 | case RS_NOPEN: |
4775 | /* Result is passed on as-is, simply pop the state. */ |
4776 | regstack_pop(&scan); |
4777 | break; |
4778 | |
4779 | case RS_MOPEN: |
4780 | // Pop the state. Restore pointers when there is no match. |
4781 | if (status == RA_NOMATCH) { |
4782 | restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no], |
4783 | &rex.reg_startp[rp->rs_no]); |
4784 | } |
4785 | regstack_pop(&scan); |
4786 | break; |
4787 | |
4788 | case RS_ZOPEN: |
4789 | /* Pop the state. Restore pointers when there is no match. */ |
4790 | if (status == RA_NOMATCH) |
4791 | restore_se(&rp->rs_un.sesave, ®_startzpos[rp->rs_no], |
4792 | ®_startzp[rp->rs_no]); |
4793 | regstack_pop(&scan); |
4794 | break; |
4795 | |
4796 | case RS_MCLOSE: |
4797 | // Pop the state. Restore pointers when there is no match. |
4798 | if (status == RA_NOMATCH) { |
4799 | restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no], |
4800 | &rex.reg_endp[rp->rs_no]); |
4801 | } |
4802 | regstack_pop(&scan); |
4803 | break; |
4804 | |
4805 | case RS_ZCLOSE: |
4806 | /* Pop the state. Restore pointers when there is no match. */ |
4807 | if (status == RA_NOMATCH) |
4808 | restore_se(&rp->rs_un.sesave, ®_endzpos[rp->rs_no], |
4809 | ®_endzp[rp->rs_no]); |
4810 | regstack_pop(&scan); |
4811 | break; |
4812 | |
4813 | case RS_BRANCH: |
4814 | if (status == RA_MATCH) |
4815 | /* this branch matched, use it */ |
4816 | regstack_pop(&scan); |
4817 | else { |
4818 | if (status != RA_BREAK) { |
4819 | /* After a non-matching branch: try next one. */ |
4820 | reg_restore(&rp->rs_un.regsave, &backpos); |
4821 | scan = rp->rs_scan; |
4822 | } |
4823 | if (scan == NULL || OP(scan) != BRANCH) { |
4824 | /* no more branches, didn't find a match */ |
4825 | status = RA_NOMATCH; |
4826 | regstack_pop(&scan); |
4827 | } else { |
4828 | /* Prepare to try a branch. */ |
4829 | rp->rs_scan = regnext(scan); |
4830 | reg_save(&rp->rs_un.regsave, &backpos); |
4831 | scan = OPERAND(scan); |
4832 | } |
4833 | } |
4834 | break; |
4835 | |
4836 | case RS_BRCPLX_MORE: |
4837 | /* Pop the state. Restore pointers when there is no match. */ |
4838 | if (status == RA_NOMATCH) { |
4839 | reg_restore(&rp->rs_un.regsave, &backpos); |
4840 | --brace_count[rp->rs_no]; /* decrement match count */ |
4841 | } |
4842 | regstack_pop(&scan); |
4843 | break; |
4844 | |
4845 | case RS_BRCPLX_LONG: |
4846 | /* Pop the state. Restore pointers when there is no match. */ |
4847 | if (status == RA_NOMATCH) { |
4848 | /* There was no match, but we did find enough matches. */ |
4849 | reg_restore(&rp->rs_un.regsave, &backpos); |
4850 | --brace_count[rp->rs_no]; |
4851 | /* continue with the items after "\{}" */ |
4852 | status = RA_CONT; |
4853 | } |
4854 | regstack_pop(&scan); |
4855 | if (status == RA_CONT) |
4856 | scan = regnext(scan); |
4857 | break; |
4858 | |
4859 | case RS_BRCPLX_SHORT: |
4860 | /* Pop the state. Restore pointers when there is no match. */ |
4861 | if (status == RA_NOMATCH) |
4862 | /* There was no match, try to match one more item. */ |
4863 | reg_restore(&rp->rs_un.regsave, &backpos); |
4864 | regstack_pop(&scan); |
4865 | if (status == RA_NOMATCH) { |
4866 | scan = OPERAND(scan); |
4867 | status = RA_CONT; |
4868 | } |
4869 | break; |
4870 | |
4871 | case RS_NOMATCH: |
4872 | /* Pop the state. If the operand matches for NOMATCH or |
4873 | * doesn't match for MATCH/SUBPAT, we fail. Otherwise backup, |
4874 | * except for SUBPAT, and continue with the next item. */ |
4875 | if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH)) |
4876 | status = RA_NOMATCH; |
4877 | else { |
4878 | status = RA_CONT; |
4879 | if (rp->rs_no != SUBPAT) /* zero-width */ |
4880 | reg_restore(&rp->rs_un.regsave, &backpos); |
4881 | } |
4882 | regstack_pop(&scan); |
4883 | if (status == RA_CONT) |
4884 | scan = regnext(scan); |
4885 | break; |
4886 | |
4887 | case RS_BEHIND1: |
4888 | if (status == RA_NOMATCH) { |
4889 | regstack_pop(&scan); |
4890 | regstack.ga_len -= sizeof(regbehind_T); |
4891 | } else { |
4892 | /* The stuff after BEHIND/NOBEHIND matches. Now try if |
4893 | * the behind part does (not) match before the current |
4894 | * position in the input. This must be done at every |
4895 | * position in the input and checking if the match ends at |
4896 | * the current position. */ |
4897 | |
4898 | /* save the position after the found match for next */ |
4899 | reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos); |
4900 | |
4901 | /* Start looking for a match with operand at the current |
4902 | * position. Go back one character until we find the |
4903 | * result, hitting the start of the line or the previous |
4904 | * line (for multi-line matching). |
4905 | * Set behind_pos to where the match should end, BHPOS |
4906 | * will match it. Save the current value. */ |
4907 | (((regbehind_T *)rp) - 1)->save_behind = behind_pos; |
4908 | behind_pos = rp->rs_un.regsave; |
4909 | |
4910 | rp->rs_state = RS_BEHIND2; |
4911 | |
4912 | reg_restore(&rp->rs_un.regsave, &backpos); |
4913 | scan = OPERAND(rp->rs_scan) + 4; |
4914 | } |
4915 | break; |
4916 | |
4917 | case RS_BEHIND2: |
4918 | /* |
4919 | * Looping for BEHIND / NOBEHIND match. |
4920 | */ |
4921 | if (status == RA_MATCH && reg_save_equal(&behind_pos)) { |
4922 | /* found a match that ends where "next" started */ |
4923 | behind_pos = (((regbehind_T *)rp) - 1)->save_behind; |
4924 | if (rp->rs_no == BEHIND) |
4925 | reg_restore(&(((regbehind_T *)rp) - 1)->save_after, |
4926 | &backpos); |
4927 | else { |
4928 | /* But we didn't want a match. Need to restore the |
4929 | * subexpr, because what follows matched, so they have |
4930 | * been set. */ |
4931 | status = RA_NOMATCH; |
4932 | restore_subexpr(((regbehind_T *)rp) - 1); |
4933 | } |
4934 | regstack_pop(&scan); |
4935 | regstack.ga_len -= sizeof(regbehind_T); |
4936 | } else { |
4937 | long limit; |
4938 | |
4939 | /* No match or a match that doesn't end where we want it: Go |
4940 | * back one character. May go to previous line once. */ |
4941 | no = OK; |
4942 | limit = OPERAND_MIN(rp->rs_scan); |
4943 | if (REG_MULTI) { |
4944 | if (limit > 0 |
4945 | && ((rp->rs_un.regsave.rs_u.pos.lnum |
4946 | < behind_pos.rs_u.pos.lnum |
4947 | ? (colnr_T)STRLEN(regline) |
4948 | : behind_pos.rs_u.pos.col) |
4949 | - rp->rs_un.regsave.rs_u.pos.col >= limit)) |
4950 | no = FAIL; |
4951 | else if (rp->rs_un.regsave.rs_u.pos.col == 0) { |
4952 | if (rp->rs_un.regsave.rs_u.pos.lnum |
4953 | < behind_pos.rs_u.pos.lnum |
4954 | || reg_getline( |
4955 | --rp->rs_un.regsave.rs_u.pos.lnum) |
4956 | == NULL) |
4957 | no = FAIL; |
4958 | else { |
4959 | reg_restore(&rp->rs_un.regsave, &backpos); |
4960 | rp->rs_un.regsave.rs_u.pos.col = |
4961 | (colnr_T)STRLEN(regline); |
4962 | } |
4963 | } else { |
4964 | const char_u *const line = |
4965 | reg_getline(rp->rs_un.regsave.rs_u.pos.lnum); |
4966 | |
4967 | rp->rs_un.regsave.rs_u.pos.col -= |
4968 | utf_head_off(line, |
4969 | line + rp->rs_un.regsave.rs_u.pos.col - 1) |
4970 | + 1; |
4971 | } |
4972 | } else { |
4973 | if (rp->rs_un.regsave.rs_u.ptr == regline) { |
4974 | no = FAIL; |
4975 | } else { |
4976 | MB_PTR_BACK(regline, rp->rs_un.regsave.rs_u.ptr); |
4977 | if (limit > 0 |
4978 | && (long)(behind_pos.rs_u.ptr |
4979 | - rp->rs_un.regsave.rs_u.ptr) > limit) { |
4980 | no = FAIL; |
4981 | } |
4982 | } |
4983 | } |
4984 | if (no == OK) { |
4985 | /* Advanced, prepare for finding match again. */ |
4986 | reg_restore(&rp->rs_un.regsave, &backpos); |
4987 | scan = OPERAND(rp->rs_scan) + 4; |
4988 | if (status == RA_MATCH) { |
4989 | /* We did match, so subexpr may have been changed, |
4990 | * need to restore them for the next try. */ |
4991 | status = RA_NOMATCH; |
4992 | restore_subexpr(((regbehind_T *)rp) - 1); |
4993 | } |
4994 | } else { |
4995 | /* Can't advance. For NOBEHIND that's a match. */ |
4996 | behind_pos = (((regbehind_T *)rp) - 1)->save_behind; |
4997 | if (rp->rs_no == NOBEHIND) { |
4998 | reg_restore(&(((regbehind_T *)rp) - 1)->save_after, |
4999 | &backpos); |
5000 | status = RA_MATCH; |
5001 | } else { |
5002 | /* We do want a proper match. Need to restore the |
5003 | * subexpr if we had a match, because they may have |
5004 | * been set. */ |
5005 | if (status == RA_MATCH) { |
5006 | status = RA_NOMATCH; |
5007 | restore_subexpr(((regbehind_T *)rp) - 1); |
5008 | } |
5009 | } |
5010 | regstack_pop(&scan); |
5011 | regstack.ga_len -= sizeof(regbehind_T); |
5012 | } |
5013 | } |
5014 | break; |
5015 | |
5016 | case RS_STAR_LONG: |
5017 | case RS_STAR_SHORT: |
5018 | { |
5019 | regstar_T *rst = ((regstar_T *)rp) - 1; |
5020 | |
5021 | if (status == RA_MATCH) { |
5022 | regstack_pop(&scan); |
5023 | regstack.ga_len -= sizeof(regstar_T); |
5024 | break; |
5025 | } |
5026 | |
5027 | /* Tried once already, restore input pointers. */ |
5028 | if (status != RA_BREAK) |
5029 | reg_restore(&rp->rs_un.regsave, &backpos); |
5030 | |
5031 | /* Repeat until we found a position where it could match. */ |
5032 | for (;; ) { |
5033 | if (status != RA_BREAK) { |
5034 | /* Tried first position already, advance. */ |
5035 | if (rp->rs_state == RS_STAR_LONG) { |
5036 | /* Trying for longest match, but couldn't or |
5037 | * didn't match -- back up one char. */ |
5038 | if (--rst->count < rst->minval) |
5039 | break; |
5040 | if (reginput == regline) { |
5041 | // backup to last char of previous line |
5042 | reglnum--; |
5043 | regline = reg_getline(reglnum); |
5044 | // Just in case regrepeat() didn't count right. |
5045 | if (regline == NULL) { |
5046 | break; |
5047 | } |
5048 | reginput = regline + STRLEN(regline); |
5049 | fast_breakcheck(); |
5050 | } else { |
5051 | MB_PTR_BACK(regline, reginput); |
5052 | } |
5053 | } else { |
5054 | /* Range is backwards, use shortest match first. |
5055 | * Careful: maxval and minval are exchanged! |
5056 | * Couldn't or didn't match: try advancing one |
5057 | * char. */ |
5058 | if (rst->count == rst->minval |
5059 | || regrepeat(OPERAND(rp->rs_scan), 1L) == 0) |
5060 | break; |
5061 | ++rst->count; |
5062 | } |
5063 | if (got_int) |
5064 | break; |
5065 | } else |
5066 | status = RA_NOMATCH; |
5067 | |
5068 | /* If it could match, try it. */ |
5069 | if (rst->nextb == NUL || *reginput == rst->nextb |
5070 | || *reginput == rst->nextb_ic) { |
5071 | reg_save(&rp->rs_un.regsave, &backpos); |
5072 | scan = regnext(rp->rs_scan); |
5073 | status = RA_CONT; |
5074 | break; |
5075 | } |
5076 | } |
5077 | if (status != RA_CONT) { |
5078 | /* Failed. */ |
5079 | regstack_pop(&scan); |
5080 | regstack.ga_len -= sizeof(regstar_T); |
5081 | status = RA_NOMATCH; |
5082 | } |
5083 | } |
5084 | break; |
5085 | } |
5086 | |
5087 | /* If we want to continue the inner loop or didn't pop a state |
5088 | * continue matching loop */ |
5089 | if (status == RA_CONT || rp == (regitem_T *) |
5090 | ((char *)regstack.ga_data + regstack.ga_len) - 1) |
5091 | break; |
5092 | } |
5093 | |
5094 | /* May need to continue with the inner loop, starting at "scan". */ |
5095 | if (status == RA_CONT) |
5096 | continue; |
5097 | |
5098 | /* |
5099 | * If the regstack is empty or something failed we are done. |
5100 | */ |
5101 | if (GA_EMPTY(®stack) || status == RA_FAIL) { |
5102 | if (scan == NULL) { |
5103 | /* |
5104 | * We get here only if there's trouble -- normally "case END" is |
5105 | * the terminating point. |
5106 | */ |
5107 | EMSG(_(e_re_corr)); |
5108 | #ifdef REGEXP_DEBUG |
5109 | printf("Premature EOL\n" ); |
5110 | #endif |
5111 | } |
5112 | return status == RA_MATCH; |
5113 | } |
5114 | |
5115 | } /* End of loop until the regstack is empty. */ |
5116 | |
5117 | /* NOTREACHED */ |
5118 | } |
5119 | |
5120 | /* |
5121 | * Push an item onto the regstack. |
5122 | * Returns pointer to new item. Returns NULL when out of memory. |
5123 | */ |
5124 | static regitem_T *regstack_push(regstate_T state, char_u *scan) |
5125 | { |
5126 | regitem_T *rp; |
5127 | |
5128 | if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) { |
5129 | EMSG(_(e_maxmempat)); |
5130 | return NULL; |
5131 | } |
5132 | ga_grow(®stack, sizeof(regitem_T)); |
5133 | |
5134 | rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len); |
5135 | rp->rs_state = state; |
5136 | rp->rs_scan = scan; |
5137 | |
5138 | regstack.ga_len += sizeof(regitem_T); |
5139 | return rp; |
5140 | } |
5141 | |
5142 | /* |
5143 | * Pop an item from the regstack. |
5144 | */ |
5145 | static void regstack_pop(char_u **scan) |
5146 | { |
5147 | regitem_T *rp; |
5148 | |
5149 | rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1; |
5150 | *scan = rp->rs_scan; |
5151 | |
5152 | regstack.ga_len -= sizeof(regitem_T); |
5153 | } |
5154 | |
5155 | /* |
5156 | * regrepeat - repeatedly match something simple, return how many. |
5157 | * Advances reginput (and reglnum) to just after the matched chars. |
5158 | */ |
5159 | static int |
5160 | regrepeat ( |
5161 | char_u *p, |
5162 | long maxcount /* maximum number of matches allowed */ |
5163 | ) |
5164 | { |
5165 | long count = 0; |
5166 | char_u *scan; |
5167 | char_u *opnd; |
5168 | int mask; |
5169 | int testval = 0; |
5170 | |
5171 | scan = reginput; /* Make local copy of reginput for speed. */ |
5172 | opnd = OPERAND(p); |
5173 | switch (OP(p)) { |
5174 | case ANY: |
5175 | case ANY + ADD_NL: |
5176 | while (count < maxcount) { |
5177 | /* Matching anything means we continue until end-of-line (or |
5178 | * end-of-file for ANY + ADD_NL), only limited by maxcount. */ |
5179 | while (*scan != NUL && count < maxcount) { |
5180 | count++; |
5181 | MB_PTR_ADV(scan); |
5182 | } |
5183 | if (!REG_MULTI || !WITH_NL(OP(p)) || reglnum > rex.reg_maxline |
5184 | || rex.reg_line_lbr || count == maxcount) { |
5185 | break; |
5186 | } |
5187 | count++; // count the line-break |
5188 | reg_nextline(); |
5189 | scan = reginput; |
5190 | if (got_int) |
5191 | break; |
5192 | } |
5193 | break; |
5194 | |
5195 | case IDENT: |
5196 | case IDENT + ADD_NL: |
5197 | testval = 1; |
5198 | FALLTHROUGH; |
5199 | case SIDENT: |
5200 | case SIDENT + ADD_NL: |
5201 | while (count < maxcount) { |
5202 | if (vim_isIDc(PTR2CHAR(scan)) && (testval || !ascii_isdigit(*scan))) { |
5203 | MB_PTR_ADV(scan); |
5204 | } else if (*scan == NUL) { |
5205 | if (!REG_MULTI || !WITH_NL(OP(p)) || reglnum > rex.reg_maxline |
5206 | || rex.reg_line_lbr) { |
5207 | break; |
5208 | } |
5209 | reg_nextline(); |
5210 | scan = reginput; |
5211 | if (got_int) |
5212 | break; |
5213 | } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { |
5214 | scan++; |
5215 | } else { |
5216 | break; |
5217 | } |
5218 | ++count; |
5219 | } |
5220 | break; |
5221 | |
5222 | case KWORD: |
5223 | case KWORD + ADD_NL: |
5224 | testval = 1; |
5225 | FALLTHROUGH; |
5226 | case SKWORD: |
5227 | case SKWORD + ADD_NL: |
5228 | while (count < maxcount) { |
5229 | if (vim_iswordp_buf(scan, rex.reg_buf) |
5230 | && (testval || !ascii_isdigit(*scan))) { |
5231 | MB_PTR_ADV(scan); |
5232 | } else if (*scan == NUL) { |
5233 | if (!REG_MULTI || !WITH_NL(OP(p)) || reglnum > rex.reg_maxline |
5234 | || rex.reg_line_lbr) { |
5235 | break; |
5236 | } |
5237 | reg_nextline(); |
5238 | scan = reginput; |
5239 | if (got_int) { |
5240 | break; |
5241 | } |
5242 | } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { |
5243 | scan++; |
5244 | } else { |
5245 | break; |
5246 | } |
5247 | count++; |
5248 | } |
5249 | break; |
5250 | |
5251 | case FNAME: |
5252 | case FNAME + ADD_NL: |
5253 | testval = 1; |
5254 | FALLTHROUGH; |
5255 | case SFNAME: |
5256 | case SFNAME + ADD_NL: |
5257 | while (count < maxcount) { |
5258 | if (vim_isfilec(PTR2CHAR(scan)) && (testval || !ascii_isdigit(*scan))) { |
5259 | MB_PTR_ADV(scan); |
5260 | } else if (*scan == NUL) { |
5261 | if (!REG_MULTI || !WITH_NL(OP(p)) || reglnum > rex.reg_maxline |
5262 | || rex.reg_line_lbr) { |
5263 | break; |
5264 | } |
5265 | reg_nextline(); |
5266 | scan = reginput; |
5267 | if (got_int) { |
5268 | break; |
5269 | } |
5270 | } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { |
5271 | scan++; |
5272 | } else { |
5273 | break; |
5274 | } |
5275 | count++; |
5276 | } |
5277 | break; |
5278 | |
5279 | case PRINT: |
5280 | case PRINT + ADD_NL: |
5281 | testval = 1; |
5282 | FALLTHROUGH; |
5283 | case SPRINT: |
5284 | case SPRINT + ADD_NL: |
5285 | while (count < maxcount) { |
5286 | if (*scan == NUL) { |
5287 | if (!REG_MULTI || !WITH_NL(OP(p)) || reglnum > rex.reg_maxline |
5288 | || rex.reg_line_lbr) { |
5289 | break; |
5290 | } |
5291 | reg_nextline(); |
5292 | scan = reginput; |
5293 | if (got_int) { |
5294 | break; |
5295 | } |
5296 | } else if (vim_isprintc(PTR2CHAR(scan)) == 1 |
5297 | && (testval || !ascii_isdigit(*scan))) { |
5298 | MB_PTR_ADV(scan); |
5299 | } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { |
5300 | scan++; |
5301 | } else { |
5302 | break; |
5303 | } |
5304 | count++; |
5305 | } |
5306 | break; |
5307 | |
5308 | case WHITE: |
5309 | case WHITE + ADD_NL: |
5310 | testval = mask = RI_WHITE; |
5311 | do_class: |
5312 | while (count < maxcount) { |
5313 | int l; |
5314 | if (*scan == NUL) { |
5315 | if (!REG_MULTI || !WITH_NL(OP(p)) || reglnum > rex.reg_maxline |
5316 | || rex.reg_line_lbr) { |
5317 | break; |
5318 | } |
5319 | reg_nextline(); |
5320 | scan = reginput; |
5321 | if (got_int) |
5322 | break; |
5323 | } else if (has_mbyte && (l = (*mb_ptr2len)(scan)) > 1) { |
5324 | if (testval != 0) |
5325 | break; |
5326 | scan += l; |
5327 | } else if ((class_tab[*scan] & mask) == testval) { |
5328 | scan++; |
5329 | } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { |
5330 | scan++; |
5331 | } else { |
5332 | break; |
5333 | } |
5334 | ++count; |
5335 | } |
5336 | break; |
5337 | |
5338 | case NWHITE: |
5339 | case NWHITE + ADD_NL: |
5340 | mask = RI_WHITE; |
5341 | goto do_class; |
5342 | case DIGIT: |
5343 | case DIGIT + ADD_NL: |
5344 | testval = mask = RI_DIGIT; |
5345 | goto do_class; |
5346 | case NDIGIT: |
5347 | case NDIGIT + ADD_NL: |
5348 | mask = RI_DIGIT; |
5349 | goto do_class; |
5350 | case HEX: |
5351 | case HEX + ADD_NL: |
5352 | testval = mask = RI_HEX; |
5353 | goto do_class; |
5354 | case NHEX: |
5355 | case NHEX + ADD_NL: |
5356 | mask = RI_HEX; |
5357 | goto do_class; |
5358 | case OCTAL: |
5359 | case OCTAL + ADD_NL: |
5360 | testval = mask = RI_OCTAL; |
5361 | goto do_class; |
5362 | case NOCTAL: |
5363 | case NOCTAL + ADD_NL: |
5364 | mask = RI_OCTAL; |
5365 | goto do_class; |
5366 | case WORD: |
5367 | case WORD + ADD_NL: |
5368 | testval = mask = RI_WORD; |
5369 | goto do_class; |
5370 | case NWORD: |
5371 | case NWORD + ADD_NL: |
5372 | mask = RI_WORD; |
5373 | goto do_class; |
5374 | case HEAD: |
5375 | case HEAD + ADD_NL: |
5376 | testval = mask = RI_HEAD; |
5377 | goto do_class; |
5378 | case NHEAD: |
5379 | case NHEAD + ADD_NL: |
5380 | mask = RI_HEAD; |
5381 | goto do_class; |
5382 | case ALPHA: |
5383 | case ALPHA + ADD_NL: |
5384 | testval = mask = RI_ALPHA; |
5385 | goto do_class; |
5386 | case NALPHA: |
5387 | case NALPHA + ADD_NL: |
5388 | mask = RI_ALPHA; |
5389 | goto do_class; |
5390 | case LOWER: |
5391 | case LOWER + ADD_NL: |
5392 | testval = mask = RI_LOWER; |
5393 | goto do_class; |
5394 | case NLOWER: |
5395 | case NLOWER + ADD_NL: |
5396 | mask = RI_LOWER; |
5397 | goto do_class; |
5398 | case UPPER: |
5399 | case UPPER + ADD_NL: |
5400 | testval = mask = RI_UPPER; |
5401 | goto do_class; |
5402 | case NUPPER: |
5403 | case NUPPER + ADD_NL: |
5404 | mask = RI_UPPER; |
5405 | goto do_class; |
5406 | |
5407 | case EXACTLY: |
5408 | { |
5409 | int cu, cl; |
5410 | |
5411 | // This doesn't do a multi-byte character, because a MULTIBYTECODE |
5412 | // would have been used for it. It does handle single-byte |
5413 | // characters, such as latin1. |
5414 | if (rex.reg_ic) { |
5415 | cu = mb_toupper(*opnd); |
5416 | cl = mb_tolower(*opnd); |
5417 | while (count < maxcount && (*scan == cu || *scan == cl)) { |
5418 | count++; |
5419 | scan++; |
5420 | } |
5421 | } else { |
5422 | cu = *opnd; |
5423 | while (count < maxcount && *scan == cu) { |
5424 | count++; |
5425 | scan++; |
5426 | } |
5427 | } |
5428 | break; |
5429 | } |
5430 | |
5431 | case MULTIBYTECODE: |
5432 | { |
5433 | int i, len, cf = 0; |
5434 | |
5435 | /* Safety check (just in case 'encoding' was changed since |
5436 | * compiling the program). */ |
5437 | if ((len = (*mb_ptr2len)(opnd)) > 1) { |
5438 | if (rex.reg_ic && enc_utf8) { |
5439 | cf = utf_fold(utf_ptr2char(opnd)); |
5440 | } |
5441 | while (count < maxcount && (*mb_ptr2len)(scan) >= len) { |
5442 | for (i = 0; i < len; ++i) { |
5443 | if (opnd[i] != scan[i]) { |
5444 | break; |
5445 | } |
5446 | } |
5447 | if (i < len && (!rex.reg_ic || !enc_utf8 |
5448 | || utf_fold(utf_ptr2char(scan)) != cf)) { |
5449 | break; |
5450 | } |
5451 | scan += len; |
5452 | ++count; |
5453 | } |
5454 | } |
5455 | } |
5456 | break; |
5457 | |
5458 | case ANYOF: |
5459 | case ANYOF + ADD_NL: |
5460 | testval = 1; |
5461 | FALLTHROUGH; |
5462 | |
5463 | case ANYBUT: |
5464 | case ANYBUT + ADD_NL: |
5465 | while (count < maxcount) { |
5466 | int len; |
5467 | if (*scan == NUL) { |
5468 | if (!REG_MULTI || !WITH_NL(OP(p)) || reglnum > rex.reg_maxline |
5469 | || rex.reg_line_lbr) { |
5470 | break; |
5471 | } |
5472 | reg_nextline(); |
5473 | scan = reginput; |
5474 | if (got_int) { |
5475 | break; |
5476 | } |
5477 | } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) { |
5478 | scan++; |
5479 | } else if ((len = utfc_ptr2len(scan)) > 1) { |
5480 | if ((cstrchr(opnd, utf_ptr2char(scan)) == NULL) == testval) { |
5481 | break; |
5482 | } |
5483 | scan += len; |
5484 | } else { |
5485 | if ((cstrchr(opnd, *scan) == NULL) == testval) |
5486 | break; |
5487 | ++scan; |
5488 | } |
5489 | ++count; |
5490 | } |
5491 | break; |
5492 | |
5493 | case NEWL: |
5494 | while (count < maxcount |
5495 | && ((*scan == NUL && reglnum <= rex.reg_maxline && !rex.reg_line_lbr |
5496 | && REG_MULTI) || (*scan == '\n' && rex.reg_line_lbr))) { |
5497 | count++; |
5498 | if (rex.reg_line_lbr) { |
5499 | ADVANCE_REGINPUT(); |
5500 | } else { |
5501 | reg_nextline(); |
5502 | } |
5503 | scan = reginput; |
5504 | if (got_int) |
5505 | break; |
5506 | } |
5507 | break; |
5508 | |
5509 | default: /* Oh dear. Called inappropriately. */ |
5510 | EMSG(_(e_re_corr)); |
5511 | #ifdef REGEXP_DEBUG |
5512 | printf("Called regrepeat with op code %d\n" , OP(p)); |
5513 | #endif |
5514 | break; |
5515 | } |
5516 | |
5517 | reginput = scan; |
5518 | |
5519 | return (int)count; |
5520 | } |
5521 | |
5522 | /* |
5523 | * regnext - dig the "next" pointer out of a node |
5524 | * Returns NULL when calculating size, when there is no next item and when |
5525 | * there is an error. |
5526 | */ |
5527 | static char_u *regnext(char_u *p) |
5528 | FUNC_ATTR_NONNULL_ALL |
5529 | { |
5530 | int offset; |
5531 | |
5532 | if (p == JUST_CALC_SIZE || reg_toolong) |
5533 | return NULL; |
5534 | |
5535 | offset = NEXT(p); |
5536 | if (offset == 0) |
5537 | return NULL; |
5538 | |
5539 | if (OP(p) == BACK) |
5540 | return p - offset; |
5541 | else |
5542 | return p + offset; |
5543 | } |
5544 | |
5545 | /* |
5546 | * Check the regexp program for its magic number. |
5547 | * Return TRUE if it's wrong. |
5548 | */ |
5549 | static int prog_magic_wrong(void) |
5550 | { |
5551 | regprog_T *prog; |
5552 | |
5553 | prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog; |
5554 | if (prog->engine == &nfa_regengine) { |
5555 | // For NFA matcher we don't check the magic |
5556 | return false; |
5557 | } |
5558 | |
5559 | if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC) { |
5560 | EMSG(_(e_re_corr)); |
5561 | return TRUE; |
5562 | } |
5563 | return FALSE; |
5564 | } |
5565 | |
5566 | /* |
5567 | * Cleanup the subexpressions, if this wasn't done yet. |
5568 | * This construction is used to clear the subexpressions only when they are |
5569 | * used (to increase speed). |
5570 | */ |
5571 | static void cleanup_subexpr(void) |
5572 | { |
5573 | if (need_clear_subexpr) { |
5574 | if (REG_MULTI) { |
5575 | // Use 0xff to set lnum to -1 |
5576 | memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP); |
5577 | memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP); |
5578 | } else { |
5579 | memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP); |
5580 | memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP); |
5581 | } |
5582 | need_clear_subexpr = FALSE; |
5583 | } |
5584 | } |
5585 | |
5586 | static void cleanup_zsubexpr(void) |
5587 | { |
5588 | if (need_clear_zsubexpr) { |
5589 | if (REG_MULTI) { |
5590 | /* Use 0xff to set lnum to -1 */ |
5591 | memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP); |
5592 | memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP); |
5593 | } else { |
5594 | memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP); |
5595 | memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP); |
5596 | } |
5597 | need_clear_zsubexpr = FALSE; |
5598 | } |
5599 | } |
5600 | |
5601 | /* |
5602 | * Save the current subexpr to "bp", so that they can be restored |
5603 | * later by restore_subexpr(). |
5604 | */ |
5605 | static void save_subexpr(regbehind_T *bp) |
5606 | { |
5607 | int i; |
5608 | |
5609 | // When "need_clear_subexpr" is set we don't need to save the values, only |
5610 | // remember that this flag needs to be set again when restoring. |
5611 | bp->save_need_clear_subexpr = need_clear_subexpr; |
5612 | if (!need_clear_subexpr) { |
5613 | for (i = 0; i < NSUBEXP; ++i) { |
5614 | if (REG_MULTI) { |
5615 | bp->save_start[i].se_u.pos = rex.reg_startpos[i]; |
5616 | bp->save_end[i].se_u.pos = rex.reg_endpos[i]; |
5617 | } else { |
5618 | bp->save_start[i].se_u.ptr = rex.reg_startp[i]; |
5619 | bp->save_end[i].se_u.ptr = rex.reg_endp[i]; |
5620 | } |
5621 | } |
5622 | } |
5623 | } |
5624 | |
5625 | /* |
5626 | * Restore the subexpr from "bp". |
5627 | */ |
5628 | static void restore_subexpr(regbehind_T *bp) |
5629 | { |
5630 | int i; |
5631 | |
5632 | /* Only need to restore saved values when they are not to be cleared. */ |
5633 | need_clear_subexpr = bp->save_need_clear_subexpr; |
5634 | if (!need_clear_subexpr) { |
5635 | for (i = 0; i < NSUBEXP; ++i) { |
5636 | if (REG_MULTI) { |
5637 | rex.reg_startpos[i] = bp->save_start[i].se_u.pos; |
5638 | rex.reg_endpos[i] = bp->save_end[i].se_u.pos; |
5639 | } else { |
5640 | rex.reg_startp[i] = bp->save_start[i].se_u.ptr; |
5641 | rex.reg_endp[i] = bp->save_end[i].se_u.ptr; |
5642 | } |
5643 | } |
5644 | } |
5645 | } |
5646 | |
5647 | /* |
5648 | * Advance reglnum, regline and reginput to the next line. |
5649 | */ |
5650 | static void reg_nextline(void) |
5651 | { |
5652 | regline = reg_getline(++reglnum); |
5653 | reginput = regline; |
5654 | fast_breakcheck(); |
5655 | } |
5656 | |
5657 | /* |
5658 | * Save the input line and position in a regsave_T. |
5659 | */ |
5660 | static void reg_save(regsave_T *save, garray_T *gap) |
5661 | { |
5662 | if (REG_MULTI) { |
5663 | save->rs_u.pos.col = (colnr_T)(reginput - regline); |
5664 | save->rs_u.pos.lnum = reglnum; |
5665 | } else |
5666 | save->rs_u.ptr = reginput; |
5667 | save->rs_len = gap->ga_len; |
5668 | } |
5669 | |
5670 | /* |
5671 | * Restore the input line and position from a regsave_T. |
5672 | */ |
5673 | static void reg_restore(regsave_T *save, garray_T *gap) |
5674 | { |
5675 | if (REG_MULTI) { |
5676 | if (reglnum != save->rs_u.pos.lnum) { |
5677 | /* only call reg_getline() when the line number changed to save |
5678 | * a bit of time */ |
5679 | reglnum = save->rs_u.pos.lnum; |
5680 | regline = reg_getline(reglnum); |
5681 | } |
5682 | reginput = regline + save->rs_u.pos.col; |
5683 | } else |
5684 | reginput = save->rs_u.ptr; |
5685 | gap->ga_len = save->rs_len; |
5686 | } |
5687 | |
5688 | /* |
5689 | * Return TRUE if current position is equal to saved position. |
5690 | */ |
5691 | static int reg_save_equal(regsave_T *save) |
5692 | { |
5693 | if (REG_MULTI) |
5694 | return reglnum == save->rs_u.pos.lnum |
5695 | && reginput == regline + save->rs_u.pos.col; |
5696 | return reginput == save->rs_u.ptr; |
5697 | } |
5698 | |
5699 | /* |
5700 | * Tentatively set the sub-expression start to the current position (after |
5701 | * calling regmatch() they will have changed). Need to save the existing |
5702 | * values for when there is no match. |
5703 | * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()), |
5704 | * depending on REG_MULTI. |
5705 | */ |
5706 | static void save_se_multi(save_se_T *savep, lpos_T *posp) |
5707 | { |
5708 | savep->se_u.pos = *posp; |
5709 | posp->lnum = reglnum; |
5710 | posp->col = (colnr_T)(reginput - regline); |
5711 | } |
5712 | |
5713 | static void save_se_one(save_se_T *savep, char_u **pp) |
5714 | { |
5715 | savep->se_u.ptr = *pp; |
5716 | *pp = reginput; |
5717 | } |
5718 | |
5719 | /* |
5720 | * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL. |
5721 | */ |
5722 | static int re_num_cmp(uint32_t val, char_u *scan) |
5723 | { |
5724 | uint32_t n = (uint32_t)OPERAND_MIN(scan); |
5725 | |
5726 | if (OPERAND_CMP(scan) == '>') |
5727 | return val > n; |
5728 | if (OPERAND_CMP(scan) == '<') |
5729 | return val < n; |
5730 | return val == n; |
5731 | } |
5732 | |
5733 | /* |
5734 | * Check whether a backreference matches. |
5735 | * Returns RA_FAIL, RA_NOMATCH or RA_MATCH. |
5736 | * If "bytelen" is not NULL, it is set to the byte length of the match in the |
5737 | * last line. |
5738 | */ |
5739 | static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen) |
5740 | { |
5741 | linenr_T clnum = start_lnum; |
5742 | colnr_T ccol = start_col; |
5743 | int len; |
5744 | char_u *p; |
5745 | |
5746 | if (bytelen != NULL) |
5747 | *bytelen = 0; |
5748 | for (;; ) { |
5749 | /* Since getting one line may invalidate the other, need to make copy. |
5750 | * Slow! */ |
5751 | if (regline != reg_tofree) { |
5752 | len = (int)STRLEN(regline); |
5753 | if (reg_tofree == NULL || len >= (int)reg_tofreelen) { |
5754 | len += 50; /* get some extra */ |
5755 | xfree(reg_tofree); |
5756 | reg_tofree = xmalloc(len); |
5757 | reg_tofreelen = len; |
5758 | } |
5759 | STRCPY(reg_tofree, regline); |
5760 | reginput = reg_tofree + (reginput - regline); |
5761 | regline = reg_tofree; |
5762 | } |
5763 | |
5764 | /* Get the line to compare with. */ |
5765 | p = reg_getline(clnum); |
5766 | assert(p); |
5767 | |
5768 | if (clnum == end_lnum) |
5769 | len = end_col - ccol; |
5770 | else |
5771 | len = (int)STRLEN(p + ccol); |
5772 | |
5773 | if (cstrncmp(p + ccol, reginput, &len) != 0) |
5774 | return RA_NOMATCH; /* doesn't match */ |
5775 | if (bytelen != NULL) |
5776 | *bytelen += len; |
5777 | if (clnum == end_lnum) { |
5778 | break; // match and at end! |
5779 | } |
5780 | if (reglnum >= rex.reg_maxline) { |
5781 | return RA_NOMATCH; // text too short |
5782 | } |
5783 | |
5784 | /* Advance to next line. */ |
5785 | reg_nextline(); |
5786 | if (bytelen != NULL) |
5787 | *bytelen = 0; |
5788 | ++clnum; |
5789 | ccol = 0; |
5790 | if (got_int) |
5791 | return RA_FAIL; |
5792 | } |
5793 | |
5794 | /* found a match! Note that regline may now point to a copy of the line, |
5795 | * that should not matter. */ |
5796 | return RA_MATCH; |
5797 | } |
5798 | |
5799 | #ifdef BT_REGEXP_DUMP |
5800 | |
5801 | /* |
5802 | * regdump - dump a regexp onto stdout in vaguely comprehensible form |
5803 | */ |
5804 | static void regdump(char_u *pattern, bt_regprog_T *r) |
5805 | { |
5806 | char_u *s; |
5807 | int op = EXACTLY; /* Arbitrary non-END op. */ |
5808 | char_u *next; |
5809 | char_u *end = NULL; |
5810 | FILE *f; |
5811 | |
5812 | #ifdef BT_REGEXP_LOG |
5813 | f = fopen("bt_regexp_log.log" , "a" ); |
5814 | #else |
5815 | f = stdout; |
5816 | #endif |
5817 | if (f == NULL) |
5818 | return; |
5819 | fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n" , |
5820 | pattern); |
5821 | |
5822 | s = r->program + 1; |
5823 | /* |
5824 | * Loop until we find the END that isn't before a referred next (an END |
5825 | * can also appear in a NOMATCH operand). |
5826 | */ |
5827 | while (op != END || s <= end) { |
5828 | op = OP(s); |
5829 | fprintf(f, "%2d%s" , (int)(s - r->program), regprop(s)); /* Where, what. */ |
5830 | next = regnext(s); |
5831 | if (next == NULL) /* Next ptr. */ |
5832 | fprintf(f, "(0)" ); |
5833 | else |
5834 | fprintf(f, "(%d)" , (int)((s - r->program) + (next - s))); |
5835 | if (end < next) |
5836 | end = next; |
5837 | if (op == BRACE_LIMITS) { |
5838 | /* Two ints */ |
5839 | fprintf(f, " minval %" PRId64 ", maxval %" PRId64, |
5840 | (int64_t)OPERAND_MIN(s), (int64_t)OPERAND_MAX(s)); |
5841 | s += 8; |
5842 | } else if (op == BEHIND || op == NOBEHIND) { |
5843 | /* one int */ |
5844 | fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s)); |
5845 | s += 4; |
5846 | } else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL) { |
5847 | /* one int plus comperator */ |
5848 | fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s)); |
5849 | s += 5; |
5850 | } |
5851 | s += 3; |
5852 | if (op == ANYOF || op == ANYOF + ADD_NL |
5853 | || op == ANYBUT || op == ANYBUT + ADD_NL |
5854 | || op == EXACTLY) { |
5855 | /* Literal string, where present. */ |
5856 | fprintf(f, "\nxxxxxxxxx\n" ); |
5857 | while (*s != NUL) |
5858 | fprintf(f, "%c" , *s++); |
5859 | fprintf(f, "\nxxxxxxxxx\n" ); |
5860 | s++; |
5861 | } |
5862 | fprintf(f, "\r\n" ); |
5863 | } |
5864 | |
5865 | /* Header fields of interest. */ |
5866 | if (r->regstart != NUL) |
5867 | fprintf(f, "start `%s' 0x%x; " , r->regstart < 256 |
5868 | ? (char *)transchar(r->regstart) |
5869 | : "multibyte" , r->regstart); |
5870 | if (r->reganch) |
5871 | fprintf(f, "anchored; " ); |
5872 | if (r->regmust != NULL) |
5873 | fprintf(f, "must have \"%s\"" , r->regmust); |
5874 | fprintf(f, "\r\n" ); |
5875 | |
5876 | #ifdef BT_REGEXP_LOG |
5877 | fclose(f); |
5878 | #endif |
5879 | } |
5880 | #endif /* BT_REGEXP_DUMP */ |
5881 | |
5882 | #ifdef REGEXP_DEBUG |
5883 | /* |
5884 | * regprop - printable representation of opcode |
5885 | */ |
5886 | static char_u *regprop(char_u *op) |
5887 | { |
5888 | char *p; |
5889 | static char buf[50]; |
5890 | |
5891 | STRCPY(buf, ":" ); |
5892 | |
5893 | switch ((int) OP(op)) { |
5894 | case BOL: |
5895 | p = "BOL" ; |
5896 | break; |
5897 | case EOL: |
5898 | p = "EOL" ; |
5899 | break; |
5900 | case RE_BOF: |
5901 | p = "BOF" ; |
5902 | break; |
5903 | case RE_EOF: |
5904 | p = "EOF" ; |
5905 | break; |
5906 | case CURSOR: |
5907 | p = "CURSOR" ; |
5908 | break; |
5909 | case RE_VISUAL: |
5910 | p = "RE_VISUAL" ; |
5911 | break; |
5912 | case RE_LNUM: |
5913 | p = "RE_LNUM" ; |
5914 | break; |
5915 | case RE_MARK: |
5916 | p = "RE_MARK" ; |
5917 | break; |
5918 | case RE_COL: |
5919 | p = "RE_COL" ; |
5920 | break; |
5921 | case RE_VCOL: |
5922 | p = "RE_VCOL" ; |
5923 | break; |
5924 | case BOW: |
5925 | p = "BOW" ; |
5926 | break; |
5927 | case EOW: |
5928 | p = "EOW" ; |
5929 | break; |
5930 | case ANY: |
5931 | p = "ANY" ; |
5932 | break; |
5933 | case ANY + ADD_NL: |
5934 | p = "ANY+NL" ; |
5935 | break; |
5936 | case ANYOF: |
5937 | p = "ANYOF" ; |
5938 | break; |
5939 | case ANYOF + ADD_NL: |
5940 | p = "ANYOF+NL" ; |
5941 | break; |
5942 | case ANYBUT: |
5943 | p = "ANYBUT" ; |
5944 | break; |
5945 | case ANYBUT + ADD_NL: |
5946 | p = "ANYBUT+NL" ; |
5947 | break; |
5948 | case IDENT: |
5949 | p = "IDENT" ; |
5950 | break; |
5951 | case IDENT + ADD_NL: |
5952 | p = "IDENT+NL" ; |
5953 | break; |
5954 | case SIDENT: |
5955 | p = "SIDENT" ; |
5956 | break; |
5957 | case SIDENT + ADD_NL: |
5958 | p = "SIDENT+NL" ; |
5959 | break; |
5960 | case KWORD: |
5961 | p = "KWORD" ; |
5962 | break; |
5963 | case KWORD + ADD_NL: |
5964 | p = "KWORD+NL" ; |
5965 | break; |
5966 | case SKWORD: |
5967 | p = "SKWORD" ; |
5968 | break; |
5969 | case SKWORD + ADD_NL: |
5970 | p = "SKWORD+NL" ; |
5971 | break; |
5972 | case FNAME: |
5973 | p = "FNAME" ; |
5974 | break; |
5975 | case FNAME + ADD_NL: |
5976 | p = "FNAME+NL" ; |
5977 | break; |
5978 | case SFNAME: |
5979 | p = "SFNAME" ; |
5980 | break; |
5981 | case SFNAME + ADD_NL: |
5982 | p = "SFNAME+NL" ; |
5983 | break; |
5984 | case PRINT: |
5985 | p = "PRINT" ; |
5986 | break; |
5987 | case PRINT + ADD_NL: |
5988 | p = "PRINT+NL" ; |
5989 | break; |
5990 | case SPRINT: |
5991 | p = "SPRINT" ; |
5992 | break; |
5993 | case SPRINT + ADD_NL: |
5994 | p = "SPRINT+NL" ; |
5995 | break; |
5996 | case WHITE: |
5997 | p = "WHITE" ; |
5998 | break; |
5999 | case WHITE + ADD_NL: |
6000 | p = "WHITE+NL" ; |
6001 | break; |
6002 | case NWHITE: |
6003 | p = "NWHITE" ; |
6004 | break; |
6005 | case NWHITE + ADD_NL: |
6006 | p = "NWHITE+NL" ; |
6007 | break; |
6008 | case DIGIT: |
6009 | p = "DIGIT" ; |
6010 | break; |
6011 | case DIGIT + ADD_NL: |
6012 | p = "DIGIT+NL" ; |
6013 | break; |
6014 | case NDIGIT: |
6015 | p = "NDIGIT" ; |
6016 | break; |
6017 | case NDIGIT + ADD_NL: |
6018 | p = "NDIGIT+NL" ; |
6019 | break; |
6020 | case HEX: |
6021 | p = "HEX" ; |
6022 | break; |
6023 | case HEX + ADD_NL: |
6024 | p = "HEX+NL" ; |
6025 | break; |
6026 | case NHEX: |
6027 | p = "NHEX" ; |
6028 | break; |
6029 | case NHEX + ADD_NL: |
6030 | p = "NHEX+NL" ; |
6031 | break; |
6032 | case OCTAL: |
6033 | p = "OCTAL" ; |
6034 | break; |
6035 | case OCTAL + ADD_NL: |
6036 | p = "OCTAL+NL" ; |
6037 | break; |
6038 | case NOCTAL: |
6039 | p = "NOCTAL" ; |
6040 | break; |
6041 | case NOCTAL + ADD_NL: |
6042 | p = "NOCTAL+NL" ; |
6043 | break; |
6044 | case WORD: |
6045 | p = "WORD" ; |
6046 | break; |
6047 | case WORD + ADD_NL: |
6048 | p = "WORD+NL" ; |
6049 | break; |
6050 | case NWORD: |
6051 | p = "NWORD" ; |
6052 | break; |
6053 | case NWORD + ADD_NL: |
6054 | p = "NWORD+NL" ; |
6055 | break; |
6056 | case HEAD: |
6057 | p = "HEAD" ; |
6058 | break; |
6059 | case HEAD + ADD_NL: |
6060 | p = "HEAD+NL" ; |
6061 | break; |
6062 | case NHEAD: |
6063 | p = "NHEAD" ; |
6064 | break; |
6065 | case NHEAD + ADD_NL: |
6066 | p = "NHEAD+NL" ; |
6067 | break; |
6068 | case ALPHA: |
6069 | p = "ALPHA" ; |
6070 | break; |
6071 | case ALPHA + ADD_NL: |
6072 | p = "ALPHA+NL" ; |
6073 | break; |
6074 | case NALPHA: |
6075 | p = "NALPHA" ; |
6076 | break; |
6077 | case NALPHA + ADD_NL: |
6078 | p = "NALPHA+NL" ; |
6079 | break; |
6080 | case LOWER: |
6081 | p = "LOWER" ; |
6082 | break; |
6083 | case LOWER + ADD_NL: |
6084 | p = "LOWER+NL" ; |
6085 | break; |
6086 | case NLOWER: |
6087 | p = "NLOWER" ; |
6088 | break; |
6089 | case NLOWER + ADD_NL: |
6090 | p = "NLOWER+NL" ; |
6091 | break; |
6092 | case UPPER: |
6093 | p = "UPPER" ; |
6094 | break; |
6095 | case UPPER + ADD_NL: |
6096 | p = "UPPER+NL" ; |
6097 | break; |
6098 | case NUPPER: |
6099 | p = "NUPPER" ; |
6100 | break; |
6101 | case NUPPER + ADD_NL: |
6102 | p = "NUPPER+NL" ; |
6103 | break; |
6104 | case BRANCH: |
6105 | p = "BRANCH" ; |
6106 | break; |
6107 | case EXACTLY: |
6108 | p = "EXACTLY" ; |
6109 | break; |
6110 | case NOTHING: |
6111 | p = "NOTHING" ; |
6112 | break; |
6113 | case BACK: |
6114 | p = "BACK" ; |
6115 | break; |
6116 | case END: |
6117 | p = "END" ; |
6118 | break; |
6119 | case MOPEN + 0: |
6120 | p = "MATCH START" ; |
6121 | break; |
6122 | case MOPEN + 1: |
6123 | case MOPEN + 2: |
6124 | case MOPEN + 3: |
6125 | case MOPEN + 4: |
6126 | case MOPEN + 5: |
6127 | case MOPEN + 6: |
6128 | case MOPEN + 7: |
6129 | case MOPEN + 8: |
6130 | case MOPEN + 9: |
6131 | sprintf(buf + STRLEN(buf), "MOPEN%d" , OP(op) - MOPEN); |
6132 | p = NULL; |
6133 | break; |
6134 | case MCLOSE + 0: |
6135 | p = "MATCH END" ; |
6136 | break; |
6137 | case MCLOSE + 1: |
6138 | case MCLOSE + 2: |
6139 | case MCLOSE + 3: |
6140 | case MCLOSE + 4: |
6141 | case MCLOSE + 5: |
6142 | case MCLOSE + 6: |
6143 | case MCLOSE + 7: |
6144 | case MCLOSE + 8: |
6145 | case MCLOSE + 9: |
6146 | sprintf(buf + STRLEN(buf), "MCLOSE%d" , OP(op) - MCLOSE); |
6147 | p = NULL; |
6148 | break; |
6149 | case BACKREF + 1: |
6150 | case BACKREF + 2: |
6151 | case BACKREF + 3: |
6152 | case BACKREF + 4: |
6153 | case BACKREF + 5: |
6154 | case BACKREF + 6: |
6155 | case BACKREF + 7: |
6156 | case BACKREF + 8: |
6157 | case BACKREF + 9: |
6158 | sprintf(buf + STRLEN(buf), "BACKREF%d" , OP(op) - BACKREF); |
6159 | p = NULL; |
6160 | break; |
6161 | case NOPEN: |
6162 | p = "NOPEN" ; |
6163 | break; |
6164 | case NCLOSE: |
6165 | p = "NCLOSE" ; |
6166 | break; |
6167 | case ZOPEN + 1: |
6168 | case ZOPEN + 2: |
6169 | case ZOPEN + 3: |
6170 | case ZOPEN + 4: |
6171 | case ZOPEN + 5: |
6172 | case ZOPEN + 6: |
6173 | case ZOPEN + 7: |
6174 | case ZOPEN + 8: |
6175 | case ZOPEN + 9: |
6176 | sprintf(buf + STRLEN(buf), "ZOPEN%d" , OP(op) - ZOPEN); |
6177 | p = NULL; |
6178 | break; |
6179 | case ZCLOSE + 1: |
6180 | case ZCLOSE + 2: |
6181 | case ZCLOSE + 3: |
6182 | case ZCLOSE + 4: |
6183 | case ZCLOSE + 5: |
6184 | case ZCLOSE + 6: |
6185 | case ZCLOSE + 7: |
6186 | case ZCLOSE + 8: |
6187 | case ZCLOSE + 9: |
6188 | sprintf(buf + STRLEN(buf), "ZCLOSE%d" , OP(op) - ZCLOSE); |
6189 | p = NULL; |
6190 | break; |
6191 | case ZREF + 1: |
6192 | case ZREF + 2: |
6193 | case ZREF + 3: |
6194 | case ZREF + 4: |
6195 | case ZREF + 5: |
6196 | case ZREF + 6: |
6197 | case ZREF + 7: |
6198 | case ZREF + 8: |
6199 | case ZREF + 9: |
6200 | sprintf(buf + STRLEN(buf), "ZREF%d" , OP(op) - ZREF); |
6201 | p = NULL; |
6202 | break; |
6203 | case STAR: |
6204 | p = "STAR" ; |
6205 | break; |
6206 | case PLUS: |
6207 | p = "PLUS" ; |
6208 | break; |
6209 | case NOMATCH: |
6210 | p = "NOMATCH" ; |
6211 | break; |
6212 | case MATCH: |
6213 | p = "MATCH" ; |
6214 | break; |
6215 | case BEHIND: |
6216 | p = "BEHIND" ; |
6217 | break; |
6218 | case NOBEHIND: |
6219 | p = "NOBEHIND" ; |
6220 | break; |
6221 | case SUBPAT: |
6222 | p = "SUBPAT" ; |
6223 | break; |
6224 | case BRACE_LIMITS: |
6225 | p = "BRACE_LIMITS" ; |
6226 | break; |
6227 | case BRACE_SIMPLE: |
6228 | p = "BRACE_SIMPLE" ; |
6229 | break; |
6230 | case BRACE_COMPLEX + 0: |
6231 | case BRACE_COMPLEX + 1: |
6232 | case BRACE_COMPLEX + 2: |
6233 | case BRACE_COMPLEX + 3: |
6234 | case BRACE_COMPLEX + 4: |
6235 | case BRACE_COMPLEX + 5: |
6236 | case BRACE_COMPLEX + 6: |
6237 | case BRACE_COMPLEX + 7: |
6238 | case BRACE_COMPLEX + 8: |
6239 | case BRACE_COMPLEX + 9: |
6240 | sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d" , OP(op) - BRACE_COMPLEX); |
6241 | p = NULL; |
6242 | break; |
6243 | case MULTIBYTECODE: |
6244 | p = "MULTIBYTECODE" ; |
6245 | break; |
6246 | case NEWL: |
6247 | p = "NEWL" ; |
6248 | break; |
6249 | default: |
6250 | sprintf(buf + STRLEN(buf), "corrupt %d" , OP(op)); |
6251 | p = NULL; |
6252 | break; |
6253 | } |
6254 | if (p != NULL) |
6255 | STRCAT(buf, p); |
6256 | return (char_u *)buf; |
6257 | } |
6258 | #endif /* REGEXP_DEBUG */ |
6259 | |
6260 | |
6261 | |
6262 | /* 0xfb20 - 0xfb4f */ |
6263 | static decomp_T decomp_table[0xfb4f-0xfb20+1] = |
6264 | { |
6265 | {0x5e2,0,0}, /* 0xfb20 alt ayin */ |
6266 | {0x5d0,0,0}, /* 0xfb21 alt alef */ |
6267 | {0x5d3,0,0}, /* 0xfb22 alt dalet */ |
6268 | {0x5d4,0,0}, /* 0xfb23 alt he */ |
6269 | {0x5db,0,0}, /* 0xfb24 alt kaf */ |
6270 | {0x5dc,0,0}, /* 0xfb25 alt lamed */ |
6271 | {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */ |
6272 | {0x5e8,0,0}, /* 0xfb27 alt resh */ |
6273 | {0x5ea,0,0}, /* 0xfb28 alt tav */ |
6274 | {'+', 0, 0}, /* 0xfb29 alt plus */ |
6275 | {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */ |
6276 | {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */ |
6277 | {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */ |
6278 | {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */ |
6279 | {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */ |
6280 | {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */ |
6281 | {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */ |
6282 | {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */ |
6283 | {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */ |
6284 | {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */ |
6285 | {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */ |
6286 | {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */ |
6287 | {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */ |
6288 | {0xfb37, 0, 0}, /* 0xfb37 -- */ |
6289 | {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */ |
6290 | {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */ |
6291 | {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */ |
6292 | {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */ |
6293 | {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */ |
6294 | {0xfb3d, 0, 0}, /* 0xfb3d -- */ |
6295 | {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */ |
6296 | {0xfb3f, 0, 0}, /* 0xfb3f -- */ |
6297 | {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */ |
6298 | {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */ |
6299 | {0xfb42, 0, 0}, /* 0xfb42 -- */ |
6300 | {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */ |
6301 | {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */ |
6302 | {0xfb45, 0, 0}, /* 0xfb45 -- */ |
6303 | {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */ |
6304 | {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */ |
6305 | {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */ |
6306 | {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */ |
6307 | {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */ |
6308 | {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */ |
6309 | {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */ |
6310 | {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */ |
6311 | {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */ |
6312 | {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */ |
6313 | }; |
6314 | |
6315 | static void mb_decompose(int c, int *c1, int *c2, int *c3) |
6316 | { |
6317 | decomp_T d; |
6318 | |
6319 | if (c >= 0xfb20 && c <= 0xfb4f) { |
6320 | d = decomp_table[c - 0xfb20]; |
6321 | *c1 = d.a; |
6322 | *c2 = d.b; |
6323 | *c3 = d.c; |
6324 | } else { |
6325 | *c1 = c; |
6326 | *c2 = *c3 = 0; |
6327 | } |
6328 | } |
6329 | |
6330 | // Compare two strings, ignore case if rex.reg_ic set. |
6331 | // Return 0 if strings match, non-zero otherwise. |
6332 | // Correct the length "*n" when composing characters are ignored. |
6333 | static int cstrncmp(char_u *s1, char_u *s2, int *n) |
6334 | { |
6335 | int result; |
6336 | |
6337 | if (!rex.reg_ic) { |
6338 | result = STRNCMP(s1, s2, *n); |
6339 | } else { |
6340 | assert(*n >= 0); |
6341 | result = mb_strnicmp(s1, s2, (size_t)*n); |
6342 | } |
6343 | |
6344 | // if it failed and it's utf8 and we want to combineignore: |
6345 | if (result != 0 && enc_utf8 && rex.reg_icombine) { |
6346 | char_u *str1, *str2; |
6347 | int c1, c2, c11, c12; |
6348 | int junk; |
6349 | |
6350 | /* we have to handle the strcmp ourselves, since it is necessary to |
6351 | * deal with the composing characters by ignoring them: */ |
6352 | str1 = s1; |
6353 | str2 = s2; |
6354 | c1 = c2 = 0; |
6355 | while ((int)(str1 - s1) < *n) { |
6356 | c1 = mb_ptr2char_adv((const char_u **)&str1); |
6357 | c2 = mb_ptr2char_adv((const char_u **)&str2); |
6358 | |
6359 | /* decompose the character if necessary, into 'base' characters |
6360 | * because I don't care about Arabic, I will hard-code the Hebrew |
6361 | * which I *do* care about! So sue me... */ |
6362 | if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) { |
6363 | // decomposition necessary? |
6364 | mb_decompose(c1, &c11, &junk, &junk); |
6365 | mb_decompose(c2, &c12, &junk, &junk); |
6366 | c1 = c11; |
6367 | c2 = c12; |
6368 | if (c11 != c12 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12))) { |
6369 | break; |
6370 | } |
6371 | } |
6372 | } |
6373 | result = c2 - c1; |
6374 | if (result == 0) |
6375 | *n = (int)(str2 - s2); |
6376 | } |
6377 | |
6378 | return result; |
6379 | } |
6380 | |
6381 | /*************************************************************** |
6382 | * regsub stuff * |
6383 | ***************************************************************/ |
6384 | |
6385 | /* This stuff below really confuses cc on an SGI -- webb */ |
6386 | |
6387 | |
6388 | |
6389 | static fptr_T do_upper(int *d, int c) |
6390 | { |
6391 | *d = mb_toupper(c); |
6392 | |
6393 | return (fptr_T)NULL; |
6394 | } |
6395 | |
6396 | static fptr_T do_Upper(int *d, int c) |
6397 | { |
6398 | *d = mb_toupper(c); |
6399 | |
6400 | return (fptr_T)do_Upper; |
6401 | } |
6402 | |
6403 | static fptr_T do_lower(int *d, int c) |
6404 | { |
6405 | *d = mb_tolower(c); |
6406 | |
6407 | return (fptr_T)NULL; |
6408 | } |
6409 | |
6410 | static fptr_T do_Lower(int *d, int c) |
6411 | { |
6412 | *d = mb_tolower(c); |
6413 | |
6414 | return (fptr_T)do_Lower; |
6415 | } |
6416 | |
6417 | /* |
6418 | * regtilde(): Replace tildes in the pattern by the old pattern. |
6419 | * |
6420 | * Short explanation of the tilde: It stands for the previous replacement |
6421 | * pattern. If that previous pattern also contains a ~ we should go back a |
6422 | * step further... But we insert the previous pattern into the current one |
6423 | * and remember that. |
6424 | * This still does not handle the case where "magic" changes. So require the |
6425 | * user to keep his hands off of "magic". |
6426 | * |
6427 | * The tildes are parsed once before the first call to vim_regsub(). |
6428 | */ |
6429 | char_u *regtilde(char_u *source, int magic) |
6430 | { |
6431 | char_u *newsub = source; |
6432 | char_u *tmpsub; |
6433 | char_u *p; |
6434 | int len; |
6435 | int prevlen; |
6436 | |
6437 | for (p = newsub; *p; ++p) { |
6438 | if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic)) { |
6439 | if (reg_prev_sub != NULL) { |
6440 | /* length = len(newsub) - 1 + len(prev_sub) + 1 */ |
6441 | prevlen = (int)STRLEN(reg_prev_sub); |
6442 | tmpsub = xmalloc(STRLEN(newsub) + prevlen); |
6443 | /* copy prefix */ |
6444 | len = (int)(p - newsub); /* not including ~ */ |
6445 | memmove(tmpsub, newsub, (size_t)len); |
6446 | /* interpret tilde */ |
6447 | memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen); |
6448 | /* copy postfix */ |
6449 | if (!magic) |
6450 | ++p; /* back off \ */ |
6451 | STRCPY(tmpsub + len + prevlen, p + 1); |
6452 | |
6453 | if (newsub != source) /* already allocated newsub */ |
6454 | xfree(newsub); |
6455 | newsub = tmpsub; |
6456 | p = newsub + len + prevlen; |
6457 | } else if (magic) |
6458 | STRMOVE(p, p + 1); /* remove '~' */ |
6459 | else |
6460 | STRMOVE(p, p + 2); /* remove '\~' */ |
6461 | --p; |
6462 | } else { |
6463 | if (*p == '\\' && p[1]) /* skip escaped characters */ |
6464 | ++p; |
6465 | if (has_mbyte) |
6466 | p += (*mb_ptr2len)(p) - 1; |
6467 | } |
6468 | } |
6469 | |
6470 | xfree(reg_prev_sub); |
6471 | if (newsub != source) /* newsub was allocated, just keep it */ |
6472 | reg_prev_sub = newsub; |
6473 | else /* no ~ found, need to save newsub */ |
6474 | reg_prev_sub = vim_strsave(newsub); |
6475 | return newsub; |
6476 | } |
6477 | |
6478 | static int can_f_submatch = FALSE; /* TRUE when submatch() can be used */ |
6479 | |
6480 | // These pointers are used for reg_submatch(). Needed for when the |
6481 | // substitution string is an expression that contains a call to substitute() |
6482 | // and submatch(). |
6483 | typedef struct { |
6484 | regmatch_T *sm_match; |
6485 | regmmatch_T *sm_mmatch; |
6486 | linenr_T sm_firstlnum; |
6487 | linenr_T sm_maxline; |
6488 | int sm_line_lbr; |
6489 | } regsubmatch_T; |
6490 | |
6491 | static regsubmatch_T rsm; // can only be used when can_f_submatch is true |
6492 | |
6493 | /// Put the submatches in "argv[0]" which is a list passed into call_func() by |
6494 | /// vim_regsub_both(). |
6495 | static int fill_submatch_list(int argc, typval_T *argv, int argcount) |
6496 | { |
6497 | if (argcount == 0) { |
6498 | // called function doesn't take an argument |
6499 | return 0; |
6500 | } |
6501 | |
6502 | // Relies on sl_list to be the first item in staticList10_T. |
6503 | tv_list_init_static10((staticList10_T *)argv->vval.v_list); |
6504 | |
6505 | // There are always 10 list items in staticList10_T. |
6506 | listitem_T *li = tv_list_first(argv->vval.v_list); |
6507 | for (int i = 0; i < 10; i++) { |
6508 | char_u *s = rsm.sm_match->startp[i]; |
6509 | if (s == NULL || rsm.sm_match->endp[i] == NULL) { |
6510 | s = NULL; |
6511 | } else { |
6512 | s = vim_strnsave(s, (int)(rsm.sm_match->endp[i] - s)); |
6513 | } |
6514 | TV_LIST_ITEM_TV(li)->v_type = VAR_STRING; |
6515 | TV_LIST_ITEM_TV(li)->vval.v_string = s; |
6516 | li = TV_LIST_ITEM_NEXT(argv->vval.v_list, li); |
6517 | } |
6518 | return 1; |
6519 | } |
6520 | |
6521 | static void clear_submatch_list(staticList10_T *sl) |
6522 | { |
6523 | TV_LIST_ITER(&sl->sl_list, li, { |
6524 | xfree(TV_LIST_ITEM_TV(li)->vval.v_string); |
6525 | }); |
6526 | } |
6527 | |
6528 | /// vim_regsub() - perform substitutions after a vim_regexec() or |
6529 | /// vim_regexec_multi() match. |
6530 | /// |
6531 | /// If "copy" is TRUE really copy into "dest". |
6532 | /// If "copy" is FALSE nothing is copied, this is just to find out the length |
6533 | /// of the result. |
6534 | /// |
6535 | /// If "backslash" is TRUE, a backslash will be removed later, need to double |
6536 | /// them to keep them, and insert a backslash before a CR to avoid it being |
6537 | /// replaced with a line break later. |
6538 | /// |
6539 | /// Note: The matched text must not change between the call of |
6540 | /// vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back |
6541 | /// references invalid! |
6542 | /// |
6543 | /// Returns the size of the replacement, including terminating NUL. |
6544 | int vim_regsub(regmatch_T *rmp, char_u *source, typval_T *expr, char_u *dest, |
6545 | int copy, int magic, int backslash) |
6546 | { |
6547 | regexec_T rex_save; |
6548 | bool rex_in_use_save = rex_in_use; |
6549 | |
6550 | if (rex_in_use) { |
6551 | // Being called recursively, save the state. |
6552 | rex_save = rex; |
6553 | } |
6554 | rex_in_use = true; |
6555 | |
6556 | rex.reg_match = rmp; |
6557 | rex.reg_mmatch = NULL; |
6558 | rex.reg_maxline = 0; |
6559 | rex.reg_buf = curbuf; |
6560 | rex.reg_line_lbr = true; |
6561 | int result = vim_regsub_both(source, expr, dest, copy, magic, backslash); |
6562 | |
6563 | rex_in_use = rex_in_use_save; |
6564 | if (rex_in_use) { |
6565 | rex = rex_save; |
6566 | } |
6567 | |
6568 | return result; |
6569 | } |
6570 | |
6571 | int vim_regsub_multi(regmmatch_T *rmp, linenr_T lnum, char_u *source, char_u *dest, int copy, int magic, int backslash) |
6572 | { |
6573 | regexec_T rex_save; |
6574 | bool rex_in_use_save = rex_in_use; |
6575 | |
6576 | if (rex_in_use) { |
6577 | // Being called recursively, save the state. |
6578 | rex_save = rex; |
6579 | } |
6580 | rex_in_use = true; |
6581 | |
6582 | rex.reg_match = NULL; |
6583 | rex.reg_mmatch = rmp; |
6584 | rex.reg_buf = curbuf; // always works on the current buffer! |
6585 | rex.reg_firstlnum = lnum; |
6586 | rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum; |
6587 | rex.reg_line_lbr = false; |
6588 | int result = vim_regsub_both(source, NULL, dest, copy, magic, backslash); |
6589 | |
6590 | rex_in_use = rex_in_use_save; |
6591 | if (rex_in_use) { |
6592 | rex = rex_save; |
6593 | } |
6594 | |
6595 | return result; |
6596 | } |
6597 | |
6598 | static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest, |
6599 | int copy, int magic, int backslash) |
6600 | { |
6601 | char_u *src; |
6602 | char_u *dst; |
6603 | char_u *s; |
6604 | int c; |
6605 | int cc; |
6606 | int no = -1; |
6607 | fptr_T func_all = (fptr_T)NULL; |
6608 | fptr_T func_one = (fptr_T)NULL; |
6609 | linenr_T clnum = 0; /* init for GCC */ |
6610 | int len = 0; /* init for GCC */ |
6611 | static char_u *eval_result = NULL; |
6612 | |
6613 | // Be paranoid... |
6614 | if ((source == NULL && expr == NULL) || dest == NULL) { |
6615 | EMSG(_(e_null)); |
6616 | return 0; |
6617 | } |
6618 | if (prog_magic_wrong()) |
6619 | return 0; |
6620 | src = source; |
6621 | dst = dest; |
6622 | |
6623 | // When the substitute part starts with "\=" evaluate it as an expression. |
6624 | if (expr != NULL || (source[0] == '\\' && source[1] == '=')) { |
6625 | // To make sure that the length doesn't change between checking the |
6626 | // length and copying the string, and to speed up things, the |
6627 | // resulting string is saved from the call with "copy" == FALSE to the |
6628 | // call with "copy" == TRUE. |
6629 | if (copy) { |
6630 | if (eval_result != NULL) { |
6631 | STRCPY(dest, eval_result); |
6632 | dst += STRLEN(eval_result); |
6633 | XFREE_CLEAR(eval_result); |
6634 | } |
6635 | } else { |
6636 | int prev_can_f_submatch = can_f_submatch; |
6637 | regsubmatch_T rsm_save; |
6638 | |
6639 | xfree(eval_result); |
6640 | |
6641 | // The expression may contain substitute(), which calls us |
6642 | // recursively. Make sure submatch() gets the text from the first |
6643 | // level. |
6644 | if (can_f_submatch) { |
6645 | rsm_save = rsm; |
6646 | } |
6647 | can_f_submatch = true; |
6648 | rsm.sm_match = rex.reg_match; |
6649 | rsm.sm_mmatch = rex.reg_mmatch; |
6650 | rsm.sm_firstlnum = rex.reg_firstlnum; |
6651 | rsm.sm_maxline = rex.reg_maxline; |
6652 | rsm.sm_line_lbr = rex.reg_line_lbr; |
6653 | |
6654 | if (expr != NULL) { |
6655 | typval_T argv[2]; |
6656 | int dummy; |
6657 | typval_T rettv; |
6658 | staticList10_T matchList = TV_LIST_STATIC10_INIT; |
6659 | |
6660 | rettv.v_type = VAR_STRING; |
6661 | rettv.vval.v_string = NULL; |
6662 | argv[0].v_type = VAR_LIST; |
6663 | argv[0].vval.v_list = &matchList.sl_list; |
6664 | if (expr->v_type == VAR_FUNC) { |
6665 | s = expr->vval.v_string; |
6666 | call_func(s, (int)STRLEN(s), &rettv, 1, argv, |
6667 | fill_submatch_list, 0L, 0L, &dummy, |
6668 | true, NULL, NULL); |
6669 | } else if (expr->v_type == VAR_PARTIAL) { |
6670 | partial_T *partial = expr->vval.v_partial; |
6671 | |
6672 | s = partial_name(partial); |
6673 | call_func(s, (int)STRLEN(s), &rettv, 1, argv, |
6674 | fill_submatch_list, 0L, 0L, &dummy, |
6675 | true, partial, NULL); |
6676 | } |
6677 | if (tv_list_len(&matchList.sl_list) > 0) { |
6678 | // fill_submatch_list() was called. |
6679 | clear_submatch_list(&matchList); |
6680 | } |
6681 | char buf[NUMBUFLEN]; |
6682 | eval_result = (char_u *)tv_get_string_buf_chk(&rettv, buf); |
6683 | if (eval_result != NULL) { |
6684 | eval_result = vim_strsave(eval_result); |
6685 | } |
6686 | tv_clear(&rettv); |
6687 | } else { |
6688 | eval_result = eval_to_string(source + 2, NULL, true); |
6689 | } |
6690 | |
6691 | if (eval_result != NULL) { |
6692 | int had_backslash = FALSE; |
6693 | |
6694 | for (s = eval_result; *s != NUL; MB_PTR_ADV(s)) { |
6695 | // Change NL to CR, so that it becomes a line break, |
6696 | // unless called from vim_regexec_nl(). |
6697 | // Skip over a backslashed character. |
6698 | if (*s == NL && !rsm.sm_line_lbr) { |
6699 | *s = CAR; |
6700 | } else if (*s == '\\' && s[1] != NUL) { |
6701 | s++; |
6702 | /* Change NL to CR here too, so that this works: |
6703 | * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text: |
6704 | * abc\ |
6705 | * def |
6706 | * Not when called from vim_regexec_nl(). |
6707 | */ |
6708 | if (*s == NL && !rsm.sm_line_lbr) { |
6709 | *s = CAR; |
6710 | } |
6711 | had_backslash = true; |
6712 | } |
6713 | } |
6714 | if (had_backslash && backslash) { |
6715 | /* Backslashes will be consumed, need to double them. */ |
6716 | s = vim_strsave_escaped(eval_result, (char_u *)"\\" ); |
6717 | xfree(eval_result); |
6718 | eval_result = s; |
6719 | } |
6720 | |
6721 | dst += STRLEN(eval_result); |
6722 | } |
6723 | |
6724 | can_f_submatch = prev_can_f_submatch; |
6725 | if (can_f_submatch) { |
6726 | rsm = rsm_save; |
6727 | } |
6728 | } |
6729 | } else |
6730 | while ((c = *src++) != NUL) { |
6731 | if (c == '&' && magic) |
6732 | no = 0; |
6733 | else if (c == '\\' && *src != NUL) { |
6734 | if (*src == '&' && !magic) { |
6735 | ++src; |
6736 | no = 0; |
6737 | } else if ('0' <= *src && *src <= '9') { |
6738 | no = *src++ - '0'; |
6739 | } else if (vim_strchr((char_u *)"uUlLeE" , *src)) { |
6740 | switch (*src++) { |
6741 | case 'u': func_one = (fptr_T)do_upper; |
6742 | continue; |
6743 | case 'U': func_all = (fptr_T)do_Upper; |
6744 | continue; |
6745 | case 'l': func_one = (fptr_T)do_lower; |
6746 | continue; |
6747 | case 'L': func_all = (fptr_T)do_Lower; |
6748 | continue; |
6749 | case 'e': |
6750 | case 'E': func_one = func_all = (fptr_T)NULL; |
6751 | continue; |
6752 | } |
6753 | } |
6754 | } |
6755 | if (no < 0) { /* Ordinary character. */ |
6756 | if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL) { |
6757 | /* Copy a special key as-is. */ |
6758 | if (copy) { |
6759 | *dst++ = c; |
6760 | *dst++ = *src++; |
6761 | *dst++ = *src++; |
6762 | } else { |
6763 | dst += 3; |
6764 | src += 2; |
6765 | } |
6766 | continue; |
6767 | } |
6768 | |
6769 | if (c == '\\' && *src != NUL) { |
6770 | /* Check for abbreviations -- webb */ |
6771 | switch (*src) { |
6772 | case 'r': c = CAR; ++src; break; |
6773 | case 'n': c = NL; ++src; break; |
6774 | case 't': c = TAB; ++src; break; |
6775 | /* Oh no! \e already has meaning in subst pat :-( */ |
6776 | /* case 'e': c = ESC; ++src; break; */ |
6777 | case 'b': c = Ctrl_H; ++src; break; |
6778 | |
6779 | /* If "backslash" is TRUE the backslash will be removed |
6780 | * later. Used to insert a literal CR. */ |
6781 | default: if (backslash) { |
6782 | if (copy) |
6783 | *dst = '\\'; |
6784 | ++dst; |
6785 | } |
6786 | c = *src++; |
6787 | } |
6788 | } else { |
6789 | c = utf_ptr2char(src - 1); |
6790 | } |
6791 | // Write to buffer, if copy is set. |
6792 | if (func_one != NULL) { |
6793 | func_one = (fptr_T)(func_one(&cc, c)); |
6794 | } else if (func_all != NULL) { |
6795 | func_all = (fptr_T)(func_all(&cc, c)); |
6796 | } else { |
6797 | // just copy |
6798 | cc = c; |
6799 | } |
6800 | |
6801 | int totlen = utfc_ptr2len(src - 1); |
6802 | |
6803 | if (copy) { |
6804 | utf_char2bytes(cc, dst); |
6805 | } |
6806 | dst += utf_char2len(cc) - 1; |
6807 | int clen = utf_ptr2len(src - 1); |
6808 | |
6809 | // If the character length is shorter than "totlen", there |
6810 | // are composing characters; copy them as-is. |
6811 | if (clen < totlen) { |
6812 | if (copy) { |
6813 | memmove(dst + 1, src - 1 + clen, (size_t)(totlen - clen)); |
6814 | } |
6815 | dst += totlen - clen; |
6816 | } |
6817 | src += totlen - 1; |
6818 | dst++; |
6819 | } else { |
6820 | if (REG_MULTI) { |
6821 | clnum = rex.reg_mmatch->startpos[no].lnum; |
6822 | if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0) { |
6823 | s = NULL; |
6824 | } else { |
6825 | s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col; |
6826 | if (rex.reg_mmatch->endpos[no].lnum == clnum) { |
6827 | len = rex.reg_mmatch->endpos[no].col |
6828 | - rex.reg_mmatch->startpos[no].col; |
6829 | } else { |
6830 | len = (int)STRLEN(s); |
6831 | } |
6832 | } |
6833 | } else { |
6834 | s = rex.reg_match->startp[no]; |
6835 | if (rex.reg_match->endp[no] == NULL) { |
6836 | s = NULL; |
6837 | } else { |
6838 | len = (int)(rex.reg_match->endp[no] - s); |
6839 | } |
6840 | } |
6841 | if (s != NULL) { |
6842 | for (;; ) { |
6843 | if (len == 0) { |
6844 | if (REG_MULTI) { |
6845 | if (rex.reg_mmatch->endpos[no].lnum == clnum) { |
6846 | break; |
6847 | } |
6848 | if (copy) { |
6849 | *dst = CAR; |
6850 | } |
6851 | dst++; |
6852 | s = reg_getline(++clnum); |
6853 | if (rex.reg_mmatch->endpos[no].lnum == clnum) { |
6854 | len = rex.reg_mmatch->endpos[no].col; |
6855 | } else { |
6856 | len = (int)STRLEN(s); |
6857 | } |
6858 | } else { |
6859 | break; |
6860 | } |
6861 | } else if (*s == NUL) { // we hit NUL. |
6862 | if (copy) { |
6863 | EMSG(_(e_re_damg)); |
6864 | } |
6865 | goto exit; |
6866 | } else { |
6867 | if (backslash && (*s == CAR || *s == '\\')) { |
6868 | /* |
6869 | * Insert a backslash in front of a CR, otherwise |
6870 | * it will be replaced by a line break. |
6871 | * Number of backslashes will be halved later, |
6872 | * double them here. |
6873 | */ |
6874 | if (copy) { |
6875 | dst[0] = '\\'; |
6876 | dst[1] = *s; |
6877 | } |
6878 | dst += 2; |
6879 | } else { |
6880 | c = utf_ptr2char(s); |
6881 | |
6882 | if (func_one != (fptr_T)NULL) |
6883 | /* Turbo C complains without the typecast */ |
6884 | func_one = (fptr_T)(func_one(&cc, c)); |
6885 | else if (func_all != (fptr_T)NULL) |
6886 | /* Turbo C complains without the typecast */ |
6887 | func_all = (fptr_T)(func_all(&cc, c)); |
6888 | else /* just copy */ |
6889 | cc = c; |
6890 | |
6891 | if (has_mbyte) { |
6892 | int l; |
6893 | |
6894 | // Copy composing characters separately, one |
6895 | // at a time. |
6896 | l = utf_ptr2len(s) - 1; |
6897 | |
6898 | s += l; |
6899 | len -= l; |
6900 | if (copy) { |
6901 | utf_char2bytes(cc, dst); |
6902 | } |
6903 | dst += utf_char2len(cc) - 1; |
6904 | } else if (copy) { |
6905 | *dst = cc; |
6906 | } |
6907 | dst++; |
6908 | } |
6909 | |
6910 | ++s; |
6911 | --len; |
6912 | } |
6913 | } |
6914 | } |
6915 | no = -1; |
6916 | } |
6917 | } |
6918 | if (copy) |
6919 | *dst = NUL; |
6920 | |
6921 | exit: |
6922 | return (int)((dst - dest) + 1); |
6923 | } |
6924 | |
6925 | |
6926 | /* |
6927 | * Call reg_getline() with the line numbers from the submatch. If a |
6928 | * substitute() was used the reg_maxline and other values have been |
6929 | * overwritten. |
6930 | */ |
6931 | static char_u *reg_getline_submatch(linenr_T lnum) |
6932 | { |
6933 | char_u *s; |
6934 | linenr_T save_first = rex.reg_firstlnum; |
6935 | linenr_T save_max = rex.reg_maxline; |
6936 | |
6937 | rex.reg_firstlnum = rsm.sm_firstlnum; |
6938 | rex.reg_maxline = rsm.sm_maxline; |
6939 | |
6940 | s = reg_getline(lnum); |
6941 | |
6942 | rex.reg_firstlnum = save_first; |
6943 | rex.reg_maxline = save_max; |
6944 | return s; |
6945 | } |
6946 | |
6947 | /* |
6948 | * Used for the submatch() function: get the string from the n'th submatch in |
6949 | * allocated memory. |
6950 | * Returns NULL when not in a ":s" command and for a non-existing submatch. |
6951 | */ |
6952 | char_u *reg_submatch(int no) |
6953 | { |
6954 | char_u *retval = NULL; |
6955 | char_u *s; |
6956 | int round; |
6957 | linenr_T lnum; |
6958 | |
6959 | if (!can_f_submatch || no < 0) |
6960 | return NULL; |
6961 | |
6962 | if (rsm.sm_match == NULL) { |
6963 | ssize_t len; |
6964 | |
6965 | /* |
6966 | * First round: compute the length and allocate memory. |
6967 | * Second round: copy the text. |
6968 | */ |
6969 | for (round = 1; round <= 2; round++) { |
6970 | lnum = rsm.sm_mmatch->startpos[no].lnum; |
6971 | if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0) { |
6972 | return NULL; |
6973 | } |
6974 | |
6975 | s = reg_getline_submatch(lnum); |
6976 | if (s == NULL) { // anti-crash check, cannot happen? |
6977 | break; |
6978 | } |
6979 | s += rsm.sm_mmatch->startpos[no].col; |
6980 | if (rsm.sm_mmatch->endpos[no].lnum == lnum) { |
6981 | // Within one line: take form start to end col. |
6982 | len = rsm.sm_mmatch->endpos[no].col - rsm.sm_mmatch->startpos[no].col; |
6983 | if (round == 2) { |
6984 | STRLCPY(retval, s, len + 1); |
6985 | } |
6986 | len++; |
6987 | } else { |
6988 | // Multiple lines: take start line from start col, middle |
6989 | // lines completely and end line up to end col. |
6990 | len = (ssize_t)STRLEN(s); |
6991 | if (round == 2) { |
6992 | STRCPY(retval, s); |
6993 | retval[len] = '\n'; |
6994 | } |
6995 | len++; |
6996 | lnum++; |
6997 | while (lnum < rsm.sm_mmatch->endpos[no].lnum) { |
6998 | s = reg_getline_submatch(lnum++); |
6999 | if (round == 2) |
7000 | STRCPY(retval + len, s); |
7001 | len += STRLEN(s); |
7002 | if (round == 2) |
7003 | retval[len] = '\n'; |
7004 | ++len; |
7005 | } |
7006 | if (round == 2) { |
7007 | STRNCPY(retval + len, reg_getline_submatch(lnum), |
7008 | rsm.sm_mmatch->endpos[no].col); |
7009 | } |
7010 | len += rsm.sm_mmatch->endpos[no].col; |
7011 | if (round == 2) { |
7012 | retval[len] = NUL; // -V595 |
7013 | } |
7014 | len++; |
7015 | } |
7016 | |
7017 | if (retval == NULL) { |
7018 | retval = xmalloc(len); |
7019 | } |
7020 | } |
7021 | } else { |
7022 | s = rsm.sm_match->startp[no]; |
7023 | if (s == NULL || rsm.sm_match->endp[no] == NULL) { |
7024 | retval = NULL; |
7025 | } else { |
7026 | retval = vim_strnsave(s, (int)(rsm.sm_match->endp[no] - s)); |
7027 | } |
7028 | } |
7029 | |
7030 | return retval; |
7031 | } |
7032 | |
7033 | // Used for the submatch() function with the optional non-zero argument: get |
7034 | // the list of strings from the n'th submatch in allocated memory with NULs |
7035 | // represented in NLs. |
7036 | // Returns a list of allocated strings. Returns NULL when not in a ":s" |
7037 | // command, for a non-existing submatch and for any error. |
7038 | list_T *reg_submatch_list(int no) |
7039 | { |
7040 | if (!can_f_submatch || no < 0) { |
7041 | return NULL; |
7042 | } |
7043 | |
7044 | linenr_T slnum; |
7045 | linenr_T elnum; |
7046 | list_T *list; |
7047 | const char *s; |
7048 | |
7049 | if (rsm.sm_match == NULL) { |
7050 | slnum = rsm.sm_mmatch->startpos[no].lnum; |
7051 | elnum = rsm.sm_mmatch->endpos[no].lnum; |
7052 | if (slnum < 0 || elnum < 0) { |
7053 | return NULL; |
7054 | } |
7055 | |
7056 | colnr_T scol = rsm.sm_mmatch->startpos[no].col; |
7057 | colnr_T ecol = rsm.sm_mmatch->endpos[no].col; |
7058 | |
7059 | list = tv_list_alloc(elnum - slnum + 1); |
7060 | |
7061 | s = (const char *)reg_getline_submatch(slnum) + scol; |
7062 | if (slnum == elnum) { |
7063 | tv_list_append_string(list, s, ecol - scol); |
7064 | } else { |
7065 | tv_list_append_string(list, s, -1); |
7066 | for (int i = 1; i < elnum - slnum; i++) { |
7067 | s = (const char *)reg_getline_submatch(slnum + i); |
7068 | tv_list_append_string(list, s, -1); |
7069 | } |
7070 | s = (const char *)reg_getline_submatch(elnum); |
7071 | tv_list_append_string(list, s, ecol); |
7072 | } |
7073 | } else { |
7074 | s = (const char *)rsm.sm_match->startp[no]; |
7075 | if (s == NULL || rsm.sm_match->endp[no] == NULL) { |
7076 | return NULL; |
7077 | } |
7078 | list = tv_list_alloc(1); |
7079 | tv_list_append_string(list, s, (const char *)rsm.sm_match->endp[no] - s); |
7080 | } |
7081 | |
7082 | return list; |
7083 | } |
7084 | |
7085 | static regengine_T bt_regengine = |
7086 | { |
7087 | bt_regcomp, |
7088 | bt_regfree, |
7089 | bt_regexec_nl, |
7090 | bt_regexec_multi, |
7091 | (char_u *)"" |
7092 | }; |
7093 | |
7094 | |
7095 | // XXX Do not allow headers generator to catch definitions from regexp_nfa.c |
7096 | #ifndef DO_NOT_DEFINE_EMPTY_ATTRIBUTES |
7097 | # include "nvim/regexp_nfa.c" |
7098 | #endif |
7099 | |
7100 | static regengine_T nfa_regengine = |
7101 | { |
7102 | nfa_regcomp, |
7103 | nfa_regfree, |
7104 | nfa_regexec_nl, |
7105 | nfa_regexec_multi, |
7106 | (char_u *)"" |
7107 | }; |
7108 | |
7109 | /* Which regexp engine to use? Needed for vim_regcomp(). |
7110 | * Must match with 'regexpengine'. */ |
7111 | static int regexp_engine = 0; |
7112 | |
7113 | #ifdef REGEXP_DEBUG |
7114 | static char_u regname[][30] = { |
7115 | "AUTOMATIC Regexp Engine" , |
7116 | "BACKTRACKING Regexp Engine" , |
7117 | "NFA Regexp Engine" |
7118 | }; |
7119 | #endif |
7120 | |
7121 | /* |
7122 | * Compile a regular expression into internal code. |
7123 | * Returns the program in allocated memory. |
7124 | * Use vim_regfree() to free the memory. |
7125 | * Returns NULL for an error. |
7126 | */ |
7127 | regprog_T *vim_regcomp(char_u *expr_arg, int re_flags) |
7128 | { |
7129 | regprog_T *prog = NULL; |
7130 | char_u *expr = expr_arg; |
7131 | int save_called_emsg; |
7132 | |
7133 | regexp_engine = p_re; |
7134 | |
7135 | /* Check for prefix "\%#=", that sets the regexp engine */ |
7136 | if (STRNCMP(expr, "\\%#=" , 4) == 0) { |
7137 | int newengine = expr[4] - '0'; |
7138 | |
7139 | if (newengine == AUTOMATIC_ENGINE |
7140 | || newengine == BACKTRACKING_ENGINE |
7141 | || newengine == NFA_ENGINE) { |
7142 | regexp_engine = expr[4] - '0'; |
7143 | expr += 5; |
7144 | #ifdef REGEXP_DEBUG |
7145 | smsg("New regexp mode selected (%d): %s" , |
7146 | regexp_engine, |
7147 | regname[newengine]); |
7148 | #endif |
7149 | } else { |
7150 | EMSG(_( |
7151 | "E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used " )); |
7152 | regexp_engine = AUTOMATIC_ENGINE; |
7153 | } |
7154 | } |
7155 | bt_regengine.expr = expr; |
7156 | nfa_regengine.expr = expr; |
7157 | // reg_iswordc() uses rex.reg_buf |
7158 | rex.reg_buf = curbuf; |
7159 | |
7160 | // |
7161 | // First try the NFA engine, unless backtracking was requested. |
7162 | // |
7163 | save_called_emsg = called_emsg; |
7164 | called_emsg = false; |
7165 | if (regexp_engine != BACKTRACKING_ENGINE) { |
7166 | prog = nfa_regengine.regcomp(expr, |
7167 | re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0)); |
7168 | } else { |
7169 | prog = bt_regengine.regcomp(expr, re_flags); |
7170 | } |
7171 | |
7172 | // Check for error compiling regexp with initial engine. |
7173 | if (prog == NULL) { |
7174 | #ifdef BT_REGEXP_DEBUG_LOG |
7175 | // Debugging log for NFA. |
7176 | if (regexp_engine != BACKTRACKING_ENGINE) { |
7177 | FILE *f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a" ); |
7178 | if (f) { |
7179 | fprintf(f, "Syntax error in \"%s\"\n" , expr); |
7180 | fclose(f); |
7181 | } else |
7182 | EMSG2("(NFA) Could not open \"%s\" to write !!!" , |
7183 | BT_REGEXP_DEBUG_LOG_NAME); |
7184 | } |
7185 | #endif |
7186 | // If the NFA engine failed, try the backtracking engine. The NFA engine |
7187 | // also fails for patterns that it can't handle well but are still valid |
7188 | // patterns, thus a retry should work. |
7189 | // But don't try if an error message was given. |
7190 | if (regexp_engine == AUTOMATIC_ENGINE && !called_emsg) { |
7191 | regexp_engine = BACKTRACKING_ENGINE; |
7192 | prog = bt_regengine.regcomp(expr, re_flags); |
7193 | } |
7194 | } |
7195 | called_emsg |= save_called_emsg; |
7196 | |
7197 | if (prog != NULL) { |
7198 | // Store the info needed to call regcomp() again when the engine turns out |
7199 | // to be very slow when executing it. |
7200 | prog->re_engine = regexp_engine; |
7201 | prog->re_flags = re_flags; |
7202 | } |
7203 | |
7204 | return prog; |
7205 | } |
7206 | |
7207 | /* |
7208 | * Free a compiled regexp program, returned by vim_regcomp(). |
7209 | */ |
7210 | void vim_regfree(regprog_T *prog) |
7211 | { |
7212 | if (prog != NULL) |
7213 | prog->engine->regfree(prog); |
7214 | } |
7215 | |
7216 | static void report_re_switch(char_u *pat) |
7217 | { |
7218 | if (p_verbose > 0) { |
7219 | verbose_enter(); |
7220 | MSG_PUTS(_("Switching to backtracking RE engine for pattern: " )); |
7221 | MSG_PUTS(pat); |
7222 | verbose_leave(); |
7223 | } |
7224 | } |
7225 | |
7226 | /// Matches a regexp against a string. |
7227 | /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). |
7228 | /// Note: "rmp->regprog" may be freed and changed. |
7229 | /// Uses curbuf for line count and 'iskeyword'. |
7230 | /// When "nl" is true consider a "\n" in "line" to be a line break. |
7231 | /// |
7232 | /// @param rmp |
7233 | /// @param line the string to match against |
7234 | /// @param col the column to start looking for match |
7235 | /// @param nl |
7236 | /// |
7237 | /// @return TRUE if there is a match, FALSE if not. |
7238 | static int vim_regexec_string(regmatch_T *rmp, char_u *line, colnr_T col, |
7239 | bool nl) |
7240 | { |
7241 | regexec_T rex_save; |
7242 | bool rex_in_use_save = rex_in_use; |
7243 | |
7244 | if (rex_in_use) { |
7245 | // Being called recursively, save the state. |
7246 | rex_save = rex; |
7247 | } |
7248 | rex_in_use = true; |
7249 | rex.reg_startp = NULL; |
7250 | rex.reg_endp = NULL; |
7251 | rex.reg_startpos = NULL; |
7252 | rex.reg_endpos = NULL; |
7253 | |
7254 | int result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl); |
7255 | |
7256 | // NFA engine aborted because it's very slow, use backtracking engine instead. |
7257 | if (rmp->regprog->re_engine == AUTOMATIC_ENGINE |
7258 | && result == NFA_TOO_EXPENSIVE) { |
7259 | int save_p_re = p_re; |
7260 | int re_flags = rmp->regprog->re_flags; |
7261 | char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern); |
7262 | |
7263 | p_re = BACKTRACKING_ENGINE; |
7264 | vim_regfree(rmp->regprog); |
7265 | report_re_switch(pat); |
7266 | rmp->regprog = vim_regcomp(pat, re_flags); |
7267 | if (rmp->regprog != NULL) { |
7268 | result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl); |
7269 | } |
7270 | |
7271 | xfree(pat); |
7272 | p_re = save_p_re; |
7273 | } |
7274 | |
7275 | rex_in_use = rex_in_use_save; |
7276 | if (rex_in_use) { |
7277 | rex = rex_save; |
7278 | } |
7279 | |
7280 | return result > 0; |
7281 | } |
7282 | |
7283 | // Note: "*prog" may be freed and changed. |
7284 | // Return TRUE if there is a match, FALSE if not. |
7285 | int vim_regexec_prog(regprog_T **prog, bool ignore_case, char_u *line, |
7286 | colnr_T col) |
7287 | { |
7288 | regmatch_T regmatch = { .regprog = *prog, .rm_ic = ignore_case }; |
7289 | int r = vim_regexec_string(®match, line, col, false); |
7290 | *prog = regmatch.regprog; |
7291 | return r; |
7292 | } |
7293 | |
7294 | // Note: "rmp->regprog" may be freed and changed. |
7295 | // Return TRUE if there is a match, FALSE if not. |
7296 | int vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col) |
7297 | { |
7298 | return vim_regexec_string(rmp, line, col, false); |
7299 | } |
7300 | |
7301 | // Like vim_regexec(), but consider a "\n" in "line" to be a line break. |
7302 | // Note: "rmp->regprog" may be freed and changed. |
7303 | // Return TRUE if there is a match, FALSE if not. |
7304 | int vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col) |
7305 | { |
7306 | return vim_regexec_string(rmp, line, col, true); |
7307 | } |
7308 | |
7309 | /// Match a regexp against multiple lines. |
7310 | /// "rmp->regprog" must be a compiled regexp as returned by vim_regcomp(). |
7311 | /// Note: "rmp->regprog" may be freed and changed, even set to NULL. |
7312 | /// Uses curbuf for line count and 'iskeyword'. |
7313 | /// |
7314 | /// Return zero if there is no match. Return number of lines contained in the |
7315 | /// match otherwise. |
7316 | long vim_regexec_multi( |
7317 | regmmatch_T *rmp, |
7318 | win_T *win, // window in which to search or NULL |
7319 | buf_T *buf, // buffer in which to search |
7320 | linenr_T lnum, // nr of line to start looking for match |
7321 | colnr_T col, // column to start looking for match |
7322 | proftime_T *tm, // timeout limit or NULL |
7323 | int *timed_out // flag is set when timeout limit reached |
7324 | ) |
7325 | { |
7326 | regexec_T rex_save; |
7327 | bool rex_in_use_save = rex_in_use; |
7328 | |
7329 | if (rex_in_use) { |
7330 | // Being called recursively, save the state. |
7331 | rex_save = rex; |
7332 | } |
7333 | rex_in_use = true; |
7334 | |
7335 | int result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, |
7336 | tm, timed_out); |
7337 | |
7338 | // NFA engine aborted because it's very slow, use backtracking engine instead. |
7339 | if (rmp->regprog->re_engine == AUTOMATIC_ENGINE |
7340 | && result == NFA_TOO_EXPENSIVE) { |
7341 | int save_p_re = p_re; |
7342 | int re_flags = rmp->regprog->re_flags; |
7343 | char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern); |
7344 | |
7345 | p_re = BACKTRACKING_ENGINE; |
7346 | vim_regfree(rmp->regprog); |
7347 | report_re_switch(pat); |
7348 | // checking for \z misuse was already done when compiling for NFA, |
7349 | // allow all here |
7350 | reg_do_extmatch = REX_ALL; |
7351 | rmp->regprog = vim_regcomp(pat, re_flags); |
7352 | reg_do_extmatch = 0; |
7353 | |
7354 | if (rmp->regprog != NULL) { |
7355 | result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col, |
7356 | tm, timed_out); |
7357 | } |
7358 | |
7359 | xfree(pat); |
7360 | p_re = save_p_re; |
7361 | } |
7362 | |
7363 | rex_in_use = rex_in_use_save; |
7364 | if (rex_in_use) { |
7365 | rex = rex_save; |
7366 | } |
7367 | |
7368 | return result <= 0 ? 0 : result; |
7369 | } |
7370 | |