1// This is an open source non-commercial project. Dear PVS-Studio, please check
2// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4/*
5 * NFA regular expression implementation.
6 *
7 * This file is included in "regexp.c".
8 */
9
10#include <assert.h>
11#include <inttypes.h>
12#include <stdbool.h>
13#include <limits.h>
14
15#include "nvim/ascii.h"
16#include "nvim/garray.h"
17
18/*
19 * Logging of NFA engine.
20 *
21 * The NFA engine can write four log files:
22 * - Error log: Contains NFA engine's fatal errors.
23 * - Dump log: Contains compiled NFA state machine's information.
24 * - Run log: Contains information of matching procedure.
25 * - Debug log: Contains detailed information of matching procedure. Can be
26 * disabled by undefining NFA_REGEXP_DEBUG_LOG.
27 * The first one can also be used without debug mode.
28 * The last three are enabled when compiled as debug mode and individually
29 * disabled by commenting them out.
30 * The log files can get quite big!
31 * Do disable all of this when compiling Vim for debugging, undefine REGEXP_DEBUG in
32 * regexp.c
33 */
34#ifdef REGEXP_DEBUG
35# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
36# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
37# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
38# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
39#endif
40
41/* Added to NFA_ANY - NFA_NUPPER_IC to include a NL. */
42#define NFA_ADD_NL 31
43
44enum {
45 NFA_SPLIT = -1024,
46 NFA_MATCH,
47 NFA_EMPTY, /* matches 0-length */
48
49 NFA_START_COLL, /* [abc] start */
50 NFA_END_COLL, /* [abc] end */
51 NFA_START_NEG_COLL, /* [^abc] start */
52 NFA_END_NEG_COLL, /* [^abc] end (postfix only) */
53 NFA_RANGE, /* range of the two previous items
54 * (postfix only) */
55 NFA_RANGE_MIN, /* low end of a range */
56 NFA_RANGE_MAX, /* high end of a range */
57
58 NFA_CONCAT, // concatenate two previous items (postfix
59 // only)
60 NFA_OR, // \| (postfix only)
61 NFA_STAR, // greedy * (postfix only)
62 NFA_STAR_NONGREEDY, // non-greedy * (postfix only)
63 NFA_QUEST, // greedy \? (postfix only)
64 NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
65
66 NFA_BOL, /* ^ Begin line */
67 NFA_EOL, /* $ End line */
68 NFA_BOW, /* \< Begin word */
69 NFA_EOW, /* \> End word */
70 NFA_BOF, /* \%^ Begin file */
71 NFA_EOF, /* \%$ End file */
72 NFA_NEWL,
73 NFA_ZSTART, /* Used for \zs */
74 NFA_ZEND, /* Used for \ze */
75 NFA_NOPEN, /* Start of subexpression marked with \%( */
76 NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
77 NFA_START_INVISIBLE,
78 NFA_START_INVISIBLE_FIRST,
79 NFA_START_INVISIBLE_NEG,
80 NFA_START_INVISIBLE_NEG_FIRST,
81 NFA_START_INVISIBLE_BEFORE,
82 NFA_START_INVISIBLE_BEFORE_FIRST,
83 NFA_START_INVISIBLE_BEFORE_NEG,
84 NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
85 NFA_START_PATTERN,
86 NFA_END_INVISIBLE,
87 NFA_END_INVISIBLE_NEG,
88 NFA_END_PATTERN,
89 NFA_COMPOSING, /* Next nodes in NFA are part of the
90 composing multibyte char */
91 NFA_END_COMPOSING, /* End of a composing char in the NFA */
92 NFA_ANY_COMPOSING, // \%C: Any composing characters.
93 NFA_OPT_CHARS, /* \%[abc] */
94
95 /* The following are used only in the postfix form, not in the NFA */
96 NFA_PREV_ATOM_NO_WIDTH, /* Used for \@= */
97 NFA_PREV_ATOM_NO_WIDTH_NEG, /* Used for \@! */
98 NFA_PREV_ATOM_JUST_BEFORE, /* Used for \@<= */
99 NFA_PREV_ATOM_JUST_BEFORE_NEG, /* Used for \@<! */
100 NFA_PREV_ATOM_LIKE_PATTERN, /* Used for \@> */
101
102 NFA_BACKREF1, /* \1 */
103 NFA_BACKREF2, /* \2 */
104 NFA_BACKREF3, /* \3 */
105 NFA_BACKREF4, /* \4 */
106 NFA_BACKREF5, /* \5 */
107 NFA_BACKREF6, /* \6 */
108 NFA_BACKREF7, /* \7 */
109 NFA_BACKREF8, /* \8 */
110 NFA_BACKREF9, /* \9 */
111 NFA_ZREF1, /* \z1 */
112 NFA_ZREF2, /* \z2 */
113 NFA_ZREF3, /* \z3 */
114 NFA_ZREF4, /* \z4 */
115 NFA_ZREF5, /* \z5 */
116 NFA_ZREF6, /* \z6 */
117 NFA_ZREF7, /* \z7 */
118 NFA_ZREF8, /* \z8 */
119 NFA_ZREF9, /* \z9 */
120 NFA_SKIP, /* Skip characters */
121
122 NFA_MOPEN,
123 NFA_MOPEN1,
124 NFA_MOPEN2,
125 NFA_MOPEN3,
126 NFA_MOPEN4,
127 NFA_MOPEN5,
128 NFA_MOPEN6,
129 NFA_MOPEN7,
130 NFA_MOPEN8,
131 NFA_MOPEN9,
132
133 NFA_MCLOSE,
134 NFA_MCLOSE1,
135 NFA_MCLOSE2,
136 NFA_MCLOSE3,
137 NFA_MCLOSE4,
138 NFA_MCLOSE5,
139 NFA_MCLOSE6,
140 NFA_MCLOSE7,
141 NFA_MCLOSE8,
142 NFA_MCLOSE9,
143
144 NFA_ZOPEN,
145 NFA_ZOPEN1,
146 NFA_ZOPEN2,
147 NFA_ZOPEN3,
148 NFA_ZOPEN4,
149 NFA_ZOPEN5,
150 NFA_ZOPEN6,
151 NFA_ZOPEN7,
152 NFA_ZOPEN8,
153 NFA_ZOPEN9,
154
155 NFA_ZCLOSE,
156 NFA_ZCLOSE1,
157 NFA_ZCLOSE2,
158 NFA_ZCLOSE3,
159 NFA_ZCLOSE4,
160 NFA_ZCLOSE5,
161 NFA_ZCLOSE6,
162 NFA_ZCLOSE7,
163 NFA_ZCLOSE8,
164 NFA_ZCLOSE9,
165
166 /* NFA_FIRST_NL */
167 NFA_ANY, /* Match any one character. */
168 NFA_IDENT, /* Match identifier char */
169 NFA_SIDENT, /* Match identifier char but no digit */
170 NFA_KWORD, /* Match keyword char */
171 NFA_SKWORD, /* Match word char but no digit */
172 NFA_FNAME, /* Match file name char */
173 NFA_SFNAME, /* Match file name char but no digit */
174 NFA_PRINT, /* Match printable char */
175 NFA_SPRINT, /* Match printable char but no digit */
176 NFA_WHITE, /* Match whitespace char */
177 NFA_NWHITE, /* Match non-whitespace char */
178 NFA_DIGIT, /* Match digit char */
179 NFA_NDIGIT, /* Match non-digit char */
180 NFA_HEX, /* Match hex char */
181 NFA_NHEX, /* Match non-hex char */
182 NFA_OCTAL, /* Match octal char */
183 NFA_NOCTAL, /* Match non-octal char */
184 NFA_WORD, /* Match word char */
185 NFA_NWORD, /* Match non-word char */
186 NFA_HEAD, /* Match head char */
187 NFA_NHEAD, /* Match non-head char */
188 NFA_ALPHA, /* Match alpha char */
189 NFA_NALPHA, /* Match non-alpha char */
190 NFA_LOWER, /* Match lowercase char */
191 NFA_NLOWER, /* Match non-lowercase char */
192 NFA_UPPER, /* Match uppercase char */
193 NFA_NUPPER, /* Match non-uppercase char */
194 NFA_LOWER_IC, /* Match [a-z] */
195 NFA_NLOWER_IC, /* Match [^a-z] */
196 NFA_UPPER_IC, /* Match [A-Z] */
197 NFA_NUPPER_IC, /* Match [^A-Z] */
198
199 NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
200 NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
201
202 NFA_CURSOR, /* Match cursor pos */
203 NFA_LNUM, /* Match line number */
204 NFA_LNUM_GT, /* Match > line number */
205 NFA_LNUM_LT, /* Match < line number */
206 NFA_COL, /* Match cursor column */
207 NFA_COL_GT, /* Match > cursor column */
208 NFA_COL_LT, /* Match < cursor column */
209 NFA_VCOL, /* Match cursor virtual column */
210 NFA_VCOL_GT, /* Match > cursor virtual column */
211 NFA_VCOL_LT, /* Match < cursor virtual column */
212 NFA_MARK, /* Match mark */
213 NFA_MARK_GT, /* Match > mark */
214 NFA_MARK_LT, /* Match < mark */
215 NFA_VISUAL, /* Match Visual area */
216
217 /* Character classes [:alnum:] etc */
218 NFA_CLASS_ALNUM,
219 NFA_CLASS_ALPHA,
220 NFA_CLASS_BLANK,
221 NFA_CLASS_CNTRL,
222 NFA_CLASS_DIGIT,
223 NFA_CLASS_GRAPH,
224 NFA_CLASS_LOWER,
225 NFA_CLASS_PRINT,
226 NFA_CLASS_PUNCT,
227 NFA_CLASS_SPACE,
228 NFA_CLASS_UPPER,
229 NFA_CLASS_XDIGIT,
230 NFA_CLASS_TAB,
231 NFA_CLASS_RETURN,
232 NFA_CLASS_BACKSPACE,
233 NFA_CLASS_ESCAPE
234};
235
236/* Keep in sync with classchars. */
237static int nfa_classcodes[] = {
238 NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
239 NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
240 NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
241 NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
242 NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
243 NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
244 NFA_UPPER, NFA_NUPPER
245};
246
247static char_u e_nul_found[] = N_(
248 "E865: (NFA) Regexp end encountered prematurely");
249static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
250static char_u e_ill_char_class[] = N_(
251 "E877: (NFA regexp) Invalid character class: %" PRId64);
252
253/* Since the out pointers in the list are always
254 * uninitialized, we use the pointers themselves
255 * as storage for the Ptrlists. */
256typedef union Ptrlist Ptrlist;
257union Ptrlist {
258 Ptrlist *next;
259 nfa_state_T *s;
260};
261
262struct Frag {
263 nfa_state_T *start;
264 Ptrlist *out;
265};
266typedef struct Frag Frag_T;
267
268typedef struct {
269 int in_use; /* number of subexpr with useful info */
270
271 /* When REG_MULTI is TRUE list.multi is used, otherwise list.line. */
272 union {
273 struct multipos {
274 linenr_T start_lnum;
275 linenr_T end_lnum;
276 colnr_T start_col;
277 colnr_T end_col;
278 } multi[NSUBEXP];
279 struct linepos {
280 char_u *start;
281 char_u *end;
282 } line[NSUBEXP];
283 } list;
284} regsub_T;
285
286typedef struct {
287 regsub_T norm; /* \( .. \) matches */
288 regsub_T synt; /* \z( .. \) matches */
289} regsubs_T;
290
291/* nfa_pim_T stores a Postponed Invisible Match. */
292typedef struct nfa_pim_S nfa_pim_T;
293struct nfa_pim_S {
294 int result; /* NFA_PIM_*, see below */
295 nfa_state_T *state; /* the invisible match start state */
296 regsubs_T subs; /* submatch info, only party used */
297 union {
298 lpos_T pos;
299 char_u *ptr;
300 } end; /* where the match must end */
301};
302
303/* nfa_thread_T contains execution information of a NFA state */
304typedef struct {
305 nfa_state_T *state;
306 int count;
307 nfa_pim_T pim; /* if pim.result != NFA_PIM_UNUSED: postponed
308 * invisible match */
309 regsubs_T subs; /* submatch info, only party used */
310} nfa_thread_T;
311
312/* nfa_list_T contains the alternative NFA execution states. */
313typedef struct {
314 nfa_thread_T *t; /* allocated array of states */
315 int n; /* nr of states currently in "t" */
316 int len; /* max nr of states in "t" */
317 int id; /* ID of the list */
318 int has_pim; /* TRUE when any state has a PIM */
319} nfa_list_T;
320
321/// re_flags passed to nfa_regcomp().
322static int nfa_re_flags;
323
324/* NFA regexp \ze operator encountered. */
325static int nfa_has_zend;
326
327/* NFA regexp \1 .. \9 encountered. */
328static int nfa_has_backref;
329
330/* NFA regexp has \z( ), set zsubexpr. */
331static int nfa_has_zsubexpr;
332
333/* Number of sub expressions actually being used during execution. 1 if only
334 * the whole match (subexpr 0) is used. */
335static int nfa_nsubexpr;
336
337static int *post_start; /* holds the postfix form of r.e. */
338static int *post_end;
339static int *post_ptr;
340
341static int nstate; /* Number of states in the NFA. Also used when
342 * executing. */
343static int istate; /* Index in the state vector, used in alloc_state() */
344
345/* If not NULL match must end at this position */
346static save_se_T *nfa_endp = NULL;
347
348/* listid is global, so that it increases on recursive calls to
349 * nfa_regmatch(), which means we don't have to clear the lastlist field of
350 * all the states. */
351static int nfa_listid;
352static int nfa_alt_listid;
353
354/* 0 for first call to nfa_regmatch(), 1 for recursive call. */
355static int nfa_ll_index = 0;
356
357#ifdef INCLUDE_GENERATED_DECLARATIONS
358# include "regexp_nfa.c.generated.h"
359#endif
360
361// Helper functions used when doing re2post() ... regatom() parsing
362#define EMIT(c) \
363 do { \
364 if (post_ptr >= post_end) { \
365 realloc_post_list(); \
366 } \
367 *post_ptr++ = c; \
368 } while (0)
369
370/*
371 * Initialize internal variables before NFA compilation.
372 */
373static void
374nfa_regcomp_start (
375 char_u *expr,
376 int re_flags /* see vim_regcomp() */
377)
378{
379 size_t postfix_size;
380 size_t nstate_max;
381
382 nstate = 0;
383 istate = 0;
384 /* A reasonable estimation for maximum size */
385 nstate_max = (STRLEN(expr) + 1) * 25;
386
387 /* Some items blow up in size, such as [A-z]. Add more space for that.
388 * When it is still not enough realloc_post_list() will be used. */
389 nstate_max += 1000;
390
391 /* Size for postfix representation of expr. */
392 postfix_size = sizeof(int) * nstate_max;
393
394 post_start = (int *)xmalloc(postfix_size);
395 post_ptr = post_start;
396 post_end = post_start + nstate_max;
397 nfa_has_zend = FALSE;
398 nfa_has_backref = FALSE;
399
400 /* shared with BT engine */
401 regcomp_start(expr, re_flags);
402}
403
404/*
405 * Figure out if the NFA state list starts with an anchor, must match at start
406 * of the line.
407 */
408static int nfa_get_reganch(nfa_state_T *start, int depth)
409{
410 nfa_state_T *p = start;
411
412 if (depth > 4)
413 return 0;
414
415 while (p != NULL) {
416 switch (p->c) {
417 case NFA_BOL:
418 case NFA_BOF:
419 return 1; /* yes! */
420
421 case NFA_ZSTART:
422 case NFA_ZEND:
423 case NFA_CURSOR:
424 case NFA_VISUAL:
425
426 case NFA_MOPEN:
427 case NFA_MOPEN1:
428 case NFA_MOPEN2:
429 case NFA_MOPEN3:
430 case NFA_MOPEN4:
431 case NFA_MOPEN5:
432 case NFA_MOPEN6:
433 case NFA_MOPEN7:
434 case NFA_MOPEN8:
435 case NFA_MOPEN9:
436 case NFA_NOPEN:
437 case NFA_ZOPEN:
438 case NFA_ZOPEN1:
439 case NFA_ZOPEN2:
440 case NFA_ZOPEN3:
441 case NFA_ZOPEN4:
442 case NFA_ZOPEN5:
443 case NFA_ZOPEN6:
444 case NFA_ZOPEN7:
445 case NFA_ZOPEN8:
446 case NFA_ZOPEN9:
447 p = p->out;
448 break;
449
450 case NFA_SPLIT:
451 return nfa_get_reganch(p->out, depth + 1)
452 && nfa_get_reganch(p->out1, depth + 1);
453
454 default:
455 return 0; /* noooo */
456 }
457 }
458 return 0;
459}
460
461/*
462 * Figure out if the NFA state list starts with a character which must match
463 * at start of the match.
464 */
465static int nfa_get_regstart(nfa_state_T *start, int depth)
466{
467 nfa_state_T *p = start;
468
469 if (depth > 4)
470 return 0;
471
472 while (p != NULL) {
473 switch (p->c) {
474 /* all kinds of zero-width matches */
475 case NFA_BOL:
476 case NFA_BOF:
477 case NFA_BOW:
478 case NFA_EOW:
479 case NFA_ZSTART:
480 case NFA_ZEND:
481 case NFA_CURSOR:
482 case NFA_VISUAL:
483 case NFA_LNUM:
484 case NFA_LNUM_GT:
485 case NFA_LNUM_LT:
486 case NFA_COL:
487 case NFA_COL_GT:
488 case NFA_COL_LT:
489 case NFA_VCOL:
490 case NFA_VCOL_GT:
491 case NFA_VCOL_LT:
492 case NFA_MARK:
493 case NFA_MARK_GT:
494 case NFA_MARK_LT:
495
496 case NFA_MOPEN:
497 case NFA_MOPEN1:
498 case NFA_MOPEN2:
499 case NFA_MOPEN3:
500 case NFA_MOPEN4:
501 case NFA_MOPEN5:
502 case NFA_MOPEN6:
503 case NFA_MOPEN7:
504 case NFA_MOPEN8:
505 case NFA_MOPEN9:
506 case NFA_NOPEN:
507 case NFA_ZOPEN:
508 case NFA_ZOPEN1:
509 case NFA_ZOPEN2:
510 case NFA_ZOPEN3:
511 case NFA_ZOPEN4:
512 case NFA_ZOPEN5:
513 case NFA_ZOPEN6:
514 case NFA_ZOPEN7:
515 case NFA_ZOPEN8:
516 case NFA_ZOPEN9:
517 p = p->out;
518 break;
519
520 case NFA_SPLIT:
521 {
522 int c1 = nfa_get_regstart(p->out, depth + 1);
523 int c2 = nfa_get_regstart(p->out1, depth + 1);
524
525 if (c1 == c2)
526 return c1; /* yes! */
527 return 0;
528 }
529
530 default:
531 if (p->c > 0)
532 return p->c; /* yes! */
533 return 0;
534 }
535 }
536 return 0;
537}
538
539/*
540 * Figure out if the NFA state list contains just literal text and nothing
541 * else. If so return a string in allocated memory with what must match after
542 * regstart. Otherwise return NULL.
543 */
544static char_u *nfa_get_match_text(nfa_state_T *start)
545{
546 nfa_state_T *p = start;
547 int len = 0;
548 char_u *ret;
549 char_u *s;
550
551 if (p->c != NFA_MOPEN)
552 return NULL; /* just in case */
553 p = p->out;
554 while (p->c > 0) {
555 len += MB_CHAR2LEN(p->c);
556 p = p->out;
557 }
558 if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH)
559 return NULL;
560
561 ret = xmalloc(len);
562 p = start->out->out; /* skip first char, it goes into regstart */
563 s = ret;
564 while (p->c > 0) {
565 s += utf_char2bytes(p->c, s);
566 p = p->out;
567 }
568 *s = NUL;
569
570 return ret;
571}
572
573/*
574 * Allocate more space for post_start. Called when
575 * running above the estimated number of states.
576 */
577static void realloc_post_list(void)
578{
579 size_t new_max = (post_end - post_start) + 1000;
580 int *new_start = xrealloc(post_start, new_max * sizeof(int));
581 post_ptr = new_start + (post_ptr - post_start);
582 post_end = new_start + new_max;
583 post_start = new_start;
584}
585
586/*
587 * Search between "start" and "end" and try to recognize a
588 * character class in expanded form. For example [0-9].
589 * On success, return the id the character class to be emitted.
590 * On failure, return 0 (=FAIL)
591 * Start points to the first char of the range, while end should point
592 * to the closing brace.
593 * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
594 * need to be interpreted as [a-zA-Z].
595 */
596static int nfa_recognize_char_class(char_u *start, char_u *end, int extra_newl)
597{
598# define CLASS_not 0x80
599# define CLASS_af 0x40
600# define CLASS_AF 0x20
601# define CLASS_az 0x10
602# define CLASS_AZ 0x08
603# define CLASS_o7 0x04
604# define CLASS_o9 0x02
605# define CLASS_underscore 0x01
606
607 int newl = FALSE;
608 char_u *p;
609 int config = 0;
610
611 if (extra_newl == TRUE)
612 newl = TRUE;
613
614 if (*end != ']')
615 return FAIL;
616 p = start;
617 if (*p == '^') {
618 config |= CLASS_not;
619 p++;
620 }
621
622 while (p < end) {
623 if (p + 2 < end && *(p + 1) == '-') {
624 switch (*p) {
625 case '0':
626 if (*(p + 2) == '9') {
627 config |= CLASS_o9;
628 break;
629 } else if (*(p + 2) == '7') {
630 config |= CLASS_o7;
631 break;
632 }
633 return FAIL;
634 case 'a':
635 if (*(p + 2) == 'z') {
636 config |= CLASS_az;
637 break;
638 } else if (*(p + 2) == 'f') {
639 config |= CLASS_af;
640 break;
641 }
642 return FAIL;
643 case 'A':
644 if (*(p + 2) == 'Z') {
645 config |= CLASS_AZ;
646 break;
647 } else if (*(p + 2) == 'F') {
648 config |= CLASS_AF;
649 break;
650 }
651 return FAIL;
652 default:
653 return FAIL;
654 }
655 p += 3;
656 } else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n') {
657 newl = TRUE;
658 p += 2;
659 } else if (*p == '_') {
660 config |= CLASS_underscore;
661 p++;
662 } else if (*p == '\n') {
663 newl = TRUE;
664 p++;
665 } else
666 return FAIL;
667 } /* while (p < end) */
668
669 if (p != end)
670 return FAIL;
671
672 if (newl == TRUE)
673 extra_newl = NFA_ADD_NL;
674
675 switch (config) {
676 case CLASS_o9:
677 return extra_newl + NFA_DIGIT;
678 case CLASS_not | CLASS_o9:
679 return extra_newl + NFA_NDIGIT;
680 case CLASS_af | CLASS_AF | CLASS_o9:
681 return extra_newl + NFA_HEX;
682 case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9:
683 return extra_newl + NFA_NHEX;
684 case CLASS_o7:
685 return extra_newl + NFA_OCTAL;
686 case CLASS_not | CLASS_o7:
687 return extra_newl + NFA_NOCTAL;
688 case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
689 return extra_newl + NFA_WORD;
690 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore:
691 return extra_newl + NFA_NWORD;
692 case CLASS_az | CLASS_AZ | CLASS_underscore:
693 return extra_newl + NFA_HEAD;
694 case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore:
695 return extra_newl + NFA_NHEAD;
696 case CLASS_az | CLASS_AZ:
697 return extra_newl + NFA_ALPHA;
698 case CLASS_not | CLASS_az | CLASS_AZ:
699 return extra_newl + NFA_NALPHA;
700 case CLASS_az:
701 return extra_newl + NFA_LOWER_IC;
702 case CLASS_not | CLASS_az:
703 return extra_newl + NFA_NLOWER_IC;
704 case CLASS_AZ:
705 return extra_newl + NFA_UPPER_IC;
706 case CLASS_not | CLASS_AZ:
707 return extra_newl + NFA_NUPPER_IC;
708 }
709 return FAIL;
710}
711
712/*
713 * Produce the bytes for equivalence class "c".
714 * Currently only handles latin1, latin9 and utf-8.
715 * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
716 * equivalent to 'a OR b OR c'
717 *
718 * NOTE! When changing this function, also update reg_equi_class()
719 */
720static void nfa_emit_equi_class(int c)
721{
722#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
723#define EMITMBC(c) EMIT(c); EMIT(NFA_CONCAT);
724
725 if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
726 || STRCMP(p_enc, "iso-8859-15") == 0) {
727#define A_grave 0xc0
728#define A_acute 0xc1
729#define A_circumflex 0xc2
730#define A_virguilla 0xc3
731#define A_diaeresis 0xc4
732#define A_ring 0xc5
733#define C_cedilla 0xc7
734#define E_grave 0xc8
735#define E_acute 0xc9
736#define E_circumflex 0xca
737#define E_diaeresis 0xcb
738#define I_grave 0xcc
739#define I_acute 0xcd
740#define I_circumflex 0xce
741#define I_diaeresis 0xcf
742#define N_virguilla 0xd1
743#define O_grave 0xd2
744#define O_acute 0xd3
745#define O_circumflex 0xd4
746#define O_virguilla 0xd5
747#define O_diaeresis 0xd6
748#define O_slash 0xd8
749#define U_grave 0xd9
750#define U_acute 0xda
751#define U_circumflex 0xdb
752#define U_diaeresis 0xdc
753#define Y_acute 0xdd
754#define a_grave 0xe0
755#define a_acute 0xe1
756#define a_circumflex 0xe2
757#define a_virguilla 0xe3
758#define a_diaeresis 0xe4
759#define a_ring 0xe5
760#define c_cedilla 0xe7
761#define e_grave 0xe8
762#define e_acute 0xe9
763#define e_circumflex 0xea
764#define e_diaeresis 0xeb
765#define i_grave 0xec
766#define i_acute 0xed
767#define i_circumflex 0xee
768#define i_diaeresis 0xef
769#define n_virguilla 0xf1
770#define o_grave 0xf2
771#define o_acute 0xf3
772#define o_circumflex 0xf4
773#define o_virguilla 0xf5
774#define o_diaeresis 0xf6
775#define o_slash 0xf8
776#define u_grave 0xf9
777#define u_acute 0xfa
778#define u_circumflex 0xfb
779#define u_diaeresis 0xfc
780#define y_acute 0xfd
781#define y_diaeresis 0xff
782 switch (c) {
783 case 'A': case A_grave: case A_acute: case A_circumflex:
784 case A_virguilla: case A_diaeresis: case A_ring:
785 CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104)
786 CASEMBC(0x1cd) CASEMBC(0x1de) CASEMBC(0x1e0)
787 CASEMBC(0x1ea2)
788 EMIT2('A'); EMIT2(A_grave); EMIT2(A_acute);
789 EMIT2(A_circumflex); EMIT2(A_virguilla);
790 EMIT2(A_diaeresis); EMIT2(A_ring);
791 EMITMBC(0x100) EMITMBC(0x102) EMITMBC(0x104)
792 EMITMBC(0x1cd) EMITMBC(0x1de) EMITMBC(0x1e0)
793 EMITMBC(0x1ea2)
794 return;
795
796 case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06)
797 EMIT2('B'); EMITMBC(0x1e02) EMITMBC(0x1e06)
798 return;
799
800 case 'C': case C_cedilla: CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a)
801 CASEMBC(0x10c)
802 EMIT2('C'); EMIT2(C_cedilla); EMITMBC(0x106) EMITMBC(0x108)
803 EMITMBC(0x10a) EMITMBC(0x10c)
804 return;
805
806 case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a)
807 CASEMBC(0x1e0e) CASEMBC(0x1e10)
808 EMIT2('D'); EMITMBC(0x10e) EMITMBC(0x110) EMITMBC(0x1e0a)
809 EMITMBC(0x1e0e) EMITMBC(0x1e10)
810 return;
811
812 case 'E': case E_grave: case E_acute: case E_circumflex:
813 case E_diaeresis: CASEMBC(0x112) CASEMBC(0x114)
814 CASEMBC(0x116) CASEMBC(0x118) CASEMBC(0x11a)
815 CASEMBC(0x1eba) CASEMBC(0x1ebc)
816 EMIT2('E'); EMIT2(E_grave); EMIT2(E_acute);
817 EMIT2(E_circumflex); EMIT2(E_diaeresis);
818 EMITMBC(0x112) EMITMBC(0x114) EMITMBC(0x116)
819 EMITMBC(0x118) EMITMBC(0x11a) EMITMBC(0x1eba)
820 EMITMBC(0x1ebc)
821 return;
822
823 case 'F': CASEMBC(0x1e1e)
824 EMIT2('F'); EMITMBC(0x1e1e)
825 return;
826
827 case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120)
828 CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6)
829 CASEMBC(0x1f4) CASEMBC(0x1e20)
830 EMIT2('G'); EMITMBC(0x11c) EMITMBC(0x11e) EMITMBC(0x120)
831 EMITMBC(0x122) EMITMBC(0x1e4) EMITMBC(0x1e6)
832 EMITMBC(0x1f4) EMITMBC(0x1e20)
833 return;
834
835 case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22)
836 CASEMBC(0x1e26) CASEMBC(0x1e28)
837 EMIT2('H'); EMITMBC(0x124) EMITMBC(0x126) EMITMBC(0x1e22)
838 EMITMBC(0x1e26) EMITMBC(0x1e28)
839 return;
840
841 case 'I': case I_grave: case I_acute: case I_circumflex:
842 case I_diaeresis: CASEMBC(0x128) CASEMBC(0x12a)
843 CASEMBC(0x12c) CASEMBC(0x12e) CASEMBC(0x130)
844 CASEMBC(0x1cf) CASEMBC(0x1ec8)
845 EMIT2('I'); EMIT2(I_grave); EMIT2(I_acute);
846 EMIT2(I_circumflex); EMIT2(I_diaeresis);
847 EMITMBC(0x128) EMITMBC(0x12a)
848 EMITMBC(0x12c) EMITMBC(0x12e) EMITMBC(0x130)
849 EMITMBC(0x1cf) EMITMBC(0x1ec8)
850 return;
851
852 case 'J': CASEMBC(0x134)
853 EMIT2('J'); EMITMBC(0x134)
854 return;
855
856 case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30)
857 CASEMBC(0x1e34)
858 EMIT2('K'); EMITMBC(0x136) EMITMBC(0x1e8) EMITMBC(0x1e30)
859 EMITMBC(0x1e34)
860 return;
861
862 case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d)
863 CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a)
864 EMIT2('L'); EMITMBC(0x139) EMITMBC(0x13b) EMITMBC(0x13d)
865 EMITMBC(0x13f) EMITMBC(0x141) EMITMBC(0x1e3a)
866 return;
867
868 case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40)
869 EMIT2('M'); EMITMBC(0x1e3e) EMITMBC(0x1e40)
870 return;
871
872 case 'N': case N_virguilla: CASEMBC(0x143) CASEMBC(0x145)
873 CASEMBC(0x147) CASEMBC(0x1e44) CASEMBC(0x1e48)
874 EMIT2('N'); EMIT2(N_virguilla);
875 EMITMBC(0x143) EMITMBC(0x145)
876 EMITMBC(0x147) EMITMBC(0x1e44) EMITMBC(0x1e48)
877 return;
878
879 case 'O': case O_grave: case O_acute: case O_circumflex:
880 case O_virguilla: case O_diaeresis: case O_slash:
881 CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150)
882 CASEMBC(0x1a0) CASEMBC(0x1d1) CASEMBC(0x1ea)
883 CASEMBC(0x1ec) CASEMBC(0x1ece)
884 EMIT2('O'); EMIT2(O_grave); EMIT2(O_acute);
885 EMIT2(O_circumflex); EMIT2(O_virguilla);
886 EMIT2(O_diaeresis); EMIT2(O_slash);
887 EMITMBC(0x14c) EMITMBC(0x14e) EMITMBC(0x150)
888 EMITMBC(0x1a0) EMITMBC(0x1d1) EMITMBC(0x1ea)
889 EMITMBC(0x1ec) EMITMBC(0x1ece)
890 return;
891
892 case 'P': case 0x1e54: case 0x1e56:
893 EMIT2('P'); EMITMBC(0x1e54) EMITMBC(0x1e56)
894 return;
895
896 case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158)
897 CASEMBC(0x1e58) CASEMBC(0x1e5e)
898 EMIT2('R'); EMITMBC(0x154) EMITMBC(0x156) EMITMBC(0x158)
899 EMITMBC(0x1e58) EMITMBC(0x1e5e)
900 return;
901
902 case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e)
903 CASEMBC(0x160) CASEMBC(0x1e60)
904 EMIT2('S'); EMITMBC(0x15a) EMITMBC(0x15c) EMITMBC(0x15e)
905 EMITMBC(0x160) EMITMBC(0x1e60)
906 return;
907
908 case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166)
909 CASEMBC(0x1e6a) CASEMBC(0x1e6e)
910 EMIT2('T'); EMITMBC(0x162) EMITMBC(0x164) EMITMBC(0x166)
911 EMITMBC(0x1e6a) EMITMBC(0x1e6e)
912 return;
913
914 case 'U': case U_grave: case U_acute: case U_diaeresis:
915 case U_circumflex: CASEMBC(0x168) CASEMBC(0x16a)
916 CASEMBC(0x16c) CASEMBC(0x16e) CASEMBC(0x170)
917 CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3)
918 CASEMBC(0x1ee6)
919 EMIT2('U'); EMIT2(U_grave); EMIT2(U_acute);
920 EMIT2(U_diaeresis); EMIT2(U_circumflex);
921 EMITMBC(0x168) EMITMBC(0x16a)
922 EMITMBC(0x16c) EMITMBC(0x16e) EMITMBC(0x170)
923 EMITMBC(0x172) EMITMBC(0x1af) EMITMBC(0x1d3)
924 EMITMBC(0x1ee6)
925 return;
926
927 case 'V': CASEMBC(0x1e7c)
928 EMIT2('V'); EMITMBC(0x1e7c)
929 return;
930
931 case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82)
932 CASEMBC(0x1e84) CASEMBC(0x1e86)
933 EMIT2('W'); EMITMBC(0x174) EMITMBC(0x1e80) EMITMBC(0x1e82)
934 EMITMBC(0x1e84) EMITMBC(0x1e86)
935 return;
936
937 case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c)
938 EMIT2('X'); EMITMBC(0x1e8a) EMITMBC(0x1e8c)
939 return;
940
941 case 'Y': case Y_acute: CASEMBC(0x176) CASEMBC(0x178)
942 CASEMBC(0x1e8e) CASEMBC(0x1ef2) CASEMBC(0x1ef6)
943 CASEMBC(0x1ef8)
944 EMIT2('Y'); EMIT2(Y_acute);
945 EMITMBC(0x176) EMITMBC(0x178)
946 EMITMBC(0x1e8e) EMITMBC(0x1ef2) EMITMBC(0x1ef6)
947 EMITMBC(0x1ef8)
948 return;
949
950 case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d)
951 CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94)
952 EMIT2('Z'); EMITMBC(0x179) EMITMBC(0x17b) EMITMBC(0x17d)
953 EMITMBC(0x1b5) EMITMBC(0x1e90) EMITMBC(0x1e94)
954 return;
955
956 case 'a': case a_grave: case a_acute: case a_circumflex:
957 case a_virguilla: case a_diaeresis: case a_ring:
958 CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105)
959 CASEMBC(0x1ce) CASEMBC(0x1df) CASEMBC(0x1e1)
960 CASEMBC(0x1ea3)
961 EMIT2('a'); EMIT2(a_grave); EMIT2(a_acute);
962 EMIT2(a_circumflex); EMIT2(a_virguilla);
963 EMIT2(a_diaeresis); EMIT2(a_ring);
964 EMITMBC(0x101) EMITMBC(0x103) EMITMBC(0x105)
965 EMITMBC(0x1ce) EMITMBC(0x1df) EMITMBC(0x1e1)
966 EMITMBC(0x1ea3)
967 return;
968
969 case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07)
970 EMIT2('b'); EMITMBC(0x1e03) EMITMBC(0x1e07)
971 return;
972
973 case 'c': case c_cedilla: CASEMBC(0x107) CASEMBC(0x109)
974 CASEMBC(0x10b) CASEMBC(0x10d)
975 EMIT2('c'); EMIT2(c_cedilla);
976 EMITMBC(0x107) EMITMBC(0x109)
977 EMITMBC(0x10b) EMITMBC(0x10d)
978 return;
979
980 case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b)
981 CASEMBC(0x1e0f) CASEMBC(0x1e11)
982 EMIT2('d'); EMITMBC(0x10f) EMITMBC(0x111) EMITMBC(0x1e0b)
983 EMITMBC(0x1e0f) EMITMBC(0x1e11)
984 return;
985
986 case 'e': case e_grave: case e_acute: case e_circumflex:
987 case e_diaeresis: CASEMBC(0x113) CASEMBC(0x115)
988 CASEMBC(0x117) CASEMBC(0x119) CASEMBC(0x11b)
989 CASEMBC(0x1ebb) CASEMBC(0x1ebd)
990 EMIT2('e'); EMIT2(e_grave); EMIT2(e_acute);
991 EMIT2(e_circumflex); EMIT2(e_diaeresis);
992 EMITMBC(0x113) EMITMBC(0x115)
993 EMITMBC(0x117) EMITMBC(0x119) EMITMBC(0x11b)
994 EMITMBC(0x1ebb) EMITMBC(0x1ebd)
995 return;
996
997 case 'f': CASEMBC(0x1e1f)
998 EMIT2('f'); EMITMBC(0x1e1f)
999 return;
1000
1001 case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121)
1002 CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7)
1003 CASEMBC(0x1f5) CASEMBC(0x1e21)
1004 EMIT2('g'); EMITMBC(0x11d) EMITMBC(0x11f) EMITMBC(0x121)
1005 EMITMBC(0x123) EMITMBC(0x1e5) EMITMBC(0x1e7)
1006 EMITMBC(0x1f5) EMITMBC(0x1e21)
1007 return;
1008
1009 case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23)
1010 CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96)
1011 EMIT2('h'); EMITMBC(0x125) EMITMBC(0x127) EMITMBC(0x1e23)
1012 EMITMBC(0x1e27) EMITMBC(0x1e29) EMITMBC(0x1e96)
1013 return;
1014
1015 case 'i': case i_grave: case i_acute: case i_circumflex:
1016 case i_diaeresis: CASEMBC(0x129) CASEMBC(0x12b)
1017 CASEMBC(0x12d) CASEMBC(0x12f) CASEMBC(0x1d0)
1018 CASEMBC(0x1ec9)
1019 EMIT2('i'); EMIT2(i_grave); EMIT2(i_acute);
1020 EMIT2(i_circumflex); EMIT2(i_diaeresis);
1021 EMITMBC(0x129) EMITMBC(0x12b)
1022 EMITMBC(0x12d) EMITMBC(0x12f) EMITMBC(0x1d0)
1023 EMITMBC(0x1ec9)
1024 return;
1025
1026 case 'j': CASEMBC(0x135) CASEMBC(0x1f0)
1027 EMIT2('j'); EMITMBC(0x135) EMITMBC(0x1f0)
1028 return;
1029
1030 case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31)
1031 CASEMBC(0x1e35)
1032 EMIT2('k'); EMITMBC(0x137) EMITMBC(0x1e9) EMITMBC(0x1e31)
1033 EMITMBC(0x1e35)
1034 return;
1035
1036 case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e)
1037 CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b)
1038 EMIT2('l'); EMITMBC(0x13a) EMITMBC(0x13c) EMITMBC(0x13e)
1039 EMITMBC(0x140) EMITMBC(0x142) EMITMBC(0x1e3b)
1040 return;
1041
1042 case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41)
1043 EMIT2('m'); EMITMBC(0x1e3f) EMITMBC(0x1e41)
1044 return;
1045
1046 case 'n': case n_virguilla: CASEMBC(0x144) CASEMBC(0x146)
1047 CASEMBC(0x148) CASEMBC(0x149) CASEMBC(0x1e45)
1048 CASEMBC(0x1e49)
1049 EMIT2('n'); EMIT2(n_virguilla);
1050 EMITMBC(0x144) EMITMBC(0x146)
1051 EMITMBC(0x148) EMITMBC(0x149) EMITMBC(0x1e45)
1052 EMITMBC(0x1e49)
1053 return;
1054
1055 case 'o': case o_grave: case o_acute: case o_circumflex:
1056 case o_virguilla: case o_diaeresis: case o_slash:
1057 CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151)
1058 CASEMBC(0x1a1) CASEMBC(0x1d2) CASEMBC(0x1eb)
1059 CASEMBC(0x1ed) CASEMBC(0x1ecf)
1060 EMIT2('o'); EMIT2(o_grave); EMIT2(o_acute);
1061 EMIT2(o_circumflex); EMIT2(o_virguilla);
1062 EMIT2(o_diaeresis); EMIT2(o_slash);
1063 EMITMBC(0x14d) EMITMBC(0x14f) EMITMBC(0x151)
1064 EMITMBC(0x1a1) EMITMBC(0x1d2) EMITMBC(0x1eb)
1065 EMITMBC(0x1ed) EMITMBC(0x1ecf)
1066 return;
1067
1068 case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57)
1069 EMIT2('p'); EMITMBC(0x1e55) EMITMBC(0x1e57)
1070 return;
1071
1072 case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159)
1073 CASEMBC(0x1e59) CASEMBC(0x1e5f)
1074 EMIT2('r'); EMITMBC(0x155) EMITMBC(0x157) EMITMBC(0x159)
1075 EMITMBC(0x1e59) EMITMBC(0x1e5f)
1076 return;
1077
1078 case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f)
1079 CASEMBC(0x161) CASEMBC(0x1e61)
1080 EMIT2('s'); EMITMBC(0x15b) EMITMBC(0x15d) EMITMBC(0x15f)
1081 EMITMBC(0x161) EMITMBC(0x1e61)
1082 return;
1083
1084 case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167)
1085 CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97)
1086 EMIT2('t'); EMITMBC(0x163) EMITMBC(0x165) EMITMBC(0x167)
1087 EMITMBC(0x1e6b) EMITMBC(0x1e6f) EMITMBC(0x1e97)
1088 return;
1089
1090 case 'u': case u_grave: case u_acute: case u_circumflex:
1091 case u_diaeresis: CASEMBC(0x169) CASEMBC(0x16b)
1092 CASEMBC(0x16d) CASEMBC(0x16f) CASEMBC(0x171)
1093 CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4)
1094 CASEMBC(0x1ee7)
1095 EMIT2('u'); EMIT2(u_grave); EMIT2(u_acute);
1096 EMIT2(u_circumflex); EMIT2(u_diaeresis);
1097 EMITMBC(0x169) EMITMBC(0x16b)
1098 EMITMBC(0x16d) EMITMBC(0x16f) EMITMBC(0x171)
1099 EMITMBC(0x173) EMITMBC(0x1b0) EMITMBC(0x1d4)
1100 EMITMBC(0x1ee7)
1101 return;
1102
1103 case 'v': CASEMBC(0x1e7d)
1104 EMIT2('v'); EMITMBC(0x1e7d)
1105 return;
1106
1107 case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83)
1108 CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98)
1109 EMIT2('w'); EMITMBC(0x175) EMITMBC(0x1e81) EMITMBC(0x1e83)
1110 EMITMBC(0x1e85) EMITMBC(0x1e87) EMITMBC(0x1e98)
1111 return;
1112
1113 case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d)
1114 EMIT2('x'); EMITMBC(0x1e8b) EMITMBC(0x1e8d)
1115 return;
1116
1117 case 'y': case y_acute: case y_diaeresis: CASEMBC(0x177)
1118 CASEMBC(0x1e8f) CASEMBC(0x1e99) CASEMBC(0x1ef3)
1119 CASEMBC(0x1ef7) CASEMBC(0x1ef9)
1120 EMIT2('y'); EMIT2(y_acute); EMIT2(y_diaeresis);
1121 EMITMBC(0x177)
1122 EMITMBC(0x1e8f) EMITMBC(0x1e99) EMITMBC(0x1ef3)
1123 EMITMBC(0x1ef7) EMITMBC(0x1ef9)
1124 return;
1125
1126 case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e)
1127 CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95)
1128 EMIT2('z'); EMITMBC(0x17a) EMITMBC(0x17c) EMITMBC(0x17e)
1129 EMITMBC(0x1b6) EMITMBC(0x1e91) EMITMBC(0x1e95)
1130 return;
1131
1132 /* default: character itself */
1133 }
1134 }
1135
1136 EMIT2(c);
1137#undef EMIT2
1138#undef EMITMBC
1139}
1140
1141/*
1142 * Code to parse regular expression.
1143 *
1144 * We try to reuse parsing functions in regexp.c to
1145 * minimize surprise and keep the syntax consistent.
1146 */
1147
1148/*
1149 * Parse the lowest level.
1150 *
1151 * An atom can be one of a long list of items. Many atoms match one character
1152 * in the text. It is often an ordinary character or a character class.
1153 * Braces can be used to make a pattern into an atom. The "\z(\)" construct
1154 * is only for syntax highlighting.
1155 *
1156 * atom ::= ordinary-atom
1157 * or \( pattern \)
1158 * or \%( pattern \)
1159 * or \z( pattern \)
1160 */
1161static int nfa_regatom(void)
1162{
1163 int c;
1164 int charclass;
1165 int equiclass;
1166 int collclass;
1167 int got_coll_char;
1168 char_u *p;
1169 char_u *endp;
1170 char_u *old_regparse = regparse;
1171 int extra = 0;
1172 int emit_range;
1173 int negated;
1174 int startc = -1;
1175 int endc = -1;
1176 int oldstartc = -1;
1177 int save_prev_at_start = prev_at_start;
1178
1179 c = getchr();
1180 switch (c) {
1181 case NUL:
1182 EMSG_RET_FAIL(_(e_nul_found));
1183
1184 case Magic('^'):
1185 EMIT(NFA_BOL);
1186 break;
1187
1188 case Magic('$'):
1189 EMIT(NFA_EOL);
1190 had_eol = TRUE;
1191 break;
1192
1193 case Magic('<'):
1194 EMIT(NFA_BOW);
1195 break;
1196
1197 case Magic('>'):
1198 EMIT(NFA_EOW);
1199 break;
1200
1201 case Magic('_'):
1202 c = no_Magic(getchr());
1203 if (c == NUL)
1204 EMSG_RET_FAIL(_(e_nul_found));
1205
1206 if (c == '^') { /* "\_^" is start-of-line */
1207 EMIT(NFA_BOL);
1208 break;
1209 }
1210 if (c == '$') { /* "\_$" is end-of-line */
1211 EMIT(NFA_EOL);
1212 had_eol = TRUE;
1213 break;
1214 }
1215
1216 extra = NFA_ADD_NL;
1217
1218 /* "\_[" is collection plus newline */
1219 if (c == '[')
1220 goto collection;
1221
1222 // "\_x" is character class plus newline
1223 FALLTHROUGH;
1224
1225 /*
1226 * Character classes.
1227 */
1228 case Magic('.'):
1229 case Magic('i'):
1230 case Magic('I'):
1231 case Magic('k'):
1232 case Magic('K'):
1233 case Magic('f'):
1234 case Magic('F'):
1235 case Magic('p'):
1236 case Magic('P'):
1237 case Magic('s'):
1238 case Magic('S'):
1239 case Magic('d'):
1240 case Magic('D'):
1241 case Magic('x'):
1242 case Magic('X'):
1243 case Magic('o'):
1244 case Magic('O'):
1245 case Magic('w'):
1246 case Magic('W'):
1247 case Magic('h'):
1248 case Magic('H'):
1249 case Magic('a'):
1250 case Magic('A'):
1251 case Magic('l'):
1252 case Magic('L'):
1253 case Magic('u'):
1254 case Magic('U'):
1255 p = vim_strchr(classchars, no_Magic(c));
1256 if (p == NULL) {
1257 if (extra == NFA_ADD_NL) {
1258 EMSGN(_(e_ill_char_class), c);
1259 rc_did_emsg = TRUE;
1260 return FAIL;
1261 }
1262 IEMSGN("INTERNAL: Unknown character class char: %" PRId64, c);
1263 return FAIL;
1264 }
1265 // When '.' is followed by a composing char ignore the dot, so that
1266 // the composing char is matched here.
1267 if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr())) {
1268 old_regparse = regparse;
1269 c = getchr();
1270 goto nfa_do_multibyte;
1271 }
1272 EMIT(nfa_classcodes[p - classchars]);
1273 if (extra == NFA_ADD_NL) {
1274 EMIT(NFA_NEWL);
1275 EMIT(NFA_OR);
1276 regflags |= RF_HASNL;
1277 }
1278 break;
1279
1280 case Magic('n'):
1281 if (reg_string) {
1282 // In a string "\n" matches a newline character.
1283 EMIT(NL);
1284 } else {
1285 // In buffer text "\n" matches the end of a line.
1286 EMIT(NFA_NEWL);
1287 regflags |= RF_HASNL;
1288 }
1289 break;
1290
1291 case Magic('('):
1292 if (nfa_reg(REG_PAREN) == FAIL) {
1293 return FAIL; // cascaded error
1294 }
1295 break;
1296
1297 case Magic('|'):
1298 case Magic('&'):
1299 case Magic(')'):
1300 EMSGN(_(e_misplaced), no_Magic(c)); // -V1037
1301 return FAIL;
1302
1303 case Magic('='):
1304 case Magic('?'):
1305 case Magic('+'):
1306 case Magic('@'):
1307 case Magic('*'):
1308 case Magic('{'):
1309 // these should follow an atom, not form an atom
1310 EMSGN(_(e_misplaced), no_Magic(c));
1311 return FAIL;
1312
1313 case Magic('~'):
1314 {
1315 char_u *lp;
1316
1317 // Previous substitute pattern.
1318 // Generated as "\%(pattern\)".
1319 if (reg_prev_sub == NULL) {
1320 EMSG(_(e_nopresub));
1321 return FAIL;
1322 }
1323 for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp)) {
1324 EMIT(PTR2CHAR(lp));
1325 if (lp != reg_prev_sub)
1326 EMIT(NFA_CONCAT);
1327 }
1328 EMIT(NFA_NOPEN);
1329 break;
1330 }
1331
1332 case Magic('1'):
1333 case Magic('2'):
1334 case Magic('3'):
1335 case Magic('4'):
1336 case Magic('5'):
1337 case Magic('6'):
1338 case Magic('7'):
1339 case Magic('8'):
1340 case Magic('9'):
1341 {
1342 int refnum = no_Magic(c) - '1';
1343
1344 if (!seen_endbrace(refnum + 1)) {
1345 return FAIL;
1346 }
1347 EMIT(NFA_BACKREF1 + refnum);
1348 nfa_has_backref = true;
1349 }
1350 break;
1351
1352 case Magic('z'):
1353 c = no_Magic(getchr());
1354 switch (c) {
1355 case 's':
1356 EMIT(NFA_ZSTART);
1357 if (!re_mult_next("\\zs")) {
1358 return false;
1359 }
1360 break;
1361 case 'e':
1362 EMIT(NFA_ZEND);
1363 nfa_has_zend = true;
1364 if (!re_mult_next("\\zs")) {
1365 return false;
1366 }
1367 break;
1368 case '1':
1369 case '2':
1370 case '3':
1371 case '4':
1372 case '5':
1373 case '6':
1374 case '7':
1375 case '8':
1376 case '9':
1377 // \z1...\z9
1378 if ((reg_do_extmatch & REX_USE) == 0) {
1379 EMSG_RET_FAIL(_(e_z1_not_allowed));
1380 }
1381 EMIT(NFA_ZREF1 + (no_Magic(c) - '1'));
1382 /* No need to set nfa_has_backref, the sub-matches don't
1383 * change when \z1 .. \z9 matches or not. */
1384 re_has_z = REX_USE;
1385 break;
1386 case '(':
1387 // \z(
1388 if (reg_do_extmatch != REX_SET) {
1389 EMSG_RET_FAIL(_(e_z_not_allowed));
1390 }
1391 if (nfa_reg(REG_ZPAREN) == FAIL) {
1392 return FAIL; // cascaded error
1393 }
1394 re_has_z = REX_SET;
1395 break;
1396 default:
1397 emsgf(_("E867: (NFA) Unknown operator '\\z%c'"),
1398 no_Magic(c));
1399 return FAIL;
1400 }
1401 break;
1402
1403 case Magic('%'):
1404 c = no_Magic(getchr());
1405 switch (c) {
1406 /* () without a back reference */
1407 case '(':
1408 if (nfa_reg(REG_NPAREN) == FAIL)
1409 return FAIL;
1410 EMIT(NFA_NOPEN);
1411 break;
1412
1413 case 'd': /* %d123 decimal */
1414 case 'o': /* %o123 octal */
1415 case 'x': /* %xab hex 2 */
1416 case 'u': /* %uabcd hex 4 */
1417 case 'U': /* %U1234abcd hex 8 */
1418 {
1419 int64_t nr;
1420
1421 switch (c) {
1422 case 'd': nr = getdecchrs(); break;
1423 case 'o': nr = getoctchrs(); break;
1424 case 'x': nr = gethexchrs(2); break;
1425 case 'u': nr = gethexchrs(4); break;
1426 case 'U': nr = gethexchrs(8); break;
1427 default: nr = -1; break;
1428 }
1429
1430 if (nr < 0 || nr > INT_MAX) {
1431 EMSG2_RET_FAIL(_("E678: Invalid character after %s%%[dxouU]"),
1432 reg_magic == MAGIC_ALL);
1433 }
1434 // A NUL is stored in the text as NL
1435 // TODO(vim): what if a composing character follows?
1436 EMIT(nr == 0 ? 0x0a : nr);
1437 }
1438 break;
1439
1440 /* Catch \%^ and \%$ regardless of where they appear in the
1441 * pattern -- regardless of whether or not it makes sense. */
1442 case '^':
1443 EMIT(NFA_BOF);
1444 break;
1445
1446 case '$':
1447 EMIT(NFA_EOF);
1448 break;
1449
1450 case '#':
1451 EMIT(NFA_CURSOR);
1452 break;
1453
1454 case 'V':
1455 EMIT(NFA_VISUAL);
1456 break;
1457
1458 case 'C':
1459 EMIT(NFA_ANY_COMPOSING);
1460 break;
1461
1462 case '[':
1463 {
1464 int n;
1465
1466 /* \%[abc] */
1467 for (n = 0; (c = peekchr()) != ']'; ++n) {
1468 if (c == NUL)
1469 EMSG2_RET_FAIL(_(e_missing_sb),
1470 reg_magic == MAGIC_ALL);
1471 /* recursive call! */
1472 if (nfa_regatom() == FAIL)
1473 return FAIL;
1474 }
1475 getchr(); /* get the ] */
1476 if (n == 0)
1477 EMSG2_RET_FAIL(_(e_empty_sb),
1478 reg_magic == MAGIC_ALL);
1479 EMIT(NFA_OPT_CHARS);
1480 EMIT(n);
1481
1482 /* Emit as "\%(\%[abc]\)" to be able to handle
1483 * "\%[abc]*" which would cause the empty string to be
1484 * matched an unlimited number of times. NFA_NOPEN is
1485 * added only once at a position, while NFA_SPLIT is
1486 * added multiple times. This is more efficient than
1487 * not allowing NFA_SPLIT multiple times, it is used
1488 * a lot. */
1489 EMIT(NFA_NOPEN);
1490 break;
1491 }
1492
1493 default:
1494 {
1495 int64_t n = 0;
1496 const int cmp = c;
1497
1498 if (c == '<' || c == '>')
1499 c = getchr();
1500 while (ascii_isdigit(c)) {
1501 if (n > (INT32_MAX - (c - '0')) / 10) {
1502 EMSG(_("E951: \\% value too large"));
1503 return FAIL;
1504 }
1505 n = n * 10 + (c - '0');
1506 c = getchr();
1507 }
1508 if (c == 'l' || c == 'c' || c == 'v') {
1509 int32_t limit = INT32_MAX;
1510
1511 if (c == 'l') {
1512 // \%{n}l \%{n}<l \%{n}>l
1513 EMIT(cmp == '<' ? NFA_LNUM_LT :
1514 cmp == '>' ? NFA_LNUM_GT : NFA_LNUM);
1515 if (save_prev_at_start) {
1516 at_start = true;
1517 }
1518 } else if (c == 'c') {
1519 // \%{n}c \%{n}<c \%{n}>c
1520 EMIT(cmp == '<' ? NFA_COL_LT :
1521 cmp == '>' ? NFA_COL_GT : NFA_COL);
1522 } else {
1523 // \%{n}v \%{n}<v \%{n}>v
1524 EMIT(cmp == '<' ? NFA_VCOL_LT :
1525 cmp == '>' ? NFA_VCOL_GT : NFA_VCOL);
1526 limit = INT32_MAX / MB_MAXBYTES;
1527 }
1528 if (n >= limit) {
1529 EMSG(_("E951: \\% value too large"));
1530 return FAIL;
1531 }
1532 EMIT((int)n);
1533 break;
1534 } else if (c == '\'' && n == 0) {
1535 /* \%'m \%<'m \%>'m */
1536 EMIT(cmp == '<' ? NFA_MARK_LT :
1537 cmp == '>' ? NFA_MARK_GT : NFA_MARK);
1538 EMIT(getchr());
1539 break;
1540 }
1541 }
1542 emsgf(_("E867: (NFA) Unknown operator '\\%%%c'"),
1543 no_Magic(c));
1544 return FAIL;
1545 }
1546 break;
1547
1548 case Magic('['):
1549collection:
1550 /*
1551 * [abc] uses NFA_START_COLL - NFA_END_COLL
1552 * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1553 * Each character is produced as a regular state, using
1554 * NFA_CONCAT to bind them together.
1555 * Besides normal characters there can be:
1556 * - character classes NFA_CLASS_*
1557 * - ranges, two characters followed by NFA_RANGE.
1558 */
1559
1560 p = regparse;
1561 endp = skip_anyof(p);
1562 if (*endp == ']') {
1563 /*
1564 * Try to reverse engineer character classes. For example,
1565 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
1566 * and perform the necessary substitutions in the NFA.
1567 */
1568 int result = nfa_recognize_char_class(regparse, endp,
1569 extra == NFA_ADD_NL);
1570 if (result != FAIL) {
1571 if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL) {
1572 EMIT(result - NFA_ADD_NL);
1573 EMIT(NFA_NEWL);
1574 EMIT(NFA_OR);
1575 } else
1576 EMIT(result);
1577 regparse = endp;
1578 MB_PTR_ADV(regparse);
1579 return OK;
1580 }
1581 /*
1582 * Failed to recognize a character class. Use the simple
1583 * version that turns [abc] into 'a' OR 'b' OR 'c'
1584 */
1585 startc = endc = oldstartc = -1;
1586 negated = false;
1587 if (*regparse == '^') { // negated range
1588 negated = true;
1589 MB_PTR_ADV(regparse);
1590 EMIT(NFA_START_NEG_COLL);
1591 } else
1592 EMIT(NFA_START_COLL);
1593 if (*regparse == '-') {
1594 startc = '-';
1595 EMIT(startc);
1596 EMIT(NFA_CONCAT);
1597 MB_PTR_ADV(regparse);
1598 }
1599 /* Emit the OR branches for each character in the [] */
1600 emit_range = FALSE;
1601 while (regparse < endp) {
1602 oldstartc = startc;
1603 startc = -1;
1604 got_coll_char = FALSE;
1605 if (*regparse == '[') {
1606 /* Check for [: :], [= =], [. .] */
1607 equiclass = collclass = 0;
1608 charclass = get_char_class(&regparse);
1609 if (charclass == CLASS_NONE) {
1610 equiclass = get_equi_class(&regparse);
1611 if (equiclass == 0)
1612 collclass = get_coll_element(&regparse);
1613 }
1614
1615 /* Character class like [:alpha:] */
1616 if (charclass != CLASS_NONE) {
1617 switch (charclass) {
1618 case CLASS_ALNUM:
1619 EMIT(NFA_CLASS_ALNUM);
1620 break;
1621 case CLASS_ALPHA:
1622 EMIT(NFA_CLASS_ALPHA);
1623 break;
1624 case CLASS_BLANK:
1625 EMIT(NFA_CLASS_BLANK);
1626 break;
1627 case CLASS_CNTRL:
1628 EMIT(NFA_CLASS_CNTRL);
1629 break;
1630 case CLASS_DIGIT:
1631 EMIT(NFA_CLASS_DIGIT);
1632 break;
1633 case CLASS_GRAPH:
1634 EMIT(NFA_CLASS_GRAPH);
1635 break;
1636 case CLASS_LOWER:
1637 EMIT(NFA_CLASS_LOWER);
1638 break;
1639 case CLASS_PRINT:
1640 EMIT(NFA_CLASS_PRINT);
1641 break;
1642 case CLASS_PUNCT:
1643 EMIT(NFA_CLASS_PUNCT);
1644 break;
1645 case CLASS_SPACE:
1646 EMIT(NFA_CLASS_SPACE);
1647 break;
1648 case CLASS_UPPER:
1649 EMIT(NFA_CLASS_UPPER);
1650 break;
1651 case CLASS_XDIGIT:
1652 EMIT(NFA_CLASS_XDIGIT);
1653 break;
1654 case CLASS_TAB:
1655 EMIT(NFA_CLASS_TAB);
1656 break;
1657 case CLASS_RETURN:
1658 EMIT(NFA_CLASS_RETURN);
1659 break;
1660 case CLASS_BACKSPACE:
1661 EMIT(NFA_CLASS_BACKSPACE);
1662 break;
1663 case CLASS_ESCAPE:
1664 EMIT(NFA_CLASS_ESCAPE);
1665 break;
1666 }
1667 EMIT(NFA_CONCAT);
1668 continue;
1669 }
1670 /* Try equivalence class [=a=] and the like */
1671 if (equiclass != 0) {
1672 nfa_emit_equi_class(equiclass);
1673 continue;
1674 }
1675 /* Try collating class like [. .] */
1676 if (collclass != 0) {
1677 startc = collclass; /* allow [.a.]-x as a range */
1678 /* Will emit the proper atom at the end of the
1679 * while loop. */
1680 }
1681 }
1682 /* Try a range like 'a-x' or '\t-z'. Also allows '-' as a
1683 * start character. */
1684 if (*regparse == '-' && oldstartc != -1) {
1685 emit_range = TRUE;
1686 startc = oldstartc;
1687 MB_PTR_ADV(regparse);
1688 continue; // reading the end of the range
1689 }
1690
1691 /* Now handle simple and escaped characters.
1692 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1693 * accepts "\t", "\e", etc., but only when the 'l' flag in
1694 * 'cpoptions' is not included.
1695 */
1696 if (*regparse == '\\'
1697 && regparse + 1 <= endp
1698 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
1699 || (!reg_cpo_lit
1700 && vim_strchr(REGEXP_ABBR, regparse[1])
1701 != NULL)
1702 )
1703 ) {
1704 MB_PTR_ADV(regparse);
1705
1706 if (*regparse == 'n') {
1707 startc = (reg_string || emit_range || regparse[1] == '-')
1708 ? NL : NFA_NEWL;
1709 } else if (*regparse == 'd'
1710 || *regparse == 'o'
1711 || *regparse == 'x'
1712 || *regparse == 'u'
1713 || *regparse == 'U'
1714 ) {
1715 // TODO(RE): This needs more testing
1716 startc = coll_get_char();
1717 got_coll_char = true;
1718 MB_PTR_BACK(old_regparse, regparse);
1719 } else {
1720 /* \r,\t,\e,\b */
1721 startc = backslash_trans(*regparse);
1722 }
1723 }
1724
1725 /* Normal printable char */
1726 if (startc == -1)
1727 startc = PTR2CHAR(regparse);
1728
1729 /* Previous char was '-', so this char is end of range. */
1730 if (emit_range) {
1731 endc = startc;
1732 startc = oldstartc;
1733 if (startc > endc) {
1734 EMSG_RET_FAIL(_(e_reverse_range));
1735 }
1736
1737 if (endc > startc + 2) {
1738 /* Emit a range instead of the sequence of
1739 * individual characters. */
1740 if (startc == 0)
1741 /* \x00 is translated to \x0a, start at \x01. */
1742 EMIT(1);
1743 else
1744 --post_ptr; /* remove NFA_CONCAT */
1745 EMIT(endc);
1746 EMIT(NFA_RANGE);
1747 EMIT(NFA_CONCAT);
1748 } else if (has_mbyte && ((*mb_char2len)(startc) > 1
1749 || (*mb_char2len)(endc) > 1)) {
1750 /* Emit the characters in the range.
1751 * "startc" was already emitted, so skip it.
1752 * */
1753 for (c = startc + 1; c <= endc; c++) {
1754 EMIT(c);
1755 EMIT(NFA_CONCAT);
1756 }
1757 } else {
1758 /* Emit the range. "startc" was already emitted, so
1759 * skip it. */
1760 for (c = startc + 1; c <= endc; c++) {
1761 EMIT(c);
1762 EMIT(NFA_CONCAT);
1763 }
1764 }
1765 emit_range = FALSE;
1766 startc = -1;
1767 } else {
1768 /* This char (startc) is not part of a range. Just
1769 * emit it.
1770 * Normally, simply emit startc. But if we get char
1771 * code=0 from a collating char, then replace it with
1772 * 0x0a.
1773 * This is needed to completely mimic the behaviour of
1774 * the backtracking engine. */
1775 if (startc == NFA_NEWL) {
1776 /* Line break can't be matched as part of the
1777 * collection, add an OR below. But not for negated
1778 * range. */
1779 if (!negated)
1780 extra = NFA_ADD_NL;
1781 } else {
1782 if (got_coll_char == TRUE && startc == 0)
1783 EMIT(0x0a);
1784 else
1785 EMIT(startc);
1786 EMIT(NFA_CONCAT);
1787 }
1788 }
1789
1790 MB_PTR_ADV(regparse);
1791 } // while (p < endp)
1792
1793 MB_PTR_BACK(old_regparse, regparse);
1794 if (*regparse == '-') { // if last, '-' is just a char
1795 EMIT('-');
1796 EMIT(NFA_CONCAT);
1797 }
1798
1799 /* skip the trailing ] */
1800 regparse = endp;
1801 MB_PTR_ADV(regparse);
1802
1803 /* Mark end of the collection. */
1804 if (negated == TRUE)
1805 EMIT(NFA_END_NEG_COLL);
1806 else
1807 EMIT(NFA_END_COLL);
1808
1809 /* \_[] also matches \n but it's not negated */
1810 if (extra == NFA_ADD_NL) {
1811 EMIT(reg_string ? NL : NFA_NEWL);
1812 EMIT(NFA_OR);
1813 }
1814
1815 return OK;
1816 } /* if exists closing ] */
1817
1818 if (reg_strict)
1819 EMSG_RET_FAIL(_(e_missingbracket));
1820 FALLTHROUGH;
1821
1822 default:
1823 {
1824 int plen;
1825
1826nfa_do_multibyte:
1827 // plen is length of current char with composing chars
1828 if (enc_utf8 && ((*mb_char2len)(c)
1829 != (plen = utfc_ptr2len(old_regparse))
1830 || utf_iscomposing(c))) {
1831 int i = 0;
1832
1833 /* A base character plus composing characters, or just one
1834 * or more composing characters.
1835 * This requires creating a separate atom as if enclosing
1836 * the characters in (), where NFA_COMPOSING is the ( and
1837 * NFA_END_COMPOSING is the ). Note that right now we are
1838 * building the postfix form, not the NFA itself;
1839 * a composing char could be: a, b, c, NFA_COMPOSING
1840 * where 'b' and 'c' are chars with codes > 256. */
1841 for (;; ) {
1842 EMIT(c);
1843 if (i > 0)
1844 EMIT(NFA_CONCAT);
1845 if ((i += utf_char2len(c)) >= plen)
1846 break;
1847 c = utf_ptr2char(old_regparse + i);
1848 }
1849 EMIT(NFA_COMPOSING);
1850 regparse = old_regparse + plen;
1851 } else {
1852 c = no_Magic(c);
1853 EMIT(c);
1854 }
1855 return OK;
1856 }
1857 }
1858
1859 return OK;
1860}
1861
1862/*
1863 * Parse something followed by possible [*+=].
1864 *
1865 * A piece is an atom, possibly followed by a multi, an indication of how many
1866 * times the atom can be matched. Example: "a*" matches any sequence of "a"
1867 * characters: "", "a", "aa", etc.
1868 *
1869 * piece ::= atom
1870 * or atom multi
1871 */
1872static int nfa_regpiece(void)
1873{
1874 int i;
1875 int op;
1876 int ret;
1877 long minval, maxval;
1878 int greedy = TRUE; /* Braces are prefixed with '-' ? */
1879 parse_state_T old_state;
1880 parse_state_T new_state;
1881 int64_t c2;
1882 int old_post_pos;
1883 int my_post_start;
1884 int quest;
1885
1886 /* Save the current parse state, so that we can use it if <atom>{m,n} is
1887 * next. */
1888 save_parse_state(&old_state);
1889
1890 /* store current pos in the postfix form, for \{m,n} involving 0s */
1891 my_post_start = (int)(post_ptr - post_start);
1892
1893 ret = nfa_regatom();
1894 if (ret == FAIL)
1895 return FAIL; /* cascaded error */
1896
1897 op = peekchr();
1898 if (re_multi_type(op) == NOT_MULTI)
1899 return OK;
1900
1901 skipchr();
1902 switch (op) {
1903 case Magic('*'):
1904 EMIT(NFA_STAR);
1905 break;
1906
1907 case Magic('+'):
1908 /*
1909 * Trick: Normally, (a*)\+ would match the whole input "aaa". The
1910 * first and only submatch would be "aaa". But the backtracking
1911 * engine interprets the plus as "try matching one more time", and
1912 * a* matches a second time at the end of the input, the empty
1913 * string.
1914 * The submatch will be the empty string.
1915 *
1916 * In order to be consistent with the old engine, we replace
1917 * <atom>+ with <atom><atom>*
1918 */
1919 restore_parse_state(&old_state);
1920 curchr = -1;
1921 if (nfa_regatom() == FAIL)
1922 return FAIL;
1923 EMIT(NFA_STAR);
1924 EMIT(NFA_CONCAT);
1925 skipchr(); /* skip the \+ */
1926 break;
1927
1928 case Magic('@'):
1929 c2 = getdecchrs();
1930 op = no_Magic(getchr());
1931 i = 0;
1932 switch(op) {
1933 case '=':
1934 /* \@= */
1935 i = NFA_PREV_ATOM_NO_WIDTH;
1936 break;
1937 case '!':
1938 /* \@! */
1939 i = NFA_PREV_ATOM_NO_WIDTH_NEG;
1940 break;
1941 case '<':
1942 op = no_Magic(getchr());
1943 if (op == '=')
1944 /* \@<= */
1945 i = NFA_PREV_ATOM_JUST_BEFORE;
1946 else if (op == '!')
1947 /* \@<! */
1948 i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
1949 break;
1950 case '>':
1951 /* \@> */
1952 i = NFA_PREV_ATOM_LIKE_PATTERN;
1953 break;
1954 }
1955 if (i == 0) {
1956 emsgf(_("E869: (NFA) Unknown operator '\\@%c'"), op);
1957 return FAIL;
1958 }
1959 EMIT(i);
1960 if (i == NFA_PREV_ATOM_JUST_BEFORE
1961 || i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
1962 EMIT(c2);
1963 break;
1964
1965 case Magic('?'):
1966 case Magic('='):
1967 EMIT(NFA_QUEST);
1968 break;
1969
1970 case Magic('{'):
1971 /* a{2,5} will expand to 'aaa?a?a?'
1972 * a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
1973 * version of '?'
1974 * \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
1975 * parenthesis have the same id
1976 */
1977
1978 greedy = TRUE;
1979 c2 = peekchr();
1980 if (c2 == '-' || c2 == Magic('-')) {
1981 skipchr();
1982 greedy = FALSE;
1983 }
1984 if (!read_limits(&minval, &maxval))
1985 EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
1986
1987 /* <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to
1988 * <atom>* */
1989 if (minval == 0 && maxval == MAX_LIMIT) {
1990 if (greedy)
1991 /* \{}, \{0,} */
1992 EMIT(NFA_STAR);
1993 else
1994 /* \{-}, \{-0,} */
1995 EMIT(NFA_STAR_NONGREEDY);
1996 break;
1997 }
1998
1999 /* Special case: x{0} or x{-0} */
2000 if (maxval == 0) {
2001 /* Ignore result of previous call to nfa_regatom() */
2002 post_ptr = post_start + my_post_start;
2003 /* NFA_EMPTY is 0-length and works everywhere */
2004 EMIT(NFA_EMPTY);
2005 return OK;
2006 }
2007
2008 // The engine is very inefficient (uses too many states) when the maximum
2009 // is much larger than the minimum and when the maximum is large. Bail out
2010 // if we can use the other engine.
2011 if ((nfa_re_flags & RE_AUTO) && (maxval > 500 || maxval > minval + 200)) {
2012 return FAIL;
2013 }
2014
2015 /* Ignore previous call to nfa_regatom() */
2016 post_ptr = post_start + my_post_start;
2017 /* Save parse state after the repeated atom and the \{} */
2018 save_parse_state(&new_state);
2019
2020 quest = (greedy == TRUE ? NFA_QUEST : NFA_QUEST_NONGREEDY);
2021 for (i = 0; i < maxval; i++) {
2022 /* Goto beginning of the repeated atom */
2023 restore_parse_state(&old_state);
2024 old_post_pos = (int)(post_ptr - post_start);
2025 if (nfa_regatom() == FAIL)
2026 return FAIL;
2027 /* after "minval" times, atoms are optional */
2028 if (i + 1 > minval) {
2029 if (maxval == MAX_LIMIT) {
2030 if (greedy)
2031 EMIT(NFA_STAR);
2032 else
2033 EMIT(NFA_STAR_NONGREEDY);
2034 } else
2035 EMIT(quest);
2036 }
2037 if (old_post_pos != my_post_start)
2038 EMIT(NFA_CONCAT);
2039 if (i + 1 > minval && maxval == MAX_LIMIT)
2040 break;
2041 }
2042
2043 /* Go to just after the repeated atom and the \{} */
2044 restore_parse_state(&new_state);
2045 curchr = -1;
2046
2047 break;
2048
2049
2050 default:
2051 break;
2052 } /* end switch */
2053
2054 if (re_multi_type(peekchr()) != NOT_MULTI) {
2055 // Can't have a multi follow a multi.
2056 EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi"));
2057 }
2058
2059 return OK;
2060}
2061
2062/*
2063 * Parse one or more pieces, concatenated. It matches a match for the
2064 * first piece, followed by a match for the second piece, etc. Example:
2065 * "f[0-9]b", first matches "f", then a digit and then "b".
2066 *
2067 * concat ::= piece
2068 * or piece piece
2069 * or piece piece piece
2070 * etc.
2071 */
2072static int nfa_regconcat(void)
2073{
2074 int cont = TRUE;
2075 int first = TRUE;
2076
2077 while (cont) {
2078 switch (peekchr()) {
2079 case NUL:
2080 case Magic('|'):
2081 case Magic('&'):
2082 case Magic(')'):
2083 cont = FALSE;
2084 break;
2085
2086 case Magic('Z'):
2087 regflags |= RF_ICOMBINE;
2088 skipchr_keepstart();
2089 break;
2090 case Magic('c'):
2091 regflags |= RF_ICASE;
2092 skipchr_keepstart();
2093 break;
2094 case Magic('C'):
2095 regflags |= RF_NOICASE;
2096 skipchr_keepstart();
2097 break;
2098 case Magic('v'):
2099 reg_magic = MAGIC_ALL;
2100 skipchr_keepstart();
2101 curchr = -1;
2102 break;
2103 case Magic('m'):
2104 reg_magic = MAGIC_ON;
2105 skipchr_keepstart();
2106 curchr = -1;
2107 break;
2108 case Magic('M'):
2109 reg_magic = MAGIC_OFF;
2110 skipchr_keepstart();
2111 curchr = -1;
2112 break;
2113 case Magic('V'):
2114 reg_magic = MAGIC_NONE;
2115 skipchr_keepstart();
2116 curchr = -1;
2117 break;
2118
2119 default:
2120 if (nfa_regpiece() == FAIL)
2121 return FAIL;
2122 if (first == FALSE)
2123 EMIT(NFA_CONCAT);
2124 else
2125 first = FALSE;
2126 break;
2127 }
2128 }
2129
2130 return OK;
2131}
2132
2133/*
2134 * Parse a branch, one or more concats, separated by "\&". It matches the
2135 * last concat, but only if all the preceding concats also match at the same
2136 * position. Examples:
2137 * "foobeep\&..." matches "foo" in "foobeep".
2138 * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob"
2139 *
2140 * branch ::= concat
2141 * or concat \& concat
2142 * or concat \& concat \& concat
2143 * etc.
2144 */
2145static int nfa_regbranch(void)
2146{
2147 int old_post_pos;
2148
2149 old_post_pos = (int)(post_ptr - post_start);
2150
2151 /* First branch, possibly the only one */
2152 if (nfa_regconcat() == FAIL)
2153 return FAIL;
2154
2155 // Try next concats
2156 while (peekchr() == Magic('&')) {
2157 skipchr();
2158 // if concat is empty do emit a node
2159 if (old_post_pos == (int)(post_ptr - post_start)) {
2160 EMIT(NFA_EMPTY);
2161 }
2162 EMIT(NFA_NOPEN);
2163 EMIT(NFA_PREV_ATOM_NO_WIDTH);
2164 old_post_pos = (int)(post_ptr - post_start);
2165 if (nfa_regconcat() == FAIL)
2166 return FAIL;
2167 /* if concat is empty do emit a node */
2168 if (old_post_pos == (int)(post_ptr - post_start))
2169 EMIT(NFA_EMPTY);
2170 EMIT(NFA_CONCAT);
2171 }
2172
2173 /* if a branch is empty, emit one node for it */
2174 if (old_post_pos == (int)(post_ptr - post_start))
2175 EMIT(NFA_EMPTY);
2176
2177 return OK;
2178}
2179
2180/*
2181 * Parse a pattern, one or more branches, separated by "\|". It matches
2182 * anything that matches one of the branches. Example: "foo\|beep" matches
2183 * "foo" and matches "beep". If more than one branch matches, the first one
2184 * is used.
2185 *
2186 * pattern ::= branch
2187 * or branch \| branch
2188 * or branch \| branch \| branch
2189 * etc.
2190 */
2191static int
2192nfa_reg (
2193 int paren /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
2194)
2195{
2196 int parno = 0;
2197
2198 if (paren == REG_PAREN) {
2199 if (regnpar >= NSUBEXP) /* Too many `(' */
2200 EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
2201 parno = regnpar++;
2202 } else if (paren == REG_ZPAREN) {
2203 /* Make a ZOPEN node. */
2204 if (regnzpar >= NSUBEXP)
2205 EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z("));
2206 parno = regnzpar++;
2207 }
2208
2209 if (nfa_regbranch() == FAIL)
2210 return FAIL; /* cascaded error */
2211
2212 while (peekchr() == Magic('|')) {
2213 skipchr();
2214 if (nfa_regbranch() == FAIL)
2215 return FAIL; /* cascaded error */
2216 EMIT(NFA_OR);
2217 }
2218
2219 /* Check for proper termination. */
2220 if (paren != REG_NOPAREN && getchr() != Magic(')')) {
2221 if (paren == REG_NPAREN)
2222 EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
2223 else
2224 EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
2225 } else if (paren == REG_NOPAREN && peekchr() != NUL) {
2226 if (peekchr() == Magic(')'))
2227 EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
2228 else
2229 EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
2230 }
2231 /*
2232 * Here we set the flag allowing back references to this set of
2233 * parentheses.
2234 */
2235 if (paren == REG_PAREN) {
2236 had_endbrace[parno] = TRUE; /* have seen the close paren */
2237 EMIT(NFA_MOPEN + parno);
2238 } else if (paren == REG_ZPAREN)
2239 EMIT(NFA_ZOPEN + parno);
2240
2241 return OK;
2242}
2243
2244#ifdef REGEXP_DEBUG
2245static char_u code[50];
2246
2247static void nfa_set_code(int c)
2248{
2249 int addnl = FALSE;
2250
2251 if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) {
2252 addnl = TRUE;
2253 c -= NFA_ADD_NL;
2254 }
2255
2256 STRCPY(code, "");
2257 switch (c) {
2258 case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2259 case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2260 case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2261 case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2262 case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2263 case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2264
2265 case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2266 case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2267 case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2268 case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2269 case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2270 case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2271 case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2272 case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2273 case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
2274 case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2275 case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2276 case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2277 case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2278 case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2279 case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2280 case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2281 case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2282 case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2283 case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2284
2285 case NFA_PREV_ATOM_NO_WIDTH:
2286 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
2287 case NFA_PREV_ATOM_NO_WIDTH_NEG:
2288 STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
2289 case NFA_PREV_ATOM_JUST_BEFORE:
2290 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2291 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2292 STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
2293 case NFA_PREV_ATOM_LIKE_PATTERN:
2294 STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2295
2296 case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2297 case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
2298 case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
2299 case NFA_START_INVISIBLE_FIRST:
2300 STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
2301 case NFA_START_INVISIBLE_NEG:
2302 STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
2303 case NFA_START_INVISIBLE_NEG_FIRST:
2304 STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
2305 case NFA_START_INVISIBLE_BEFORE:
2306 STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
2307 case NFA_START_INVISIBLE_BEFORE_FIRST:
2308 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
2309 case NFA_START_INVISIBLE_BEFORE_NEG:
2310 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
2311 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2312 STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
2313 case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
2314 case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
2315 case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
2316 case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
2317
2318 case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2319 case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
2320 case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
2321
2322 case NFA_MOPEN:
2323 case NFA_MOPEN1:
2324 case NFA_MOPEN2:
2325 case NFA_MOPEN3:
2326 case NFA_MOPEN4:
2327 case NFA_MOPEN5:
2328 case NFA_MOPEN6:
2329 case NFA_MOPEN7:
2330 case NFA_MOPEN8:
2331 case NFA_MOPEN9:
2332 STRCPY(code, "NFA_MOPEN(x)");
2333 code[10] = c - NFA_MOPEN + '0';
2334 break;
2335 case NFA_MCLOSE:
2336 case NFA_MCLOSE1:
2337 case NFA_MCLOSE2:
2338 case NFA_MCLOSE3:
2339 case NFA_MCLOSE4:
2340 case NFA_MCLOSE5:
2341 case NFA_MCLOSE6:
2342 case NFA_MCLOSE7:
2343 case NFA_MCLOSE8:
2344 case NFA_MCLOSE9:
2345 STRCPY(code, "NFA_MCLOSE(x)");
2346 code[11] = c - NFA_MCLOSE + '0';
2347 break;
2348 case NFA_ZOPEN:
2349 case NFA_ZOPEN1:
2350 case NFA_ZOPEN2:
2351 case NFA_ZOPEN3:
2352 case NFA_ZOPEN4:
2353 case NFA_ZOPEN5:
2354 case NFA_ZOPEN6:
2355 case NFA_ZOPEN7:
2356 case NFA_ZOPEN8:
2357 case NFA_ZOPEN9:
2358 STRCPY(code, "NFA_ZOPEN(x)");
2359 code[10] = c - NFA_ZOPEN + '0';
2360 break;
2361 case NFA_ZCLOSE:
2362 case NFA_ZCLOSE1:
2363 case NFA_ZCLOSE2:
2364 case NFA_ZCLOSE3:
2365 case NFA_ZCLOSE4:
2366 case NFA_ZCLOSE5:
2367 case NFA_ZCLOSE6:
2368 case NFA_ZCLOSE7:
2369 case NFA_ZCLOSE8:
2370 case NFA_ZCLOSE9:
2371 STRCPY(code, "NFA_ZCLOSE(x)");
2372 code[11] = c - NFA_ZCLOSE + '0';
2373 break;
2374 case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2375 case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2376 case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2377 case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
2378 case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2379 case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
2380 case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2381 case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2382 case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2383 case NFA_COL: STRCPY(code, "NFA_COL "); break;
2384 case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2385 case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2386 case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2387 case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2388 case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2389 case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2390 case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2391 case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2392 case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2393 case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
2394 case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
2395
2396 case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
2397 case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2398 case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2399 case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
2400 case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
2401 case NFA_OR: STRCPY(code, "NFA_OR"); break;
2402
2403 case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2404 case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2405 case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2406 case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2407 case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2408 case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2409 case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2410
2411 case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2412 case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2413 case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2414 case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2415 case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2416 case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2417 case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2418 case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2419 case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2420 case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2421 case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2422 case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2423 case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2424 case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2425 case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2426 case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
2427
2428 case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2429 case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2430 case NFA_SIDENT: STRCPY(code, "NFA_SIDENT"); break;
2431 case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2432 case NFA_SKWORD: STRCPY(code, "NFA_SKWORD"); break;
2433 case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2434 case NFA_SFNAME: STRCPY(code, "NFA_SFNAME"); break;
2435 case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2436 case NFA_SPRINT: STRCPY(code, "NFA_SPRINT"); break;
2437 case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2438 case NFA_NWHITE: STRCPY(code, "NFA_NWHITE"); break;
2439 case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2440 case NFA_NDIGIT: STRCPY(code, "NFA_NDIGIT"); break;
2441 case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2442 case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2443 case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2444 case NFA_NOCTAL: STRCPY(code, "NFA_NOCTAL"); break;
2445 case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2446 case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2447 case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2448 case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2449 case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2450 case NFA_NALPHA: STRCPY(code, "NFA_NALPHA"); break;
2451 case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2452 case NFA_NLOWER: STRCPY(code, "NFA_NLOWER"); break;
2453 case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2454 case NFA_NUPPER: STRCPY(code, "NFA_NUPPER"); break;
2455 case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2456 case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2457 case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2458 case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
2459
2460 default:
2461 STRCPY(code, "CHAR(x)");
2462 code[5] = c;
2463 }
2464
2465 if (addnl == TRUE)
2466 STRCAT(code, " + NEWLINE ");
2467
2468}
2469
2470static FILE *log_fd;
2471static char_u e_log_open_failed[] = N_(
2472 "Could not open temporary log file for writing, displaying on stderr... ");
2473
2474/*
2475 * Print the postfix notation of the current regexp.
2476 */
2477static void nfa_postfix_dump(char_u *expr, int retval)
2478{
2479 int *p;
2480 FILE *f;
2481
2482 f = fopen(NFA_REGEXP_DUMP_LOG, "a");
2483 if (f != NULL) {
2484 fprintf(f, "\n-------------------------\n");
2485 if (retval == FAIL) {
2486 fprintf(f, ">>> NFA engine failed... \n");
2487 } else if (retval == OK) {
2488 fprintf(f, ">>> NFA engine succeeded !\n");
2489 }
2490 fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
2491 for (p = post_start; *p && p < post_ptr; p++) {
2492 nfa_set_code(*p);
2493 fprintf(f, "%s, ", code);
2494 }
2495 fprintf(f, "\"\nPostfix notation (int): ");
2496 for (p = post_start; *p && p < post_ptr; p++)
2497 fprintf(f, "%d ", *p);
2498 fprintf(f, "\n\n");
2499 fclose(f);
2500 }
2501}
2502
2503/*
2504 * Print the NFA starting with a root node "state".
2505 */
2506static void nfa_print_state(FILE *debugf, nfa_state_T *state)
2507{
2508 garray_T indent;
2509
2510 ga_init(&indent, 1, 64);
2511 ga_append(&indent, '\0');
2512 nfa_print_state2(debugf, state, &indent);
2513 ga_clear(&indent);
2514}
2515
2516static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent)
2517{
2518 char_u *p;
2519
2520 if (state == NULL)
2521 return;
2522
2523 fprintf(debugf, "(%2d)", abs(state->id));
2524
2525 /* Output indent */
2526 p = (char_u *)indent->ga_data;
2527 if (indent->ga_len >= 3) {
2528 int last = indent->ga_len - 3;
2529 char_u save[2];
2530
2531 STRNCPY(save, &p[last], 2);
2532 STRNCPY(&p[last], "+-", 2);
2533 fprintf(debugf, " %s", p);
2534 STRNCPY(&p[last], save, 2);
2535 } else
2536 fprintf(debugf, " %s", p);
2537
2538 nfa_set_code(state->c);
2539 fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
2540 code,
2541 state->c,
2542 abs(state->id),
2543 state->val);
2544 if (state->id < 0)
2545 return;
2546
2547 state->id = abs(state->id) * -1;
2548
2549 /* grow indent for state->out */
2550 indent->ga_len -= 1;
2551 if (state->out1)
2552 ga_concat(indent, (char_u *)"| ");
2553 else
2554 ga_concat(indent, (char_u *)" ");
2555 ga_append(indent, '\0');
2556
2557 nfa_print_state2(debugf, state->out, indent);
2558
2559 /* replace last part of indent for state->out1 */
2560 indent->ga_len -= 3;
2561 ga_concat(indent, (char_u *)" ");
2562 ga_append(indent, '\0');
2563
2564 nfa_print_state2(debugf, state->out1, indent);
2565
2566 /* shrink indent */
2567 indent->ga_len -= 3;
2568 ga_append(indent, '\0');
2569}
2570
2571/*
2572 * Print the NFA state machine.
2573 */
2574static void nfa_dump(nfa_regprog_T *prog)
2575{
2576 FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
2577
2578 if (debugf != NULL) {
2579 nfa_print_state(debugf, prog->start);
2580
2581 if (prog->reganch)
2582 fprintf(debugf, "reganch: %d\n", prog->reganch);
2583 if (prog->regstart != NUL)
2584 fprintf(debugf, "regstart: %c (decimal: %d)\n",
2585 prog->regstart, prog->regstart);
2586 if (prog->match_text != NULL)
2587 fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
2588
2589 fclose(debugf);
2590 }
2591}
2592#endif /* REGEXP_DEBUG */
2593
2594/*
2595 * Parse r.e. @expr and convert it into postfix form.
2596 * Return the postfix string on success, NULL otherwise.
2597 */
2598static int *re2post(void)
2599{
2600 if (nfa_reg(REG_NOPAREN) == FAIL)
2601 return NULL;
2602 EMIT(NFA_MOPEN);
2603 return post_start;
2604}
2605
2606/* NB. Some of the code below is inspired by Russ's. */
2607
2608/*
2609 * Represents an NFA state plus zero or one or two arrows exiting.
2610 * if c == MATCH, no arrows out; matching state.
2611 * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2612 * If c < 256, labeled arrow with character c to out.
2613 */
2614
2615static nfa_state_T *state_ptr; /* points to nfa_prog->state */
2616
2617/*
2618 * Allocate and initialize nfa_state_T.
2619 */
2620static nfa_state_T *alloc_state(int c, nfa_state_T *out, nfa_state_T *out1)
2621{
2622 nfa_state_T *s;
2623
2624 if (istate >= nstate)
2625 return NULL;
2626
2627 s = &state_ptr[istate++];
2628
2629 s->c = c;
2630 s->out = out;
2631 s->out1 = out1;
2632 s->val = 0;
2633
2634 s->id = istate;
2635 s->lastlist[0] = 0;
2636 s->lastlist[1] = 0;
2637
2638 return s;
2639}
2640
2641/*
2642 * A partially built NFA without the matching state filled in.
2643 * Frag_T.start points at the start state.
2644 * Frag_T.out is a list of places that need to be set to the
2645 * next state for this fragment.
2646 */
2647
2648
2649/*
2650 * Initialize a Frag_T struct and return it.
2651 */
2652static Frag_T frag(nfa_state_T *start, Ptrlist *out)
2653{
2654 Frag_T n;
2655
2656 n.start = start;
2657 n.out = out;
2658 return n;
2659}
2660
2661/*
2662 * Create singleton list containing just outp.
2663 */
2664static Ptrlist *list1(nfa_state_T **outp)
2665{
2666 Ptrlist *l;
2667
2668 l = (Ptrlist *)outp;
2669 l->next = NULL;
2670 return l;
2671}
2672
2673/*
2674 * Patch the list of states at out to point to start.
2675 */
2676static void patch(Ptrlist *l, nfa_state_T *s)
2677{
2678 Ptrlist *next;
2679
2680 for (; l; l = next) {
2681 next = l->next;
2682 l->s = s;
2683 }
2684}
2685
2686
2687/*
2688 * Join the two lists l1 and l2, returning the combination.
2689 */
2690static Ptrlist *append(Ptrlist *l1, Ptrlist *l2)
2691{
2692 Ptrlist *oldl1;
2693
2694 oldl1 = l1;
2695 while (l1->next)
2696 l1 = l1->next;
2697 l1->next = l2;
2698 return oldl1;
2699}
2700
2701/*
2702 * Stack used for transforming postfix form into NFA.
2703 */
2704static Frag_T empty;
2705
2706static void st_error(int *postfix, int *end, int *p)
2707{
2708#ifdef NFA_REGEXP_ERROR_LOG
2709 FILE *df;
2710 int *p2;
2711
2712 df = fopen(NFA_REGEXP_ERROR_LOG, "a");
2713 if (df) {
2714 fprintf(df, "Error popping the stack!\n");
2715#ifdef REGEXP_DEBUG
2716 fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
2717#endif
2718 fprintf(df, "Postfix form is: ");
2719#ifdef REGEXP_DEBUG
2720 for (p2 = postfix; p2 < end; p2++) {
2721 nfa_set_code(*p2);
2722 fprintf(df, "%s, ", code);
2723 }
2724 nfa_set_code(*p);
2725 fprintf(df, "\nCurrent position is: ");
2726 for (p2 = postfix; p2 <= p; p2++) {
2727 nfa_set_code(*p2);
2728 fprintf(df, "%s, ", code);
2729 }
2730#else
2731 for (p2 = postfix; p2 < end; p2++) {
2732 fprintf(df, "%d, ", *p2);
2733 }
2734 fprintf(df, "\nCurrent position is: ");
2735 for (p2 = postfix; p2 <= p; p2++) {
2736 fprintf(df, "%d, ", *p2);
2737 }
2738#endif
2739 fprintf(df, "\n--------------------------\n");
2740 fclose(df);
2741 }
2742#endif
2743 EMSG(_("E874: (NFA) Could not pop the stack!"));
2744}
2745
2746/*
2747 * Push an item onto the stack.
2748 */
2749static void st_push(Frag_T s, Frag_T **p, Frag_T *stack_end)
2750{
2751 Frag_T *stackp = *p;
2752
2753 if (stackp >= stack_end)
2754 return;
2755 *stackp = s;
2756 *p = *p + 1;
2757}
2758
2759/*
2760 * Pop an item from the stack.
2761 */
2762static Frag_T st_pop(Frag_T **p, Frag_T *stack)
2763{
2764 Frag_T *stackp;
2765
2766 *p = *p - 1;
2767 stackp = *p;
2768 if (stackp < stack)
2769 return empty;
2770 return **p;
2771}
2772
2773/*
2774 * Estimate the maximum byte length of anything matching "state".
2775 * When unknown or unlimited return -1.
2776 */
2777static int nfa_max_width(nfa_state_T *startstate, int depth)
2778{
2779 int l, r;
2780 nfa_state_T *state = startstate;
2781 int len = 0;
2782
2783 /* detect looping in a NFA_SPLIT */
2784 if (depth > 4)
2785 return -1;
2786
2787 while (state != NULL) {
2788 switch (state->c) {
2789 case NFA_END_INVISIBLE:
2790 case NFA_END_INVISIBLE_NEG:
2791 /* the end, return what we have */
2792 return len;
2793
2794 case NFA_SPLIT:
2795 /* two alternatives, use the maximum */
2796 l = nfa_max_width(state->out, depth + 1);
2797 r = nfa_max_width(state->out1, depth + 1);
2798 if (l < 0 || r < 0)
2799 return -1;
2800 return len + (l > r ? l : r);
2801
2802 case NFA_ANY:
2803 case NFA_START_COLL:
2804 case NFA_START_NEG_COLL:
2805 // Matches some character, including composing chars.
2806 len += MB_MAXBYTES;
2807 if (state->c != NFA_ANY) {
2808 // Skip over the characters.
2809 state = state->out1->out;
2810 continue;
2811 }
2812 break;
2813
2814 case NFA_DIGIT:
2815 case NFA_WHITE:
2816 case NFA_HEX:
2817 case NFA_OCTAL:
2818 /* ascii */
2819 ++len;
2820 break;
2821
2822 case NFA_IDENT:
2823 case NFA_SIDENT:
2824 case NFA_KWORD:
2825 case NFA_SKWORD:
2826 case NFA_FNAME:
2827 case NFA_SFNAME:
2828 case NFA_PRINT:
2829 case NFA_SPRINT:
2830 case NFA_NWHITE:
2831 case NFA_NDIGIT:
2832 case NFA_NHEX:
2833 case NFA_NOCTAL:
2834 case NFA_WORD:
2835 case NFA_NWORD:
2836 case NFA_HEAD:
2837 case NFA_NHEAD:
2838 case NFA_ALPHA:
2839 case NFA_NALPHA:
2840 case NFA_LOWER:
2841 case NFA_NLOWER:
2842 case NFA_UPPER:
2843 case NFA_NUPPER:
2844 case NFA_LOWER_IC:
2845 case NFA_NLOWER_IC:
2846 case NFA_UPPER_IC:
2847 case NFA_NUPPER_IC:
2848 case NFA_ANY_COMPOSING:
2849 /* possibly non-ascii */
2850 if (has_mbyte)
2851 len += 3;
2852 else
2853 ++len;
2854 break;
2855
2856 case NFA_START_INVISIBLE:
2857 case NFA_START_INVISIBLE_NEG:
2858 case NFA_START_INVISIBLE_BEFORE:
2859 case NFA_START_INVISIBLE_BEFORE_NEG:
2860 /* zero-width, out1 points to the END state */
2861 state = state->out1->out;
2862 continue;
2863
2864 case NFA_BACKREF1:
2865 case NFA_BACKREF2:
2866 case NFA_BACKREF3:
2867 case NFA_BACKREF4:
2868 case NFA_BACKREF5:
2869 case NFA_BACKREF6:
2870 case NFA_BACKREF7:
2871 case NFA_BACKREF8:
2872 case NFA_BACKREF9:
2873 case NFA_ZREF1:
2874 case NFA_ZREF2:
2875 case NFA_ZREF3:
2876 case NFA_ZREF4:
2877 case NFA_ZREF5:
2878 case NFA_ZREF6:
2879 case NFA_ZREF7:
2880 case NFA_ZREF8:
2881 case NFA_ZREF9:
2882 case NFA_NEWL:
2883 case NFA_SKIP:
2884 /* unknown width */
2885 return -1;
2886
2887 case NFA_BOL:
2888 case NFA_EOL:
2889 case NFA_BOF:
2890 case NFA_EOF:
2891 case NFA_BOW:
2892 case NFA_EOW:
2893 case NFA_MOPEN:
2894 case NFA_MOPEN1:
2895 case NFA_MOPEN2:
2896 case NFA_MOPEN3:
2897 case NFA_MOPEN4:
2898 case NFA_MOPEN5:
2899 case NFA_MOPEN6:
2900 case NFA_MOPEN7:
2901 case NFA_MOPEN8:
2902 case NFA_MOPEN9:
2903 case NFA_ZOPEN:
2904 case NFA_ZOPEN1:
2905 case NFA_ZOPEN2:
2906 case NFA_ZOPEN3:
2907 case NFA_ZOPEN4:
2908 case NFA_ZOPEN5:
2909 case NFA_ZOPEN6:
2910 case NFA_ZOPEN7:
2911 case NFA_ZOPEN8:
2912 case NFA_ZOPEN9:
2913 case NFA_ZCLOSE:
2914 case NFA_ZCLOSE1:
2915 case NFA_ZCLOSE2:
2916 case NFA_ZCLOSE3:
2917 case NFA_ZCLOSE4:
2918 case NFA_ZCLOSE5:
2919 case NFA_ZCLOSE6:
2920 case NFA_ZCLOSE7:
2921 case NFA_ZCLOSE8:
2922 case NFA_ZCLOSE9:
2923 case NFA_MCLOSE:
2924 case NFA_MCLOSE1:
2925 case NFA_MCLOSE2:
2926 case NFA_MCLOSE3:
2927 case NFA_MCLOSE4:
2928 case NFA_MCLOSE5:
2929 case NFA_MCLOSE6:
2930 case NFA_MCLOSE7:
2931 case NFA_MCLOSE8:
2932 case NFA_MCLOSE9:
2933 case NFA_NOPEN:
2934 case NFA_NCLOSE:
2935
2936 case NFA_LNUM_GT:
2937 case NFA_LNUM_LT:
2938 case NFA_COL_GT:
2939 case NFA_COL_LT:
2940 case NFA_VCOL_GT:
2941 case NFA_VCOL_LT:
2942 case NFA_MARK_GT:
2943 case NFA_MARK_LT:
2944 case NFA_VISUAL:
2945 case NFA_LNUM:
2946 case NFA_CURSOR:
2947 case NFA_COL:
2948 case NFA_VCOL:
2949 case NFA_MARK:
2950
2951 case NFA_ZSTART:
2952 case NFA_ZEND:
2953 case NFA_OPT_CHARS:
2954 case NFA_EMPTY:
2955 case NFA_START_PATTERN:
2956 case NFA_END_PATTERN:
2957 case NFA_COMPOSING:
2958 case NFA_END_COMPOSING:
2959 /* zero-width */
2960 break;
2961
2962 default:
2963 if (state->c < 0)
2964 /* don't know what this is */
2965 return -1;
2966 /* normal character */
2967 len += MB_CHAR2LEN(state->c);
2968 break;
2969 }
2970
2971 /* normal way to continue */
2972 state = state->out;
2973 }
2974
2975 /* unrecognized, "cannot happen" */
2976 return -1;
2977}
2978
2979/*
2980 * Convert a postfix form into its equivalent NFA.
2981 * Return the NFA start state on success, NULL otherwise.
2982 */
2983static nfa_state_T *post2nfa(int *postfix, int *end, int nfa_calc_size)
2984{
2985 int *p;
2986 int mopen;
2987 int mclose;
2988 Frag_T *stack = NULL;
2989 Frag_T *stackp = NULL;
2990 Frag_T *stack_end = NULL;
2991 Frag_T e1;
2992 Frag_T e2;
2993 Frag_T e;
2994 nfa_state_T *s;
2995 nfa_state_T *s1;
2996 nfa_state_T *matchstate;
2997 nfa_state_T *ret = NULL;
2998
2999 if (postfix == NULL)
3000 return NULL;
3001
3002#define PUSH(s) st_push((s), &stackp, stack_end)
3003#define POP() st_pop(&stackp, stack); \
3004 if (stackp < stack) { \
3005 st_error(postfix, end, p); \
3006 xfree(stack); \
3007 return NULL; \
3008 }
3009
3010 if (nfa_calc_size == false) {
3011 // Allocate space for the stack. Max states on the stack: "nstate".
3012 stack = xmalloc((nstate + 1) * sizeof(Frag_T));
3013 stackp = stack;
3014 stack_end = stack + (nstate + 1);
3015 }
3016
3017 for (p = postfix; p < end; ++p) {
3018 switch (*p) {
3019 case NFA_CONCAT:
3020 /* Concatenation.
3021 * Pay attention: this operator does not exist in the r.e. itself
3022 * (it is implicit, really). It is added when r.e. is translated
3023 * to postfix form in re2post(). */
3024 if (nfa_calc_size == TRUE) {
3025 /* nstate += 0; */
3026 break;
3027 }
3028 e2 = POP();
3029 e1 = POP();
3030 patch(e1.out, e2.start);
3031 PUSH(frag(e1.start, e2.out));
3032 break;
3033
3034 case NFA_OR:
3035 /* Alternation */
3036 if (nfa_calc_size == TRUE) {
3037 nstate++;
3038 break;
3039 }
3040 e2 = POP();
3041 e1 = POP();
3042 s = alloc_state(NFA_SPLIT, e1.start, e2.start);
3043 if (s == NULL)
3044 goto theend;
3045 PUSH(frag(s, append(e1.out, e2.out)));
3046 break;
3047
3048 case NFA_STAR:
3049 /* Zero or more, prefer more */
3050 if (nfa_calc_size == TRUE) {
3051 nstate++;
3052 break;
3053 }
3054 e = POP();
3055 s = alloc_state(NFA_SPLIT, e.start, NULL);
3056 if (s == NULL)
3057 goto theend;
3058 patch(e.out, s);
3059 PUSH(frag(s, list1(&s->out1)));
3060 break;
3061
3062 case NFA_STAR_NONGREEDY:
3063 /* Zero or more, prefer zero */
3064 if (nfa_calc_size == TRUE) {
3065 nstate++;
3066 break;
3067 }
3068 e = POP();
3069 s = alloc_state(NFA_SPLIT, NULL, e.start);
3070 if (s == NULL)
3071 goto theend;
3072 patch(e.out, s);
3073 PUSH(frag(s, list1(&s->out)));
3074 break;
3075
3076 case NFA_QUEST:
3077 /* one or zero atoms=> greedy match */
3078 if (nfa_calc_size == TRUE) {
3079 nstate++;
3080 break;
3081 }
3082 e = POP();
3083 s = alloc_state(NFA_SPLIT, e.start, NULL);
3084 if (s == NULL)
3085 goto theend;
3086 PUSH(frag(s, append(e.out, list1(&s->out1))));
3087 break;
3088
3089 case NFA_QUEST_NONGREEDY:
3090 /* zero or one atoms => non-greedy match */
3091 if (nfa_calc_size == TRUE) {
3092 nstate++;
3093 break;
3094 }
3095 e = POP();
3096 s = alloc_state(NFA_SPLIT, NULL, e.start);
3097 if (s == NULL)
3098 goto theend;
3099 PUSH(frag(s, append(e.out, list1(&s->out))));
3100 break;
3101
3102 case NFA_END_COLL:
3103 case NFA_END_NEG_COLL:
3104 /* On the stack is the sequence starting with NFA_START_COLL or
3105 * NFA_START_NEG_COLL and all possible characters. Patch it to
3106 * add the output to the start. */
3107 if (nfa_calc_size == TRUE) {
3108 nstate++;
3109 break;
3110 }
3111 e = POP();
3112 s = alloc_state(NFA_END_COLL, NULL, NULL);
3113 if (s == NULL)
3114 goto theend;
3115 patch(e.out, s);
3116 e.start->out1 = s;
3117 PUSH(frag(e.start, list1(&s->out)));
3118 break;
3119
3120 case NFA_RANGE:
3121 /* Before this are two characters, the low and high end of a
3122 * range. Turn them into two states with MIN and MAX. */
3123 if (nfa_calc_size == TRUE) {
3124 /* nstate += 0; */
3125 break;
3126 }
3127 e2 = POP();
3128 e1 = POP();
3129 e2.start->val = e2.start->c;
3130 e2.start->c = NFA_RANGE_MAX;
3131 e1.start->val = e1.start->c;
3132 e1.start->c = NFA_RANGE_MIN;
3133 patch(e1.out, e2.start);
3134 PUSH(frag(e1.start, e2.out));
3135 break;
3136
3137 case NFA_EMPTY:
3138 /* 0-length, used in a repetition with max/min count of 0 */
3139 if (nfa_calc_size == TRUE) {
3140 nstate++;
3141 break;
3142 }
3143 s = alloc_state(NFA_EMPTY, NULL, NULL);
3144 if (s == NULL)
3145 goto theend;
3146 PUSH(frag(s, list1(&s->out)));
3147 break;
3148
3149 case NFA_OPT_CHARS:
3150 {
3151 int n;
3152
3153 /* \%[abc] implemented as:
3154 * NFA_SPLIT
3155 * +-CHAR(a)
3156 * | +-NFA_SPLIT
3157 * | +-CHAR(b)
3158 * | | +-NFA_SPLIT
3159 * | | +-CHAR(c)
3160 * | | | +-next
3161 * | | +- next
3162 * | +- next
3163 * +- next
3164 */
3165 n = *++p; /* get number of characters */
3166 if (nfa_calc_size == TRUE) {
3167 nstate += n;
3168 break;
3169 }
3170 s = NULL; /* avoid compiler warning */
3171 e1.out = NULL; /* stores list with out1's */
3172 s1 = NULL; /* previous NFA_SPLIT to connect to */
3173 while (n-- > 0) {
3174 e = POP(); /* get character */
3175 s = alloc_state(NFA_SPLIT, e.start, NULL);
3176 if (s == NULL)
3177 goto theend;
3178 if (e1.out == NULL)
3179 e1 = e;
3180 patch(e.out, s1);
3181 append(e1.out, list1(&s->out1));
3182 s1 = s;
3183 }
3184 PUSH(frag(s, e1.out));
3185 break;
3186 }
3187
3188 case NFA_PREV_ATOM_NO_WIDTH:
3189 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3190 case NFA_PREV_ATOM_JUST_BEFORE:
3191 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3192 case NFA_PREV_ATOM_LIKE_PATTERN:
3193 {
3194 int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3195 || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
3196 int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
3197 int start_state;
3198 int end_state;
3199 int n = 0;
3200 nfa_state_T *zend;
3201 nfa_state_T *skip;
3202
3203 switch (*p) {
3204 case NFA_PREV_ATOM_NO_WIDTH:
3205 start_state = NFA_START_INVISIBLE;
3206 end_state = NFA_END_INVISIBLE;
3207 break;
3208 case NFA_PREV_ATOM_NO_WIDTH_NEG:
3209 start_state = NFA_START_INVISIBLE_NEG;
3210 end_state = NFA_END_INVISIBLE_NEG;
3211 break;
3212 case NFA_PREV_ATOM_JUST_BEFORE:
3213 start_state = NFA_START_INVISIBLE_BEFORE;
3214 end_state = NFA_END_INVISIBLE;
3215 break;
3216 case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3217 start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3218 end_state = NFA_END_INVISIBLE_NEG;
3219 break;
3220 default: /* NFA_PREV_ATOM_LIKE_PATTERN: */
3221 start_state = NFA_START_PATTERN;
3222 end_state = NFA_END_PATTERN;
3223 break;
3224 }
3225
3226 if (before)
3227 n = *++p; /* get the count */
3228
3229 /* The \@= operator: match the preceding atom with zero width.
3230 * The \@! operator: no match for the preceding atom.
3231 * The \@<= operator: match for the preceding atom.
3232 * The \@<! operator: no match for the preceding atom.
3233 * Surrounds the preceding atom with START_INVISIBLE and
3234 * END_INVISIBLE, similarly to MOPEN. */
3235
3236 if (nfa_calc_size == TRUE) {
3237 nstate += pattern ? 4 : 2;
3238 break;
3239 }
3240 e = POP();
3241 s1 = alloc_state(end_state, NULL, NULL);
3242 if (s1 == NULL)
3243 goto theend;
3244
3245 s = alloc_state(start_state, e.start, s1);
3246 if (s == NULL)
3247 goto theend;
3248 if (pattern) {
3249 /* NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows. */
3250 skip = alloc_state(NFA_SKIP, NULL, NULL);
3251 if (skip == NULL) {
3252 goto theend;
3253 }
3254 zend = alloc_state(NFA_ZEND, s1, NULL);
3255 if (zend == NULL) {
3256 goto theend;
3257 }
3258 s1->out= skip;
3259 patch(e.out, zend);
3260 PUSH(frag(s, list1(&skip->out)));
3261 } else {
3262 patch(e.out, s1);
3263 PUSH(frag(s, list1(&s1->out)));
3264 if (before) {
3265 if (n <= 0)
3266 /* See if we can guess the maximum width, it avoids a
3267 * lot of pointless tries. */
3268 n = nfa_max_width(e.start, 0);
3269 s->val = n; /* store the count */
3270 }
3271 }
3272 break;
3273 }
3274
3275 case NFA_COMPOSING: // char with composing char
3276 FALLTHROUGH;
3277
3278 case NFA_MOPEN: /* \( \) Submatch */
3279 case NFA_MOPEN1:
3280 case NFA_MOPEN2:
3281 case NFA_MOPEN3:
3282 case NFA_MOPEN4:
3283 case NFA_MOPEN5:
3284 case NFA_MOPEN6:
3285 case NFA_MOPEN7:
3286 case NFA_MOPEN8:
3287 case NFA_MOPEN9:
3288 case NFA_ZOPEN: /* \z( \) Submatch */
3289 case NFA_ZOPEN1:
3290 case NFA_ZOPEN2:
3291 case NFA_ZOPEN3:
3292 case NFA_ZOPEN4:
3293 case NFA_ZOPEN5:
3294 case NFA_ZOPEN6:
3295 case NFA_ZOPEN7:
3296 case NFA_ZOPEN8:
3297 case NFA_ZOPEN9:
3298 case NFA_NOPEN: /* \%( \) "Invisible Submatch" */
3299 if (nfa_calc_size == TRUE) {
3300 nstate += 2;
3301 break;
3302 }
3303
3304 mopen = *p;
3305 switch (*p) {
3306 case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3307 case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3308 case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3309 case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3310 case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3311 case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3312 case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3313 case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3314 case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3315 case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3316 case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3317 case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
3318 default:
3319 /* NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9 */
3320 mclose = *p + NSUBEXP;
3321 break;
3322 }
3323
3324 /* Allow "NFA_MOPEN" as a valid postfix representation for
3325 * the empty regexp "". In this case, the NFA will be
3326 * NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3327 * empty groups of parenthesis, and empty mbyte chars */
3328 if (stackp == stack) {
3329 s = alloc_state(mopen, NULL, NULL);
3330 if (s == NULL)
3331 goto theend;
3332 s1 = alloc_state(mclose, NULL, NULL);
3333 if (s1 == NULL)
3334 goto theend;
3335 patch(list1(&s->out), s1);
3336 PUSH(frag(s, list1(&s1->out)));
3337 break;
3338 }
3339
3340 /* At least one node was emitted before NFA_MOPEN, so
3341 * at least one node will be between NFA_MOPEN and NFA_MCLOSE */
3342 e = POP();
3343 s = alloc_state(mopen, e.start, NULL); /* `(' */
3344 if (s == NULL)
3345 goto theend;
3346
3347 s1 = alloc_state(mclose, NULL, NULL); /* `)' */
3348 if (s1 == NULL)
3349 goto theend;
3350 patch(e.out, s1);
3351
3352 if (mopen == NFA_COMPOSING)
3353 /* COMPOSING->out1 = END_COMPOSING */
3354 patch(list1(&s->out1), s1);
3355
3356 PUSH(frag(s, list1(&s1->out)));
3357 break;
3358
3359 case NFA_BACKREF1:
3360 case NFA_BACKREF2:
3361 case NFA_BACKREF3:
3362 case NFA_BACKREF4:
3363 case NFA_BACKREF5:
3364 case NFA_BACKREF6:
3365 case NFA_BACKREF7:
3366 case NFA_BACKREF8:
3367 case NFA_BACKREF9:
3368 case NFA_ZREF1:
3369 case NFA_ZREF2:
3370 case NFA_ZREF3:
3371 case NFA_ZREF4:
3372 case NFA_ZREF5:
3373 case NFA_ZREF6:
3374 case NFA_ZREF7:
3375 case NFA_ZREF8:
3376 case NFA_ZREF9:
3377 if (nfa_calc_size == TRUE) {
3378 nstate += 2;
3379 break;
3380 }
3381 s = alloc_state(*p, NULL, NULL);
3382 if (s == NULL)
3383 goto theend;
3384 s1 = alloc_state(NFA_SKIP, NULL, NULL);
3385 if (s1 == NULL)
3386 goto theend;
3387 patch(list1(&s->out), s1);
3388 PUSH(frag(s, list1(&s1->out)));
3389 break;
3390
3391 case NFA_LNUM:
3392 case NFA_LNUM_GT:
3393 case NFA_LNUM_LT:
3394 case NFA_VCOL:
3395 case NFA_VCOL_GT:
3396 case NFA_VCOL_LT:
3397 case NFA_COL:
3398 case NFA_COL_GT:
3399 case NFA_COL_LT:
3400 case NFA_MARK:
3401 case NFA_MARK_GT:
3402 case NFA_MARK_LT:
3403 {
3404 int n = *++p; /* lnum, col or mark name */
3405
3406 if (nfa_calc_size == TRUE) {
3407 nstate += 1;
3408 break;
3409 }
3410 s = alloc_state(p[-1], NULL, NULL);
3411 if (s == NULL)
3412 goto theend;
3413 s->val = n;
3414 PUSH(frag(s, list1(&s->out)));
3415 break;
3416 }
3417
3418 case NFA_ZSTART:
3419 case NFA_ZEND:
3420 default:
3421 /* Operands */
3422 if (nfa_calc_size == TRUE) {
3423 nstate++;
3424 break;
3425 }
3426 s = alloc_state(*p, NULL, NULL);
3427 if (s == NULL)
3428 goto theend;
3429 PUSH(frag(s, list1(&s->out)));
3430 break;
3431
3432 } /* switch(*p) */
3433
3434 } /* for(p = postfix; *p; ++p) */
3435
3436 if (nfa_calc_size == TRUE) {
3437 nstate++;
3438 goto theend; /* Return value when counting size is ignored anyway */
3439 }
3440
3441 e = POP();
3442 if (stackp != stack) {
3443 xfree(stack);
3444 EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA),"
3445 "too many states left on stack"));
3446 }
3447
3448 if (istate >= nstate) {
3449 xfree(stack);
3450 EMSG_RET_NULL(_("E876: (NFA regexp) "
3451 "Not enough space to store the whole NFA "));
3452 }
3453
3454 matchstate = &state_ptr[istate++]; /* the match state */
3455 matchstate->c = NFA_MATCH;
3456 matchstate->out = matchstate->out1 = NULL;
3457 matchstate->id = 0;
3458
3459 patch(e.out, matchstate);
3460 ret = e.start;
3461
3462theend:
3463 xfree(stack);
3464 return ret;
3465
3466#undef POP1
3467#undef PUSH1
3468#undef POP2
3469#undef PUSH2
3470#undef POP
3471#undef PUSH
3472}
3473
3474/*
3475 * After building the NFA program, inspect it to add optimization hints.
3476 */
3477static void nfa_postprocess(nfa_regprog_T *prog)
3478{
3479 int i;
3480 int c;
3481
3482 for (i = 0; i < prog->nstate; ++i) {
3483 c = prog->state[i].c;
3484 if (c == NFA_START_INVISIBLE
3485 || c == NFA_START_INVISIBLE_NEG
3486 || c == NFA_START_INVISIBLE_BEFORE
3487 || c == NFA_START_INVISIBLE_BEFORE_NEG) {
3488 int directly;
3489
3490 /* Do it directly when what follows is possibly the end of the
3491 * match. */
3492 if (match_follows(prog->state[i].out1->out, 0))
3493 directly = TRUE;
3494 else {
3495 int ch_invisible = failure_chance(prog->state[i].out, 0);
3496 int ch_follows = failure_chance(prog->state[i].out1->out, 0);
3497
3498 /* Postpone when the invisible match is expensive or has a
3499 * lower chance of failing. */
3500 if (c == NFA_START_INVISIBLE_BEFORE
3501 || c == NFA_START_INVISIBLE_BEFORE_NEG) {
3502 /* "before" matches are very expensive when
3503 * unbounded, always prefer what follows then,
3504 * unless what follows will always match.
3505 * Otherwise strongly prefer what follows. */
3506 if (prog->state[i].val <= 0 && ch_follows > 0)
3507 directly = FALSE;
3508 else
3509 directly = ch_follows * 10 < ch_invisible;
3510 } else {
3511 /* normal invisible, first do the one with the
3512 * highest failure chance */
3513 directly = ch_follows < ch_invisible;
3514 }
3515 }
3516 if (directly)
3517 /* switch to the _FIRST state */
3518 ++prog->state[i].c;
3519 }
3520 }
3521}
3522
3523/****************************************************************
3524* NFA execution code.
3525****************************************************************/
3526
3527/* Values for done in nfa_pim_T. */
3528#define NFA_PIM_UNUSED 0 /* pim not used */
3529#define NFA_PIM_TODO 1 /* pim not done yet */
3530#define NFA_PIM_MATCH 2 /* pim executed, matches */
3531#define NFA_PIM_NOMATCH 3 /* pim executed, no match */
3532
3533
3534#ifdef REGEXP_DEBUG
3535static void log_subsexpr(regsubs_T *subs)
3536{
3537 log_subexpr(&subs->norm);
3538 if (nfa_has_zsubexpr)
3539 log_subexpr(&subs->synt);
3540}
3541
3542static void log_subexpr(regsub_T *sub)
3543{
3544 int j;
3545
3546 for (j = 0; j < sub->in_use; j++)
3547 if (REG_MULTI)
3548 fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
3549 j,
3550 sub->list.multi[j].start_col,
3551 (int)sub->list.multi[j].start_lnum,
3552 sub->list.multi[j].end_col,
3553 (int)sub->list.multi[j].end_lnum);
3554 else {
3555 char *s = (char *)sub->list.line[j].start;
3556 char *e = (char *)sub->list.line[j].end;
3557
3558 fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
3559 j,
3560 s == NULL ? "NULL" : s,
3561 e == NULL ? "NULL" : e);
3562 }
3563}
3564
3565static char *pim_info(nfa_pim_T *pim)
3566{
3567 static char buf[30];
3568
3569 if (pim == NULL || pim->result == NFA_PIM_UNUSED)
3570 buf[0] = NUL;
3571 else {
3572 sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
3573 : (int)(pim->end.ptr - reginput));
3574 }
3575 return buf;
3576}
3577
3578#endif
3579
3580// Used during execution: whether a match has been found.
3581static int nfa_match;
3582static proftime_T *nfa_time_limit;
3583static int *nfa_timed_out;
3584static int nfa_time_count;
3585
3586// Copy postponed invisible match info from "from" to "to".
3587static void copy_pim(nfa_pim_T *to, nfa_pim_T *from)
3588{
3589 to->result = from->result;
3590 to->state = from->state;
3591 copy_sub(&to->subs.norm, &from->subs.norm);
3592 if (nfa_has_zsubexpr)
3593 copy_sub(&to->subs.synt, &from->subs.synt);
3594 to->end = from->end;
3595}
3596
3597static void clear_sub(regsub_T *sub)
3598{
3599 if (REG_MULTI)
3600 /* Use 0xff to set lnum to -1 */
3601 memset(sub->list.multi, 0xff,
3602 sizeof(struct multipos) * nfa_nsubexpr);
3603 else
3604 memset(sub->list.line, 0, sizeof(struct linepos) * nfa_nsubexpr);
3605 sub->in_use = 0;
3606}
3607
3608/*
3609 * Copy the submatches from "from" to "to".
3610 */
3611static void copy_sub(regsub_T *to, regsub_T *from)
3612{
3613 to->in_use = from->in_use;
3614 if (from->in_use > 0) {
3615 /* Copy the match start and end positions. */
3616 if (REG_MULTI)
3617 memmove(&to->list.multi[0],
3618 &from->list.multi[0],
3619 sizeof(struct multipos) * from->in_use);
3620 else
3621 memmove(&to->list.line[0],
3622 &from->list.line[0],
3623 sizeof(struct linepos) * from->in_use);
3624 }
3625}
3626
3627/*
3628 * Like copy_sub() but exclude the main match.
3629 */
3630static void copy_sub_off(regsub_T *to, regsub_T *from)
3631{
3632 if (to->in_use < from->in_use)
3633 to->in_use = from->in_use;
3634 if (from->in_use > 1) {
3635 /* Copy the match start and end positions. */
3636 if (REG_MULTI)
3637 memmove(&to->list.multi[1],
3638 &from->list.multi[1],
3639 sizeof(struct multipos) * (from->in_use - 1));
3640 else
3641 memmove(&to->list.line[1],
3642 &from->list.line[1],
3643 sizeof(struct linepos) * (from->in_use - 1));
3644 }
3645}
3646
3647/*
3648 * Like copy_sub() but only do the end of the main match if \ze is present.
3649 */
3650static void copy_ze_off(regsub_T *to, regsub_T *from)
3651{
3652 if (nfa_has_zend) {
3653 if (REG_MULTI) {
3654 if (from->list.multi[0].end_lnum >= 0){
3655 to->list.multi[0].end_lnum = from->list.multi[0].end_lnum;
3656 to->list.multi[0].end_col = from->list.multi[0].end_col;
3657 }
3658 } else {
3659 if (from->list.line[0].end != NULL)
3660 to->list.line[0].end = from->list.line[0].end;
3661 }
3662 }
3663}
3664
3665// Return TRUE if "sub1" and "sub2" have the same start positions.
3666// When using back-references also check the end position.
3667static int sub_equal(regsub_T *sub1, regsub_T *sub2)
3668{
3669 int i;
3670 int todo;
3671 linenr_T s1;
3672 linenr_T s2;
3673 char_u *sp1;
3674 char_u *sp2;
3675
3676 todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
3677 if (REG_MULTI) {
3678 for (i = 0; i < todo; ++i) {
3679 if (i < sub1->in_use)
3680 s1 = sub1->list.multi[i].start_lnum;
3681 else
3682 s1 = -1;
3683 if (i < sub2->in_use)
3684 s2 = sub2->list.multi[i].start_lnum;
3685 else
3686 s2 = -1;
3687 if (s1 != s2)
3688 return FALSE;
3689 if (s1 != -1 && sub1->list.multi[i].start_col
3690 != sub2->list.multi[i].start_col)
3691 return FALSE;
3692
3693 if (nfa_has_backref) {
3694 if (i < sub1->in_use) {
3695 s1 = sub1->list.multi[i].end_lnum;
3696 } else {
3697 s1 = -1;
3698 }
3699 if (i < sub2->in_use) {
3700 s2 = sub2->list.multi[i].end_lnum;
3701 } else {
3702 s2 = -1;
3703 }
3704 if (s1 != s2) {
3705 return FALSE;
3706 }
3707 if (s1 != -1
3708 && sub1->list.multi[i].end_col != sub2->list.multi[i].end_col) {
3709 return FALSE;
3710 }
3711 }
3712 }
3713 } else {
3714 for (i = 0; i < todo; ++i) {
3715 if (i < sub1->in_use)
3716 sp1 = sub1->list.line[i].start;
3717 else
3718 sp1 = NULL;
3719 if (i < sub2->in_use)
3720 sp2 = sub2->list.line[i].start;
3721 else
3722 sp2 = NULL;
3723 if (sp1 != sp2)
3724 return FALSE;
3725
3726 if (nfa_has_backref) {
3727 if (i < sub1->in_use) {
3728 sp1 = sub1->list.line[i].end;
3729 } else {
3730 sp1 = NULL;
3731 }
3732 if (i < sub2->in_use) {
3733 sp2 = sub2->list.line[i].end;
3734 } else {
3735 sp2 = NULL;
3736 }
3737 if (sp1 != sp2) {
3738 return FALSE;
3739 }
3740 }
3741 }
3742 }
3743
3744 return TRUE;
3745}
3746
3747#ifdef REGEXP_DEBUG
3748static void report_state(char *action,
3749 regsub_T *sub,
3750 nfa_state_T *state,
3751 int lid,
3752 nfa_pim_T *pim) {
3753 int col;
3754
3755 if (sub->in_use <= 0)
3756 col = -1;
3757 else if (REG_MULTI)
3758 col = sub->list.multi[0].start_col;
3759 else
3760 col = (int)(sub->list.line[0].start - regline);
3761 nfa_set_code(state->c);
3762 fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
3763 action, abs(state->id), lid, state->c, code, col,
3764 pim_info(pim));
3765}
3766
3767#endif
3768
3769/*
3770 * Return TRUE if the same state is already in list "l" with the same
3771 * positions as "subs".
3772 */
3773static int
3774has_state_with_pos (
3775 nfa_list_T *l, /* runtime state list */
3776 nfa_state_T *state, /* state to update */
3777 regsubs_T *subs, /* pointers to subexpressions */
3778 nfa_pim_T *pim /* postponed match or NULL */
3779)
3780{
3781 nfa_thread_T *thread;
3782 int i;
3783
3784 for (i = 0; i < l->n; ++i) {
3785 thread = &l->t[i];
3786 if (thread->state->id == state->id
3787 && sub_equal(&thread->subs.norm, &subs->norm)
3788 && (!nfa_has_zsubexpr
3789 || sub_equal(&thread->subs.synt, &subs->synt))
3790 && pim_equal(&thread->pim, pim))
3791 return TRUE;
3792 }
3793 return FALSE;
3794}
3795
3796/*
3797 * Return TRUE if "one" and "two" are equal. That includes when both are not
3798 * set.
3799 */
3800static int pim_equal(nfa_pim_T *one, nfa_pim_T *two)
3801{
3802 int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED);
3803 int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED);
3804
3805 if (one_unused)
3806 /* one is unused: equal when two is also unused */
3807 return two_unused;
3808 if (two_unused)
3809 /* one is used and two is not: not equal */
3810 return FALSE;
3811 /* compare the state id */
3812 if (one->state->id != two->state->id)
3813 return FALSE;
3814 /* compare the position */
3815 if (REG_MULTI)
3816 return one->end.pos.lnum == two->end.pos.lnum
3817 && one->end.pos.col == two->end.pos.col;
3818 return one->end.ptr == two->end.ptr;
3819}
3820
3821/*
3822 * Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
3823 */
3824static int match_follows(nfa_state_T *startstate, int depth)
3825{
3826 nfa_state_T *state = startstate;
3827
3828 /* avoid too much recursion */
3829 if (depth > 10)
3830 return FALSE;
3831
3832 while (state != NULL) {
3833 switch (state->c) {
3834 case NFA_MATCH:
3835 case NFA_MCLOSE:
3836 case NFA_END_INVISIBLE:
3837 case NFA_END_INVISIBLE_NEG:
3838 case NFA_END_PATTERN:
3839 return TRUE;
3840
3841 case NFA_SPLIT:
3842 return match_follows(state->out, depth + 1)
3843 || match_follows(state->out1, depth + 1);
3844
3845 case NFA_START_INVISIBLE:
3846 case NFA_START_INVISIBLE_FIRST:
3847 case NFA_START_INVISIBLE_BEFORE:
3848 case NFA_START_INVISIBLE_BEFORE_FIRST:
3849 case NFA_START_INVISIBLE_NEG:
3850 case NFA_START_INVISIBLE_NEG_FIRST:
3851 case NFA_START_INVISIBLE_BEFORE_NEG:
3852 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
3853 case NFA_COMPOSING:
3854 /* skip ahead to next state */
3855 state = state->out1->out;
3856 continue;
3857
3858 case NFA_ANY:
3859 case NFA_ANY_COMPOSING:
3860 case NFA_IDENT:
3861 case NFA_SIDENT:
3862 case NFA_KWORD:
3863 case NFA_SKWORD:
3864 case NFA_FNAME:
3865 case NFA_SFNAME:
3866 case NFA_PRINT:
3867 case NFA_SPRINT:
3868 case NFA_WHITE:
3869 case NFA_NWHITE:
3870 case NFA_DIGIT:
3871 case NFA_NDIGIT:
3872 case NFA_HEX:
3873 case NFA_NHEX:
3874 case NFA_OCTAL:
3875 case NFA_NOCTAL:
3876 case NFA_WORD:
3877 case NFA_NWORD:
3878 case NFA_HEAD:
3879 case NFA_NHEAD:
3880 case NFA_ALPHA:
3881 case NFA_NALPHA:
3882 case NFA_LOWER:
3883 case NFA_NLOWER:
3884 case NFA_UPPER:
3885 case NFA_NUPPER:
3886 case NFA_LOWER_IC:
3887 case NFA_NLOWER_IC:
3888 case NFA_UPPER_IC:
3889 case NFA_NUPPER_IC:
3890 case NFA_START_COLL:
3891 case NFA_START_NEG_COLL:
3892 case NFA_NEWL:
3893 /* state will advance input */
3894 return FALSE;
3895
3896 default:
3897 if (state->c > 0)
3898 /* state will advance input */
3899 return FALSE;
3900
3901 /* Others: zero-width or possibly zero-width, might still find
3902 * a match at the same position, keep looking. */
3903 break;
3904 }
3905 state = state->out;
3906 }
3907 return FALSE;
3908}
3909
3910
3911/*
3912 * Return TRUE if "state" is already in list "l".
3913 */
3914static int
3915state_in_list (
3916 nfa_list_T *l, /* runtime state list */
3917 nfa_state_T *state, /* state to update */
3918 regsubs_T *subs /* pointers to subexpressions */
3919)
3920{
3921 if (state->lastlist[nfa_ll_index] == l->id) {
3922 if (!nfa_has_backref || has_state_with_pos(l, state, subs, NULL))
3923 return TRUE;
3924 }
3925 return FALSE;
3926}
3927
3928// Offset used for "off" by addstate_here().
3929#define ADDSTATE_HERE_OFFSET 10
3930
3931// Add "state" and possibly what follows to state list ".".
3932// Returns "subs_arg", possibly copied into temp_subs.
3933// Returns NULL when recursiveness is too deep.
3934static regsubs_T *addstate(
3935 nfa_list_T *l, // runtime state list
3936 nfa_state_T *state, // state to update
3937 regsubs_T *subs_arg, // pointers to subexpressions
3938 nfa_pim_T *pim, // postponed look-behind match
3939 int off_arg) // byte offset, when -1 go to next line
3940 FUNC_ATTR_NONNULL_ARG(1, 2) FUNC_ATTR_WARN_UNUSED_RESULT
3941{
3942 int subidx;
3943 int off = off_arg;
3944 int add_here = FALSE;
3945 int listindex = 0;
3946 int k;
3947 int found = FALSE;
3948 nfa_thread_T *thread;
3949 struct multipos save_multipos;
3950 int save_in_use;
3951 char_u *save_ptr;
3952 int i;
3953 regsub_T *sub;
3954 regsubs_T *subs = subs_arg;
3955 static regsubs_T temp_subs;
3956#ifdef REGEXP_DEBUG
3957 int did_print = FALSE;
3958#endif
3959 static int depth = 0;
3960
3961 // This function is called recursively. When the depth is too much we run
3962 // out of stack and crash, limit recursiveness here.
3963 if (++depth >= 5000 || subs == NULL) {
3964 depth--;
3965 return NULL;
3966 }
3967
3968 if (off_arg <= -ADDSTATE_HERE_OFFSET) {
3969 add_here = true;
3970 off = 0;
3971 listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
3972 }
3973
3974 switch (state->c) {
3975 case NFA_NCLOSE:
3976 case NFA_MCLOSE:
3977 case NFA_MCLOSE1:
3978 case NFA_MCLOSE2:
3979 case NFA_MCLOSE3:
3980 case NFA_MCLOSE4:
3981 case NFA_MCLOSE5:
3982 case NFA_MCLOSE6:
3983 case NFA_MCLOSE7:
3984 case NFA_MCLOSE8:
3985 case NFA_MCLOSE9:
3986 case NFA_ZCLOSE:
3987 case NFA_ZCLOSE1:
3988 case NFA_ZCLOSE2:
3989 case NFA_ZCLOSE3:
3990 case NFA_ZCLOSE4:
3991 case NFA_ZCLOSE5:
3992 case NFA_ZCLOSE6:
3993 case NFA_ZCLOSE7:
3994 case NFA_ZCLOSE8:
3995 case NFA_ZCLOSE9:
3996 case NFA_MOPEN:
3997 case NFA_ZEND:
3998 case NFA_SPLIT:
3999 case NFA_EMPTY:
4000 /* These nodes are not added themselves but their "out" and/or
4001 * "out1" may be added below. */
4002 break;
4003
4004 case NFA_BOL:
4005 case NFA_BOF:
4006 /* "^" won't match past end-of-line, don't bother trying.
4007 * Except when at the end of the line, or when we are going to the
4008 * next line for a look-behind match. */
4009 if (reginput > regline
4010 && *reginput != NUL
4011 && (nfa_endp == NULL
4012 || !REG_MULTI
4013 || reglnum == nfa_endp->se_u.pos.lnum))
4014 goto skip_add;
4015 FALLTHROUGH;
4016
4017 case NFA_MOPEN1:
4018 case NFA_MOPEN2:
4019 case NFA_MOPEN3:
4020 case NFA_MOPEN4:
4021 case NFA_MOPEN5:
4022 case NFA_MOPEN6:
4023 case NFA_MOPEN7:
4024 case NFA_MOPEN8:
4025 case NFA_MOPEN9:
4026 case NFA_ZOPEN:
4027 case NFA_ZOPEN1:
4028 case NFA_ZOPEN2:
4029 case NFA_ZOPEN3:
4030 case NFA_ZOPEN4:
4031 case NFA_ZOPEN5:
4032 case NFA_ZOPEN6:
4033 case NFA_ZOPEN7:
4034 case NFA_ZOPEN8:
4035 case NFA_ZOPEN9:
4036 case NFA_NOPEN:
4037 case NFA_ZSTART:
4038 /* These nodes need to be added so that we can bail out when it
4039 * was added to this list before at the same position to avoid an
4040 * endless loop for "\(\)*" */
4041
4042 default:
4043 if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP) {
4044 /* This state is already in the list, don't add it again,
4045 * unless it is an MOPEN that is used for a backreference or
4046 * when there is a PIM. For NFA_MATCH check the position,
4047 * lower position is preferred. */
4048 if (!nfa_has_backref && pim == NULL && !l->has_pim
4049 && state->c != NFA_MATCH) {
4050
4051 /* When called from addstate_here() do insert before
4052 * existing states. */
4053 if (add_here) {
4054 for (k = 0; k < l->n && k < listindex; ++k) {
4055 if (l->t[k].state->id == state->id) {
4056 found = TRUE;
4057 break;
4058 }
4059 }
4060 }
4061
4062 if (!add_here || found) {
4063skip_add:
4064#ifdef REGEXP_DEBUG
4065 nfa_set_code(state->c);
4066 fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4067 abs(state->id), l->id, state->c, code,
4068 pim == NULL ? "NULL" : "yes", l->has_pim, found);
4069#endif
4070 depth--;
4071 return subs;
4072 }
4073 }
4074
4075 /* Do not add the state again when it exists with the same
4076 * positions. */
4077 if (has_state_with_pos(l, state, subs, pim))
4078 goto skip_add;
4079 }
4080
4081 // When there are backreferences or PIMs the number of states may
4082 // be (a lot) bigger than anticipated.
4083 if (l->n == l->len) {
4084 const int newlen = l->len * 3 / 2 + 50;
4085 const size_t newsize = newlen * sizeof(nfa_thread_T);
4086
4087 if ((long)(newsize >> 10) >= p_mmp) {
4088 EMSG(_(e_maxmempat));
4089 depth--;
4090 return NULL;
4091 }
4092 if (subs != &temp_subs) {
4093 /* "subs" may point into the current array, need to make a
4094 * copy before it becomes invalid. */
4095 copy_sub(&temp_subs.norm, &subs->norm);
4096 if (nfa_has_zsubexpr)
4097 copy_sub(&temp_subs.synt, &subs->synt);
4098 subs = &temp_subs;
4099 }
4100
4101 nfa_thread_T *const newt = xrealloc(l->t, newsize);
4102 l->t = newt;
4103 l->len = newlen;
4104 }
4105
4106 /* add the state to the list */
4107 state->lastlist[nfa_ll_index] = l->id;
4108 thread = &l->t[l->n++];
4109 thread->state = state;
4110 if (pim == NULL)
4111 thread->pim.result = NFA_PIM_UNUSED;
4112 else {
4113 copy_pim(&thread->pim, pim);
4114 l->has_pim = TRUE;
4115 }
4116 copy_sub(&thread->subs.norm, &subs->norm);
4117 if (nfa_has_zsubexpr)
4118 copy_sub(&thread->subs.synt, &subs->synt);
4119#ifdef REGEXP_DEBUG
4120 report_state("Adding", &thread->subs.norm, state, l->id, pim);
4121 did_print = TRUE;
4122#endif
4123 }
4124
4125#ifdef REGEXP_DEBUG
4126 if (!did_print)
4127 report_state("Processing", &subs->norm, state, l->id, pim);
4128#endif
4129 switch (state->c) {
4130 case NFA_MATCH:
4131 break;
4132
4133 case NFA_SPLIT:
4134 /* order matters here */
4135 subs = addstate(l, state->out, subs, pim, off_arg);
4136 subs = addstate(l, state->out1, subs, pim, off_arg);
4137 break;
4138
4139 case NFA_EMPTY:
4140 case NFA_NOPEN:
4141 case NFA_NCLOSE:
4142 subs = addstate(l, state->out, subs, pim, off_arg);
4143 break;
4144
4145 case NFA_MOPEN:
4146 case NFA_MOPEN1:
4147 case NFA_MOPEN2:
4148 case NFA_MOPEN3:
4149 case NFA_MOPEN4:
4150 case NFA_MOPEN5:
4151 case NFA_MOPEN6:
4152 case NFA_MOPEN7:
4153 case NFA_MOPEN8:
4154 case NFA_MOPEN9:
4155 case NFA_ZOPEN:
4156 case NFA_ZOPEN1:
4157 case NFA_ZOPEN2:
4158 case NFA_ZOPEN3:
4159 case NFA_ZOPEN4:
4160 case NFA_ZOPEN5:
4161 case NFA_ZOPEN6:
4162 case NFA_ZOPEN7:
4163 case NFA_ZOPEN8:
4164 case NFA_ZOPEN9:
4165 case NFA_ZSTART:
4166 if (state->c == NFA_ZSTART) {
4167 subidx = 0;
4168 sub = &subs->norm;
4169 } else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) { // -V560
4170 subidx = state->c - NFA_ZOPEN;
4171 sub = &subs->synt;
4172 } else {
4173 subidx = state->c - NFA_MOPEN;
4174 sub = &subs->norm;
4175 }
4176
4177 /* avoid compiler warnings */
4178 save_ptr = NULL;
4179 memset(&save_multipos, 0, sizeof(save_multipos));
4180
4181 /* Set the position (with "off" added) in the subexpression. Save
4182 * and restore it when it was in use. Otherwise fill any gap. */
4183 if (REG_MULTI) {
4184 if (subidx < sub->in_use) {
4185 save_multipos = sub->list.multi[subidx];
4186 save_in_use = -1;
4187 } else {
4188 save_in_use = sub->in_use;
4189 for (i = sub->in_use; i < subidx; ++i) {
4190 sub->list.multi[i].start_lnum = -1;
4191 sub->list.multi[i].end_lnum = -1;
4192 }
4193 sub->in_use = subidx + 1;
4194 }
4195 if (off == -1) {
4196 sub->list.multi[subidx].start_lnum = reglnum + 1;
4197 sub->list.multi[subidx].start_col = 0;
4198 } else {
4199
4200 sub->list.multi[subidx].start_lnum = reglnum;
4201 sub->list.multi[subidx].start_col =
4202 (colnr_T)(reginput - regline + off);
4203 }
4204 sub->list.multi[subidx].end_lnum = -1;
4205 } else {
4206 if (subidx < sub->in_use) {
4207 save_ptr = sub->list.line[subidx].start;
4208 save_in_use = -1;
4209 } else {
4210 save_in_use = sub->in_use;
4211 for (i = sub->in_use; i < subidx; ++i) {
4212 sub->list.line[i].start = NULL;
4213 sub->list.line[i].end = NULL;
4214 }
4215 sub->in_use = subidx + 1;
4216 }
4217 sub->list.line[subidx].start = reginput + off;
4218 }
4219
4220 subs = addstate(l, state->out, subs, pim, off_arg);
4221 if (subs == NULL) {
4222 break;
4223 }
4224 // "subs" may have changed, need to set "sub" again.
4225 if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) { // -V560
4226 sub = &subs->synt;
4227 } else {
4228 sub = &subs->norm;
4229 }
4230
4231 if (save_in_use == -1) {
4232 if (REG_MULTI) {
4233 sub->list.multi[subidx] = save_multipos;
4234 }
4235 else
4236 sub->list.line[subidx].start = save_ptr;
4237 } else
4238 sub->in_use = save_in_use;
4239 break;
4240
4241 case NFA_MCLOSE:
4242 if (nfa_has_zend && (REG_MULTI
4243 ? subs->norm.list.multi[0].end_lnum >= 0
4244 : subs->norm.list.line[0].end != NULL)) {
4245 // Do not overwrite the position set by \ze.
4246 subs = addstate(l, state->out, subs, pim, off_arg);
4247 break;
4248 }
4249 FALLTHROUGH;
4250 case NFA_MCLOSE1:
4251 case NFA_MCLOSE2:
4252 case NFA_MCLOSE3:
4253 case NFA_MCLOSE4:
4254 case NFA_MCLOSE5:
4255 case NFA_MCLOSE6:
4256 case NFA_MCLOSE7:
4257 case NFA_MCLOSE8:
4258 case NFA_MCLOSE9:
4259 case NFA_ZCLOSE:
4260 case NFA_ZCLOSE1:
4261 case NFA_ZCLOSE2:
4262 case NFA_ZCLOSE3:
4263 case NFA_ZCLOSE4:
4264 case NFA_ZCLOSE5:
4265 case NFA_ZCLOSE6:
4266 case NFA_ZCLOSE7:
4267 case NFA_ZCLOSE8:
4268 case NFA_ZCLOSE9:
4269 case NFA_ZEND:
4270 if (state->c == NFA_ZEND) {
4271 subidx = 0;
4272 sub = &subs->norm;
4273 } else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) { // -V560
4274 subidx = state->c - NFA_ZCLOSE;
4275 sub = &subs->synt;
4276 } else {
4277 subidx = state->c - NFA_MCLOSE;
4278 sub = &subs->norm;
4279 }
4280
4281 /* We don't fill in gaps here, there must have been an MOPEN that
4282 * has done that. */
4283 save_in_use = sub->in_use;
4284 if (sub->in_use <= subidx)
4285 sub->in_use = subidx + 1;
4286 if (REG_MULTI) {
4287 save_multipos = sub->list.multi[subidx];
4288 if (off == -1) {
4289 sub->list.multi[subidx].end_lnum = reglnum + 1;
4290 sub->list.multi[subidx].end_col = 0;
4291 } else {
4292 sub->list.multi[subidx].end_lnum = reglnum;
4293 sub->list.multi[subidx].end_col =
4294 (colnr_T)(reginput - regline + off);
4295 }
4296 /* avoid compiler warnings */
4297 save_ptr = NULL;
4298 } else {
4299 save_ptr = sub->list.line[subidx].end;
4300 sub->list.line[subidx].end = reginput + off;
4301 // avoid compiler warnings
4302 memset(&save_multipos, 0, sizeof(save_multipos));
4303 }
4304
4305 subs = addstate(l, state->out, subs, pim, off_arg);
4306 if (subs == NULL) {
4307 break;
4308 }
4309 // "subs" may have changed, need to set "sub" again.
4310 if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) { // -V560
4311 sub = &subs->synt;
4312 } else {
4313 sub = &subs->norm;
4314 }
4315
4316 if (REG_MULTI) {
4317 sub->list.multi[subidx] = save_multipos;
4318 }
4319 else
4320 sub->list.line[subidx].end = save_ptr;
4321 sub->in_use = save_in_use;
4322 break;
4323 }
4324 depth--;
4325 return subs;
4326}
4327
4328/*
4329 * Like addstate(), but the new state(s) are put at position "*ip".
4330 * Used for zero-width matches, next state to use is the added one.
4331 * This makes sure the order of states to be tried does not change, which
4332 * matters for alternatives.
4333 */
4334static regsubs_T *addstate_here(
4335 nfa_list_T *l, // runtime state list
4336 nfa_state_T *state, // state to update
4337 regsubs_T *subs, // pointers to subexpressions
4338 nfa_pim_T *pim, // postponed look-behind match
4339 int *ip
4340)
4341 FUNC_ATTR_NONNULL_ARG(1, 2, 5) FUNC_ATTR_WARN_UNUSED_RESULT
4342{
4343 int tlen = l->n;
4344 int count;
4345 int listidx = *ip;
4346
4347 /* First add the state(s) at the end, so that we know how many there are.
4348 * Pass the listidx as offset (avoids adding another argument to
4349 * addstate(). */
4350 regsubs_T *r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4351 if (r == NULL) {
4352 return NULL;
4353 }
4354
4355 // when "*ip" was at the end of the list, nothing to do
4356 if (listidx + 1 == tlen) {
4357 return r;
4358 }
4359
4360 // re-order to put the new state at the current position
4361 count = l->n - tlen;
4362 if (count == 0) {
4363 return r; // no state got added
4364 }
4365 if (count == 1) {
4366 // overwrite the current state
4367 l->t[listidx] = l->t[l->n - 1];
4368 } else if (count > 1) {
4369 if (l->n + count - 1 >= l->len) {
4370 /* not enough space to move the new states, reallocate the list
4371 * and move the states to the right position */
4372 const int newlen = l->len * 3 / 2 + 50;
4373 const size_t newsize = newlen * sizeof(nfa_thread_T);
4374
4375 if ((long)(newsize >> 10) >= p_mmp) {
4376 EMSG(_(e_maxmempat));
4377 return NULL;
4378 }
4379 nfa_thread_T *const newl = xmalloc(newsize);
4380 l->len = newlen;
4381 memmove(&(newl[0]),
4382 &(l->t[0]),
4383 sizeof(nfa_thread_T) * listidx);
4384 memmove(&(newl[listidx]),
4385 &(l->t[l->n - count]),
4386 sizeof(nfa_thread_T) * count);
4387 memmove(&(newl[listidx + count]),
4388 &(l->t[listidx + 1]),
4389 sizeof(nfa_thread_T) * (l->n - count - listidx - 1));
4390 xfree(l->t);
4391 l->t = newl;
4392 } else {
4393 /* make space for new states, then move them from the
4394 * end to the current position */
4395 memmove(&(l->t[listidx + count]),
4396 &(l->t[listidx + 1]),
4397 sizeof(nfa_thread_T) * (l->n - listidx - 1));
4398 memmove(&(l->t[listidx]),
4399 &(l->t[l->n - 1]),
4400 sizeof(nfa_thread_T) * count);
4401 }
4402 }
4403 --l->n;
4404 *ip = listidx - 1;
4405
4406 return r;
4407}
4408
4409/*
4410 * Check character class "class" against current character c.
4411 */
4412static int check_char_class(int class, int c)
4413{
4414 switch (class) {
4415 case NFA_CLASS_ALNUM:
4416 if (c >= 1 && c < 128 && isalnum(c)) {
4417 return OK;
4418 }
4419 break;
4420 case NFA_CLASS_ALPHA:
4421 if (c >= 1 && c < 128 && isalpha(c)) {
4422 return OK;
4423 }
4424 break;
4425 case NFA_CLASS_BLANK:
4426 if (c == ' ' || c == '\t')
4427 return OK;
4428 break;
4429 case NFA_CLASS_CNTRL:
4430 if (c >= 1 && c <= 127 && iscntrl(c)) {
4431 return OK;
4432 }
4433 break;
4434 case NFA_CLASS_DIGIT:
4435 if (ascii_isdigit(c))
4436 return OK;
4437 break;
4438 case NFA_CLASS_GRAPH:
4439 if (c >= 1 && c <= 127 && isgraph(c)) {
4440 return OK;
4441 }
4442 break;
4443 case NFA_CLASS_LOWER:
4444 if (mb_islower(c) && c != 170 && c != 186) {
4445 return OK;
4446 }
4447 break;
4448 case NFA_CLASS_PRINT:
4449 if (vim_isprintc(c))
4450 return OK;
4451 break;
4452 case NFA_CLASS_PUNCT:
4453 if (c >= 1 && c < 128 && ispunct(c)) {
4454 return OK;
4455 }
4456 break;
4457 case NFA_CLASS_SPACE:
4458 if ((c >= 9 && c <= 13) || (c == ' '))
4459 return OK;
4460 break;
4461 case NFA_CLASS_UPPER:
4462 if (mb_isupper(c)) {
4463 return OK;
4464 }
4465 break;
4466 case NFA_CLASS_XDIGIT:
4467 if (ascii_isxdigit(c))
4468 return OK;
4469 break;
4470 case NFA_CLASS_TAB:
4471 if (c == '\t')
4472 return OK;
4473 break;
4474 case NFA_CLASS_RETURN:
4475 if (c == '\r')
4476 return OK;
4477 break;
4478 case NFA_CLASS_BACKSPACE:
4479 if (c == '\b')
4480 return OK;
4481 break;
4482 case NFA_CLASS_ESCAPE:
4483 if (c == ESC) {
4484 return OK;
4485 }
4486 break;
4487
4488 default:
4489 // should not be here :P
4490 IEMSGN(_(e_ill_char_class), class);
4491 return FAIL;
4492 }
4493 return FAIL;
4494}
4495
4496/*
4497 * Check for a match with subexpression "subidx".
4498 * Return TRUE if it matches.
4499 */
4500static int
4501match_backref (
4502 regsub_T *sub, /* pointers to subexpressions */
4503 int subidx,
4504 int *bytelen /* out: length of match in bytes */
4505)
4506{
4507 int len;
4508
4509 if (sub->in_use <= subidx) {
4510retempty:
4511 /* backref was not set, match an empty string */
4512 *bytelen = 0;
4513 return TRUE;
4514 }
4515
4516 if (REG_MULTI) {
4517 if (sub->list.multi[subidx].start_lnum < 0
4518 || sub->list.multi[subidx].end_lnum < 0)
4519 goto retempty;
4520 if (sub->list.multi[subidx].start_lnum == reglnum
4521 && sub->list.multi[subidx].end_lnum == reglnum) {
4522 len = sub->list.multi[subidx].end_col
4523 - sub->list.multi[subidx].start_col;
4524 if (cstrncmp(regline + sub->list.multi[subidx].start_col,
4525 reginput, &len) == 0) {
4526 *bytelen = len;
4527 return TRUE;
4528 }
4529 } else {
4530 if (match_with_backref(
4531 sub->list.multi[subidx].start_lnum,
4532 sub->list.multi[subidx].start_col,
4533 sub->list.multi[subidx].end_lnum,
4534 sub->list.multi[subidx].end_col,
4535 bytelen) == RA_MATCH)
4536 return TRUE;
4537 }
4538 } else {
4539 if (sub->list.line[subidx].start == NULL
4540 || sub->list.line[subidx].end == NULL)
4541 goto retempty;
4542 len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
4543 if (cstrncmp(sub->list.line[subidx].start, reginput, &len) == 0) {
4544 *bytelen = len;
4545 return TRUE;
4546 }
4547 }
4548 return FALSE;
4549}
4550
4551
4552
4553/*
4554 * Check for a match with \z subexpression "subidx".
4555 * Return TRUE if it matches.
4556 */
4557static int
4558match_zref (
4559 int subidx,
4560 int *bytelen /* out: length of match in bytes */
4561)
4562{
4563 int len;
4564
4565 cleanup_zsubexpr();
4566 if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL) {
4567 /* backref was not set, match an empty string */
4568 *bytelen = 0;
4569 return TRUE;
4570 }
4571
4572 len = (int)STRLEN(re_extmatch_in->matches[subidx]);
4573 if (cstrncmp(re_extmatch_in->matches[subidx], reginput, &len) == 0) {
4574 *bytelen = len;
4575 return TRUE;
4576 }
4577 return FALSE;
4578}
4579
4580/*
4581 * Save list IDs for all NFA states of "prog" into "list".
4582 * Also reset the IDs to zero.
4583 * Only used for the recursive value lastlist[1].
4584 */
4585static void nfa_save_listids(nfa_regprog_T *prog, int *list)
4586{
4587 int i;
4588 nfa_state_T *p;
4589
4590 /* Order in the list is reverse, it's a bit faster that way. */
4591 p = &prog->state[0];
4592 for (i = prog->nstate; --i >= 0; ) {
4593 list[i] = p->lastlist[1];
4594 p->lastlist[1] = 0;
4595 ++p;
4596 }
4597}
4598
4599/*
4600 * Restore list IDs from "list" to all NFA states.
4601 */
4602static void nfa_restore_listids(nfa_regprog_T *prog, int *list)
4603{
4604 int i;
4605 nfa_state_T *p;
4606
4607 p = &prog->state[0];
4608 for (i = prog->nstate; --i >= 0; ) {
4609 p->lastlist[1] = list[i];
4610 ++p;
4611 }
4612}
4613
4614static bool nfa_re_num_cmp(uintmax_t val, int op, uintmax_t pos)
4615{
4616 if (op == 1) return pos > val;
4617 if (op == 2) return pos < val;
4618 return val == pos;
4619}
4620
4621
4622/*
4623 * Recursively call nfa_regmatch()
4624 * "pim" is NULL or contains info about a Postponed Invisible Match (start
4625 * position).
4626 */
4627static int recursive_regmatch(
4628 nfa_state_T *state, nfa_pim_T *pim, nfa_regprog_T *prog,
4629 regsubs_T *submatch, regsubs_T *m, int **listids, int *listids_len)
4630{
4631 int save_reginput_col = (int)(reginput - regline);
4632 int save_reglnum = reglnum;
4633 int save_nfa_match = nfa_match;
4634 int save_nfa_listid = nfa_listid;
4635 save_se_T *save_nfa_endp = nfa_endp;
4636 save_se_T endpos;
4637 save_se_T *endposp = NULL;
4638 int result;
4639 int need_restore = FALSE;
4640
4641 if (pim != NULL) {
4642 /* start at the position where the postponed match was */
4643 if (REG_MULTI)
4644 reginput = regline + pim->end.pos.col;
4645 else
4646 reginput = pim->end.ptr;
4647 }
4648
4649 if (state->c == NFA_START_INVISIBLE_BEFORE
4650 || state->c == NFA_START_INVISIBLE_BEFORE_FIRST
4651 || state->c == NFA_START_INVISIBLE_BEFORE_NEG
4652 || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) {
4653 /* The recursive match must end at the current position. When "pim" is
4654 * not NULL it specifies the current position. */
4655 endposp = &endpos;
4656 if (REG_MULTI) {
4657 if (pim == NULL) {
4658 endpos.se_u.pos.col = (int)(reginput - regline);
4659 endpos.se_u.pos.lnum = reglnum;
4660 } else
4661 endpos.se_u.pos = pim->end.pos;
4662 } else {
4663 if (pim == NULL)
4664 endpos.se_u.ptr = reginput;
4665 else
4666 endpos.se_u.ptr = pim->end.ptr;
4667 }
4668
4669 /* Go back the specified number of bytes, or as far as the
4670 * start of the previous line, to try matching "\@<=" or
4671 * not matching "\@<!". This is very inefficient, limit the number of
4672 * bytes if possible. */
4673 if (state->val <= 0) {
4674 if (REG_MULTI) {
4675 regline = reg_getline(--reglnum);
4676 if (regline == NULL)
4677 /* can't go before the first line */
4678 regline = reg_getline(++reglnum);
4679 }
4680 reginput = regline;
4681 } else {
4682 if (REG_MULTI && (int)(reginput - regline) < state->val) {
4683 /* Not enough bytes in this line, go to end of
4684 * previous line. */
4685 regline = reg_getline(--reglnum);
4686 if (regline == NULL) {
4687 /* can't go before the first line */
4688 regline = reg_getline(++reglnum);
4689 reginput = regline;
4690 } else
4691 reginput = regline + STRLEN(regline);
4692 }
4693 if ((int)(reginput - regline) >= state->val) {
4694 reginput -= state->val;
4695 reginput -= utf_head_off(regline, reginput);
4696 } else {
4697 reginput = regline;
4698 }
4699 }
4700 }
4701
4702#ifdef REGEXP_DEBUG
4703 if (log_fd != stderr)
4704 fclose(log_fd);
4705 log_fd = NULL;
4706#endif
4707 /* Have to clear the lastlist field of the NFA nodes, so that
4708 * nfa_regmatch() and addstate() can run properly after recursion. */
4709 if (nfa_ll_index == 1) {
4710 /* Already calling nfa_regmatch() recursively. Save the lastlist[1]
4711 * values and clear them. */
4712 if (*listids == NULL || *listids_len < nstate) {
4713 xfree(*listids);
4714 *listids = xmalloc(sizeof(**listids) * nstate);
4715 *listids_len = nstate;
4716 }
4717 nfa_save_listids(prog, *listids);
4718 need_restore = TRUE;
4719 /* any value of nfa_listid will do */
4720 } else {
4721 /* First recursive nfa_regmatch() call, switch to the second lastlist
4722 * entry. Make sure nfa_listid is different from a previous recursive
4723 * call, because some states may still have this ID. */
4724 ++nfa_ll_index;
4725 if (nfa_listid <= nfa_alt_listid)
4726 nfa_listid = nfa_alt_listid;
4727 }
4728
4729 /* Call nfa_regmatch() to check if the current concat matches at this
4730 * position. The concat ends with the node NFA_END_INVISIBLE */
4731 nfa_endp = endposp;
4732 result = nfa_regmatch(prog, state->out, submatch, m);
4733
4734 if (need_restore)
4735 nfa_restore_listids(prog, *listids);
4736 else {
4737 --nfa_ll_index;
4738 nfa_alt_listid = nfa_listid;
4739 }
4740
4741 /* restore position in input text */
4742 reglnum = save_reglnum;
4743 if (REG_MULTI)
4744 regline = reg_getline(reglnum);
4745 reginput = regline + save_reginput_col;
4746 if (result != NFA_TOO_EXPENSIVE) {
4747 nfa_match = save_nfa_match;
4748 nfa_listid = save_nfa_listid;
4749 }
4750 nfa_endp = save_nfa_endp;
4751
4752#ifdef REGEXP_DEBUG
4753 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
4754 if (log_fd != NULL) {
4755 fprintf(log_fd, "****************************\n");
4756 fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
4757 fprintf(log_fd, "MATCH = %s\n", !result ? "FALSE" : "OK");
4758 fprintf(log_fd, "****************************\n");
4759 } else {
4760 EMSG(_(e_log_open_failed));
4761 log_fd = stderr;
4762 }
4763#endif
4764
4765 return result;
4766}
4767
4768
4769/*
4770 * Estimate the chance of a match with "state" failing.
4771 * empty match: 0
4772 * NFA_ANY: 1
4773 * specific character: 99
4774 */
4775static int failure_chance(nfa_state_T *state, int depth)
4776{
4777 int c = state->c;
4778 int l, r;
4779
4780 /* detect looping */
4781 if (depth > 4)
4782 return 1;
4783
4784 switch (c) {
4785 case NFA_SPLIT:
4786 if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT)
4787 /* avoid recursive stuff */
4788 return 1;
4789 /* two alternatives, use the lowest failure chance */
4790 l = failure_chance(state->out, depth + 1);
4791 r = failure_chance(state->out1, depth + 1);
4792 return l < r ? l : r;
4793
4794 case NFA_ANY:
4795 /* matches anything, unlikely to fail */
4796 return 1;
4797
4798 case NFA_MATCH:
4799 case NFA_MCLOSE:
4800 case NFA_ANY_COMPOSING:
4801 /* empty match works always */
4802 return 0;
4803
4804 case NFA_START_INVISIBLE:
4805 case NFA_START_INVISIBLE_FIRST:
4806 case NFA_START_INVISIBLE_NEG:
4807 case NFA_START_INVISIBLE_NEG_FIRST:
4808 case NFA_START_INVISIBLE_BEFORE:
4809 case NFA_START_INVISIBLE_BEFORE_FIRST:
4810 case NFA_START_INVISIBLE_BEFORE_NEG:
4811 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
4812 case NFA_START_PATTERN:
4813 /* recursive regmatch is expensive, use low failure chance */
4814 return 5;
4815
4816 case NFA_BOL:
4817 case NFA_EOL:
4818 case NFA_BOF:
4819 case NFA_EOF:
4820 case NFA_NEWL:
4821 return 99;
4822
4823 case NFA_BOW:
4824 case NFA_EOW:
4825 return 90;
4826
4827 case NFA_MOPEN:
4828 case NFA_MOPEN1:
4829 case NFA_MOPEN2:
4830 case NFA_MOPEN3:
4831 case NFA_MOPEN4:
4832 case NFA_MOPEN5:
4833 case NFA_MOPEN6:
4834 case NFA_MOPEN7:
4835 case NFA_MOPEN8:
4836 case NFA_MOPEN9:
4837 case NFA_ZOPEN:
4838 case NFA_ZOPEN1:
4839 case NFA_ZOPEN2:
4840 case NFA_ZOPEN3:
4841 case NFA_ZOPEN4:
4842 case NFA_ZOPEN5:
4843 case NFA_ZOPEN6:
4844 case NFA_ZOPEN7:
4845 case NFA_ZOPEN8:
4846 case NFA_ZOPEN9:
4847 case NFA_ZCLOSE:
4848 case NFA_ZCLOSE1:
4849 case NFA_ZCLOSE2:
4850 case NFA_ZCLOSE3:
4851 case NFA_ZCLOSE4:
4852 case NFA_ZCLOSE5:
4853 case NFA_ZCLOSE6:
4854 case NFA_ZCLOSE7:
4855 case NFA_ZCLOSE8:
4856 case NFA_ZCLOSE9:
4857 case NFA_NOPEN:
4858 case NFA_MCLOSE1:
4859 case NFA_MCLOSE2:
4860 case NFA_MCLOSE3:
4861 case NFA_MCLOSE4:
4862 case NFA_MCLOSE5:
4863 case NFA_MCLOSE6:
4864 case NFA_MCLOSE7:
4865 case NFA_MCLOSE8:
4866 case NFA_MCLOSE9:
4867 case NFA_NCLOSE:
4868 return failure_chance(state->out, depth + 1);
4869
4870 case NFA_BACKREF1:
4871 case NFA_BACKREF2:
4872 case NFA_BACKREF3:
4873 case NFA_BACKREF4:
4874 case NFA_BACKREF5:
4875 case NFA_BACKREF6:
4876 case NFA_BACKREF7:
4877 case NFA_BACKREF8:
4878 case NFA_BACKREF9:
4879 case NFA_ZREF1:
4880 case NFA_ZREF2:
4881 case NFA_ZREF3:
4882 case NFA_ZREF4:
4883 case NFA_ZREF5:
4884 case NFA_ZREF6:
4885 case NFA_ZREF7:
4886 case NFA_ZREF8:
4887 case NFA_ZREF9:
4888 /* backreferences don't match in many places */
4889 return 94;
4890
4891 case NFA_LNUM_GT:
4892 case NFA_LNUM_LT:
4893 case NFA_COL_GT:
4894 case NFA_COL_LT:
4895 case NFA_VCOL_GT:
4896 case NFA_VCOL_LT:
4897 case NFA_MARK_GT:
4898 case NFA_MARK_LT:
4899 case NFA_VISUAL:
4900 /* before/after positions don't match very often */
4901 return 85;
4902
4903 case NFA_LNUM:
4904 return 90;
4905
4906 case NFA_CURSOR:
4907 case NFA_COL:
4908 case NFA_VCOL:
4909 case NFA_MARK:
4910 /* specific positions rarely match */
4911 return 98;
4912
4913 case NFA_COMPOSING:
4914 return 95;
4915
4916 default:
4917 if (c > 0)
4918 /* character match fails often */
4919 return 95;
4920 }
4921
4922 /* something else, includes character classes */
4923 return 50;
4924}
4925
4926/*
4927 * Skip until the char "c" we know a match must start with.
4928 */
4929static int skip_to_start(int c, colnr_T *colp)
4930{
4931 const char_u *const s = cstrchr(regline + *colp, c);
4932 if (s == NULL) {
4933 return FAIL;
4934 }
4935 *colp = (int)(s - regline);
4936 return OK;
4937}
4938
4939/*
4940 * Check for a match with match_text.
4941 * Called after skip_to_start() has found regstart.
4942 * Returns zero for no match, 1 for a match.
4943 */
4944static long find_match_text(colnr_T startcol, int regstart, char_u *match_text)
4945{
4946#define PTR2LEN(x) enc_utf8 ? utf_ptr2len(x) : MB_PTR2LEN(x)
4947
4948 colnr_T col = startcol;
4949 int regstart_len = PTR2LEN(regline + startcol);
4950
4951 for (;;) {
4952 bool match = true;
4953 char_u *s1 = match_text;
4954 char_u *s2 = regline + col + regstart_len; // skip regstart
4955 while (*s1) {
4956 int c1_len = PTR2LEN(s1);
4957 int c1 = PTR2CHAR(s1);
4958 int c2_len = PTR2LEN(s2);
4959 int c2 = PTR2CHAR(s2);
4960
4961 if ((c1 != c2 && (!rex.reg_ic || mb_tolower(c1) != mb_tolower(c2)))
4962 || c1_len != c2_len) {
4963 match = false;
4964 break;
4965 }
4966 s1 += c1_len;
4967 s2 += c2_len;
4968 }
4969 if (match
4970 // check that no composing char follows
4971 && !(enc_utf8 && utf_iscomposing(PTR2CHAR(s2)))) {
4972 cleanup_subexpr();
4973 if (REG_MULTI) {
4974 rex.reg_startpos[0].lnum = reglnum;
4975 rex.reg_startpos[0].col = col;
4976 rex.reg_endpos[0].lnum = reglnum;
4977 rex.reg_endpos[0].col = s2 - regline;
4978 } else {
4979 rex.reg_startp[0] = regline + col;
4980 rex.reg_endp[0] = s2;
4981 }
4982 return 1L;
4983 }
4984
4985 // Try finding regstart after the current match.
4986 col += regstart_len; // skip regstart
4987 if (skip_to_start(regstart, &col) == FAIL) {
4988 break;
4989 }
4990 }
4991 return 0L;
4992
4993#undef PTR2LEN
4994}
4995
4996static int nfa_did_time_out(void)
4997{
4998 if (nfa_time_limit != NULL && profile_passed_limit(*nfa_time_limit)) {
4999 if (nfa_timed_out != NULL) {
5000 *nfa_timed_out = true;
5001 }
5002 return true;
5003 }
5004 return false;
5005}
5006
5007/// Main matching routine.
5008///
5009/// Run NFA to determine whether it matches reginput.
5010///
5011/// When "nfa_endp" is not NULL it is a required end-of-match position.
5012///
5013/// Return TRUE if there is a match, FALSE if there is no match,
5014/// NFA_TOO_EXPENSIVE if we end up with too many states.
5015/// When there is a match "submatch" contains the positions.
5016///
5017/// Note: Caller must ensure that: start != NULL.
5018static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start,
5019 regsubs_T *submatch, regsubs_T *m)
5020{
5021 int result = false;
5022 int flag = 0;
5023 bool go_to_nextline = false;
5024 nfa_thread_T *t;
5025 nfa_list_T list[2];
5026 int listidx;
5027 nfa_list_T *thislist;
5028 nfa_list_T *nextlist;
5029 int *listids = NULL;
5030 int listids_len = 0;
5031 nfa_state_T *add_state;
5032 bool add_here;
5033 int add_count;
5034 int add_off = 0;
5035 int toplevel = start->c == NFA_MOPEN;
5036 regsubs_T *r;
5037#ifdef NFA_REGEXP_DEBUG_LOG
5038 FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5039
5040 if (debug == NULL) {
5041 EMSG2("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
5042 return false;
5043 }
5044#endif
5045 // Some patterns may take a long time to match, especially when using
5046 // recursive_regmatch(). Allow interrupting them with CTRL-C.
5047 fast_breakcheck();
5048 if (got_int) {
5049#ifdef NFA_REGEXP_DEBUG_LOG
5050 fclose(debug);
5051#endif
5052 return false;
5053 }
5054 if (nfa_did_time_out()) {
5055#ifdef NFA_REGEXP_DEBUG_LOG
5056 fclose(debug);
5057#endif
5058 return false;
5059 }
5060
5061 nfa_match = false;
5062
5063 // Allocate memory for the lists of nodes.
5064 size_t size = (nstate + 1) * sizeof(nfa_thread_T);
5065 list[0].t = xmalloc(size);
5066 list[0].len = nstate + 1;
5067 list[1].t = xmalloc(size);
5068 list[1].len = nstate + 1;
5069
5070#ifdef REGEXP_DEBUG
5071 log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
5072 if (log_fd != NULL) {
5073 fprintf(log_fd, "**********************************\n");
5074 nfa_set_code(start->c);
5075 fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5076 abs(start->id), code);
5077 fprintf(log_fd, "**********************************\n");
5078 } else {
5079 EMSG(_(e_log_open_failed));
5080 log_fd = stderr;
5081 }
5082#endif
5083
5084 thislist = &list[0];
5085 thislist->n = 0;
5086 thislist->has_pim = FALSE;
5087 nextlist = &list[1];
5088 nextlist->n = 0;
5089 nextlist->has_pim = FALSE;
5090#ifdef REGEXP_DEBUG
5091 fprintf(log_fd, "(---) STARTSTATE first\n");
5092#endif
5093 thislist->id = nfa_listid + 1;
5094
5095 /* Inline optimized code for addstate(thislist, start, m, 0) if we know
5096 * it's the first MOPEN. */
5097 if (toplevel) {
5098 if (REG_MULTI) {
5099 m->norm.list.multi[0].start_lnum = reglnum;
5100 m->norm.list.multi[0].start_col = (colnr_T)(reginput - regline);
5101 } else
5102 m->norm.list.line[0].start = reginput;
5103 m->norm.in_use = 1;
5104 r = addstate(thislist, start->out, m, NULL, 0);
5105 } else {
5106 r = addstate(thislist, start, m, NULL, 0);
5107 }
5108 if (r == NULL) {
5109 nfa_match = NFA_TOO_EXPENSIVE;
5110 goto theend;
5111 }
5112
5113#define ADD_STATE_IF_MATCH(state) \
5114 if (result) { \
5115 add_state = state->out; \
5116 add_off = clen; \
5117 }
5118
5119 /*
5120 * Run for each character.
5121 */
5122 for (;; ) {
5123 int curc = utf_ptr2char(reginput);
5124 int clen = utfc_ptr2len(reginput);
5125 if (curc == NUL) {
5126 clen = 0;
5127 go_to_nextline = false;
5128 }
5129
5130 /* swap lists */
5131 thislist = &list[flag];
5132 nextlist = &list[flag ^= 1];
5133 nextlist->n = 0; // clear nextlist
5134 nextlist->has_pim = false;
5135 nfa_listid++;
5136 if (prog->re_engine == AUTOMATIC_ENGINE
5137 && (nfa_listid >= NFA_MAX_STATES)) {
5138 // Too many states, retry with old engine.
5139 nfa_match = NFA_TOO_EXPENSIVE;
5140 goto theend;
5141 }
5142
5143 thislist->id = nfa_listid;
5144 nextlist->id = nfa_listid + 1;
5145
5146#ifdef REGEXP_DEBUG
5147 fprintf(log_fd, "------------------------------------------\n");
5148 fprintf(log_fd, ">>> Reginput is \"%s\"\n", reginput);
5149 fprintf(log_fd,
5150 ">>> Advanced one character... Current char is %c (code %d) \n",
5151 curc,
5152 (int)curc);
5153 fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
5154 {
5155 int i;
5156
5157 for (i = 0; i < thislist->n; i++)
5158 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5159 }
5160 fprintf(log_fd, "\n");
5161#endif
5162
5163#ifdef NFA_REGEXP_DEBUG_LOG
5164 fprintf(debug, "\n-------------------\n");
5165#endif
5166 /*
5167 * If the state lists are empty we can stop.
5168 */
5169 if (thislist->n == 0)
5170 break;
5171
5172 // compute nextlist
5173 for (listidx = 0; listidx < thislist->n; listidx++) {
5174 // If the list gets very long there probably is something wrong.
5175 // At least allow interrupting with CTRL-C.
5176 fast_breakcheck();
5177 if (got_int) {
5178 break;
5179 }
5180 if (nfa_time_limit != NULL && ++nfa_time_count == 20) {
5181 nfa_time_count = 0;
5182 if (nfa_did_time_out()) {
5183 break;
5184 }
5185 }
5186 t = &thislist->t[listidx];
5187
5188#ifdef NFA_REGEXP_DEBUG_LOG
5189 nfa_set_code(t->state->c);
5190 fprintf(debug, "%s, ", code);
5191#endif
5192#ifdef REGEXP_DEBUG
5193 {
5194 int col;
5195
5196 if (t->subs.norm.in_use <= 0) {
5197 col = -1;
5198 } else if (REG_MULTI) {
5199 col = t->subs.norm.list.multi[0].start_col;
5200 } else {
5201 col = (int)(t->subs.norm.list.line[0].start - regline);
5202 }
5203 nfa_set_code(t->state->c);
5204 fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
5205 abs(t->state->id), (int)t->state->c, code, col,
5206 pim_info(&t->pim));
5207 }
5208#endif
5209
5210 /*
5211 * Handle the possible codes of the current state.
5212 * The most important is NFA_MATCH.
5213 */
5214 add_state = NULL;
5215 add_here = false;
5216 add_count = 0;
5217 switch (t->state->c) {
5218 case NFA_MATCH:
5219 {
5220 // If the match ends before a composing characters and
5221 // rex.reg_icombine is not set, that is not really a match.
5222 if (enc_utf8 && !rex.reg_icombine && utf_iscomposing(curc)) {
5223 break;
5224 }
5225 nfa_match = true;
5226 copy_sub(&submatch->norm, &t->subs.norm);
5227 if (nfa_has_zsubexpr)
5228 copy_sub(&submatch->synt, &t->subs.synt);
5229#ifdef REGEXP_DEBUG
5230 log_subsexpr(&t->subs);
5231#endif
5232 /* Found the left-most longest match, do not look at any other
5233 * states at this position. When the list of states is going
5234 * to be empty quit without advancing, so that "reginput" is
5235 * correct. */
5236 if (nextlist->n == 0)
5237 clen = 0;
5238 goto nextchar;
5239 }
5240
5241 case NFA_END_INVISIBLE:
5242 case NFA_END_INVISIBLE_NEG:
5243 case NFA_END_PATTERN:
5244 /*
5245 * This is only encountered after a NFA_START_INVISIBLE or
5246 * NFA_START_INVISIBLE_BEFORE node.
5247 * They surround a zero-width group, used with "\@=", "\&",
5248 * "\@!", "\@<=" and "\@<!".
5249 * If we got here, it means that the current "invisible" group
5250 * finished successfully, so return control to the parent
5251 * nfa_regmatch(). For a look-behind match only when it ends
5252 * in the position in "nfa_endp".
5253 * Submatches are stored in *m, and used in the parent call.
5254 */
5255#ifdef REGEXP_DEBUG
5256 if (nfa_endp != NULL) {
5257 if (REG_MULTI)
5258 fprintf(
5259 log_fd,
5260 "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
5261 (int)reglnum,
5262 (int)nfa_endp->se_u.pos.lnum,
5263 (int)(reginput - regline),
5264 nfa_endp->se_u.pos.col);
5265 else
5266 fprintf(log_fd, "Current col: %d, endp col: %d\n",
5267 (int)(reginput - regline),
5268 (int)(nfa_endp->se_u.ptr - reginput));
5269 }
5270#endif
5271 /* If "nfa_endp" is set it's only a match if it ends at
5272 * "nfa_endp" */
5273 if (nfa_endp != NULL && (REG_MULTI
5274 ? (reglnum != nfa_endp->se_u.pos.lnum
5275 || (int)(reginput - regline)
5276 != nfa_endp->se_u.pos.col)
5277 : reginput != nfa_endp->se_u.ptr))
5278 break;
5279
5280 /* do not set submatches for \@! */
5281 if (t->state->c != NFA_END_INVISIBLE_NEG) {
5282 copy_sub(&m->norm, &t->subs.norm);
5283 if (nfa_has_zsubexpr)
5284 copy_sub(&m->synt, &t->subs.synt);
5285 }
5286#ifdef REGEXP_DEBUG
5287 fprintf(log_fd, "Match found:\n");
5288 log_subsexpr(m);
5289#endif
5290 nfa_match = true;
5291 // See comment above at "goto nextchar".
5292 if (nextlist->n == 0) {
5293 clen = 0;
5294 }
5295 goto nextchar;
5296
5297 case NFA_START_INVISIBLE:
5298 case NFA_START_INVISIBLE_FIRST:
5299 case NFA_START_INVISIBLE_NEG:
5300 case NFA_START_INVISIBLE_NEG_FIRST:
5301 case NFA_START_INVISIBLE_BEFORE:
5302 case NFA_START_INVISIBLE_BEFORE_FIRST:
5303 case NFA_START_INVISIBLE_BEFORE_NEG:
5304 case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5305 {
5306#ifdef REGEXP_DEBUG
5307 fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
5308 failure_chance(t->state->out, 0),
5309 failure_chance(t->state->out1->out, 0));
5310#endif
5311 // Do it directly if there already is a PIM or when
5312 // nfa_postprocess() detected it will work better.
5313 if (t->pim.result != NFA_PIM_UNUSED
5314 || t->state->c == NFA_START_INVISIBLE_FIRST
5315 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
5316 || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5317 || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) {
5318 int in_use = m->norm.in_use;
5319
5320 // Copy submatch info for the recursive call, opposite
5321 // of what happens on success below.
5322 copy_sub_off(&m->norm, &t->subs.norm);
5323 if (nfa_has_zsubexpr)
5324 copy_sub_off(&m->synt, &t->subs.synt);
5325
5326 // First try matching the invisible match, then what
5327 // follows.
5328 result = recursive_regmatch(t->state, NULL, prog, submatch, m,
5329 &listids, &listids_len);
5330 if (result == NFA_TOO_EXPENSIVE) {
5331 nfa_match = result;
5332 goto theend;
5333 }
5334
5335 // for \@! and \@<! it is a match when the result is
5336 // FALSE
5337 if (result != (t->state->c == NFA_START_INVISIBLE_NEG
5338 || t->state->c == NFA_START_INVISIBLE_NEG_FIRST
5339 || t->state->c
5340 == NFA_START_INVISIBLE_BEFORE_NEG
5341 || t->state->c
5342 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
5343 // Copy submatch info from the recursive call
5344 copy_sub_off(&t->subs.norm, &m->norm);
5345 if (nfa_has_zsubexpr)
5346 copy_sub_off(&t->subs.synt, &m->synt);
5347 // If the pattern has \ze and it matched in the
5348 // sub pattern, use it.
5349 copy_ze_off(&t->subs.norm, &m->norm);
5350
5351 // t->state->out1 is the corresponding
5352 // END_INVISIBLE node; Add its out to the current
5353 // list (zero-width match).
5354 add_here = true;
5355 add_state = t->state->out1->out;
5356 }
5357 m->norm.in_use = in_use;
5358 } else {
5359 nfa_pim_T pim;
5360
5361 // First try matching what follows. Only if a match
5362 // is found verify the invisible match matches. Add a
5363 // nfa_pim_T to the following states, it contains info
5364 // about the invisible match.
5365 pim.state = t->state;
5366 pim.result = NFA_PIM_TODO;
5367 pim.subs.norm.in_use = 0;
5368 pim.subs.synt.in_use = 0;
5369 if (REG_MULTI) {
5370 pim.end.pos.col = (int)(reginput - regline);
5371 pim.end.pos.lnum = reglnum;
5372 } else
5373 pim.end.ptr = reginput;
5374
5375 // t->state->out1 is the corresponding END_INVISIBLE
5376 // node; Add its out to the current list (zero-width
5377 // match).
5378 if (addstate_here(thislist, t->state->out1->out, &t->subs,
5379 &pim, &listidx) == NULL) {
5380 nfa_match = NFA_TOO_EXPENSIVE;
5381 goto theend;
5382 }
5383 }
5384 }
5385 break;
5386
5387 case NFA_START_PATTERN:
5388 {
5389 nfa_state_T *skip = NULL;
5390#ifdef REGEXP_DEBUG
5391 int skip_lid = 0;
5392#endif
5393
5394 // There is no point in trying to match the pattern if the
5395 // output state is not going to be added to the list.
5396 if (state_in_list(nextlist, t->state->out1->out, &t->subs)) {
5397 skip = t->state->out1->out;
5398#ifdef REGEXP_DEBUG
5399 skip_lid = nextlist->id;
5400#endif
5401 } else if (state_in_list(nextlist,
5402 t->state->out1->out->out, &t->subs)) {
5403 skip = t->state->out1->out->out;
5404#ifdef REGEXP_DEBUG
5405 skip_lid = nextlist->id;
5406#endif
5407 } else if (state_in_list(thislist,
5408 t->state->out1->out->out, &t->subs)) {
5409 skip = t->state->out1->out->out;
5410#ifdef REGEXP_DEBUG
5411 skip_lid = thislist->id;
5412#endif
5413 }
5414 if (skip != NULL) {
5415#ifdef REGEXP_DEBUG
5416 nfa_set_code(skip->c);
5417 fprintf(
5418 log_fd,
5419 "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
5420 abs(skip->id), skip_lid, skip->c, code);
5421#endif
5422 break;
5423 }
5424 // Copy submatch info to the recursive call, opposite of what
5425 // happens afterwards.
5426 copy_sub_off(&m->norm, &t->subs.norm);
5427 if (nfa_has_zsubexpr) {
5428 copy_sub_off(&m->synt, &t->subs.synt);
5429 }
5430
5431 // First try matching the pattern.
5432 result = recursive_regmatch(t->state, NULL, prog, submatch, m,
5433 &listids, &listids_len);
5434 if (result == NFA_TOO_EXPENSIVE) {
5435 nfa_match = result;
5436 goto theend;
5437 }
5438 if (result) {
5439 int bytelen;
5440
5441#ifdef REGEXP_DEBUG
5442 fprintf(log_fd, "NFA_START_PATTERN matches:\n");
5443 log_subsexpr(m);
5444#endif
5445 // Copy submatch info from the recursive call
5446 copy_sub_off(&t->subs.norm, &m->norm);
5447 if (nfa_has_zsubexpr) {
5448 copy_sub_off(&t->subs.synt, &m->synt);
5449 }
5450 // Now we need to skip over the matched text and then
5451 // continue with what follows.
5452 if (REG_MULTI) {
5453 // TODO(RE): multi-line match
5454 bytelen = m->norm.list.multi[0].end_col
5455 - (int)(reginput - regline);
5456 } else {
5457 bytelen = (int)(m->norm.list.line[0].end - reginput);
5458 }
5459
5460#ifdef REGEXP_DEBUG
5461 fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
5462#endif
5463 if (bytelen == 0) {
5464 // empty match, output of corresponding
5465 // NFA_END_PATTERN/NFA_SKIP to be used at current
5466 // position
5467 add_here = true;
5468 add_state = t->state->out1->out->out;
5469 } else if (bytelen <= clen) {
5470 // match current character, output of corresponding
5471 // NFA_END_PATTERN to be used at next position.
5472 add_state = t->state->out1->out->out;
5473 add_off = clen;
5474 } else {
5475 // skip over the matched characters, set character
5476 // count in NFA_SKIP
5477 add_state = t->state->out1->out;
5478 add_off = bytelen;
5479 add_count = bytelen - clen;
5480 }
5481 }
5482 break;
5483 }
5484
5485 case NFA_BOL:
5486 if (reginput == regline) {
5487 add_here = true;
5488 add_state = t->state->out;
5489 }
5490 break;
5491
5492 case NFA_EOL:
5493 if (curc == NUL) {
5494 add_here = true;
5495 add_state = t->state->out;
5496 }
5497 break;
5498
5499 case NFA_BOW:
5500 result = true;
5501
5502 if (curc == NUL) {
5503 result = false;
5504 } else if (has_mbyte) {
5505 int this_class;
5506
5507 // Get class of current and previous char (if it exists).
5508 this_class = mb_get_class_tab(reginput, rex.reg_buf->b_chartab);
5509 if (this_class <= 1) {
5510 result = false;
5511 } else if (reg_prev_class() == this_class) {
5512 result = false;
5513 }
5514 } else if (!vim_iswordc_buf(curc, rex.reg_buf)
5515 || (reginput > regline
5516 && vim_iswordc_buf(reginput[-1], rex.reg_buf))) {
5517 result = false;
5518 }
5519 if (result) {
5520 add_here = true;
5521 add_state = t->state->out;
5522 }
5523 break;
5524
5525 case NFA_EOW:
5526 result = true;
5527 if (reginput == regline) {
5528 result = false;
5529 } else if (has_mbyte) {
5530 int this_class, prev_class;
5531
5532 // Get class of current and previous char (if it exists).
5533 this_class = mb_get_class_tab(reginput, rex.reg_buf->b_chartab);
5534 prev_class = reg_prev_class();
5535 if (this_class == prev_class
5536 || prev_class == 0 || prev_class == 1) {
5537 result = false;
5538 }
5539 } else if (!vim_iswordc_buf(reginput[-1], rex.reg_buf)
5540 || (reginput[0] != NUL
5541 && vim_iswordc_buf(curc, rex.reg_buf))) {
5542 result = false;
5543 }
5544 if (result) {
5545 add_here = true;
5546 add_state = t->state->out;
5547 }
5548 break;
5549
5550 case NFA_BOF:
5551 if (reglnum == 0 && reginput == regline
5552 && (!REG_MULTI || rex.reg_firstlnum == 1)) {
5553 add_here = true;
5554 add_state = t->state->out;
5555 }
5556 break;
5557
5558 case NFA_EOF:
5559 if (reglnum == rex.reg_maxline && curc == NUL) {
5560 add_here = true;
5561 add_state = t->state->out;
5562 }
5563 break;
5564
5565 case NFA_COMPOSING:
5566 {
5567 int mc = curc;
5568 int len = 0;
5569 nfa_state_T *end;
5570 nfa_state_T *sta;
5571 int cchars[MAX_MCO];
5572 int ccount = 0;
5573 int j;
5574
5575 sta = t->state->out;
5576 len = 0;
5577 if (utf_iscomposing(sta->c)) {
5578 // Only match composing character(s), ignore base
5579 // character. Used for ".{composing}" and "{composing}"
5580 // (no preceding character).
5581 len += mb_char2len(mc);
5582 }
5583 if (rex.reg_icombine && len == 0) {
5584 // If \Z was present, then ignore composing characters.
5585 // When ignoring the base character this always matches.
5586 if (sta->c != curc) {
5587 result = FAIL;
5588 } else {
5589 result = OK;
5590 }
5591 while (sta->c != NFA_END_COMPOSING) {
5592 sta = sta->out;
5593 }
5594 } else if (len > 0 || mc == sta->c) {
5595 // Check base character matches first, unless ignored.
5596 if (len == 0) {
5597 len += mb_char2len(mc);
5598 sta = sta->out;
5599 }
5600
5601 // We don't care about the order of composing characters.
5602 // Get them into cchars[] first.
5603 while (len < clen) {
5604 mc = utf_ptr2char(reginput + len);
5605 cchars[ccount++] = mc;
5606 len += mb_char2len(mc);
5607 if (ccount == MAX_MCO)
5608 break;
5609 }
5610
5611 // Check that each composing char in the pattern matches a
5612 // composing char in the text. We do not check if all
5613 // composing chars are matched.
5614 result = OK;
5615 while (sta->c != NFA_END_COMPOSING) {
5616 for (j = 0; j < ccount; ++j)
5617 if (cchars[j] == sta->c)
5618 break;
5619 if (j == ccount) {
5620 result = FAIL;
5621 break;
5622 }
5623 sta = sta->out;
5624 }
5625 } else
5626 result = FAIL;
5627
5628 end = t->state->out1; // NFA_END_COMPOSING
5629 ADD_STATE_IF_MATCH(end);
5630 break;
5631 }
5632
5633 case NFA_NEWL:
5634 if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
5635 && reglnum <= rex.reg_maxline) {
5636 go_to_nextline = true;
5637 // Pass -1 for the offset, which means taking the position
5638 // at the start of the next line.
5639 add_state = t->state->out;
5640 add_off = -1;
5641 } else if (curc == '\n' && rex.reg_line_lbr) {
5642 // match \n as if it is an ordinary character
5643 add_state = t->state->out;
5644 add_off = 1;
5645 }
5646 break;
5647
5648 case NFA_START_COLL:
5649 case NFA_START_NEG_COLL:
5650 {
5651 // What follows is a list of characters, until NFA_END_COLL.
5652 // One of them must match or none of them must match.
5653 nfa_state_T *state;
5654 int result_if_matched;
5655 int c1, c2;
5656
5657 // Never match EOL. If it's part of the collection it is added
5658 // as a separate state with an OR.
5659 if (curc == NUL) {
5660 break;
5661 }
5662
5663 state = t->state->out;
5664 result_if_matched = (t->state->c == NFA_START_COLL);
5665 for (;; ) {
5666 if (state->c == NFA_END_COLL) {
5667 result = !result_if_matched;
5668 break;
5669 }
5670 if (state->c == NFA_RANGE_MIN) {
5671 c1 = state->val;
5672 state = state->out; // advance to NFA_RANGE_MAX
5673 c2 = state->val;
5674#ifdef REGEXP_DEBUG
5675 fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
5676 curc, c1, c2);
5677#endif
5678 if (curc >= c1 && curc <= c2) {
5679 result = result_if_matched;
5680 break;
5681 }
5682 if (rex.reg_ic) {
5683 int curc_low = mb_tolower(curc);
5684 int done = false;
5685
5686 for (; c1 <= c2; c1++) {
5687 if (mb_tolower(c1) == curc_low) {
5688 result = result_if_matched;
5689 done = TRUE;
5690 break;
5691 }
5692 }
5693 if (done) {
5694 break;
5695 }
5696 }
5697 } else if (state->c < 0 ? check_char_class(state->c, curc)
5698 : (curc == state->c
5699 || (rex.reg_ic && mb_tolower(curc)
5700 == mb_tolower(state->c)))) {
5701 result = result_if_matched;
5702 break;
5703 }
5704 state = state->out;
5705 }
5706 if (result) {
5707 // next state is in out of the NFA_END_COLL, out1 of
5708 // START points to the END state
5709 add_state = t->state->out1->out;
5710 add_off = clen;
5711 }
5712 break;
5713 }
5714
5715 case NFA_ANY:
5716 // Any char except '\0', (end of input) does not match.
5717 if (curc > 0) {
5718 add_state = t->state->out;
5719 add_off = clen;
5720 }
5721 break;
5722
5723 case NFA_ANY_COMPOSING:
5724 // On a composing character skip over it. Otherwise do
5725 // nothing. Always matches.
5726 if (enc_utf8 && utf_iscomposing(curc)) {
5727 add_off = clen;
5728 } else {
5729 add_here = true;
5730 add_off = 0;
5731 }
5732 add_state = t->state->out;
5733 break;
5734
5735 // Character classes like \a for alpha, \d for digit etc.
5736 case NFA_IDENT: // \i
5737 result = vim_isIDc(curc);
5738 ADD_STATE_IF_MATCH(t->state);
5739 break;
5740
5741 case NFA_SIDENT: // \I
5742 result = !ascii_isdigit(curc) && vim_isIDc(curc);
5743 ADD_STATE_IF_MATCH(t->state);
5744 break;
5745
5746 case NFA_KWORD: // \k
5747 result = vim_iswordp_buf(reginput, rex.reg_buf);
5748 ADD_STATE_IF_MATCH(t->state);
5749 break;
5750
5751 case NFA_SKWORD: // \K
5752 result = !ascii_isdigit(curc)
5753 && vim_iswordp_buf(reginput, rex.reg_buf);
5754 ADD_STATE_IF_MATCH(t->state);
5755 break;
5756
5757 case NFA_FNAME: // \f
5758 result = vim_isfilec(curc);
5759 ADD_STATE_IF_MATCH(t->state);
5760 break;
5761
5762 case NFA_SFNAME: // \F
5763 result = !ascii_isdigit(curc) && vim_isfilec(curc);
5764 ADD_STATE_IF_MATCH(t->state);
5765 break;
5766
5767 case NFA_PRINT: // \p
5768 result = vim_isprintc(PTR2CHAR(reginput));
5769 ADD_STATE_IF_MATCH(t->state);
5770 break;
5771
5772 case NFA_SPRINT: // \P
5773 result = !ascii_isdigit(curc) && vim_isprintc(PTR2CHAR(reginput));
5774 ADD_STATE_IF_MATCH(t->state);
5775 break;
5776
5777 case NFA_WHITE: // \s
5778 result = ascii_iswhite(curc);
5779 ADD_STATE_IF_MATCH(t->state);
5780 break;
5781
5782 case NFA_NWHITE: // \S
5783 result = curc != NUL && !ascii_iswhite(curc);
5784 ADD_STATE_IF_MATCH(t->state);
5785 break;
5786
5787 case NFA_DIGIT: // \d
5788 result = ri_digit(curc);
5789 ADD_STATE_IF_MATCH(t->state);
5790 break;
5791
5792 case NFA_NDIGIT: // \D
5793 result = curc != NUL && !ri_digit(curc);
5794 ADD_STATE_IF_MATCH(t->state);
5795 break;
5796
5797 case NFA_HEX: // \x
5798 result = ri_hex(curc);
5799 ADD_STATE_IF_MATCH(t->state);
5800 break;
5801
5802 case NFA_NHEX: // \X
5803 result = curc != NUL && !ri_hex(curc);
5804 ADD_STATE_IF_MATCH(t->state);
5805 break;
5806
5807 case NFA_OCTAL: // \o
5808 result = ri_octal(curc);
5809 ADD_STATE_IF_MATCH(t->state);
5810 break;
5811
5812 case NFA_NOCTAL: // \O
5813 result = curc != NUL && !ri_octal(curc);
5814 ADD_STATE_IF_MATCH(t->state);
5815 break;
5816
5817 case NFA_WORD: // \w
5818 result = ri_word(curc);
5819 ADD_STATE_IF_MATCH(t->state);
5820 break;
5821
5822 case NFA_NWORD: // \W
5823 result = curc != NUL && !ri_word(curc);
5824 ADD_STATE_IF_MATCH(t->state);
5825 break;
5826
5827 case NFA_HEAD: // \h
5828 result = ri_head(curc);
5829 ADD_STATE_IF_MATCH(t->state);
5830 break;
5831
5832 case NFA_NHEAD: // \H
5833 result = curc != NUL && !ri_head(curc);
5834 ADD_STATE_IF_MATCH(t->state);
5835 break;
5836
5837 case NFA_ALPHA: // \a
5838 result = ri_alpha(curc);
5839 ADD_STATE_IF_MATCH(t->state);
5840 break;
5841
5842 case NFA_NALPHA: // \A
5843 result = curc != NUL && !ri_alpha(curc);
5844 ADD_STATE_IF_MATCH(t->state);
5845 break;
5846
5847 case NFA_LOWER: // \l
5848 result = ri_lower(curc);
5849 ADD_STATE_IF_MATCH(t->state);
5850 break;
5851
5852 case NFA_NLOWER: // \L
5853 result = curc != NUL && !ri_lower(curc);
5854 ADD_STATE_IF_MATCH(t->state);
5855 break;
5856
5857 case NFA_UPPER: // \u
5858 result = ri_upper(curc);
5859 ADD_STATE_IF_MATCH(t->state);
5860 break;
5861
5862 case NFA_NUPPER: // \U
5863 result = curc != NUL && !ri_upper(curc);
5864 ADD_STATE_IF_MATCH(t->state);
5865 break;
5866
5867 case NFA_LOWER_IC: // [a-z]
5868 result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc));
5869 ADD_STATE_IF_MATCH(t->state);
5870 break;
5871
5872 case NFA_NLOWER_IC: // [^a-z]
5873 result = curc != NUL
5874 && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc)));
5875 ADD_STATE_IF_MATCH(t->state);
5876 break;
5877
5878 case NFA_UPPER_IC: // [A-Z]
5879 result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc));
5880 ADD_STATE_IF_MATCH(t->state);
5881 break;
5882
5883 case NFA_NUPPER_IC: // [^A-Z]
5884 result = curc != NUL
5885 && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc)));
5886 ADD_STATE_IF_MATCH(t->state);
5887 break;
5888
5889 case NFA_BACKREF1:
5890 case NFA_BACKREF2:
5891 case NFA_BACKREF3:
5892 case NFA_BACKREF4:
5893 case NFA_BACKREF5:
5894 case NFA_BACKREF6:
5895 case NFA_BACKREF7:
5896 case NFA_BACKREF8:
5897 case NFA_BACKREF9:
5898 case NFA_ZREF1:
5899 case NFA_ZREF2:
5900 case NFA_ZREF3:
5901 case NFA_ZREF4:
5902 case NFA_ZREF5:
5903 case NFA_ZREF6:
5904 case NFA_ZREF7:
5905 case NFA_ZREF8:
5906 case NFA_ZREF9:
5907 // \1 .. \9 \z1 .. \z9
5908 {
5909 int subidx;
5910 int bytelen;
5911
5912 if (t->state->c <= NFA_BACKREF9) {
5913 subidx = t->state->c - NFA_BACKREF1 + 1;
5914 result = match_backref(&t->subs.norm, subidx, &bytelen);
5915 } else {
5916 subidx = t->state->c - NFA_ZREF1 + 1;
5917 result = match_zref(subidx, &bytelen);
5918 }
5919
5920 if (result) {
5921 if (bytelen == 0) {
5922 // empty match always works, output of NFA_SKIP to be
5923 // used next
5924 add_here = true;
5925 add_state = t->state->out->out;
5926 } else if (bytelen <= clen) {
5927 // match current character, jump ahead to out of
5928 // NFA_SKIP
5929 add_state = t->state->out->out;
5930 add_off = clen;
5931 } else {
5932 // skip over the matched characters, set character
5933 // count in NFA_SKIP
5934 add_state = t->state->out;
5935 add_off = bytelen;
5936 add_count = bytelen - clen;
5937 }
5938 }
5939 break;
5940 }
5941 case NFA_SKIP:
5942 // character of previous matching \1 .. \9 or \@>
5943 if (t->count - clen <= 0) {
5944 // end of match, go to what follows
5945 add_state = t->state->out;
5946 add_off = clen;
5947 } else {
5948 // add state again with decremented count
5949 add_state = t->state;
5950 add_off = 0;
5951 add_count = t->count - clen;
5952 }
5953 break;
5954
5955 case NFA_LNUM:
5956 case NFA_LNUM_GT:
5957 case NFA_LNUM_LT:
5958 assert(t->state->val >= 0
5959 && !((rex.reg_firstlnum > 0
5960 && reglnum > LONG_MAX - rex.reg_firstlnum)
5961 || (rex.reg_firstlnum < 0
5962 && reglnum < LONG_MIN + rex.reg_firstlnum))
5963 && reglnum + rex.reg_firstlnum >= 0);
5964 result = (REG_MULTI
5965 && nfa_re_num_cmp((uintmax_t)t->state->val,
5966 t->state->c - NFA_LNUM,
5967 (uintmax_t)(reglnum + rex.reg_firstlnum)));
5968 if (result) {
5969 add_here = true;
5970 add_state = t->state->out;
5971 }
5972 break;
5973
5974 case NFA_COL:
5975 case NFA_COL_GT:
5976 case NFA_COL_LT:
5977 assert(t->state->val >= 0
5978 && reginput >= regline
5979 && (uintmax_t)(reginput - regline) <= UINTMAX_MAX - 1);
5980 result = nfa_re_num_cmp((uintmax_t)t->state->val,
5981 t->state->c - NFA_COL,
5982 (uintmax_t)(reginput - regline + 1));
5983 if (result) {
5984 add_here = true;
5985 add_state = t->state->out;
5986 }
5987 break;
5988
5989 case NFA_VCOL:
5990 case NFA_VCOL_GT:
5991 case NFA_VCOL_LT:
5992 {
5993 int op = t->state->c - NFA_VCOL;
5994 colnr_T col = (colnr_T)(reginput - regline);
5995
5996 // Bail out quickly when there can't be a match, avoid the overhead of
5997 // win_linetabsize() on long lines.
5998 if (op != 1 && col > t->state->val * (has_mbyte ? MB_MAXBYTES : 1)) {
5999 break;
6000 }
6001
6002 result = false;
6003 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
6004 if (op == 1 && col - 1 > t->state->val && col > 100) {
6005 long ts = wp->w_buffer->b_p_ts;
6006
6007 // Guess that a character won't use more columns than 'tabstop',
6008 // with a minimum of 4.
6009 if (ts < 4) {
6010 ts = 4;
6011 }
6012 result = col > t->state->val * ts;
6013 }
6014 if (!result) {
6015 uintmax_t lts = win_linetabsize(wp, regline, col);
6016 assert(t->state->val >= 0);
6017 result = nfa_re_num_cmp((uintmax_t)t->state->val, op, lts + 1);
6018 }
6019 if (result) {
6020 add_here = true;
6021 add_state = t->state->out;
6022 }
6023 }
6024 break;
6025
6026 case NFA_MARK:
6027 case NFA_MARK_GT:
6028 case NFA_MARK_LT:
6029 {
6030 pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, false);
6031
6032 // Compare the mark position to the match position.
6033 result = (pos != NULL // mark doesn't exist
6034 && pos->lnum > 0 // mark isn't set in reg_buf
6035 && (pos->lnum == reglnum + rex.reg_firstlnum
6036 ? (pos->col == (colnr_T)(reginput - regline)
6037 ? t->state->c == NFA_MARK
6038 : (pos->col < (colnr_T)(reginput - regline)
6039 ? t->state->c == NFA_MARK_GT
6040 : t->state->c == NFA_MARK_LT))
6041 : (pos->lnum < reglnum + rex.reg_firstlnum
6042 ? t->state->c == NFA_MARK_GT
6043 : t->state->c == NFA_MARK_LT)));
6044 if (result) {
6045 add_here = true;
6046 add_state = t->state->out;
6047 }
6048 break;
6049 }
6050
6051 case NFA_CURSOR:
6052 result = (rex.reg_win != NULL
6053 && (reglnum + rex.reg_firstlnum == rex.reg_win->w_cursor.lnum)
6054 && ((colnr_T)(reginput - regline)
6055 == rex.reg_win->w_cursor.col));
6056 if (result) {
6057 add_here = true;
6058 add_state = t->state->out;
6059 }
6060 break;
6061
6062 case NFA_VISUAL:
6063 result = reg_match_visual();
6064 if (result) {
6065 add_here = true;
6066 add_state = t->state->out;
6067 }
6068 break;
6069
6070 case NFA_MOPEN1:
6071 case NFA_MOPEN2:
6072 case NFA_MOPEN3:
6073 case NFA_MOPEN4:
6074 case NFA_MOPEN5:
6075 case NFA_MOPEN6:
6076 case NFA_MOPEN7:
6077 case NFA_MOPEN8:
6078 case NFA_MOPEN9:
6079 case NFA_ZOPEN:
6080 case NFA_ZOPEN1:
6081 case NFA_ZOPEN2:
6082 case NFA_ZOPEN3:
6083 case NFA_ZOPEN4:
6084 case NFA_ZOPEN5:
6085 case NFA_ZOPEN6:
6086 case NFA_ZOPEN7:
6087 case NFA_ZOPEN8:
6088 case NFA_ZOPEN9:
6089 case NFA_NOPEN:
6090 case NFA_ZSTART:
6091 // These states are only added to be able to bail out when
6092 // they are added again, nothing is to be done.
6093 break;
6094
6095 default: // regular character
6096 {
6097 int c = t->state->c;
6098
6099#ifdef REGEXP_DEBUG
6100 if (c < 0) {
6101 IEMSGN("INTERNAL: Negative state char: %" PRId64, c);
6102 }
6103#endif
6104 result = (c == curc);
6105
6106 if (!result && rex.reg_ic) {
6107 result = mb_tolower(c) == mb_tolower(curc);
6108 }
6109
6110 // If rex.reg_icombine is not set only skip over the character
6111 // itself. When it is set skip over composing characters.
6112 if (result && enc_utf8 && !rex.reg_icombine) {
6113 clen = utf_ptr2len(reginput);
6114 }
6115
6116 ADD_STATE_IF_MATCH(t->state);
6117 break;
6118 }
6119 } // switch (t->state->c)
6120
6121 if (add_state != NULL) {
6122 nfa_pim_T *pim;
6123 nfa_pim_T pim_copy;
6124
6125 if (t->pim.result == NFA_PIM_UNUSED)
6126 pim = NULL;
6127 else
6128 pim = &t->pim;
6129
6130 // Handle the postponed invisible match if the match might end
6131 // without advancing and before the end of the line.
6132 if (pim != NULL && (clen == 0 || match_follows(add_state, 0))) {
6133 if (pim->result == NFA_PIM_TODO) {
6134#ifdef REGEXP_DEBUG
6135 fprintf(log_fd, "\n");
6136 fprintf(log_fd, "==================================\n");
6137 fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6138 fprintf(log_fd, "\n");
6139#endif
6140 result = recursive_regmatch(pim->state, pim, prog, submatch, m,
6141 &listids, &listids_len);
6142 pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
6143 // for \@! and \@<! it is a match when the result is
6144 // FALSE
6145 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
6146 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6147 || pim->state->c
6148 == NFA_START_INVISIBLE_BEFORE_NEG
6149 || pim->state->c
6150 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
6151 // Copy submatch info from the recursive call
6152 copy_sub_off(&pim->subs.norm, &m->norm);
6153 if (nfa_has_zsubexpr)
6154 copy_sub_off(&pim->subs.synt, &m->synt);
6155 }
6156 } else {
6157 result = (pim->result == NFA_PIM_MATCH);
6158#ifdef REGEXP_DEBUG
6159 fprintf(log_fd, "\n");
6160 fprintf(
6161 log_fd,
6162 "Using previous recursive nfa_regmatch() result, result == %d\n",
6163 pim->result);
6164 fprintf(log_fd, "MATCH = %s\n", result ? "OK" : "FALSE");
6165 fprintf(log_fd, "\n");
6166#endif
6167 }
6168
6169 // for \@! and \@<! it is a match when result is FALSE
6170 if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
6171 || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6172 || pim->state->c
6173 == NFA_START_INVISIBLE_BEFORE_NEG
6174 || pim->state->c
6175 == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
6176 // Copy submatch info from the recursive call
6177 copy_sub_off(&t->subs.norm, &pim->subs.norm);
6178 if (nfa_has_zsubexpr)
6179 copy_sub_off(&t->subs.synt, &pim->subs.synt);
6180 } else {
6181 // look-behind match failed, don't add the state
6182 continue;
6183 }
6184
6185 // Postponed invisible match was handled, don't add it to
6186 // following states.
6187 pim = NULL;
6188 }
6189
6190 // If "pim" points into l->t it will become invalid when
6191 // adding the state causes the list to be reallocated. Make a
6192 // local copy to avoid that.
6193 if (pim == &t->pim) {
6194 copy_pim(&pim_copy, pim);
6195 pim = &pim_copy;
6196 }
6197
6198 if (add_here) {
6199 r = addstate_here(thislist, add_state, &t->subs, pim, &listidx);
6200 } else {
6201 r = addstate(nextlist, add_state, &t->subs, pim, add_off);
6202 if (add_count > 0) {
6203 nextlist->t[nextlist->n - 1].count = add_count;
6204 }
6205 }
6206 if (r == NULL) {
6207 nfa_match = NFA_TOO_EXPENSIVE;
6208 goto theend;
6209 }
6210 }
6211 } // for (thislist = thislist; thislist->state; thislist++)
6212
6213 // Look for the start of a match in the current position by adding the
6214 // start state to the list of states.
6215 // The first found match is the leftmost one, thus the order of states
6216 // matters!
6217 // Do not add the start state in recursive calls of nfa_regmatch(),
6218 // because recursive calls should only start in the first position.
6219 // Unless "nfa_endp" is not NULL, then we match the end position.
6220 // Also don't start a match past the first line.
6221 if (!nfa_match
6222 && ((toplevel
6223 && reglnum == 0
6224 && clen != 0
6225 && (rex.reg_maxcol == 0
6226 || (colnr_T)(reginput - regline) < rex.reg_maxcol))
6227 || (nfa_endp != NULL
6228 && (REG_MULTI
6229 ? (reglnum < nfa_endp->se_u.pos.lnum
6230 || (reglnum == nfa_endp->se_u.pos.lnum
6231 && (int)(reginput - regline)
6232 < nfa_endp->se_u.pos.col))
6233 : reginput < nfa_endp->se_u.ptr)))) {
6234#ifdef REGEXP_DEBUG
6235 fprintf(log_fd, "(---) STARTSTATE\n");
6236#endif
6237 // Inline optimized code for addstate() if we know the state is
6238 // the first MOPEN.
6239 if (toplevel) {
6240 int add = TRUE;
6241 int c;
6242
6243 if (prog->regstart != NUL && clen != 0) {
6244 if (nextlist->n == 0) {
6245 colnr_T col = (colnr_T)(reginput - regline) + clen;
6246
6247 // Nextlist is empty, we can skip ahead to the
6248 // character that must appear at the start.
6249 if (skip_to_start(prog->regstart, &col) == FAIL) {
6250 break;
6251 }
6252#ifdef REGEXP_DEBUG
6253 fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
6254 col - ((colnr_T)(reginput - regline) + clen));
6255#endif
6256 reginput = regline + col - clen;
6257 } else {
6258 // Checking if the required start character matches is
6259 // cheaper than adding a state that won't match.
6260 c = PTR2CHAR(reginput + clen);
6261 if (c != prog->regstart && (!rex.reg_ic || mb_tolower(c)
6262 != mb_tolower(prog->regstart))) {
6263#ifdef REGEXP_DEBUG
6264 fprintf(log_fd,
6265 " Skipping start state, regstart does not match\n");
6266#endif
6267 add = FALSE;
6268 }
6269 }
6270 }
6271
6272 if (add) {
6273 if (REG_MULTI)
6274 m->norm.list.multi[0].start_col =
6275 (colnr_T)(reginput - regline) + clen;
6276 else
6277 m->norm.list.line[0].start = reginput + clen;
6278 if (addstate(nextlist, start->out, m, NULL, clen) == NULL) {
6279 nfa_match = NFA_TOO_EXPENSIVE;
6280 goto theend;
6281 }
6282 }
6283 } else {
6284 if (addstate(nextlist, start, m, NULL, clen) == NULL) {
6285 nfa_match = NFA_TOO_EXPENSIVE;
6286 goto theend;
6287 }
6288 }
6289 }
6290
6291#ifdef REGEXP_DEBUG
6292 fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
6293 {
6294 int i;
6295
6296 for (i = 0; i < thislist->n; i++)
6297 fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
6298 }
6299 fprintf(log_fd, "\n");
6300#endif
6301
6302nextchar:
6303 // Advance to the next character, or advance to the next line, or
6304 // finish.
6305 if (clen != 0) {
6306 reginput += clen;
6307 } else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI
6308 && reglnum < nfa_endp->se_u.pos.lnum)) {
6309 reg_nextline();
6310 } else {
6311 break;
6312 }
6313
6314 // Allow interrupting with CTRL-C.
6315 line_breakcheck();
6316 if (got_int) {
6317 break;
6318 }
6319 // Check for timeout once every twenty times to avoid overhead.
6320 if (nfa_time_limit != NULL && ++nfa_time_count == 20) {
6321 nfa_time_count = 0;
6322 if (nfa_did_time_out()) {
6323 break;
6324 }
6325 }
6326 }
6327
6328#ifdef REGEXP_DEBUG
6329 if (log_fd != stderr)
6330 fclose(log_fd);
6331 log_fd = NULL;
6332#endif
6333
6334theend:
6335 // Free memory
6336 xfree(list[0].t);
6337 xfree(list[1].t);
6338 xfree(listids);
6339#undef ADD_STATE_IF_MATCH
6340#ifdef NFA_REGEXP_DEBUG_LOG
6341 fclose(debug);
6342#endif
6343
6344 return nfa_match;
6345}
6346
6347// Try match of "prog" with at regline["col"].
6348// Returns <= 0 for failure, number of lines contained in the match otherwise.
6349static long nfa_regtry(nfa_regprog_T *prog,
6350 colnr_T col,
6351 proftime_T *tm, // timeout limit or NULL
6352 int *timed_out) // flag set on timeout or NULL
6353{
6354 int i;
6355 regsubs_T subs, m;
6356 nfa_state_T *start = prog->start;
6357#ifdef REGEXP_DEBUG
6358 FILE *f;
6359#endif
6360
6361 reginput = regline + col;
6362 nfa_time_limit = tm;
6363 nfa_timed_out = timed_out;
6364 nfa_time_count = 0;
6365
6366#ifdef REGEXP_DEBUG
6367 f = fopen(NFA_REGEXP_RUN_LOG, "a");
6368 if (f != NULL) {
6369 fprintf(f,
6370 "\n\n\t=======================================================\n");
6371#ifdef REGEXP_DEBUG
6372 fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
6373#endif
6374 fprintf(f, "\tInput text is \"%s\" \n", reginput);
6375 fprintf(f, "\t=======================================================\n\n");
6376 nfa_print_state(f, start);
6377 fprintf(f, "\n\n");
6378 fclose(f);
6379 } else {
6380 EMSG("Could not open temporary log file for writing");
6381 }
6382#endif
6383
6384 clear_sub(&subs.norm);
6385 clear_sub(&m.norm);
6386 clear_sub(&subs.synt);
6387 clear_sub(&m.synt);
6388
6389 int result = nfa_regmatch(prog, start, &subs, &m);
6390 if (!result) {
6391 return 0;
6392 } else if (result == NFA_TOO_EXPENSIVE) {
6393 return result;
6394 }
6395
6396 cleanup_subexpr();
6397 if (REG_MULTI) {
6398 for (i = 0; i < subs.norm.in_use; i++) {
6399 rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
6400 rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
6401
6402 rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
6403 rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
6404 }
6405
6406 if (rex.reg_startpos[0].lnum < 0) {
6407 rex.reg_startpos[0].lnum = 0;
6408 rex.reg_startpos[0].col = col;
6409 }
6410 if (rex.reg_endpos[0].lnum < 0) {
6411 // pattern has a \ze but it didn't match, use current end
6412 rex.reg_endpos[0].lnum = reglnum;
6413 rex.reg_endpos[0].col = (int)(reginput - regline);
6414 } else {
6415 // Use line number of "\ze".
6416 reglnum = rex.reg_endpos[0].lnum;
6417 }
6418 } else {
6419 for (i = 0; i < subs.norm.in_use; i++) {
6420 rex.reg_startp[i] = subs.norm.list.line[i].start;
6421 rex.reg_endp[i] = subs.norm.list.line[i].end;
6422 }
6423
6424 if (rex.reg_startp[0] == NULL) {
6425 rex.reg_startp[0] = regline + col;
6426 }
6427 if (rex.reg_endp[0] == NULL) {
6428 rex.reg_endp[0] = reginput;
6429 }
6430 }
6431
6432 /* Package any found \z(...\) matches for export. Default is none. */
6433 unref_extmatch(re_extmatch_out);
6434 re_extmatch_out = NULL;
6435
6436 if (prog->reghasz == REX_SET) {
6437 cleanup_zsubexpr();
6438 re_extmatch_out = make_extmatch();
6439 // Loop over \z1, \z2, etc. There is no \z0.
6440 for (i = 1; i < subs.synt.in_use; i++) {
6441 if (REG_MULTI) {
6442 struct multipos *mpos = &subs.synt.list.multi[i];
6443
6444 // Only accept single line matches that are valid.
6445 if (mpos->start_lnum >= 0
6446 && mpos->start_lnum == mpos->end_lnum
6447 && mpos->end_col >= mpos->start_col) {
6448 re_extmatch_out->matches[i] =
6449 vim_strnsave(reg_getline(mpos->start_lnum) + mpos->start_col,
6450 mpos->end_col - mpos->start_col);
6451 }
6452 } else {
6453 struct linepos *lpos = &subs.synt.list.line[i];
6454
6455 if (lpos->start != NULL && lpos->end != NULL)
6456 re_extmatch_out->matches[i] =
6457 vim_strnsave(lpos->start,
6458 (int)(lpos->end - lpos->start));
6459 }
6460 }
6461 }
6462
6463 return 1 + reglnum;
6464}
6465
6466/// Match a regexp against a string ("line" points to the string) or multiple
6467/// lines ("line" is NULL, use reg_getline()).
6468///
6469/// @param line String in which to search or NULL
6470/// @param startcol Column to start looking for match
6471/// @param tm Timeout limit or NULL
6472/// @param timed_out Flag set on timeout or NULL
6473///
6474/// @return <= 0 if there is no match and number of lines contained in the
6475/// match otherwise.
6476static long nfa_regexec_both(char_u *line, colnr_T startcol,
6477 proftime_T *tm, int *timed_out)
6478{
6479 nfa_regprog_T *prog;
6480 long retval = 0L;
6481 int i;
6482 colnr_T col = startcol;
6483
6484 if (REG_MULTI) {
6485 prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
6486 line = reg_getline((linenr_T)0); // relative to the cursor
6487 rex.reg_startpos = rex.reg_mmatch->startpos;
6488 rex.reg_endpos = rex.reg_mmatch->endpos;
6489 } else {
6490 prog = (nfa_regprog_T *)rex.reg_match->regprog;
6491 rex.reg_startp = rex.reg_match->startp;
6492 rex.reg_endp = rex.reg_match->endp;
6493 }
6494
6495 /* Be paranoid... */
6496 if (prog == NULL || line == NULL) {
6497 EMSG(_(e_null));
6498 goto theend;
6499 }
6500
6501 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
6502 if (prog->regflags & RF_ICASE) {
6503 rex.reg_ic = true;
6504 } else if (prog->regflags & RF_NOICASE) {
6505 rex.reg_ic = false;
6506 }
6507
6508 // If pattern contains "\Z" overrule value of rex.reg_icombine
6509 if (prog->regflags & RF_ICOMBINE) {
6510 rex.reg_icombine = true;
6511 }
6512
6513 regline = line;
6514 reglnum = 0; /* relative to line */
6515
6516 nfa_has_zend = prog->has_zend;
6517 nfa_has_backref = prog->has_backref;
6518 nfa_nsubexpr = prog->nsubexp;
6519 nfa_listid = 1;
6520 nfa_alt_listid = 2;
6521 nfa_regengine.expr = prog->pattern;
6522
6523 if (prog->reganch && col > 0)
6524 return 0L;
6525
6526 need_clear_subexpr = TRUE;
6527 /* Clear the external match subpointers if necessary. */
6528 if (prog->reghasz == REX_SET) {
6529 nfa_has_zsubexpr = TRUE;
6530 need_clear_zsubexpr = TRUE;
6531 } else
6532 nfa_has_zsubexpr = FALSE;
6533
6534 if (prog->regstart != NUL) {
6535 /* Skip ahead until a character we know the match must start with.
6536 * When there is none there is no match. */
6537 if (skip_to_start(prog->regstart, &col) == FAIL)
6538 return 0L;
6539
6540 // If match_text is set it contains the full text that must match.
6541 // Nothing else to try. Doesn't handle combining chars well.
6542 if (prog->match_text != NULL && !rex.reg_icombine) {
6543 return find_match_text(col, prog->regstart, prog->match_text);
6544 }
6545 }
6546
6547 // If the start column is past the maximum column: no need to try.
6548 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) {
6549 goto theend;
6550 }
6551
6552 nstate = prog->nstate;
6553 for (i = 0; i < nstate; ++i) {
6554 prog->state[i].id = i;
6555 prog->state[i].lastlist[0] = 0;
6556 prog->state[i].lastlist[1] = 0;
6557 }
6558
6559 retval = nfa_regtry(prog, col, tm, timed_out);
6560
6561 nfa_regengine.expr = NULL;
6562
6563theend:
6564 return retval;
6565}
6566
6567/*
6568 * Compile a regular expression into internal code for the NFA matcher.
6569 * Returns the program in allocated space. Returns NULL for an error.
6570 */
6571static regprog_T *nfa_regcomp(char_u *expr, int re_flags)
6572{
6573 nfa_regprog_T *prog = NULL;
6574 int *postfix;
6575
6576 if (expr == NULL)
6577 return NULL;
6578
6579 nfa_regengine.expr = expr;
6580 nfa_re_flags = re_flags;
6581
6582 init_class_tab();
6583
6584 nfa_regcomp_start(expr, re_flags);
6585
6586 // Build postfix form of the regexp. Needed to build the NFA
6587 // (and count its size).
6588 postfix = re2post();
6589 if (postfix == NULL) {
6590 goto fail; // Cascaded (syntax?) error
6591 }
6592
6593 /*
6594 * In order to build the NFA, we parse the input regexp twice:
6595 * 1. first pass to count size (so we can allocate space)
6596 * 2. second to emit code
6597 */
6598#ifdef REGEXP_DEBUG
6599 {
6600 FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
6601
6602 if (f != NULL) {
6603 fprintf(f,
6604 "\n*****************************\n\n\n\n\t"
6605 "Compiling regexp \"%s\"... hold on !\n",
6606 expr);
6607 fclose(f);
6608 }
6609 }
6610#endif
6611
6612 /*
6613 * PASS 1
6614 * Count number of NFA states in "nstate". Do not build the NFA.
6615 */
6616 post2nfa(postfix, post_ptr, TRUE);
6617
6618 /* allocate the regprog with space for the compiled regexp */
6619 size_t prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1);
6620 prog = xmalloc(prog_size);
6621 state_ptr = prog->state;
6622
6623 /*
6624 * PASS 2
6625 * Build the NFA
6626 */
6627 prog->start = post2nfa(postfix, post_ptr, FALSE);
6628 if (prog->start == NULL)
6629 goto fail;
6630
6631 prog->regflags = regflags;
6632 prog->engine = &nfa_regengine;
6633 prog->nstate = nstate;
6634 prog->has_zend = nfa_has_zend;
6635 prog->has_backref = nfa_has_backref;
6636 prog->nsubexp = regnpar;
6637
6638 nfa_postprocess(prog);
6639
6640 prog->reganch = nfa_get_reganch(prog->start, 0);
6641 prog->regstart = nfa_get_regstart(prog->start, 0);
6642 prog->match_text = nfa_get_match_text(prog->start);
6643
6644#ifdef REGEXP_DEBUG
6645 nfa_postfix_dump(expr, OK);
6646 nfa_dump(prog);
6647#endif
6648 /* Remember whether this pattern has any \z specials in it. */
6649 prog->reghasz = re_has_z;
6650 prog->pattern = vim_strsave(expr);
6651 nfa_regengine.expr = NULL;
6652
6653out:
6654 xfree(post_start);
6655 post_start = post_ptr = post_end = NULL;
6656 state_ptr = NULL;
6657 return (regprog_T *)prog;
6658
6659fail:
6660 XFREE_CLEAR(prog);
6661#ifdef REGEXP_DEBUG
6662 nfa_postfix_dump(expr, FAIL);
6663#endif
6664 nfa_regengine.expr = NULL;
6665 goto out;
6666}
6667
6668/*
6669 * Free a compiled regexp program, returned by nfa_regcomp().
6670 */
6671static void nfa_regfree(regprog_T *prog)
6672{
6673 if (prog != NULL) {
6674 xfree(((nfa_regprog_T *)prog)->match_text);
6675 xfree(((nfa_regprog_T *)prog)->pattern);
6676 xfree(prog);
6677 }
6678}
6679
6680/*
6681 * Match a regexp against a string.
6682 * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
6683 * Uses curbuf for line count and 'iskeyword'.
6684 * If "line_lbr" is true, consider a "\n" in "line" to be a line break.
6685 *
6686 * Returns <= 0 for failure, number of lines contained in the match otherwise.
6687 */
6688static int
6689nfa_regexec_nl (
6690 regmatch_T *rmp,
6691 char_u *line, /* string to match against */
6692 colnr_T col, /* column to start looking for match */
6693 bool line_lbr
6694)
6695{
6696 rex.reg_match = rmp;
6697 rex.reg_mmatch = NULL;
6698 rex.reg_maxline = 0;
6699 rex.reg_line_lbr = line_lbr;
6700 rex.reg_buf = curbuf;
6701 rex.reg_win = NULL;
6702 rex.reg_ic = rmp->rm_ic;
6703 rex.reg_icombine = false;
6704 rex.reg_maxcol = 0;
6705 return nfa_regexec_both(line, col, NULL, NULL);
6706}
6707
6708/// Matches a regexp against multiple lines.
6709/// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
6710/// Uses curbuf for line count and 'iskeyword'.
6711///
6712/// @param win Window in which to search or NULL
6713/// @param buf Buffer in which to search
6714/// @param lnum Number of line to start looking for match
6715/// @param col Column to start looking for match
6716/// @param tm Timeout limit or NULL
6717/// @param timed_out Flag set on timeout or NULL
6718///
6719/// @return <= 0 if there is no match and number of lines contained in the match
6720/// otherwise.
6721///
6722/// @note The body is the same as bt_regexec() except for nfa_regexec_both()
6723///
6724/// @warning
6725/// Match may actually be in another line. e.g.:
6726/// when r.e. is \nc, cursor is at 'a' and the text buffer looks like
6727///
6728/// @par
6729///
6730/// +-------------------------+
6731/// |a |
6732/// |b |
6733/// |c |
6734/// | |
6735/// +-------------------------+
6736///
6737/// @par
6738/// then nfa_regexec_multi() returns 3. while the original vim_regexec_multi()
6739/// returns 0 and a second call at line 2 will return 2.
6740///
6741/// @par
6742/// FIXME if this behavior is not compatible.
6743static long nfa_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf,
6744 linenr_T lnum, colnr_T col,
6745 proftime_T *tm, int *timed_out)
6746{
6747 rex.reg_match = NULL;
6748 rex.reg_mmatch = rmp;
6749 rex.reg_buf = buf;
6750 rex.reg_win = win;
6751 rex.reg_firstlnum = lnum;
6752 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
6753 rex.reg_line_lbr = false;
6754 rex.reg_ic = rmp->rmm_ic;
6755 rex.reg_icombine = false;
6756 rex.reg_maxcol = rmp->rmm_maxcol;
6757
6758 return nfa_regexec_both(NULL, col, tm, timed_out);
6759}
6760