regexp_nfa.c source code [neovim/src/nvim/regexp_nfa.c]

1	// This is an open source non-commercial project. Dear PVS-Studio, please check
2	// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4	/*
5	* NFA regular expression implementation.
6	*
7	* This file is included in "regexp.c".
8	*/
9
10	#include <assert.h>
11	#include <inttypes.h>
12	#include <stdbool.h>
13	#include <limits.h>
14
15	#include "nvim/ascii.h"
16	#include "nvim/garray.h"
17
18	/*
19	* Logging of NFA engine.
20	*
21	* The NFA engine can write four log files:
22	* - Error log: Contains NFA engine's fatal errors.
23	* - Dump log: Contains compiled NFA state machine's information.
24	* - Run log: Contains information of matching procedure.
25	* - Debug log: Contains detailed information of matching procedure. Can be
26	* disabled by undefining NFA_REGEXP_DEBUG_LOG.
27	* The first one can also be used without debug mode.
28	* The last three are enabled when compiled as debug mode and individually
29	* disabled by commenting them out.
30	* The log files can get quite big!
31	* Do disable all of this when compiling Vim for debugging, undefine REGEXP_DEBUG in
32	* regexp.c
33	*/
34	#ifdef REGEXP_DEBUG
35	# define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log"
36	# define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log"
37	# define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log"
38	# define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log"
39	#endif
40
41	/ Added to NFA_ANY - NFA_NUPPER_IC to include a NL. /
42	#define NFA_ADD_NL 31
43
44	enum {
45	NFA_SPLIT = -`1024`,
46	NFA_MATCH,
47	NFA_EMPTY, / matches 0-length /
48
49	NFA_START_COLL, / [abc] start /
50	NFA_END_COLL, / [abc] end /
51	NFA_START_NEG_COLL, / [^abc] start /
52	NFA_END_NEG_COLL, / [^abc] end (postfix only) /
53	NFA_RANGE, / range of the two previous items*
54	* (postfix only) */
55	NFA_RANGE_MIN, / low end of a range /
56	NFA_RANGE_MAX, / high end of a range /
57
58	NFA_CONCAT, // concatenate two previous items (postfix
59	// only)
60	NFA_OR, // \\| (postfix only)
61	NFA_STAR, // greedy (postfix only)*
62	NFA_STAR_NONGREEDY, // non-greedy (postfix only)*
63	NFA_QUEST, // greedy \? (postfix only)
64	NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only)
65
66	NFA_BOL, / ^ Begin line /
67	NFA_EOL, / $ End line /
68	NFA_BOW, / \< Begin word /
69	NFA_EOW, / \> End word /
70	NFA_BOF, / \%^ Begin file /
71	NFA_EOF, / \%$ End file /
72	NFA_NEWL,
73	NFA_ZSTART, / Used for \zs /
74	NFA_ZEND, / Used for \ze /
75	NFA_NOPEN, / Start of subexpression marked with \%( /
76	NFA_NCLOSE, / End of subexpr. marked with \%( ... \) /
77	NFA_START_INVISIBLE,
78	NFA_START_INVISIBLE_FIRST,
79	NFA_START_INVISIBLE_NEG,
80	NFA_START_INVISIBLE_NEG_FIRST,
81	NFA_START_INVISIBLE_BEFORE,
82	NFA_START_INVISIBLE_BEFORE_FIRST,
83	NFA_START_INVISIBLE_BEFORE_NEG,
84	NFA_START_INVISIBLE_BEFORE_NEG_FIRST,
85	NFA_START_PATTERN,
86	NFA_END_INVISIBLE,
87	NFA_END_INVISIBLE_NEG,
88	NFA_END_PATTERN,
89	NFA_COMPOSING, / Next nodes in NFA are part of the*
90	composing multibyte char /*
91	NFA_END_COMPOSING, / End of a composing char in the NFA /
92	NFA_ANY_COMPOSING, // \%C: Any composing characters.
93	NFA_OPT_CHARS, / \%[abc] /
94
95	/ The following are used only in the postfix form, not in the NFA /
96	NFA_PREV_ATOM_NO_WIDTH, / Used for \@= /
97	NFA_PREV_ATOM_NO_WIDTH_NEG, / Used for \@! /
98	NFA_PREV_ATOM_JUST_BEFORE, / Used for \@<= /
99	NFA_PREV_ATOM_JUST_BEFORE_NEG, / Used for \@<! /
100	NFA_PREV_ATOM_LIKE_PATTERN, / Used for \@> /
101
102	NFA_BACKREF1, / \1 /
103	NFA_BACKREF2, / \2 /
104	NFA_BACKREF3, / \3 /
105	NFA_BACKREF4, / \4 /
106	NFA_BACKREF5, / \5 /
107	NFA_BACKREF6, / \6 /
108	NFA_BACKREF7, / \7 /
109	NFA_BACKREF8, / \8 /
110	NFA_BACKREF9, / \9 /
111	NFA_ZREF1, / \z1 /
112	NFA_ZREF2, / \z2 /
113	NFA_ZREF3, / \z3 /
114	NFA_ZREF4, / \z4 /
115	NFA_ZREF5, / \z5 /
116	NFA_ZREF6, / \z6 /
117	NFA_ZREF7, / \z7 /
118	NFA_ZREF8, / \z8 /
119	NFA_ZREF9, / \z9 /
120	NFA_SKIP, / Skip characters /
121
122	NFA_MOPEN,
123	NFA_MOPEN1,
124	NFA_MOPEN2,
125	NFA_MOPEN3,
126	NFA_MOPEN4,
127	NFA_MOPEN5,
128	NFA_MOPEN6,
129	NFA_MOPEN7,
130	NFA_MOPEN8,
131	NFA_MOPEN9,
132
133	NFA_MCLOSE,
134	NFA_MCLOSE1,
135	NFA_MCLOSE2,
136	NFA_MCLOSE3,
137	NFA_MCLOSE4,
138	NFA_MCLOSE5,
139	NFA_MCLOSE6,
140	NFA_MCLOSE7,
141	NFA_MCLOSE8,
142	NFA_MCLOSE9,
143
144	NFA_ZOPEN,
145	NFA_ZOPEN1,
146	NFA_ZOPEN2,
147	NFA_ZOPEN3,
148	NFA_ZOPEN4,
149	NFA_ZOPEN5,
150	NFA_ZOPEN6,
151	NFA_ZOPEN7,
152	NFA_ZOPEN8,
153	NFA_ZOPEN9,
154
155	NFA_ZCLOSE,
156	NFA_ZCLOSE1,
157	NFA_ZCLOSE2,
158	NFA_ZCLOSE3,
159	NFA_ZCLOSE4,
160	NFA_ZCLOSE5,
161	NFA_ZCLOSE6,
162	NFA_ZCLOSE7,
163	NFA_ZCLOSE8,
164	NFA_ZCLOSE9,
165
166	/ NFA_FIRST_NL /
167	NFA_ANY, / Match any one character. /
168	NFA_IDENT, / Match identifier char /
169	NFA_SIDENT, / Match identifier char but no digit /
170	NFA_KWORD, / Match keyword char /
171	NFA_SKWORD, / Match word char but no digit /
172	NFA_FNAME, / Match file name char /
173	NFA_SFNAME, / Match file name char but no digit /
174	NFA_PRINT, / Match printable char /
175	NFA_SPRINT, / Match printable char but no digit /
176	NFA_WHITE, / Match whitespace char /
177	NFA_NWHITE, / Match non-whitespace char /
178	NFA_DIGIT, / Match digit char /
179	NFA_NDIGIT, / Match non-digit char /
180	NFA_HEX, / Match hex char /
181	NFA_NHEX, / Match non-hex char /
182	NFA_OCTAL, / Match octal char /
183	NFA_NOCTAL, / Match non-octal char /
184	NFA_WORD, / Match word char /
185	NFA_NWORD, / Match non-word char /
186	NFA_HEAD, / Match head char /
187	NFA_NHEAD, / Match non-head char /
188	NFA_ALPHA, / Match alpha char /
189	NFA_NALPHA, / Match non-alpha char /
190	NFA_LOWER, / Match lowercase char /
191	NFA_NLOWER, / Match non-lowercase char /
192	NFA_UPPER, / Match uppercase char /
193	NFA_NUPPER, / Match non-uppercase char /
194	NFA_LOWER_IC, / Match [a-z] /
195	NFA_NLOWER_IC, / Match [^a-z] /
196	NFA_UPPER_IC, / Match [A-Z] /
197	NFA_NUPPER_IC, / Match [^A-Z] /
198
199	NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL,
200	NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL,
201
202	NFA_CURSOR, / Match cursor pos /
203	NFA_LNUM, / Match line number /
204	NFA_LNUM_GT, / Match > line number /
205	NFA_LNUM_LT, / Match < line number /
206	NFA_COL, / Match cursor column /
207	NFA_COL_GT, / Match > cursor column /
208	NFA_COL_LT, / Match < cursor column /
209	NFA_VCOL, / Match cursor virtual column /
210	NFA_VCOL_GT, / Match > cursor virtual column /
211	NFA_VCOL_LT, / Match < cursor virtual column /
212	NFA_MARK, / Match mark /
213	NFA_MARK_GT, / Match > mark /
214	NFA_MARK_LT, / Match < mark /
215	NFA_VISUAL, / Match Visual area /
216
217	/ Character classes [:alnum:] etc /
218	NFA_CLASS_ALNUM,
219	NFA_CLASS_ALPHA,
220	NFA_CLASS_BLANK,
221	NFA_CLASS_CNTRL,
222	NFA_CLASS_DIGIT,
223	NFA_CLASS_GRAPH,
224	NFA_CLASS_LOWER,
225	NFA_CLASS_PRINT,
226	NFA_CLASS_PUNCT,
227	NFA_CLASS_SPACE,
228	NFA_CLASS_UPPER,
229	NFA_CLASS_XDIGIT,
230	NFA_CLASS_TAB,
231	NFA_CLASS_RETURN,
232	NFA_CLASS_BACKSPACE,
233	NFA_CLASS_ESCAPE
234	};
235
236	/ Keep in sync with classchars. /
237	static int nfa_classcodes[] = {
238	NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD,
239	NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT,
240	NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT,
241	NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL,
242	NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD,
243	NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER,
244	NFA_UPPER, NFA_NUPPER
245	};
246
247	static char_u e_nul_found[] = N_(
248	"E865: (NFA) Regexp end encountered prematurely");
249	static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c");
250	static char_u e_ill_char_class[] = N_(
251	"E877: (NFA regexp) Invalid character class: %" PRId64);
252
253	/ Since the out pointers in the list are always*
254	* uninitialized, we use the pointers themselves
255	* as storage for the Ptrlists. */
256	typedef union Ptrlist Ptrlist;
257	union Ptrlist {
258	Ptrlist *next;
259	nfa_state_T *s;
260	};
261
262	struct Frag {
263	nfa_state_T *start;
264	Ptrlist *out;
265	};
266	typedef struct Frag Frag_T;
267
268	typedef struct {
269	int in_use; / number of subexpr with useful info /
270
271	/ When REG_MULTI is TRUE list.multi is used, otherwise list.line. /
272	union {
273	struct multipos {
274	linenr_T start_lnum;
275	linenr_T end_lnum;
276	colnr_T start_col;
277	colnr_T end_col;
278	} multi[NSUBEXP];
279	struct linepos {
280	char_u *start;
281	char_u *end;
282	} line[NSUBEXP];
283	} list;
284	} regsub_T;
285
286	typedef struct {
287	regsub_T norm; / $ .. $ matches /
288	regsub_T synt; / \z( .. \) matches /
289	} regsubs_T;
290
291	/ nfa_pim_T stores a Postponed Invisible Match. /
292	typedef struct nfa_pim_S nfa_pim_T;
293	struct nfa_pim_S {
294	int result; / NFA_PIM_, see below /*
295	nfa_state_T state; /* the invisible match start state /
296	regsubs_T subs; / submatch info, only party used /
297	union {
298	lpos_T pos;
299	char_u *ptr;
300	} end; / where the match must end /
301	};
302
303	/ nfa_thread_T contains execution information of a NFA state /
304	typedef struct {
305	nfa_state_T *state;
306	int count;
307	nfa_pim_T pim; / if pim.result != NFA_PIM_UNUSED: postponed*
308	* invisible match */
309	regsubs_T subs; / submatch info, only party used /
310	} nfa_thread_T;
311
312	/ nfa_list_T contains the alternative NFA execution states. /
313	typedef struct {
314	nfa_thread_T t; /* allocated array of states /
315	int n; / nr of states currently in "t" /
316	int len; / max nr of states in "t" /
317	int id; / ID of the list /
318	int has_pim; / TRUE when any state has a PIM /
319	} nfa_list_T;
320
321	/// re_flags passed to nfa_regcomp().
322	static int nfa_re_flags;
323
324	/ NFA regexp \ze operator encountered. /
325	static int nfa_has_zend;
326
327	/ NFA regexp \1 .. \9 encountered. /
328	static int nfa_has_backref;
329
330	/ NFA regexp has \z( ), set zsubexpr. /
331	static int nfa_has_zsubexpr;
332
333	/ Number of sub expressions actually being used during execution. 1 if only*
334	* the whole match (subexpr 0) is used. */
335	static int nfa_nsubexpr;
336
337	static int post_start; /* holds the postfix form of r.e. /
338	static int *post_end;
339	static int *post_ptr;
340
341	static int nstate; / Number of states in the NFA. Also used when*
342	* executing. */
343	static int istate; / Index in the state vector, used in alloc_state() /
344
345	/ If not NULL match must end at this position /
346	static save_se_T *nfa_endp = NULL;
347
348	/ listid is global, so that it increases on recursive calls to*
349	* nfa_regmatch(), which means we don't have to clear the lastlist field of
350	* all the states. */
351	static int nfa_listid;
352	static int nfa_alt_listid;
353
354	/ 0 for first call to nfa_regmatch(), 1 for recursive call. /
355	static int nfa_ll_index = `0`;
356
357	#ifdef INCLUDE_GENERATED_DECLARATIONS
358	# include "regexp_nfa.c.generated.h"
359	#endif
360
361	// Helper functions used when doing re2post() ... regatom() parsing
362	#define EMIT(c) \
363	do { \
364	if (post_ptr >= post_end) { \
365	realloc_post_list(); \
366	} \
367	*post_ptr++ = c; \
368	} while (0)
369
370	/*
371	* Initialize internal variables before NFA compilation.
372	*/
373	static void
374	nfa_regcomp_start (
375	char_u *expr,
376	int re_flags / see vim_regcomp() /
377	)
378	{
379	size_t postfix_size;
380	size_t nstate_max;
381
382	nstate = `0`;
383	istate = `0`;
384	/ A reasonable estimation for maximum size /
385	nstate_max = (STRLEN(expr) + `1`) * `25`;
386
387	/ Some items blow up in size, such as [A-z]. Add more space for that.*
388	* When it is still not enough realloc_post_list() will be used. */
389	nstate_max += `1000`;
390
391	/ Size for postfix representation of expr. /
392	postfix_size = sizeof(int) * nstate_max;
393
394	post_start = (int *)xmalloc(postfix_size);
395	post_ptr = post_start;
396	post_end = post_start + nstate_max;
397	nfa_has_zend = FALSE;
398	nfa_has_backref = FALSE;
399
400	/ shared with BT engine /
401	regcomp_start(expr, re_flags);
402	}
403
404	/*
405	* Figure out if the NFA state list starts with an anchor, must match at start
406	* of the line.
407	*/
408	static int nfa_get_reganch(nfa_state_T start, int* depth)
409	{
410	nfa_state_T *p = start;
411
412	if (depth > `4`)
413	return `0`;
414
415	while (p != NULL) {
416	switch (p->c) {
417	case NFA_BOL:
418	case NFA_BOF:
419	return `1`; / yes! /
420
421	case NFA_ZSTART:
422	case NFA_ZEND:
423	case NFA_CURSOR:
424	case NFA_VISUAL:
425
426	case NFA_MOPEN:
427	case NFA_MOPEN1:
428	case NFA_MOPEN2:
429	case NFA_MOPEN3:
430	case NFA_MOPEN4:
431	case NFA_MOPEN5:
432	case NFA_MOPEN6:
433	case NFA_MOPEN7:
434	case NFA_MOPEN8:
435	case NFA_MOPEN9:
436	case NFA_NOPEN:
437	case NFA_ZOPEN:
438	case NFA_ZOPEN1:
439	case NFA_ZOPEN2:
440	case NFA_ZOPEN3:
441	case NFA_ZOPEN4:
442	case NFA_ZOPEN5:
443	case NFA_ZOPEN6:
444	case NFA_ZOPEN7:
445	case NFA_ZOPEN8:
446	case NFA_ZOPEN9:
447	p = p->out;
448	break;
449
450	case NFA_SPLIT:
451	return nfa_get_reganch(p->out, depth + `1`)
452	&& nfa_get_reganch(p->out1, depth + `1`);
453
454	default:
455	return `0`; / noooo /
456	}
457	}
458	return `0`;
459	}
460
461	/*
462	* Figure out if the NFA state list starts with a character which must match
463	* at start of the match.
464	*/
465	static int nfa_get_regstart(nfa_state_T start, int* depth)
466	{
467	nfa_state_T *p = start;
468
469	if (depth > `4`)
470	return `0`;
471
472	while (p != NULL) {
473	switch (p->c) {
474	/ all kinds of zero-width matches /
475	case NFA_BOL:
476	case NFA_BOF:
477	case NFA_BOW:
478	case NFA_EOW:
479	case NFA_ZSTART:
480	case NFA_ZEND:
481	case NFA_CURSOR:
482	case NFA_VISUAL:
483	case NFA_LNUM:
484	case NFA_LNUM_GT:
485	case NFA_LNUM_LT:
486	case NFA_COL:
487	case NFA_COL_GT:
488	case NFA_COL_LT:
489	case NFA_VCOL:
490	case NFA_VCOL_GT:
491	case NFA_VCOL_LT:
492	case NFA_MARK:
493	case NFA_MARK_GT:
494	case NFA_MARK_LT:
495
496	case NFA_MOPEN:
497	case NFA_MOPEN1:
498	case NFA_MOPEN2:
499	case NFA_MOPEN3:
500	case NFA_MOPEN4:
501	case NFA_MOPEN5:
502	case NFA_MOPEN6:
503	case NFA_MOPEN7:
504	case NFA_MOPEN8:
505	case NFA_MOPEN9:
506	case NFA_NOPEN:
507	case NFA_ZOPEN:
508	case NFA_ZOPEN1:
509	case NFA_ZOPEN2:
510	case NFA_ZOPEN3:
511	case NFA_ZOPEN4:
512	case NFA_ZOPEN5:
513	case NFA_ZOPEN6:
514	case NFA_ZOPEN7:
515	case NFA_ZOPEN8:
516	case NFA_ZOPEN9:
517	p = p->out;
518	break;
519
520	case NFA_SPLIT:
521	{
522	int c1 = nfa_get_regstart(p->out, depth + `1`);
523	int c2 = nfa_get_regstart(p->out1, depth + `1`);
524
525	if (c1 == c2)
526	return c1; / yes! /
527	return `0`;
528	}
529
530	default:
531	if (p->c > `0`)
532	return p->c; / yes! /
533	return `0`;
534	}
535	}
536	return `0`;
537	}
538
539	/*
540	* Figure out if the NFA state list contains just literal text and nothing
541	* else. If so return a string in allocated memory with what must match after
542	* regstart. Otherwise return NULL.
543	*/
544	static char_u nfa_get_match_text(nfa_state_T start)
545	{
546	nfa_state_T *p = start;
547	int len = `0`;
548	char_u *ret;
549	char_u *s;
550
551	if (p->c != NFA_MOPEN)
552	return NULL; / just in case /
553	p = p->out;
554	while (p->c > `0`) {
555	len += MB_CHAR2LEN(p->c);
556	p = p->out;
557	}
558	if (p->c != NFA_MCLOSE \|\| p->out->c != NFA_MATCH)
559	return NULL;
560
561	ret = xmalloc(len);
562	p = start->out->out; / skip first char, it goes into regstart /
563	s = ret;
564	while (p->c > `0`) {
565	s += utf_char2bytes(p->c, s);
566	p = p->out;
567	}
568	*s = NUL;
569
570	return ret;
571	}
572
573	/*
574	* Allocate more space for post_start. Called when
575	* running above the estimated number of states.
576	*/
577	static void realloc_post_list(void)
578	{
579	size_t new_max = (post_end - post_start) + `1000`;
580	int new_start = xrealloc(post_start, new_max sizeof(int));
581	post_ptr = new_start + (post_ptr - post_start);
582	post_end = new_start + new_max;
583	post_start = new_start;
584	}
585
586	/*
587	* Search between "start" and "end" and try to recognize a
588	* character class in expanded form. For example [0-9].
589	* On success, return the id the character class to be emitted.
590	* On failure, return 0 (=FAIL)
591	* Start points to the first char of the range, while end should point
592	* to the closing brace.
593	* Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may
594	* need to be interpreted as [a-zA-Z].
595	*/
596	static int nfa_recognize_char_class(char_u start, char_u end, int extra_newl)
597	{
598	# define CLASS_not 0x80
599	# define CLASS_af 0x40
600	# define CLASS_AF 0x20
601	# define CLASS_az 0x10
602	# define CLASS_AZ 0x08
603	# define CLASS_o7 0x04
604	# define CLASS_o9 0x02
605	# define CLASS_underscore 0x01
606
607	int newl = FALSE;
608	char_u *p;
609	int config = `0`;
610
611	if (extra_newl == TRUE)
612	newl = TRUE;
613
614	if (*end != `']'`)
615	return FAIL;
616	p = start;
617	if (*p == `'^'`) {
618	config \|= CLASS_not;
619	p++;
620	}
621
622	while (p < end) {
623	if (p + `2` < end && *(p + `1`) == `'-'`) {
624	switch (*p) {
625	case `'0'`:
626	if (*(p + `2`) == `'9'`) {
627	config \|= CLASS_o9;
628	break;
629	} else if (*(p + `2`) == `'7'`) {
630	config \|= CLASS_o7;
631	break;
632	}
633	return FAIL;
634	case `'a'`:
635	if (*(p + `2`) == `'z'`) {
636	config \|= CLASS_az;
637	break;
638	} else if (*(p + `2`) == `'f'`) {
639	config \|= CLASS_af;
640	break;
641	}
642	return FAIL;
643	case `'A'`:
644	if (*(p + `2`) == `'Z'`) {
645	config \|= CLASS_AZ;
646	break;
647	} else if (*(p + `2`) == `'F'`) {
648	config \|= CLASS_AF;
649	break;
650	}
651	return FAIL;
652	default:
653	return FAIL;
654	}
655	p += `3`;
656	} else if (p + `1` < end && p == `'\\'` && (p + `1`) == `'n'`) {
657	newl = TRUE;
658	p += `2`;
659	} else if (*p == `'_'`) {
660	config \|= CLASS_underscore;
661	p++;
662	} else if (*p == `'\n'`) {
663	newl = TRUE;
664	p++;
665	} else
666	return FAIL;
667	} / while (p < end) /
668
669	if (p != end)
670	return FAIL;
671
672	if (newl == TRUE)
673	extra_newl = NFA_ADD_NL;
674
675	switch (config) {
676	case CLASS_o9:
677	return extra_newl + NFA_DIGIT;
678	case CLASS_not \| CLASS_o9:
679	return extra_newl + NFA_NDIGIT;
680	case CLASS_af \| CLASS_AF \| CLASS_o9:
681	return extra_newl + NFA_HEX;
682	case CLASS_not \| CLASS_af \| CLASS_AF \| CLASS_o9:
683	return extra_newl + NFA_NHEX;
684	case CLASS_o7:
685	return extra_newl + NFA_OCTAL;
686	case CLASS_not \| CLASS_o7:
687	return extra_newl + NFA_NOCTAL;
688	case CLASS_az \| CLASS_AZ \| CLASS_o9 \| CLASS_underscore:
689	return extra_newl + NFA_WORD;
690	case CLASS_not \| CLASS_az \| CLASS_AZ \| CLASS_o9 \| CLASS_underscore:
691	return extra_newl + NFA_NWORD;
692	case CLASS_az \| CLASS_AZ \| CLASS_underscore:
693	return extra_newl + NFA_HEAD;
694	case CLASS_not \| CLASS_az \| CLASS_AZ \| CLASS_underscore:
695	return extra_newl + NFA_NHEAD;
696	case CLASS_az \| CLASS_AZ:
697	return extra_newl + NFA_ALPHA;
698	case CLASS_not \| CLASS_az \| CLASS_AZ:
699	return extra_newl + NFA_NALPHA;
700	case CLASS_az:
701	return extra_newl + NFA_LOWER_IC;
702	case CLASS_not \| CLASS_az:
703	return extra_newl + NFA_NLOWER_IC;
704	case CLASS_AZ:
705	return extra_newl + NFA_UPPER_IC;
706	case CLASS_not \| CLASS_AZ:
707	return extra_newl + NFA_NUPPER_IC;
708	}
709	return FAIL;
710	}
711
712	/*
713	* Produce the bytes for equivalence class "c".
714	* Currently only handles latin1, latin9 and utf-8.
715	* Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is
716	* equivalent to 'a OR b OR c'
717	*
718	* NOTE! When changing this function, also update reg_equi_class()
719	*/
720	static void nfa_emit_equi_class(int c)
721	{
722	#define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT);
723	#define EMITMBC(c) EMIT(c); EMIT(NFA_CONCAT);
724
725	if (enc_utf8 \|\| STRCMP(p_enc, "latin1") == `0`
726	\|\| STRCMP(p_enc, "iso-8859-15") == `0`) {
727	#define A_grave 0xc0
728	#define A_acute 0xc1
729	#define A_circumflex 0xc2
730	#define A_virguilla 0xc3
731	#define A_diaeresis 0xc4
732	#define A_ring 0xc5
733	#define C_cedilla 0xc7
734	#define E_grave 0xc8
735	#define E_acute 0xc9
736	#define E_circumflex 0xca
737	#define E_diaeresis 0xcb
738	#define I_grave 0xcc
739	#define I_acute 0xcd
740	#define I_circumflex 0xce
741	#define I_diaeresis 0xcf
742	#define N_virguilla 0xd1
743	#define O_grave 0xd2
744	#define O_acute 0xd3
745	#define O_circumflex 0xd4
746	#define O_virguilla 0xd5
747	#define O_diaeresis 0xd6
748	#define O_slash 0xd8
749	#define U_grave 0xd9
750	#define U_acute 0xda
751	#define U_circumflex 0xdb
752	#define U_diaeresis 0xdc
753	#define Y_acute 0xdd
754	#define a_grave 0xe0
755	#define a_acute 0xe1
756	#define a_circumflex 0xe2
757	#define a_virguilla 0xe3
758	#define a_diaeresis 0xe4
759	#define a_ring 0xe5
760	#define c_cedilla 0xe7
761	#define e_grave 0xe8
762	#define e_acute 0xe9
763	#define e_circumflex 0xea
764	#define e_diaeresis 0xeb
765	#define i_grave 0xec
766	#define i_acute 0xed
767	#define i_circumflex 0xee
768	#define i_diaeresis 0xef
769	#define n_virguilla 0xf1
770	#define o_grave 0xf2
771	#define o_acute 0xf3
772	#define o_circumflex 0xf4
773	#define o_virguilla 0xf5
774	#define o_diaeresis 0xf6
775	#define o_slash 0xf8
776	#define u_grave 0xf9
777	#define u_acute 0xfa
778	#define u_circumflex 0xfb
779	#define u_diaeresis 0xfc
780	#define y_acute 0xfd
781	#define y_diaeresis 0xff
782	switch (c) {
783	case `'A'`: case A_grave: case A_acute: case A_circumflex:
784	case A_virguilla: case A_diaeresis: case A_ring:
785	CASEMBC(`0x100`) CASEMBC(`0x102`) CASEMBC(`0x104`)
786	CASEMBC(`0x1cd`) CASEMBC(`0x1de`) CASEMBC(`0x1e0`)
787	CASEMBC(`0x1ea2`)
788	EMIT2(`'A'`); EMIT2(A_grave); EMIT2(A_acute);
789	EMIT2(A_circumflex); EMIT2(A_virguilla);
790	EMIT2(A_diaeresis); EMIT2(A_ring);
791	EMITMBC(`0x100`) EMITMBC(`0x102`) EMITMBC(`0x104`)
792	EMITMBC(`0x1cd`) EMITMBC(`0x1de`) EMITMBC(`0x1e0`)
793	EMITMBC(`0x1ea2`)
794	return;
795
796	case `'B'`: CASEMBC(`0x1e02`) CASEMBC(`0x1e06`)
797	EMIT2(`'B'`); EMITMBC(`0x1e02`) EMITMBC(`0x1e06`)
798	return;
799
800	case `'C'`: case C_cedilla: CASEMBC(`0x106`) CASEMBC(`0x108`) CASEMBC(`0x10a`)
801	CASEMBC(`0x10c`)
802	EMIT2(`'C'`); EMIT2(C_cedilla); EMITMBC(`0x106`) EMITMBC(`0x108`)
803	EMITMBC(`0x10a`) EMITMBC(`0x10c`)
804	return;
805
806	case `'D'`: CASEMBC(`0x10e`) CASEMBC(`0x110`) CASEMBC(`0x1e0a`)
807	CASEMBC(`0x1e0e`) CASEMBC(`0x1e10`)
808	EMIT2(`'D'`); EMITMBC(`0x10e`) EMITMBC(`0x110`) EMITMBC(`0x1e0a`)
809	EMITMBC(`0x1e0e`) EMITMBC(`0x1e10`)
810	return;
811
812	case `'E'`: case E_grave: case E_acute: case E_circumflex:
813	case E_diaeresis: CASEMBC(`0x112`) CASEMBC(`0x114`)
814	CASEMBC(`0x116`) CASEMBC(`0x118`) CASEMBC(`0x11a`)
815	CASEMBC(`0x1eba`) CASEMBC(`0x1ebc`)
816	EMIT2(`'E'`); EMIT2(E_grave); EMIT2(E_acute);
817	EMIT2(E_circumflex); EMIT2(E_diaeresis);
818	EMITMBC(`0x112`) EMITMBC(`0x114`) EMITMBC(`0x116`)
819	EMITMBC(`0x118`) EMITMBC(`0x11a`) EMITMBC(`0x1eba`)
820	EMITMBC(`0x1ebc`)
821	return;
822
823	case `'F'`: CASEMBC(`0x1e1e`)
824	EMIT2(`'F'`); EMITMBC(`0x1e1e`)
825	return;
826
827	case `'G'`: CASEMBC(`0x11c`) CASEMBC(`0x11e`) CASEMBC(`0x120`)
828	CASEMBC(`0x122`) CASEMBC(`0x1e4`) CASEMBC(`0x1e6`)
829	CASEMBC(`0x1f4`) CASEMBC(`0x1e20`)
830	EMIT2(`'G'`); EMITMBC(`0x11c`) EMITMBC(`0x11e`) EMITMBC(`0x120`)
831	EMITMBC(`0x122`) EMITMBC(`0x1e4`) EMITMBC(`0x1e6`)
832	EMITMBC(`0x1f4`) EMITMBC(`0x1e20`)
833	return;
834
835	case `'H'`: CASEMBC(`0x124`) CASEMBC(`0x126`) CASEMBC(`0x1e22`)
836	CASEMBC(`0x1e26`) CASEMBC(`0x1e28`)
837	EMIT2(`'H'`); EMITMBC(`0x124`) EMITMBC(`0x126`) EMITMBC(`0x1e22`)
838	EMITMBC(`0x1e26`) EMITMBC(`0x1e28`)
839	return;
840
841	case `'I'`: case I_grave: case I_acute: case I_circumflex:
842	case I_diaeresis: CASEMBC(`0x128`) CASEMBC(`0x12a`)
843	CASEMBC(`0x12c`) CASEMBC(`0x12e`) CASEMBC(`0x130`)
844	CASEMBC(`0x1cf`) CASEMBC(`0x1ec8`)
845	EMIT2(`'I'`); EMIT2(I_grave); EMIT2(I_acute);
846	EMIT2(I_circumflex); EMIT2(I_diaeresis);
847	EMITMBC(`0x128`) EMITMBC(`0x12a`)
848	EMITMBC(`0x12c`) EMITMBC(`0x12e`) EMITMBC(`0x130`)
849	EMITMBC(`0x1cf`) EMITMBC(`0x1ec8`)
850	return;
851
852	case `'J'`: CASEMBC(`0x134`)
853	EMIT2(`'J'`); EMITMBC(`0x134`)
854	return;
855
856	case `'K'`: CASEMBC(`0x136`) CASEMBC(`0x1e8`) CASEMBC(`0x1e30`)
857	CASEMBC(`0x1e34`)
858	EMIT2(`'K'`); EMITMBC(`0x136`) EMITMBC(`0x1e8`) EMITMBC(`0x1e30`)
859	EMITMBC(`0x1e34`)
860	return;
861
862	case `'L'`: CASEMBC(`0x139`) CASEMBC(`0x13b`) CASEMBC(`0x13d`)
863	CASEMBC(`0x13f`) CASEMBC(`0x141`) CASEMBC(`0x1e3a`)
864	EMIT2(`'L'`); EMITMBC(`0x139`) EMITMBC(`0x13b`) EMITMBC(`0x13d`)
865	EMITMBC(`0x13f`) EMITMBC(`0x141`) EMITMBC(`0x1e3a`)
866	return;
867
868	case `'M'`: CASEMBC(`0x1e3e`) CASEMBC(`0x1e40`)
869	EMIT2(`'M'`); EMITMBC(`0x1e3e`) EMITMBC(`0x1e40`)
870	return;
871
872	case `'N'`: case N_virguilla: CASEMBC(`0x143`) CASEMBC(`0x145`)
873	CASEMBC(`0x147`) CASEMBC(`0x1e44`) CASEMBC(`0x1e48`)
874	EMIT2(`'N'`); EMIT2(N_virguilla);
875	EMITMBC(`0x143`) EMITMBC(`0x145`)
876	EMITMBC(`0x147`) EMITMBC(`0x1e44`) EMITMBC(`0x1e48`)
877	return;
878
879	case `'O'`: case O_grave: case O_acute: case O_circumflex:
880	case O_virguilla: case O_diaeresis: case O_slash:
881	CASEMBC(`0x14c`) CASEMBC(`0x14e`) CASEMBC(`0x150`)
882	CASEMBC(`0x1a0`) CASEMBC(`0x1d1`) CASEMBC(`0x1ea`)
883	CASEMBC(`0x1ec`) CASEMBC(`0x1ece`)
884	EMIT2(`'O'`); EMIT2(O_grave); EMIT2(O_acute);
885	EMIT2(O_circumflex); EMIT2(O_virguilla);
886	EMIT2(O_diaeresis); EMIT2(O_slash);
887	EMITMBC(`0x14c`) EMITMBC(`0x14e`) EMITMBC(`0x150`)
888	EMITMBC(`0x1a0`) EMITMBC(`0x1d1`) EMITMBC(`0x1ea`)
889	EMITMBC(`0x1ec`) EMITMBC(`0x1ece`)
890	return;
891
892	case `'P'`: case `0x1e54`: case `0x1e56`:
893	EMIT2(`'P'`); EMITMBC(`0x1e54`) EMITMBC(`0x1e56`)
894	return;
895
896	case `'R'`: CASEMBC(`0x154`) CASEMBC(`0x156`) CASEMBC(`0x158`)
897	CASEMBC(`0x1e58`) CASEMBC(`0x1e5e`)
898	EMIT2(`'R'`); EMITMBC(`0x154`) EMITMBC(`0x156`) EMITMBC(`0x158`)
899	EMITMBC(`0x1e58`) EMITMBC(`0x1e5e`)
900	return;
901
902	case `'S'`: CASEMBC(`0x15a`) CASEMBC(`0x15c`) CASEMBC(`0x15e`)
903	CASEMBC(`0x160`) CASEMBC(`0x1e60`)
904	EMIT2(`'S'`); EMITMBC(`0x15a`) EMITMBC(`0x15c`) EMITMBC(`0x15e`)
905	EMITMBC(`0x160`) EMITMBC(`0x1e60`)
906	return;
907
908	case `'T'`: CASEMBC(`0x162`) CASEMBC(`0x164`) CASEMBC(`0x166`)
909	CASEMBC(`0x1e6a`) CASEMBC(`0x1e6e`)
910	EMIT2(`'T'`); EMITMBC(`0x162`) EMITMBC(`0x164`) EMITMBC(`0x166`)
911	EMITMBC(`0x1e6a`) EMITMBC(`0x1e6e`)
912	return;
913
914	case `'U'`: case U_grave: case U_acute: case U_diaeresis:
915	case U_circumflex: CASEMBC(`0x168`) CASEMBC(`0x16a`)
916	CASEMBC(`0x16c`) CASEMBC(`0x16e`) CASEMBC(`0x170`)
917	CASEMBC(`0x172`) CASEMBC(`0x1af`) CASEMBC(`0x1d3`)
918	CASEMBC(`0x1ee6`)
919	EMIT2(`'U'`); EMIT2(U_grave); EMIT2(U_acute);
920	EMIT2(U_diaeresis); EMIT2(U_circumflex);
921	EMITMBC(`0x168`) EMITMBC(`0x16a`)
922	EMITMBC(`0x16c`) EMITMBC(`0x16e`) EMITMBC(`0x170`)
923	EMITMBC(`0x172`) EMITMBC(`0x1af`) EMITMBC(`0x1d3`)
924	EMITMBC(`0x1ee6`)
925	return;
926
927	case `'V'`: CASEMBC(`0x1e7c`)
928	EMIT2(`'V'`); EMITMBC(`0x1e7c`)
929	return;
930
931	case `'W'`: CASEMBC(`0x174`) CASEMBC(`0x1e80`) CASEMBC(`0x1e82`)
932	CASEMBC(`0x1e84`) CASEMBC(`0x1e86`)
933	EMIT2(`'W'`); EMITMBC(`0x174`) EMITMBC(`0x1e80`) EMITMBC(`0x1e82`)
934	EMITMBC(`0x1e84`) EMITMBC(`0x1e86`)
935	return;
936
937	case `'X'`: CASEMBC(`0x1e8a`) CASEMBC(`0x1e8c`)
938	EMIT2(`'X'`); EMITMBC(`0x1e8a`) EMITMBC(`0x1e8c`)
939	return;
940
941	case `'Y'`: case Y_acute: CASEMBC(`0x176`) CASEMBC(`0x178`)
942	CASEMBC(`0x1e8e`) CASEMBC(`0x1ef2`) CASEMBC(`0x1ef6`)
943	CASEMBC(`0x1ef8`)
944	EMIT2(`'Y'`); EMIT2(Y_acute);
945	EMITMBC(`0x176`) EMITMBC(`0x178`)
946	EMITMBC(`0x1e8e`) EMITMBC(`0x1ef2`) EMITMBC(`0x1ef6`)
947	EMITMBC(`0x1ef8`)
948	return;
949
950	case `'Z'`: CASEMBC(`0x179`) CASEMBC(`0x17b`) CASEMBC(`0x17d`)
951	CASEMBC(`0x1b5`) CASEMBC(`0x1e90`) CASEMBC(`0x1e94`)
952	EMIT2(`'Z'`); EMITMBC(`0x179`) EMITMBC(`0x17b`) EMITMBC(`0x17d`)
953	EMITMBC(`0x1b5`) EMITMBC(`0x1e90`) EMITMBC(`0x1e94`)
954	return;
955
956	case `'a'`: case a_grave: case a_acute: case a_circumflex:
957	case a_virguilla: case a_diaeresis: case a_ring:
958	CASEMBC(`0x101`) CASEMBC(`0x103`) CASEMBC(`0x105`)
959	CASEMBC(`0x1ce`) CASEMBC(`0x1df`) CASEMBC(`0x1e1`)
960	CASEMBC(`0x1ea3`)
961	EMIT2(`'a'`); EMIT2(a_grave); EMIT2(a_acute);
962	EMIT2(a_circumflex); EMIT2(a_virguilla);
963	EMIT2(a_diaeresis); EMIT2(a_ring);
964	EMITMBC(`0x101`) EMITMBC(`0x103`) EMITMBC(`0x105`)
965	EMITMBC(`0x1ce`) EMITMBC(`0x1df`) EMITMBC(`0x1e1`)
966	EMITMBC(`0x1ea3`)
967	return;
968
969	case `'b'`: CASEMBC(`0x1e03`) CASEMBC(`0x1e07`)
970	EMIT2(`'b'`); EMITMBC(`0x1e03`) EMITMBC(`0x1e07`)
971	return;
972
973	case `'c'`: case c_cedilla: CASEMBC(`0x107`) CASEMBC(`0x109`)
974	CASEMBC(`0x10b`) CASEMBC(`0x10d`)
975	EMIT2(`'c'`); EMIT2(c_cedilla);
976	EMITMBC(`0x107`) EMITMBC(`0x109`)
977	EMITMBC(`0x10b`) EMITMBC(`0x10d`)
978	return;
979
980	case `'d'`: CASEMBC(`0x10f`) CASEMBC(`0x111`) CASEMBC(`0x1e0b`)
981	CASEMBC(`0x1e0f`) CASEMBC(`0x1e11`)
982	EMIT2(`'d'`); EMITMBC(`0x10f`) EMITMBC(`0x111`) EMITMBC(`0x1e0b`)
983	EMITMBC(`0x1e0f`) EMITMBC(`0x1e11`)
984	return;
985
986	case `'e'`: case e_grave: case e_acute: case e_circumflex:
987	case e_diaeresis: CASEMBC(`0x113`) CASEMBC(`0x115`)
988	CASEMBC(`0x117`) CASEMBC(`0x119`) CASEMBC(`0x11b`)
989	CASEMBC(`0x1ebb`) CASEMBC(`0x1ebd`)
990	EMIT2(`'e'`); EMIT2(e_grave); EMIT2(e_acute);
991	EMIT2(e_circumflex); EMIT2(e_diaeresis);
992	EMITMBC(`0x113`) EMITMBC(`0x115`)
993	EMITMBC(`0x117`) EMITMBC(`0x119`) EMITMBC(`0x11b`)
994	EMITMBC(`0x1ebb`) EMITMBC(`0x1ebd`)
995	return;
996
997	case `'f'`: CASEMBC(`0x1e1f`)
998	EMIT2(`'f'`); EMITMBC(`0x1e1f`)
999	return;
1000
1001	case `'g'`: CASEMBC(`0x11d`) CASEMBC(`0x11f`) CASEMBC(`0x121`)
1002	CASEMBC(`0x123`) CASEMBC(`0x1e5`) CASEMBC(`0x1e7`)
1003	CASEMBC(`0x1f5`) CASEMBC(`0x1e21`)
1004	EMIT2(`'g'`); EMITMBC(`0x11d`) EMITMBC(`0x11f`) EMITMBC(`0x121`)
1005	EMITMBC(`0x123`) EMITMBC(`0x1e5`) EMITMBC(`0x1e7`)
1006	EMITMBC(`0x1f5`) EMITMBC(`0x1e21`)
1007	return;
1008
1009	case `'h'`: CASEMBC(`0x125`) CASEMBC(`0x127`) CASEMBC(`0x1e23`)
1010	CASEMBC(`0x1e27`) CASEMBC(`0x1e29`) CASEMBC(`0x1e96`)
1011	EMIT2(`'h'`); EMITMBC(`0x125`) EMITMBC(`0x127`) EMITMBC(`0x1e23`)
1012	EMITMBC(`0x1e27`) EMITMBC(`0x1e29`) EMITMBC(`0x1e96`)
1013	return;
1014
1015	case `'i'`: case i_grave: case i_acute: case i_circumflex:
1016	case i_diaeresis: CASEMBC(`0x129`) CASEMBC(`0x12b`)
1017	CASEMBC(`0x12d`) CASEMBC(`0x12f`) CASEMBC(`0x1d0`)
1018	CASEMBC(`0x1ec9`)
1019	EMIT2(`'i'`); EMIT2(i_grave); EMIT2(i_acute);
1020	EMIT2(i_circumflex); EMIT2(i_diaeresis);
1021	EMITMBC(`0x129`) EMITMBC(`0x12b`)
1022	EMITMBC(`0x12d`) EMITMBC(`0x12f`) EMITMBC(`0x1d0`)
1023	EMITMBC(`0x1ec9`)
1024	return;
1025
1026	case `'j'`: CASEMBC(`0x135`) CASEMBC(`0x1f0`)
1027	EMIT2(`'j'`); EMITMBC(`0x135`) EMITMBC(`0x1f0`)
1028	return;
1029
1030	case `'k'`: CASEMBC(`0x137`) CASEMBC(`0x1e9`) CASEMBC(`0x1e31`)
1031	CASEMBC(`0x1e35`)
1032	EMIT2(`'k'`); EMITMBC(`0x137`) EMITMBC(`0x1e9`) EMITMBC(`0x1e31`)
1033	EMITMBC(`0x1e35`)
1034	return;
1035
1036	case `'l'`: CASEMBC(`0x13a`) CASEMBC(`0x13c`) CASEMBC(`0x13e`)
1037	CASEMBC(`0x140`) CASEMBC(`0x142`) CASEMBC(`0x1e3b`)
1038	EMIT2(`'l'`); EMITMBC(`0x13a`) EMITMBC(`0x13c`) EMITMBC(`0x13e`)
1039	EMITMBC(`0x140`) EMITMBC(`0x142`) EMITMBC(`0x1e3b`)
1040	return;
1041
1042	case `'m'`: CASEMBC(`0x1e3f`) CASEMBC(`0x1e41`)
1043	EMIT2(`'m'`); EMITMBC(`0x1e3f`) EMITMBC(`0x1e41`)
1044	return;
1045
1046	case `'n'`: case n_virguilla: CASEMBC(`0x144`) CASEMBC(`0x146`)
1047	CASEMBC(`0x148`) CASEMBC(`0x149`) CASEMBC(`0x1e45`)
1048	CASEMBC(`0x1e49`)
1049	EMIT2(`'n'`); EMIT2(n_virguilla);
1050	EMITMBC(`0x144`) EMITMBC(`0x146`)
1051	EMITMBC(`0x148`) EMITMBC(`0x149`) EMITMBC(`0x1e45`)
1052	EMITMBC(`0x1e49`)
1053	return;
1054
1055	case `'o'`: case o_grave: case o_acute: case o_circumflex:
1056	case o_virguilla: case o_diaeresis: case o_slash:
1057	CASEMBC(`0x14d`) CASEMBC(`0x14f`) CASEMBC(`0x151`)
1058	CASEMBC(`0x1a1`) CASEMBC(`0x1d2`) CASEMBC(`0x1eb`)
1059	CASEMBC(`0x1ed`) CASEMBC(`0x1ecf`)
1060	EMIT2(`'o'`); EMIT2(o_grave); EMIT2(o_acute);
1061	EMIT2(o_circumflex); EMIT2(o_virguilla);
1062	EMIT2(o_diaeresis); EMIT2(o_slash);
1063	EMITMBC(`0x14d`) EMITMBC(`0x14f`) EMITMBC(`0x151`)
1064	EMITMBC(`0x1a1`) EMITMBC(`0x1d2`) EMITMBC(`0x1eb`)
1065	EMITMBC(`0x1ed`) EMITMBC(`0x1ecf`)
1066	return;
1067
1068	case `'p'`: CASEMBC(`0x1e55`) CASEMBC(`0x1e57`)
1069	EMIT2(`'p'`); EMITMBC(`0x1e55`) EMITMBC(`0x1e57`)
1070	return;
1071
1072	case `'r'`: CASEMBC(`0x155`) CASEMBC(`0x157`) CASEMBC(`0x159`)
1073	CASEMBC(`0x1e59`) CASEMBC(`0x1e5f`)
1074	EMIT2(`'r'`); EMITMBC(`0x155`) EMITMBC(`0x157`) EMITMBC(`0x159`)
1075	EMITMBC(`0x1e59`) EMITMBC(`0x1e5f`)
1076	return;
1077
1078	case `'s'`: CASEMBC(`0x15b`) CASEMBC(`0x15d`) CASEMBC(`0x15f`)
1079	CASEMBC(`0x161`) CASEMBC(`0x1e61`)
1080	EMIT2(`'s'`); EMITMBC(`0x15b`) EMITMBC(`0x15d`) EMITMBC(`0x15f`)
1081	EMITMBC(`0x161`) EMITMBC(`0x1e61`)
1082	return;
1083
1084	case `'t'`: CASEMBC(`0x163`) CASEMBC(`0x165`) CASEMBC(`0x167`)
1085	CASEMBC(`0x1e6b`) CASEMBC(`0x1e6f`) CASEMBC(`0x1e97`)
1086	EMIT2(`'t'`); EMITMBC(`0x163`) EMITMBC(`0x165`) EMITMBC(`0x167`)
1087	EMITMBC(`0x1e6b`) EMITMBC(`0x1e6f`) EMITMBC(`0x1e97`)
1088	return;
1089
1090	case `'u'`: case u_grave: case u_acute: case u_circumflex:
1091	case u_diaeresis: CASEMBC(`0x169`) CASEMBC(`0x16b`)
1092	CASEMBC(`0x16d`) CASEMBC(`0x16f`) CASEMBC(`0x171`)
1093	CASEMBC(`0x173`) CASEMBC(`0x1b0`) CASEMBC(`0x1d4`)
1094	CASEMBC(`0x1ee7`)
1095	EMIT2(`'u'`); EMIT2(u_grave); EMIT2(u_acute);
1096	EMIT2(u_circumflex); EMIT2(u_diaeresis);
1097	EMITMBC(`0x169`) EMITMBC(`0x16b`)
1098	EMITMBC(`0x16d`) EMITMBC(`0x16f`) EMITMBC(`0x171`)
1099	EMITMBC(`0x173`) EMITMBC(`0x1b0`) EMITMBC(`0x1d4`)
1100	EMITMBC(`0x1ee7`)
1101	return;
1102
1103	case `'v'`: CASEMBC(`0x1e7d`)
1104	EMIT2(`'v'`); EMITMBC(`0x1e7d`)
1105	return;
1106
1107	case `'w'`: CASEMBC(`0x175`) CASEMBC(`0x1e81`) CASEMBC(`0x1e83`)
1108	CASEMBC(`0x1e85`) CASEMBC(`0x1e87`) CASEMBC(`0x1e98`)
1109	EMIT2(`'w'`); EMITMBC(`0x175`) EMITMBC(`0x1e81`) EMITMBC(`0x1e83`)
1110	EMITMBC(`0x1e85`) EMITMBC(`0x1e87`) EMITMBC(`0x1e98`)
1111	return;
1112
1113	case `'x'`: CASEMBC(`0x1e8b`) CASEMBC(`0x1e8d`)
1114	EMIT2(`'x'`); EMITMBC(`0x1e8b`) EMITMBC(`0x1e8d`)
1115	return;
1116
1117	case `'y'`: case y_acute: case y_diaeresis: CASEMBC(`0x177`)
1118	CASEMBC(`0x1e8f`) CASEMBC(`0x1e99`) CASEMBC(`0x1ef3`)
1119	CASEMBC(`0x1ef7`) CASEMBC(`0x1ef9`)
1120	EMIT2(`'y'`); EMIT2(y_acute); EMIT2(y_diaeresis);
1121	EMITMBC(`0x177`)
1122	EMITMBC(`0x1e8f`) EMITMBC(`0x1e99`) EMITMBC(`0x1ef3`)
1123	EMITMBC(`0x1ef7`) EMITMBC(`0x1ef9`)
1124	return;
1125
1126	case `'z'`: CASEMBC(`0x17a`) CASEMBC(`0x17c`) CASEMBC(`0x17e`)
1127	CASEMBC(`0x1b6`) CASEMBC(`0x1e91`) CASEMBC(`0x1e95`)
1128	EMIT2(`'z'`); EMITMBC(`0x17a`) EMITMBC(`0x17c`) EMITMBC(`0x17e`)
1129	EMITMBC(`0x1b6`) EMITMBC(`0x1e91`) EMITMBC(`0x1e95`)
1130	return;
1131
1132	/ default: character itself /
1133	}
1134	}
1135
1136	EMIT2(c);
1137	#undef EMIT2
1138	#undef EMITMBC
1139	}
1140
1141	/*
1142	* Code to parse regular expression.
1143	*
1144	* We try to reuse parsing functions in regexp.c to
1145	* minimize surprise and keep the syntax consistent.
1146	*/
1147
1148	/*
1149	* Parse the lowest level.
1150	*
1151	* An atom can be one of a long list of items. Many atoms match one character
1152	* in the text. It is often an ordinary character or a character class.
1153	* Braces can be used to make a pattern into an atom. The "\z(\)" construct
1154	* is only for syntax highlighting.
1155	*
1156	* atom ::= ordinary-atom
1157	* or $ pattern $
1158	* or \%( pattern \)
1159	* or \z( pattern \)
1160	*/
1161	static int nfa_regatom(void)
1162	{
1163	int c;
1164	int charclass;
1165	int equiclass;
1166	int collclass;
1167	int got_coll_char;
1168	char_u *p;
1169	char_u *endp;
1170	char_u *old_regparse = regparse;
1171	int extra = `0`;
1172	int emit_range;
1173	int negated;
1174	int startc = -`1`;
1175	int endc = -`1`;
1176	int oldstartc = -`1`;
1177	int save_prev_at_start = prev_at_start;
1178
1179	c = getchr();
1180	switch (c) {
1181	case NUL:
1182	EMSG_RET_FAIL(_(e_nul_found));
1183
1184	case Magic(`'^'`):
1185	EMIT(NFA_BOL);
1186	break;
1187
1188	case Magic(`'$'`):
1189	EMIT(NFA_EOL);
1190	had_eol = TRUE;
1191	break;
1192
1193	case Magic(`'<'`):
1194	EMIT(NFA_BOW);
1195	break;
1196
1197	case Magic(`'>'`):
1198	EMIT(NFA_EOW);
1199	break;
1200
1201	case Magic(`'_'`):
1202	c = no_Magic(getchr());
1203	if (c == NUL)
1204	EMSG_RET_FAIL(_(e_nul_found));
1205
1206	if (c == `'^'`) { / "\_^" is start-of-line /
1207	EMIT(NFA_BOL);
1208	break;
1209	}
1210	if (c == `'$'`) { / "\_$" is end-of-line /
1211	EMIT(NFA_EOL);
1212	had_eol = TRUE;
1213	break;
1214	}
1215
1216	extra = NFA_ADD_NL;
1217
1218	/ "\_[" is collection plus newline /
1219	if (c == `'['`)
1220	goto collection;
1221
1222	// "\_x" is character class plus newline
1223	FALLTHROUGH;
1224
1225	/*
1226	* Character classes.
1227	*/
1228	case Magic(`'.'`):
1229	case Magic(`'i'`):
1230	case Magic(`'I'`):
1231	case Magic(`'k'`):
1232	case Magic(`'K'`):
1233	case Magic(`'f'`):
1234	case Magic(`'F'`):
1235	case Magic(`'p'`):
1236	case Magic(`'P'`):
1237	case Magic(`'s'`):
1238	case Magic(`'S'`):
1239	case Magic(`'d'`):
1240	case Magic(`'D'`):
1241	case Magic(`'x'`):
1242	case Magic(`'X'`):
1243	case Magic(`'o'`):
1244	case Magic(`'O'`):
1245	case Magic(`'w'`):
1246	case Magic(`'W'`):
1247	case Magic(`'h'`):
1248	case Magic(`'H'`):
1249	case Magic(`'a'`):
1250	case Magic(`'A'`):
1251	case Magic(`'l'`):
1252	case Magic(`'L'`):
1253	case Magic(`'u'`):
1254	case Magic(`'U'`):
1255	p = vim_strchr(classchars, no_Magic(c));
1256	if (p == NULL) {
1257	if (extra == NFA_ADD_NL) {
1258	EMSGN(_(e_ill_char_class), c);
1259	rc_did_emsg = TRUE;
1260	return FAIL;
1261	}
1262	IEMSGN("INTERNAL: Unknown character class char: %" PRId64, c);
1263	return FAIL;
1264	}
1265	// When '.' is followed by a composing char ignore the dot, so that
1266	// the composing char is matched here.
1267	if (enc_utf8 && c == Magic(`'.'`) && utf_iscomposing(peekchr())) {
1268	old_regparse = regparse;
1269	c = getchr();
1270	goto nfa_do_multibyte;
1271	}
1272	EMIT(nfa_classcodes[p - classchars]);
1273	if (extra == NFA_ADD_NL) {
1274	EMIT(NFA_NEWL);
1275	EMIT(NFA_OR);
1276	regflags \|= RF_HASNL;
1277	}
1278	break;
1279
1280	case Magic(`'n'`):
1281	if (reg_string) {
1282	// In a string "\n" matches a newline character.
1283	EMIT(NL);
1284	} else {
1285	// In buffer text "\n" matches the end of a line.
1286	EMIT(NFA_NEWL);
1287	regflags \|= RF_HASNL;
1288	}
1289	break;
1290
1291	case Magic(`'('`):
1292	if (nfa_reg(REG_PAREN) == FAIL) {
1293	return FAIL; // cascaded error
1294	}
1295	break;
1296
1297	case Magic(`'\|'`):
1298	case Magic(`'&'`):
1299	case Magic(`')'`):
1300	EMSGN(_(e_misplaced), no_Magic(c)); // -V1037
1301	return FAIL;
1302
1303	case Magic(`'='`):
1304	case Magic(`'?'`):
1305	case Magic(`'+'`):
1306	case Magic(`'@'`):
1307	case Magic(`'*'`):
1308	case Magic(`'{'`):
1309	// these should follow an atom, not form an atom
1310	EMSGN(_(e_misplaced), no_Magic(c));
1311	return FAIL;
1312
1313	case Magic(`'~'`):
1314	{
1315	char_u *lp;
1316
1317	// Previous substitute pattern.
1318	// Generated as "\%(pattern\)".
1319	if (reg_prev_sub == NULL) {
1320	EMSG(_(e_nopresub));
1321	return FAIL;
1322	}
1323	for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp)) {
1324	EMIT(PTR2CHAR(lp));
1325	if (lp != reg_prev_sub)
1326	EMIT(NFA_CONCAT);
1327	}
1328	EMIT(NFA_NOPEN);
1329	break;
1330	}
1331
1332	case Magic(`'1'`):
1333	case Magic(`'2'`):
1334	case Magic(`'3'`):
1335	case Magic(`'4'`):
1336	case Magic(`'5'`):
1337	case Magic(`'6'`):
1338	case Magic(`'7'`):
1339	case Magic(`'8'`):
1340	case Magic(`'9'`):
1341	{
1342	int refnum = no_Magic(c) - `'1'`;
1343
1344	if (!seen_endbrace(refnum + `1`)) {
1345	return FAIL;
1346	}
1347	EMIT(NFA_BACKREF1 + refnum);
1348	nfa_has_backref = true;
1349	}
1350	break;
1351
1352	case Magic(`'z'`):
1353	c = no_Magic(getchr());
1354	switch (c) {
1355	case `'s'`:
1356	EMIT(NFA_ZSTART);
1357	if (!re_mult_next("\\zs")) {
1358	return false;
1359	}
1360	break;
1361	case `'e'`:
1362	EMIT(NFA_ZEND);
1363	nfa_has_zend = true;
1364	if (!re_mult_next("\\zs")) {
1365	return false;
1366	}
1367	break;
1368	case `'1'`:
1369	case `'2'`:
1370	case `'3'`:
1371	case `'4'`:
1372	case `'5'`:
1373	case `'6'`:
1374	case `'7'`:
1375	case `'8'`:
1376	case `'9'`:
1377	// \z1...\z9
1378	if ((reg_do_extmatch & REX_USE) == `0`) {
1379	EMSG_RET_FAIL(_(e_z1_not_allowed));
1380	}
1381	EMIT(NFA_ZREF1 + (no_Magic(c) - `'1'`));
1382	/ No need to set nfa_has_backref, the sub-matches don't*
1383	* change when \z1 .. \z9 matches or not. */
1384	re_has_z = REX_USE;
1385	break;
1386	case `'('`:
1387	// \z(
1388	if (reg_do_extmatch != REX_SET) {
1389	EMSG_RET_FAIL(_(e_z_not_allowed));
1390	}
1391	if (nfa_reg(REG_ZPAREN) == FAIL) {
1392	return FAIL; // cascaded error
1393	}
1394	re_has_z = REX_SET;
1395	break;
1396	default:
1397	emsgf(_("E867: (NFA) Unknown operator '\\z%c'"),
1398	no_Magic(c));
1399	return FAIL;
1400	}
1401	break;
1402
1403	case Magic(`'%'`):
1404	c = no_Magic(getchr());
1405	switch (c) {
1406	/ () without a back reference /
1407	case `'('`:
1408	if (nfa_reg(REG_NPAREN) == FAIL)
1409	return FAIL;
1410	EMIT(NFA_NOPEN);
1411	break;
1412
1413	case `'d'`: / %d123 decimal /
1414	case `'o'`: / %o123 octal /
1415	case `'x'`: / %xab hex 2 /
1416	case `'u'`: / %uabcd hex 4 /
1417	case `'U'`: / %U1234abcd hex 8 /
1418	{
1419	int64_t nr;
1420
1421	switch (c) {
1422	case `'d'`: nr = getdecchrs(); break;
1423	case `'o'`: nr = getoctchrs(); break;
1424	case `'x'`: nr = gethexchrs(`2`); break;
1425	case `'u'`: nr = gethexchrs(`4`); break;
1426	case `'U'`: nr = gethexchrs(`8`); break;
1427	default: nr = -`1`; break;
1428	}
1429
1430	if (nr < `0` \|\| nr > INT_MAX) {
1431	EMSG2_RET_FAIL(_("E678: Invalid character after %s%%[dxouU]"),
1432	reg_magic == MAGIC_ALL);
1433	}
1434	// A NUL is stored in the text as NL
1435	// TODO(vim): what if a composing character follows?
1436	EMIT(nr == `0` ? `0x0a` : nr);
1437	}
1438	break;
1439
1440	/ Catch \%^ and \%$ regardless of where they appear in the*
1441	* pattern -- regardless of whether or not it makes sense. */
1442	case `'^'`:
1443	EMIT(NFA_BOF);
1444	break;
1445
1446	case `'$'`:
1447	EMIT(NFA_EOF);
1448	break;
1449
1450	case `'#'`:
1451	EMIT(NFA_CURSOR);
1452	break;
1453
1454	case `'V'`:
1455	EMIT(NFA_VISUAL);
1456	break;
1457
1458	case `'C'`:
1459	EMIT(NFA_ANY_COMPOSING);
1460	break;
1461
1462	case `'['`:
1463	{
1464	int n;
1465
1466	/ \%[abc] /
1467	for (n = `0`; (c = peekchr()) != `']'`; ++n) {
1468	if (c == NUL)
1469	EMSG2_RET_FAIL(_(e_missing_sb),
1470	reg_magic == MAGIC_ALL);
1471	/ recursive call! /
1472	if (nfa_regatom() == FAIL)
1473	return FAIL;
1474	}
1475	getchr(); / get the ] /
1476	if (n == `0`)
1477	EMSG2_RET_FAIL(_(e_empty_sb),
1478	reg_magic == MAGIC_ALL);
1479	EMIT(NFA_OPT_CHARS);
1480	EMIT(n);
1481
1482	/ Emit as "\%(\%[abc]\)" to be able to handle*
1483	* "\%[abc]*" which would cause the empty string to be
1484	* matched an unlimited number of times. NFA_NOPEN is
1485	* added only once at a position, while NFA_SPLIT is
1486	* added multiple times. This is more efficient than
1487	* not allowing NFA_SPLIT multiple times, it is used
1488	* a lot. */
1489	EMIT(NFA_NOPEN);
1490	break;
1491	}
1492
1493	default:
1494	{
1495	int64_t n = `0`;
1496	const int cmp = c;
1497
1498	if (c == `'<'` \|\| c == `'>'`)
1499	c = getchr();
1500	while (ascii_isdigit(c)) {
1501	if (n > (INT32_MAX - (c - `'0'`)) / `10`) {
1502	EMSG(_("E951: \\% value too large"));
1503	return FAIL;
1504	}
1505	n = n * `10` + (c - `'0'`);
1506	c = getchr();
1507	}
1508	if (c == `'l'` \|\| c == `'c'` \|\| c == `'v'`) {
1509	int32_t limit = INT32_MAX;
1510
1511	if (c == `'l'`) {
1512	// \%{n}l \%{n}<l \%{n}>l
1513	EMIT(cmp == `'<'` ? NFA_LNUM_LT :
1514	cmp == `'>'` ? NFA_LNUM_GT : NFA_LNUM);
1515	if (save_prev_at_start) {
1516	at_start = true;
1517	}
1518	} else if (c == `'c'`) {
1519	// \%{n}c \%{n}<c \%{n}>c
1520	EMIT(cmp == `'<'` ? NFA_COL_LT :
1521	cmp == `'>'` ? NFA_COL_GT : NFA_COL);
1522	} else {
1523	// \%{n}v \%{n}<v \%{n}>v
1524	EMIT(cmp == `'<'` ? NFA_VCOL_LT :
1525	cmp == `'>'` ? NFA_VCOL_GT : NFA_VCOL);
1526	limit = INT32_MAX / MB_MAXBYTES;
1527	}
1528	if (n >= limit) {
1529	EMSG(_("E951: \\% value too large"));
1530	return FAIL;
1531	}
1532	EMIT((int)n);
1533	break;
1534	} else if (c == `'\''` && n == `0`) {
1535	/ \%'m \%<'m \%>'m /
1536	EMIT(cmp == `'<'` ? NFA_MARK_LT :
1537	cmp == `'>'` ? NFA_MARK_GT : NFA_MARK);
1538	EMIT(getchr());
1539	break;
1540	}
1541	}
1542	emsgf(_("E867: (NFA) Unknown operator '\\%%%c'"),
1543	no_Magic(c));
1544	return FAIL;
1545	}
1546	break;
1547
1548	case Magic(`'['`):
1549	collection:
1550	/*
1551	* [abc] uses NFA_START_COLL - NFA_END_COLL
1552	* [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL
1553	* Each character is produced as a regular state, using
1554	* NFA_CONCAT to bind them together.
1555	* Besides normal characters there can be:
1556	* - character classes NFA_CLASS_*
1557	* - ranges, two characters followed by NFA_RANGE.
1558	*/
1559
1560	p = regparse;
1561	endp = skip_anyof(p);
1562	if (*endp == `']'`) {
1563	/*
1564	* Try to reverse engineer character classes. For example,
1565	* recognize that [0-9] stands for \d and [A-Za-z_] for \h,
1566	* and perform the necessary substitutions in the NFA.
1567	*/
1568	int result = nfa_recognize_char_class(regparse, endp,
1569	extra == NFA_ADD_NL);
1570	if (result != FAIL) {
1571	if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL) {
1572	EMIT(result - NFA_ADD_NL);
1573	EMIT(NFA_NEWL);
1574	EMIT(NFA_OR);
1575	} else
1576	EMIT(result);
1577	regparse = endp;
1578	MB_PTR_ADV(regparse);
1579	return OK;
1580	}
1581	/*
1582	* Failed to recognize a character class. Use the simple
1583	* version that turns [abc] into 'a' OR 'b' OR 'c'
1584	*/
1585	startc = endc = oldstartc = -`1`;
1586	negated = false;
1587	if (regparse == `'^'`) { // negated range*
1588	negated = true;
1589	MB_PTR_ADV(regparse);
1590	EMIT(NFA_START_NEG_COLL);
1591	} else
1592	EMIT(NFA_START_COLL);
1593	if (*regparse == `'-'`) {
1594	startc = `'-'`;
1595	EMIT(startc);
1596	EMIT(NFA_CONCAT);
1597	MB_PTR_ADV(regparse);
1598	}
1599	/ Emit the OR branches for each character in the [] /
1600	emit_range = FALSE;
1601	while (regparse < endp) {
1602	oldstartc = startc;
1603	startc = -`1`;
1604	got_coll_char = FALSE;
1605	if (*regparse == `'['`) {
1606	/ Check for [: :], [= =], [. .] /
1607	equiclass = collclass = `0`;
1608	charclass = get_char_class(&regparse);
1609	if (charclass == CLASS_NONE) {
1610	equiclass = get_equi_class(&regparse);
1611	if (equiclass == `0`)
1612	collclass = get_coll_element(&regparse);
1613	}
1614
1615	/ Character class like [:alpha:] /
1616	if (charclass != CLASS_NONE) {
1617	switch (charclass) {
1618	case CLASS_ALNUM:
1619	EMIT(NFA_CLASS_ALNUM);
1620	break;
1621	case CLASS_ALPHA:
1622	EMIT(NFA_CLASS_ALPHA);
1623	break;
1624	case CLASS_BLANK:
1625	EMIT(NFA_CLASS_BLANK);
1626	break;
1627	case CLASS_CNTRL:
1628	EMIT(NFA_CLASS_CNTRL);
1629	break;
1630	case CLASS_DIGIT:
1631	EMIT(NFA_CLASS_DIGIT);
1632	break;
1633	case CLASS_GRAPH:
1634	EMIT(NFA_CLASS_GRAPH);
1635	break;
1636	case CLASS_LOWER:
1637	EMIT(NFA_CLASS_LOWER);
1638	break;
1639	case CLASS_PRINT:
1640	EMIT(NFA_CLASS_PRINT);
1641	break;
1642	case CLASS_PUNCT:
1643	EMIT(NFA_CLASS_PUNCT);
1644	break;
1645	case CLASS_SPACE:
1646	EMIT(NFA_CLASS_SPACE);
1647	break;
1648	case CLASS_UPPER:
1649	EMIT(NFA_CLASS_UPPER);
1650	break;
1651	case CLASS_XDIGIT:
1652	EMIT(NFA_CLASS_XDIGIT);
1653	break;
1654	case CLASS_TAB:
1655	EMIT(NFA_CLASS_TAB);
1656	break;
1657	case CLASS_RETURN:
1658	EMIT(NFA_CLASS_RETURN);
1659	break;
1660	case CLASS_BACKSPACE:
1661	EMIT(NFA_CLASS_BACKSPACE);
1662	break;
1663	case CLASS_ESCAPE:
1664	EMIT(NFA_CLASS_ESCAPE);
1665	break;
1666	}
1667	EMIT(NFA_CONCAT);
1668	continue;
1669	}
1670	/ Try equivalence class [=a=] and the like /
1671	if (equiclass != `0`) {
1672	nfa_emit_equi_class(equiclass);
1673	continue;
1674	}
1675	/ Try collating class like [. .] /
1676	if (collclass != `0`) {
1677	startc = collclass; / allow [.a.]-x as a range /
1678	/ Will emit the proper atom at the end of the*
1679	* while loop. */
1680	}
1681	}
1682	/ Try a range like 'a-x' or '\t-z'. Also allows '-' as a*
1683	* start character. */
1684	if (*regparse == `'-'` && oldstartc != -`1`) {
1685	emit_range = TRUE;
1686	startc = oldstartc;
1687	MB_PTR_ADV(regparse);
1688	continue; // reading the end of the range
1689	}
1690
1691	/ Now handle simple and escaped characters.*
1692	* Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
1693	* accepts "\t", "\e", etc., but only when the 'l' flag in
1694	* 'cpoptions' is not included.
1695	*/
1696	if (*regparse == `'\\'`
1697	&& regparse + `1` <= endp
1698	&& (vim_strchr(REGEXP_INRANGE, regparse[`1`]) != NULL
1699	\|\| (!reg_cpo_lit
1700	&& vim_strchr(REGEXP_ABBR, regparse[`1`])
1701	!= NULL)
1702	)
1703	) {
1704	MB_PTR_ADV(regparse);
1705
1706	if (*regparse == `'n'`) {
1707	startc = (reg_string \|\| emit_range \|\| regparse[`1`] == `'-'`)
1708	? NL : NFA_NEWL;
1709	} else if (*regparse == `'d'`
1710	\|\| *regparse == `'o'`
1711	\|\| *regparse == `'x'`
1712	\|\| *regparse == `'u'`
1713	\|\| *regparse == `'U'`
1714	) {
1715	// TODO(RE): This needs more testing
1716	startc = coll_get_char();
1717	got_coll_char = true;
1718	MB_PTR_BACK(old_regparse, regparse);
1719	} else {
1720	/ \r,\t,\e,\b /
1721	startc = backslash_trans(*regparse);
1722	}
1723	}
1724
1725	/ Normal printable char /
1726	if (startc == -`1`)
1727	startc = PTR2CHAR(regparse);
1728
1729	/ Previous char was '-', so this char is end of range. /
1730	if (emit_range) {
1731	endc = startc;
1732	startc = oldstartc;
1733	if (startc > endc) {
1734	EMSG_RET_FAIL(_(e_reverse_range));
1735	}
1736
1737	if (endc > startc + `2`) {
1738	/ Emit a range instead of the sequence of*
1739	* individual characters. */
1740	if (startc == `0`)
1741	/ \x00 is translated to \x0a, start at \x01. /
1742	EMIT(`1`);
1743	else
1744	--post_ptr; / remove NFA_CONCAT /
1745	EMIT(endc);
1746	EMIT(NFA_RANGE);
1747	EMIT(NFA_CONCAT);
1748	} else if (has_mbyte && ((*mb_char2len)(startc) > `1`
1749	\|\| (*mb_char2len)(endc) > `1`)) {
1750	/ Emit the characters in the range.*
1751	* "startc" was already emitted, so skip it.
1752	* */
1753	for (c = startc + `1`; c <= endc; c++) {
1754	EMIT(c);
1755	EMIT(NFA_CONCAT);
1756	}
1757	} else {
1758	/ Emit the range. "startc" was already emitted, so*
1759	* skip it. */
1760	for (c = startc + `1`; c <= endc; c++) {
1761	EMIT(c);
1762	EMIT(NFA_CONCAT);
1763	}
1764	}
1765	emit_range = FALSE;
1766	startc = -`1`;
1767	} else {
1768	/ This char (startc) is not part of a range. Just*
1769	* emit it.
1770	* Normally, simply emit startc. But if we get char
1771	* code=0 from a collating char, then replace it with
1772	* 0x0a.
1773	* This is needed to completely mimic the behaviour of
1774	* the backtracking engine. */
1775	if (startc == NFA_NEWL) {
1776	/ Line break can't be matched as part of the*
1777	* collection, add an OR below. But not for negated
1778	* range. */
1779	if (!negated)
1780	extra = NFA_ADD_NL;
1781	} else {
1782	if (got_coll_char == TRUE && startc == `0`)
1783	EMIT(`0x0a`);
1784	else
1785	EMIT(startc);
1786	EMIT(NFA_CONCAT);
1787	}
1788	}
1789
1790	MB_PTR_ADV(regparse);
1791	} // while (p < endp)
1792
1793	MB_PTR_BACK(old_regparse, regparse);
1794	if (regparse == `'-'`) { // if last, '-' is just a char*
1795	EMIT(`'-'`);
1796	EMIT(NFA_CONCAT);
1797	}
1798
1799	/ skip the trailing ] /
1800	regparse = endp;
1801	MB_PTR_ADV(regparse);
1802
1803	/ Mark end of the collection. /
1804	if (negated == TRUE)
1805	EMIT(NFA_END_NEG_COLL);
1806	else
1807	EMIT(NFA_END_COLL);
1808
1809	/ \_[] also matches \n but it's not negated /
1810	if (extra == NFA_ADD_NL) {
1811	EMIT(reg_string ? NL : NFA_NEWL);
1812	EMIT(NFA_OR);
1813	}
1814
1815	return OK;
1816	} / if exists closing ] /
1817
1818	if (reg_strict)
1819	EMSG_RET_FAIL(_(e_missingbracket));
1820	FALLTHROUGH;
1821
1822	default:
1823	{
1824	int plen;
1825
1826	nfa_do_multibyte:
1827	// plen is length of current char with composing chars
1828	if (enc_utf8 && ((*mb_char2len)(c)
1829	!= (plen = utfc_ptr2len(old_regparse))
1830	\|\| utf_iscomposing(c))) {
1831	int i = `0`;
1832
1833	/ A base character plus composing characters, or just one*
1834	* or more composing characters.
1835	* This requires creating a separate atom as if enclosing
1836	* the characters in (), where NFA_COMPOSING is the ( and
1837	* NFA_END_COMPOSING is the ). Note that right now we are
1838	* building the postfix form, not the NFA itself;
1839	* a composing char could be: a, b, c, NFA_COMPOSING
1840	* where 'b' and 'c' are chars with codes > 256. */
1841	for (;; ) {
1842	EMIT(c);
1843	if (i > `0`)
1844	EMIT(NFA_CONCAT);
1845	if ((i += utf_char2len(c)) >= plen)
1846	break;
1847	c = utf_ptr2char(old_regparse + i);
1848	}
1849	EMIT(NFA_COMPOSING);
1850	regparse = old_regparse + plen;
1851	} else {
1852	c = no_Magic(c);
1853	EMIT(c);
1854	}
1855	return OK;
1856	}
1857	}
1858
1859	return OK;
1860	}
1861
1862	/*
1863	* Parse something followed by possible [*+=].
1864	*
1865	* A piece is an atom, possibly followed by a multi, an indication of how many
1866	* times the atom can be matched. Example: "a*" matches any sequence of "a"
1867	* characters: "", "a", "aa", etc.
1868	*
1869	* piece ::= atom
1870	* or atom multi
1871	*/
1872	static int nfa_regpiece(void)
1873	{
1874	int i;
1875	int op;
1876	int ret;
1877	long minval, maxval;
1878	int greedy = TRUE; / Braces are prefixed with '-' ? /
1879	parse_state_T old_state;
1880	parse_state_T new_state;
1881	int64_t c2;
1882	int old_post_pos;
1883	int my_post_start;
1884	int quest;
1885
1886	/ Save the current parse state, so that we can use it if <atom>{m,n} is*
1887	* next. */
1888	save_parse_state(&old_state);
1889
1890	/ store current pos in the postfix form, for \{m,n} involving 0s /
1891	my_post_start = (int)(post_ptr - post_start);
1892
1893	ret = nfa_regatom();
1894	if (ret == FAIL)
1895	return FAIL; / cascaded error /
1896
1897	op = peekchr();
1898	if (re_multi_type(op) == NOT_MULTI)
1899	return OK;
1900
1901	skipchr();
1902	switch (op) {
1903	case Magic(`'*'`):
1904	EMIT(NFA_STAR);
1905	break;
1906
1907	case Magic(`'+'`):
1908	/*
1909	* Trick: Normally, (a*)\+ would match the whole input "aaa". The
1910	* first and only submatch would be "aaa". But the backtracking
1911	* engine interprets the plus as "try matching one more time", and
1912	* a* matches a second time at the end of the input, the empty
1913	* string.
1914	* The submatch will be the empty string.
1915	*
1916	* In order to be consistent with the old engine, we replace
1917	* <atom>+ with <atom><atom>*
1918	*/
1919	restore_parse_state(&old_state);
1920	curchr = -`1`;
1921	if (nfa_regatom() == FAIL)
1922	return FAIL;
1923	EMIT(NFA_STAR);
1924	EMIT(NFA_CONCAT);
1925	skipchr(); / skip the \+ /
1926	break;
1927
1928	case Magic(`'@'`):
1929	c2 = getdecchrs();
1930	op = no_Magic(getchr());
1931	i = `0`;
1932	switch(op) {
1933	case `'='`:
1934	/ \@= /
1935	i = NFA_PREV_ATOM_NO_WIDTH;
1936	break;
1937	case `'!'`:
1938	/ \@! /
1939	i = NFA_PREV_ATOM_NO_WIDTH_NEG;
1940	break;
1941	case `'<'`:
1942	op = no_Magic(getchr());
1943	if (op == `'='`)
1944	/ \@<= /
1945	i = NFA_PREV_ATOM_JUST_BEFORE;
1946	else if (op == `'!'`)
1947	/ \@<! /
1948	i = NFA_PREV_ATOM_JUST_BEFORE_NEG;
1949	break;
1950	case `'>'`:
1951	/ \@> /
1952	i = NFA_PREV_ATOM_LIKE_PATTERN;
1953	break;
1954	}
1955	if (i == `0`) {
1956	emsgf(_("E869: (NFA) Unknown operator '\\@%c'"), op);
1957	return FAIL;
1958	}
1959	EMIT(i);
1960	if (i == NFA_PREV_ATOM_JUST_BEFORE
1961	\|\| i == NFA_PREV_ATOM_JUST_BEFORE_NEG)
1962	EMIT(c2);
1963	break;
1964
1965	case Magic(`'?'`):
1966	case Magic(`'='`):
1967	EMIT(NFA_QUEST);
1968	break;
1969
1970	case Magic(`'{'`):
1971	/ a{2,5} will expand to 'aaa?a?a?'*
1972	* a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy
1973	* version of '?'
1974	* \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the
1975	* parenthesis have the same id
1976	*/
1977
1978	greedy = TRUE;
1979	c2 = peekchr();
1980	if (c2 == `'-'` \|\| c2 == Magic(`'-'`)) {
1981	skipchr();
1982	greedy = FALSE;
1983	}
1984	if (!read_limits(&minval, &maxval))
1985	EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits"));
1986
1987	/ <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to*
1988	* <atom>* */
1989	if (minval == `0` && maxval == MAX_LIMIT) {
1990	if (greedy)
1991	/ \{}, \{0,} /
1992	EMIT(NFA_STAR);
1993	else
1994	/ \{-}, \{-0,} /
1995	EMIT(NFA_STAR_NONGREEDY);
1996	break;
1997	}
1998
1999	/ Special case: x{0} or x{-0} /
2000	if (maxval == `0`) {
2001	/ Ignore result of previous call to nfa_regatom() /
2002	post_ptr = post_start + my_post_start;
2003	/ NFA_EMPTY is 0-length and works everywhere /
2004	EMIT(NFA_EMPTY);
2005	return OK;
2006	}
2007
2008	// The engine is very inefficient (uses too many states) when the maximum
2009	// is much larger than the minimum and when the maximum is large. Bail out
2010	// if we can use the other engine.
2011	if ((nfa_re_flags & RE_AUTO) && (maxval > `500` \|\| maxval > minval + `200`)) {
2012	return FAIL;
2013	}
2014
2015	/ Ignore previous call to nfa_regatom() /
2016	post_ptr = post_start + my_post_start;
2017	/ Save parse state after the repeated atom and the \{} /
2018	save_parse_state(&new_state);
2019
2020	quest = (greedy == TRUE ? NFA_QUEST : NFA_QUEST_NONGREEDY);
2021	for (i = `0`; i < maxval; i++) {
2022	/ Goto beginning of the repeated atom /
2023	restore_parse_state(&old_state);
2024	old_post_pos = (int)(post_ptr - post_start);
2025	if (nfa_regatom() == FAIL)
2026	return FAIL;
2027	/ after "minval" times, atoms are optional /
2028	if (i + `1` > minval) {
2029	if (maxval == MAX_LIMIT) {
2030	if (greedy)
2031	EMIT(NFA_STAR);
2032	else
2033	EMIT(NFA_STAR_NONGREEDY);
2034	} else
2035	EMIT(quest);
2036	}
2037	if (old_post_pos != my_post_start)
2038	EMIT(NFA_CONCAT);
2039	if (i + `1` > minval && maxval == MAX_LIMIT)
2040	break;
2041	}
2042
2043	/ Go to just after the repeated atom and the \{} /
2044	restore_parse_state(&new_state);
2045	curchr = -`1`;
2046
2047	break;
2048
2049
2050	default:
2051	break;
2052	} / end switch /
2053
2054	if (re_multi_type(peekchr()) != NOT_MULTI) {
2055	// Can't have a multi follow a multi.
2056	EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi"));
2057	}
2058
2059	return OK;
2060	}
2061
2062	/*
2063	* Parse one or more pieces, concatenated. It matches a match for the
2064	* first piece, followed by a match for the second piece, etc. Example:
2065	* "f[0-9]b", first matches "f", then a digit and then "b".
2066	*
2067	* concat ::= piece
2068	* or piece piece
2069	* or piece piece piece
2070	* etc.
2071	*/
2072	static int nfa_regconcat(void)
2073	{
2074	int cont = TRUE;
2075	int first = TRUE;
2076
2077	while (cont) {
2078	switch (peekchr()) {
2079	case NUL:
2080	case Magic(`'\|'`):
2081	case Magic(`'&'`):
2082	case Magic(`')'`):
2083	cont = FALSE;
2084	break;
2085
2086	case Magic(`'Z'`):
2087	regflags \|= RF_ICOMBINE;
2088	skipchr_keepstart();
2089	break;
2090	case Magic(`'c'`):
2091	regflags \|= RF_ICASE;
2092	skipchr_keepstart();
2093	break;
2094	case Magic(`'C'`):
2095	regflags \|= RF_NOICASE;
2096	skipchr_keepstart();
2097	break;
2098	case Magic(`'v'`):
2099	reg_magic = MAGIC_ALL;
2100	skipchr_keepstart();
2101	curchr = -`1`;
2102	break;
2103	case Magic(`'m'`):
2104	reg_magic = MAGIC_ON;
2105	skipchr_keepstart();
2106	curchr = -`1`;
2107	break;
2108	case Magic(`'M'`):
2109	reg_magic = MAGIC_OFF;
2110	skipchr_keepstart();
2111	curchr = -`1`;
2112	break;
2113	case Magic(`'V'`):
2114	reg_magic = MAGIC_NONE;
2115	skipchr_keepstart();
2116	curchr = -`1`;
2117	break;
2118
2119	default:
2120	if (nfa_regpiece() == FAIL)
2121	return FAIL;
2122	if (first == FALSE)
2123	EMIT(NFA_CONCAT);
2124	else
2125	first = FALSE;
2126	break;
2127	}
2128	}
2129
2130	return OK;
2131	}
2132
2133	/*
2134	* Parse a branch, one or more concats, separated by "\&". It matches the
2135	* last concat, but only if all the preceding concats also match at the same
2136	* position. Examples:
2137	* "foobeep\&..." matches "foo" in "foobeep".
2138	* ".Peter\&.Bob" matches in a line containing both "Peter" and "Bob"
2139	*
2140	* branch ::= concat
2141	* or concat \& concat
2142	* or concat \& concat \& concat
2143	* etc.
2144	*/
2145	static int nfa_regbranch(void)
2146	{
2147	int old_post_pos;
2148
2149	old_post_pos = (int)(post_ptr - post_start);
2150
2151	/ First branch, possibly the only one /
2152	if (nfa_regconcat() == FAIL)
2153	return FAIL;
2154
2155	// Try next concats
2156	while (peekchr() == Magic(`'&'`)) {
2157	skipchr();
2158	// if concat is empty do emit a node
2159	if (old_post_pos == (int)(post_ptr - post_start)) {
2160	EMIT(NFA_EMPTY);
2161	}
2162	EMIT(NFA_NOPEN);
2163	EMIT(NFA_PREV_ATOM_NO_WIDTH);
2164	old_post_pos = (int)(post_ptr - post_start);
2165	if (nfa_regconcat() == FAIL)
2166	return FAIL;
2167	/ if concat is empty do emit a node /
2168	if (old_post_pos == (int)(post_ptr - post_start))
2169	EMIT(NFA_EMPTY);
2170	EMIT(NFA_CONCAT);
2171	}
2172
2173	/ if a branch is empty, emit one node for it /
2174	if (old_post_pos == (int)(post_ptr - post_start))
2175	EMIT(NFA_EMPTY);
2176
2177	return OK;
2178	}
2179
2180	/*
2181	* Parse a pattern, one or more branches, separated by "\\|". It matches
2182	* anything that matches one of the branches. Example: "foo\\|beep" matches
2183	* "foo" and matches "beep". If more than one branch matches, the first one
2184	* is used.
2185	*
2186	* pattern ::= branch
2187	* or branch \\| branch
2188	* or branch \\| branch \\| branch
2189	* etc.
2190	*/
2191	static int
2192	nfa_reg (
2193	int paren / REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN /
2194	)
2195	{
2196	int parno = `0`;
2197
2198	if (paren == REG_PAREN) {
2199	if (regnpar >= NSUBEXP) / Too many `(' /
2200	EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('"));
2201	parno = regnpar++;
2202	} else if (paren == REG_ZPAREN) {
2203	/ Make a ZOPEN node. /
2204	if (regnzpar >= NSUBEXP)
2205	EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z("));
2206	parno = regnzpar++;
2207	}
2208
2209	if (nfa_regbranch() == FAIL)
2210	return FAIL; / cascaded error /
2211
2212	while (peekchr() == Magic(`'\|'`)) {
2213	skipchr();
2214	if (nfa_regbranch() == FAIL)
2215	return FAIL; / cascaded error /
2216	EMIT(NFA_OR);
2217	}
2218
2219	/ Check for proper termination. /
2220	if (paren != REG_NOPAREN && getchr() != Magic(`')'`)) {
2221	if (paren == REG_NPAREN)
2222	EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
2223	else
2224	EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
2225	} else if (paren == REG_NOPAREN && peekchr() != NUL) {
2226	if (peekchr() == Magic(`')'`))
2227	EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
2228	else
2229	EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error"));
2230	}
2231	/*
2232	* Here we set the flag allowing back references to this set of
2233	* parentheses.
2234	*/
2235	if (paren == REG_PAREN) {
2236	had_endbrace[parno] = TRUE; / have seen the close paren /
2237	EMIT(NFA_MOPEN + parno);
2238	} else if (paren == REG_ZPAREN)
2239	EMIT(NFA_ZOPEN + parno);
2240
2241	return OK;
2242	}
2243
2244	#ifdef REGEXP_DEBUG
2245	static char_u code[`50`];
2246
2247	static void nfa_set_code(int c)
2248	{
2249	int addnl = FALSE;
2250
2251	if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) {
2252	addnl = TRUE;
2253	c -= NFA_ADD_NL;
2254	}
2255
2256	STRCPY(code, "");
2257	switch (c) {
2258	case NFA_MATCH: STRCPY(code, "NFA_MATCH "); break;
2259	case NFA_SPLIT: STRCPY(code, "NFA_SPLIT "); break;
2260	case NFA_CONCAT: STRCPY(code, "NFA_CONCAT "); break;
2261	case NFA_NEWL: STRCPY(code, "NFA_NEWL "); break;
2262	case NFA_ZSTART: STRCPY(code, "NFA_ZSTART"); break;
2263	case NFA_ZEND: STRCPY(code, "NFA_ZEND"); break;
2264
2265	case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1"); break;
2266	case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2"); break;
2267	case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3"); break;
2268	case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4"); break;
2269	case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5"); break;
2270	case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6"); break;
2271	case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7"); break;
2272	case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8"); break;
2273	case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9"); break;
2274	case NFA_ZREF1: STRCPY(code, "NFA_ZREF1"); break;
2275	case NFA_ZREF2: STRCPY(code, "NFA_ZREF2"); break;
2276	case NFA_ZREF3: STRCPY(code, "NFA_ZREF3"); break;
2277	case NFA_ZREF4: STRCPY(code, "NFA_ZREF4"); break;
2278	case NFA_ZREF5: STRCPY(code, "NFA_ZREF5"); break;
2279	case NFA_ZREF6: STRCPY(code, "NFA_ZREF6"); break;
2280	case NFA_ZREF7: STRCPY(code, "NFA_ZREF7"); break;
2281	case NFA_ZREF8: STRCPY(code, "NFA_ZREF8"); break;
2282	case NFA_ZREF9: STRCPY(code, "NFA_ZREF9"); break;
2283	case NFA_SKIP: STRCPY(code, "NFA_SKIP"); break;
2284
2285	case NFA_PREV_ATOM_NO_WIDTH:
2286	STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH"); break;
2287	case NFA_PREV_ATOM_NO_WIDTH_NEG:
2288	STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG"); break;
2289	case NFA_PREV_ATOM_JUST_BEFORE:
2290	STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE"); break;
2291	case NFA_PREV_ATOM_JUST_BEFORE_NEG:
2292	STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG"); break;
2293	case NFA_PREV_ATOM_LIKE_PATTERN:
2294	STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN"); break;
2295
2296	case NFA_NOPEN: STRCPY(code, "NFA_NOPEN"); break;
2297	case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE"); break;
2298	case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
2299	case NFA_START_INVISIBLE_FIRST:
2300	STRCPY(code, "NFA_START_INVISIBLE_FIRST"); break;
2301	case NFA_START_INVISIBLE_NEG:
2302	STRCPY(code, "NFA_START_INVISIBLE_NEG"); break;
2303	case NFA_START_INVISIBLE_NEG_FIRST:
2304	STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST"); break;
2305	case NFA_START_INVISIBLE_BEFORE:
2306	STRCPY(code, "NFA_START_INVISIBLE_BEFORE"); break;
2307	case NFA_START_INVISIBLE_BEFORE_FIRST:
2308	STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST"); break;
2309	case NFA_START_INVISIBLE_BEFORE_NEG:
2310	STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG"); break;
2311	case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
2312	STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST"); break;
2313	case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN"); break;
2314	case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
2315	case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG"); break;
2316	case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN"); break;
2317
2318	case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
2319	case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
2320	case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS"); break;
2321
2322	case NFA_MOPEN:
2323	case NFA_MOPEN1:
2324	case NFA_MOPEN2:
2325	case NFA_MOPEN3:
2326	case NFA_MOPEN4:
2327	case NFA_MOPEN5:
2328	case NFA_MOPEN6:
2329	case NFA_MOPEN7:
2330	case NFA_MOPEN8:
2331	case NFA_MOPEN9:
2332	STRCPY(code, "NFA_MOPEN(x)");
2333	code[`10`] = c - NFA_MOPEN + `'0'`;
2334	break;
2335	case NFA_MCLOSE:
2336	case NFA_MCLOSE1:
2337	case NFA_MCLOSE2:
2338	case NFA_MCLOSE3:
2339	case NFA_MCLOSE4:
2340	case NFA_MCLOSE5:
2341	case NFA_MCLOSE6:
2342	case NFA_MCLOSE7:
2343	case NFA_MCLOSE8:
2344	case NFA_MCLOSE9:
2345	STRCPY(code, "NFA_MCLOSE(x)");
2346	code[`11`] = c - NFA_MCLOSE + `'0'`;
2347	break;
2348	case NFA_ZOPEN:
2349	case NFA_ZOPEN1:
2350	case NFA_ZOPEN2:
2351	case NFA_ZOPEN3:
2352	case NFA_ZOPEN4:
2353	case NFA_ZOPEN5:
2354	case NFA_ZOPEN6:
2355	case NFA_ZOPEN7:
2356	case NFA_ZOPEN8:
2357	case NFA_ZOPEN9:
2358	STRCPY(code, "NFA_ZOPEN(x)");
2359	code[`10`] = c - NFA_ZOPEN + `'0'`;
2360	break;
2361	case NFA_ZCLOSE:
2362	case NFA_ZCLOSE1:
2363	case NFA_ZCLOSE2:
2364	case NFA_ZCLOSE3:
2365	case NFA_ZCLOSE4:
2366	case NFA_ZCLOSE5:
2367	case NFA_ZCLOSE6:
2368	case NFA_ZCLOSE7:
2369	case NFA_ZCLOSE8:
2370	case NFA_ZCLOSE9:
2371	STRCPY(code, "NFA_ZCLOSE(x)");
2372	code[`11`] = c - NFA_ZCLOSE + `'0'`;
2373	break;
2374	case NFA_EOL: STRCPY(code, "NFA_EOL "); break;
2375	case NFA_BOL: STRCPY(code, "NFA_BOL "); break;
2376	case NFA_EOW: STRCPY(code, "NFA_EOW "); break;
2377	case NFA_BOW: STRCPY(code, "NFA_BOW "); break;
2378	case NFA_EOF: STRCPY(code, "NFA_EOF "); break;
2379	case NFA_BOF: STRCPY(code, "NFA_BOF "); break;
2380	case NFA_LNUM: STRCPY(code, "NFA_LNUM "); break;
2381	case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT "); break;
2382	case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT "); break;
2383	case NFA_COL: STRCPY(code, "NFA_COL "); break;
2384	case NFA_COL_GT: STRCPY(code, "NFA_COL_GT "); break;
2385	case NFA_COL_LT: STRCPY(code, "NFA_COL_LT "); break;
2386	case NFA_VCOL: STRCPY(code, "NFA_VCOL "); break;
2387	case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT "); break;
2388	case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT "); break;
2389	case NFA_MARK: STRCPY(code, "NFA_MARK "); break;
2390	case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT "); break;
2391	case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT "); break;
2392	case NFA_CURSOR: STRCPY(code, "NFA_CURSOR "); break;
2393	case NFA_VISUAL: STRCPY(code, "NFA_VISUAL "); break;
2394	case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING "); break;
2395
2396	case NFA_STAR: STRCPY(code, "NFA_STAR "); break;
2397	case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY "); break;
2398	case NFA_QUEST: STRCPY(code, "NFA_QUEST"); break;
2399	case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY"); break;
2400	case NFA_EMPTY: STRCPY(code, "NFA_EMPTY"); break;
2401	case NFA_OR: STRCPY(code, "NFA_OR"); break;
2402
2403	case NFA_START_COLL: STRCPY(code, "NFA_START_COLL"); break;
2404	case NFA_END_COLL: STRCPY(code, "NFA_END_COLL"); break;
2405	case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL"); break;
2406	case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL"); break;
2407	case NFA_RANGE: STRCPY(code, "NFA_RANGE"); break;
2408	case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN"); break;
2409	case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX"); break;
2410
2411	case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM"); break;
2412	case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA"); break;
2413	case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK"); break;
2414	case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL"); break;
2415	case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT"); break;
2416	case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH"); break;
2417	case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER"); break;
2418	case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT"); break;
2419	case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT"); break;
2420	case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE"); break;
2421	case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER"); break;
2422	case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT"); break;
2423	case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB"); break;
2424	case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN"); break;
2425	case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE"); break;
2426	case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE"); break;
2427
2428	case NFA_ANY: STRCPY(code, "NFA_ANY"); break;
2429	case NFA_IDENT: STRCPY(code, "NFA_IDENT"); break;
2430	case NFA_SIDENT: STRCPY(code, "NFA_SIDENT"); break;
2431	case NFA_KWORD: STRCPY(code, "NFA_KWORD"); break;
2432	case NFA_SKWORD: STRCPY(code, "NFA_SKWORD"); break;
2433	case NFA_FNAME: STRCPY(code, "NFA_FNAME"); break;
2434	case NFA_SFNAME: STRCPY(code, "NFA_SFNAME"); break;
2435	case NFA_PRINT: STRCPY(code, "NFA_PRINT"); break;
2436	case NFA_SPRINT: STRCPY(code, "NFA_SPRINT"); break;
2437	case NFA_WHITE: STRCPY(code, "NFA_WHITE"); break;
2438	case NFA_NWHITE: STRCPY(code, "NFA_NWHITE"); break;
2439	case NFA_DIGIT: STRCPY(code, "NFA_DIGIT"); break;
2440	case NFA_NDIGIT: STRCPY(code, "NFA_NDIGIT"); break;
2441	case NFA_HEX: STRCPY(code, "NFA_HEX"); break;
2442	case NFA_NHEX: STRCPY(code, "NFA_NHEX"); break;
2443	case NFA_OCTAL: STRCPY(code, "NFA_OCTAL"); break;
2444	case NFA_NOCTAL: STRCPY(code, "NFA_NOCTAL"); break;
2445	case NFA_WORD: STRCPY(code, "NFA_WORD"); break;
2446	case NFA_NWORD: STRCPY(code, "NFA_NWORD"); break;
2447	case NFA_HEAD: STRCPY(code, "NFA_HEAD"); break;
2448	case NFA_NHEAD: STRCPY(code, "NFA_NHEAD"); break;
2449	case NFA_ALPHA: STRCPY(code, "NFA_ALPHA"); break;
2450	case NFA_NALPHA: STRCPY(code, "NFA_NALPHA"); break;
2451	case NFA_LOWER: STRCPY(code, "NFA_LOWER"); break;
2452	case NFA_NLOWER: STRCPY(code, "NFA_NLOWER"); break;
2453	case NFA_UPPER: STRCPY(code, "NFA_UPPER"); break;
2454	case NFA_NUPPER: STRCPY(code, "NFA_NUPPER"); break;
2455	case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC"); break;
2456	case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC"); break;
2457	case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC"); break;
2458	case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC"); break;
2459
2460	default:
2461	STRCPY(code, "CHAR(x)");
2462	code[`5`] = c;
2463	}
2464
2465	if (addnl == TRUE)
2466	STRCAT(code, " + NEWLINE ");
2467
2468	}
2469
2470	static FILE *log_fd;
2471	static char_u e_log_open_failed[] = N_(
2472	"Could not open temporary log file for writing, displaying on stderr... ");
2473
2474	/*
2475	* Print the postfix notation of the current regexp.
2476	*/
2477	static void nfa_postfix_dump(char_u expr, int* retval)
2478	{
2479	int *p;
2480	FILE *f;
2481
2482	f = fopen(NFA_REGEXP_DUMP_LOG, "a");
2483	if (f != NULL) {
2484	fprintf(f, "\n-------------------------\n");
2485	if (retval == FAIL) {
2486	fprintf(f, ">>> NFA engine failed... \n");
2487	} else if (retval == OK) {
2488	fprintf(f, ">>> NFA engine succeeded !\n");
2489	}
2490	fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"", expr);
2491	for (p = post_start; *p && p < post_ptr; p++) {
2492	nfa_set_code(*p);
2493	fprintf(f, "%s, ", code);
2494	}
2495	fprintf(f, "\"\nPostfix notation (int): ");
2496	for (p = post_start; *p && p < post_ptr; p++)
2497	fprintf(f, "%d ", *p);
2498	fprintf(f, "\n\n");
2499	fclose(f);
2500	}
2501	}
2502
2503	/*
2504	* Print the NFA starting with a root node "state".
2505	*/
2506	static void nfa_print_state(FILE debugf, nfa_state_T state)
2507	{
2508	garray_T indent;
2509
2510	ga_init(&indent, `1`, `64`);
2511	ga_append(&indent, `'\0'`);
2512	nfa_print_state2(debugf, state, &indent);
2513	ga_clear(&indent);
2514	}
2515
2516	static void nfa_print_state2(FILE debugf, nfa_state_T state, garray_T *indent)
2517	{
2518	char_u *p;
2519
2520	if (state == NULL)
2521	return;
2522
2523	fprintf(debugf, "(%2d)", abs(state->id));
2524
2525	/ Output indent /
2526	p = (char_u *)indent->ga_data;
2527	if (indent->ga_len >= `3`) {
2528	int last = indent->ga_len - `3`;
2529	char_u save[`2`];
2530
2531	STRNCPY(save, &p[last], `2`);
2532	STRNCPY(&p[last], "+-", `2`);
2533	fprintf(debugf, " %s", p);
2534	STRNCPY(&p[last], save, `2`);
2535	} else
2536	fprintf(debugf, " %s", p);
2537
2538	nfa_set_code(state->c);
2539	fprintf(debugf, "%s (%d) (id=%d) val=%d\n",
2540	code,
2541	state->c,
2542	abs(state->id),
2543	state->val);
2544	if (state->id < `0`)
2545	return;
2546
2547	state->id = abs(state->id) * -`1`;
2548
2549	/ grow indent for state->out /
2550	indent->ga_len -= `1`;
2551	if (state->out1)
2552	ga_concat(indent, (char_u *)"\| ");
2553	else
2554	ga_concat(indent, (char_u *)" ");
2555	ga_append(indent, `'\0'`);
2556
2557	nfa_print_state2(debugf, state->out, indent);
2558
2559	/ replace last part of indent for state->out1 /
2560	indent->ga_len -= `3`;
2561	ga_concat(indent, (char_u *)" ");
2562	ga_append(indent, `'\0'`);
2563
2564	nfa_print_state2(debugf, state->out1, indent);
2565
2566	/ shrink indent /
2567	indent->ga_len -= `3`;
2568	ga_append(indent, `'\0'`);
2569	}
2570
2571	/*
2572	* Print the NFA state machine.
2573	*/
2574	static void nfa_dump(nfa_regprog_T *prog)
2575	{
2576	FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a");
2577
2578	if (debugf != NULL) {
2579	nfa_print_state(debugf, prog->start);
2580
2581	if (prog->reganch)
2582	fprintf(debugf, "reganch: %d\n", prog->reganch);
2583	if (prog->regstart != NUL)
2584	fprintf(debugf, "regstart: %c (decimal: %d)\n",
2585	prog->regstart, prog->regstart);
2586	if (prog->match_text != NULL)
2587	fprintf(debugf, "match_text: \"%s\"\n", prog->match_text);
2588
2589	fclose(debugf);
2590	}
2591	}
2592	#endif /* REGEXP_DEBUG */
2593
2594	/*
2595	* Parse r.e. @expr and convert it into postfix form.
2596	* Return the postfix string on success, NULL otherwise.
2597	*/
2598	static int re2post(void*)
2599	{
2600	if (nfa_reg(REG_NOPAREN) == FAIL)
2601	return NULL;
2602	EMIT(NFA_MOPEN);
2603	return post_start;
2604	}
2605
2606	/ NB. Some of the code below is inspired by Russ's. /
2607
2608	/*
2609	* Represents an NFA state plus zero or one or two arrows exiting.
2610	* if c == MATCH, no arrows out; matching state.
2611	* If c == SPLIT, unlabeled arrows to out and out1 (if != NULL).
2612	* If c < 256, labeled arrow with character c to out.
2613	*/
2614
2615	static nfa_state_T state_ptr; /* points to nfa_prog->state /
2616
2617	/*
2618	* Allocate and initialize nfa_state_T.
2619	*/
2620	static nfa_state_T alloc_state(int* c, nfa_state_T out, nfa_state_T out1)
2621	{
2622	nfa_state_T *s;
2623
2624	if (istate >= nstate)
2625	return NULL;
2626
2627	s = &state_ptr[istate++];
2628
2629	s->c = c;
2630	s->out = out;
2631	s->out1 = out1;
2632	s->val = `0`;
2633
2634	s->id = istate;
2635	s->lastlist[`0`] = `0`;
2636	s->lastlist[`1`] = `0`;
2637
2638	return s;
2639	}
2640
2641	/*
2642	* A partially built NFA without the matching state filled in.
2643	* Frag_T.start points at the start state.
2644	* Frag_T.out is a list of places that need to be set to the
2645	* next state for this fragment.
2646	*/
2647
2648
2649	/*
2650	* Initialize a Frag_T struct and return it.
2651	*/
2652	static Frag_T frag(nfa_state_T start, Ptrlist out)
2653	{
2654	Frag_T n;
2655
2656	n.start = start;
2657	n.out = out;
2658	return n;
2659	}
2660
2661	/*
2662	* Create singleton list containing just outp.
2663	*/
2664	static Ptrlist list1(nfa_state_T *outp)
2665	{
2666	Ptrlist *l;
2667
2668	l = (Ptrlist *)outp;
2669	l->next = NULL;
2670	return l;
2671	}
2672
2673	/*
2674	* Patch the list of states at out to point to start.
2675	*/
2676	static void patch(Ptrlist l, nfa_state_T s)
2677	{
2678	Ptrlist *next;
2679
2680	for (; l; l = next) {
2681	next = l->next;
2682	l->s = s;
2683	}
2684	}
2685
2686
2687	/*
2688	* Join the two lists l1 and l2, returning the combination.
2689	*/
2690	static Ptrlist append(Ptrlist l1, Ptrlist *l2)
2691	{
2692	Ptrlist *oldl1;
2693
2694	oldl1 = l1;
2695	while (l1->next)
2696	l1 = l1->next;
2697	l1->next = l2;
2698	return oldl1;
2699	}
2700
2701	/*
2702	* Stack used for transforming postfix form into NFA.
2703	*/
2704	static Frag_T empty;
2705
2706	static void st_error(int postfix, int* end, int* *p)
2707	{
2708	#ifdef NFA_REGEXP_ERROR_LOG
2709	FILE *df;
2710	int *p2;
2711
2712	df = fopen(NFA_REGEXP_ERROR_LOG, "a");
2713	if (df) {
2714	fprintf(df, "Error popping the stack!\n");
2715	#ifdef REGEXP_DEBUG
2716	fprintf(df, "Current regexp is \"%s\"\n", nfa_regengine.expr);
2717	#endif
2718	fprintf(df, "Postfix form is: ");
2719	#ifdef REGEXP_DEBUG
2720	for (p2 = postfix; p2 < end; p2++) {
2721	nfa_set_code(*p2);
2722	fprintf(df, "%s, ", code);
2723	}
2724	nfa_set_code(*p);
2725	fprintf(df, "\nCurrent position is: ");
2726	for (p2 = postfix; p2 <= p; p2++) {
2727	nfa_set_code(*p2);
2728	fprintf(df, "%s, ", code);
2729	}
2730	#else
2731	for (p2 = postfix; p2 < end; p2++) {
2732	fprintf(df, "%d, ", *p2);
2733	}
2734	fprintf(df, "\nCurrent position is: ");
2735	for (p2 = postfix; p2 <= p; p2++) {
2736	fprintf(df, "%d, ", *p2);
2737	}
2738	#endif
2739	fprintf(df, "\n--------------------------\n");
2740	fclose(df);
2741	}
2742	#endif
2743	EMSG(_("E874: (NFA) Could not pop the stack!"));
2744	}
2745
2746	/*
2747	* Push an item onto the stack.
2748	*/
2749	static void st_push(Frag_T s, Frag_T *p, Frag_T stack_end)
2750	{
2751	Frag_T stackp = p;
2752
2753	if (stackp >= stack_end)
2754	return;
2755	*stackp = s;
2756	p = p + `1`;
2757	}
2758
2759	/*
2760	* Pop an item from the stack.
2761	*/
2762	static Frag_T st_pop(Frag_T *p, Frag_T stack)
2763	{
2764	Frag_T *stackp;
2765
2766	p = p - `1`;
2767	stackp = *p;
2768	if (stackp < stack)
2769	return empty;
2770	return **p;
2771	}
2772
2773	/*
2774	* Estimate the maximum byte length of anything matching "state".
2775	* When unknown or unlimited return -1.
2776	*/
2777	static int nfa_max_width(nfa_state_T startstate, int* depth)
2778	{
2779	int l, r;
2780	nfa_state_T *state = startstate;
2781	int len = `0`;
2782
2783	/ detect looping in a NFA_SPLIT /
2784	if (depth > `4`)
2785	return -`1`;
2786
2787	while (state != NULL) {
2788	switch (state->c) {
2789	case NFA_END_INVISIBLE:
2790	case NFA_END_INVISIBLE_NEG:
2791	/ the end, return what we have /
2792	return len;
2793
2794	case NFA_SPLIT:
2795	/ two alternatives, use the maximum /
2796	l = nfa_max_width(state->out, depth + `1`);
2797	r = nfa_max_width(state->out1, depth + `1`);
2798	if (l < `0` \|\| r < `0`)
2799	return -`1`;
2800	return len + (l > r ? l : r);
2801
2802	case NFA_ANY:
2803	case NFA_START_COLL:
2804	case NFA_START_NEG_COLL:
2805	// Matches some character, including composing chars.
2806	len += MB_MAXBYTES;
2807	if (state->c != NFA_ANY) {
2808	// Skip over the characters.
2809	state = state->out1->out;
2810	continue;
2811	}
2812	break;
2813
2814	case NFA_DIGIT:
2815	case NFA_WHITE:
2816	case NFA_HEX:
2817	case NFA_OCTAL:
2818	/ ascii /
2819	++len;
2820	break;
2821
2822	case NFA_IDENT:
2823	case NFA_SIDENT:
2824	case NFA_KWORD:
2825	case NFA_SKWORD:
2826	case NFA_FNAME:
2827	case NFA_SFNAME:
2828	case NFA_PRINT:
2829	case NFA_SPRINT:
2830	case NFA_NWHITE:
2831	case NFA_NDIGIT:
2832	case NFA_NHEX:
2833	case NFA_NOCTAL:
2834	case NFA_WORD:
2835	case NFA_NWORD:
2836	case NFA_HEAD:
2837	case NFA_NHEAD:
2838	case NFA_ALPHA:
2839	case NFA_NALPHA:
2840	case NFA_LOWER:
2841	case NFA_NLOWER:
2842	case NFA_UPPER:
2843	case NFA_NUPPER:
2844	case NFA_LOWER_IC:
2845	case NFA_NLOWER_IC:
2846	case NFA_UPPER_IC:
2847	case NFA_NUPPER_IC:
2848	case NFA_ANY_COMPOSING:
2849	/ possibly non-ascii /
2850	if (has_mbyte)
2851	len += `3`;
2852	else
2853	++len;
2854	break;
2855
2856	case NFA_START_INVISIBLE:
2857	case NFA_START_INVISIBLE_NEG:
2858	case NFA_START_INVISIBLE_BEFORE:
2859	case NFA_START_INVISIBLE_BEFORE_NEG:
2860	/ zero-width, out1 points to the END state /
2861	state = state->out1->out;
2862	continue;
2863
2864	case NFA_BACKREF1:
2865	case NFA_BACKREF2:
2866	case NFA_BACKREF3:
2867	case NFA_BACKREF4:
2868	case NFA_BACKREF5:
2869	case NFA_BACKREF6:
2870	case NFA_BACKREF7:
2871	case NFA_BACKREF8:
2872	case NFA_BACKREF9:
2873	case NFA_ZREF1:
2874	case NFA_ZREF2:
2875	case NFA_ZREF3:
2876	case NFA_ZREF4:
2877	case NFA_ZREF5:
2878	case NFA_ZREF6:
2879	case NFA_ZREF7:
2880	case NFA_ZREF8:
2881	case NFA_ZREF9:
2882	case NFA_NEWL:
2883	case NFA_SKIP:
2884	/ unknown width /
2885	return -`1`;
2886
2887	case NFA_BOL:
2888	case NFA_EOL:
2889	case NFA_BOF:
2890	case NFA_EOF:
2891	case NFA_BOW:
2892	case NFA_EOW:
2893	case NFA_MOPEN:
2894	case NFA_MOPEN1:
2895	case NFA_MOPEN2:
2896	case NFA_MOPEN3:
2897	case NFA_MOPEN4:
2898	case NFA_MOPEN5:
2899	case NFA_MOPEN6:
2900	case NFA_MOPEN7:
2901	case NFA_MOPEN8:
2902	case NFA_MOPEN9:
2903	case NFA_ZOPEN:
2904	case NFA_ZOPEN1:
2905	case NFA_ZOPEN2:
2906	case NFA_ZOPEN3:
2907	case NFA_ZOPEN4:
2908	case NFA_ZOPEN5:
2909	case NFA_ZOPEN6:
2910	case NFA_ZOPEN7:
2911	case NFA_ZOPEN8:
2912	case NFA_ZOPEN9:
2913	case NFA_ZCLOSE:
2914	case NFA_ZCLOSE1:
2915	case NFA_ZCLOSE2:
2916	case NFA_ZCLOSE3:
2917	case NFA_ZCLOSE4:
2918	case NFA_ZCLOSE5:
2919	case NFA_ZCLOSE6:
2920	case NFA_ZCLOSE7:
2921	case NFA_ZCLOSE8:
2922	case NFA_ZCLOSE9:
2923	case NFA_MCLOSE:
2924	case NFA_MCLOSE1:
2925	case NFA_MCLOSE2:
2926	case NFA_MCLOSE3:
2927	case NFA_MCLOSE4:
2928	case NFA_MCLOSE5:
2929	case NFA_MCLOSE6:
2930	case NFA_MCLOSE7:
2931	case NFA_MCLOSE8:
2932	case NFA_MCLOSE9:
2933	case NFA_NOPEN:
2934	case NFA_NCLOSE:
2935
2936	case NFA_LNUM_GT:
2937	case NFA_LNUM_LT:
2938	case NFA_COL_GT:
2939	case NFA_COL_LT:
2940	case NFA_VCOL_GT:
2941	case NFA_VCOL_LT:
2942	case NFA_MARK_GT:
2943	case NFA_MARK_LT:
2944	case NFA_VISUAL:
2945	case NFA_LNUM:
2946	case NFA_CURSOR:
2947	case NFA_COL:
2948	case NFA_VCOL:
2949	case NFA_MARK:
2950
2951	case NFA_ZSTART:
2952	case NFA_ZEND:
2953	case NFA_OPT_CHARS:
2954	case NFA_EMPTY:
2955	case NFA_START_PATTERN:
2956	case NFA_END_PATTERN:
2957	case NFA_COMPOSING:
2958	case NFA_END_COMPOSING:
2959	/ zero-width /
2960	break;
2961
2962	default:
2963	if (state->c < `0`)
2964	/ don't know what this is /
2965	return -`1`;
2966	/ normal character /
2967	len += MB_CHAR2LEN(state->c);
2968	break;
2969	}
2970
2971	/ normal way to continue /
2972	state = state->out;
2973	}
2974
2975	/ unrecognized, "cannot happen" /
2976	return -`1`;
2977	}
2978
2979	/*
2980	* Convert a postfix form into its equivalent NFA.
2981	* Return the NFA start state on success, NULL otherwise.
2982	*/
2983	static nfa_state_T post2nfa(int* postfix, int* end, int* nfa_calc_size)
2984	{
2985	int *p;
2986	int mopen;
2987	int mclose;
2988	Frag_T *stack = NULL;
2989	Frag_T *stackp = NULL;
2990	Frag_T *stack_end = NULL;
2991	Frag_T e1;
2992	Frag_T e2;
2993	Frag_T e;
2994	nfa_state_T *s;
2995	nfa_state_T *s1;
2996	nfa_state_T *matchstate;
2997	nfa_state_T *ret = NULL;
2998
2999	if (postfix == NULL)
3000	return NULL;
3001
3002	#define PUSH(s) st_push((s), &stackp, stack_end)
3003	#define POP() st_pop(&stackp, stack); \
3004	if (stackp < stack) { \
3005	st_error(postfix, end, p); \
3006	xfree(stack); \
3007	return NULL; \
3008	}
3009
3010	if (nfa_calc_size == false) {
3011	// Allocate space for the stack. Max states on the stack: "nstate".
3012	stack = xmalloc((nstate + `1`) * sizeof(Frag_T));
3013	stackp = stack;
3014	stack_end = stack + (nstate + `1`);
3015	}
3016
3017	for (p = postfix; p < end; ++p) {
3018	switch (*p) {
3019	case NFA_CONCAT:
3020	/ Concatenation.*
3021	* Pay attention: this operator does not exist in the r.e. itself
3022	* (it is implicit, really). It is added when r.e. is translated
3023	* to postfix form in re2post(). */
3024	if (nfa_calc_size == TRUE) {
3025	/ nstate += 0; /
3026	break;
3027	}
3028	e2 = POP();
3029	e1 = POP();
3030	patch(e1.out, e2.start);
3031	PUSH(frag(e1.start, e2.out));
3032	break;
3033
3034	case NFA_OR:
3035	/ Alternation /
3036	if (nfa_calc_size == TRUE) {
3037	nstate++;
3038	break;
3039	}
3040	e2 = POP();
3041	e1 = POP();
3042	s = alloc_state(NFA_SPLIT, e1.start, e2.start);
3043	if (s == NULL)
3044	goto theend;
3045	PUSH(frag(s, append(e1.out, e2.out)));
3046	break;
3047
3048	case NFA_STAR:
3049	/ Zero or more, prefer more /
3050	if (nfa_calc_size == TRUE) {
3051	nstate++;
3052	break;
3053	}
3054	e = POP();
3055	s = alloc_state(NFA_SPLIT, e.start, NULL);
3056	if (s == NULL)
3057	goto theend;
3058	patch(e.out, s);
3059	PUSH(frag(s, list1(&s->out1)));
3060	break;
3061
3062	case NFA_STAR_NONGREEDY:
3063	/ Zero or more, prefer zero /
3064	if (nfa_calc_size == TRUE) {
3065	nstate++;
3066	break;
3067	}
3068	e = POP();
3069	s = alloc_state(NFA_SPLIT, NULL, e.start);
3070	if (s == NULL)
3071	goto theend;
3072	patch(e.out, s);
3073	PUSH(frag(s, list1(&s->out)));
3074	break;
3075
3076	case NFA_QUEST:
3077	/ one or zero atoms=> greedy match /
3078	if (nfa_calc_size == TRUE) {
3079	nstate++;
3080	break;
3081	}
3082	e = POP();
3083	s = alloc_state(NFA_SPLIT, e.start, NULL);
3084	if (s == NULL)
3085	goto theend;
3086	PUSH(frag(s, append(e.out, list1(&s->out1))));
3087	break;
3088
3089	case NFA_QUEST_NONGREEDY:
3090	/ zero or one atoms => non-greedy match /
3091	if (nfa_calc_size == TRUE) {
3092	nstate++;
3093	break;
3094	}
3095	e = POP();
3096	s = alloc_state(NFA_SPLIT, NULL, e.start);
3097	if (s == NULL)
3098	goto theend;
3099	PUSH(frag(s, append(e.out, list1(&s->out))));
3100	break;
3101
3102	case NFA_END_COLL:
3103	case NFA_END_NEG_COLL:
3104	/ On the stack is the sequence starting with NFA_START_COLL or*
3105	* NFA_START_NEG_COLL and all possible characters. Patch it to
3106	* add the output to the start. */
3107	if (nfa_calc_size == TRUE) {
3108	nstate++;
3109	break;
3110	}
3111	e = POP();
3112	s = alloc_state(NFA_END_COLL, NULL, NULL);
3113	if (s == NULL)
3114	goto theend;
3115	patch(e.out, s);
3116	e.start->out1 = s;
3117	PUSH(frag(e.start, list1(&s->out)));
3118	break;
3119
3120	case NFA_RANGE:
3121	/ Before this are two characters, the low and high end of a*
3122	* range. Turn them into two states with MIN and MAX. */
3123	if (nfa_calc_size == TRUE) {
3124	/ nstate += 0; /
3125	break;
3126	}
3127	e2 = POP();
3128	e1 = POP();
3129	e2.start->val = e2.start->c;
3130	e2.start->c = NFA_RANGE_MAX;
3131	e1.start->val = e1.start->c;
3132	e1.start->c = NFA_RANGE_MIN;
3133	patch(e1.out, e2.start);
3134	PUSH(frag(e1.start, e2.out));
3135	break;
3136
3137	case NFA_EMPTY:
3138	/ 0-length, used in a repetition with max/min count of 0 /
3139	if (nfa_calc_size == TRUE) {
3140	nstate++;
3141	break;
3142	}
3143	s = alloc_state(NFA_EMPTY, NULL, NULL);
3144	if (s == NULL)
3145	goto theend;
3146	PUSH(frag(s, list1(&s->out)));
3147	break;
3148
3149	case NFA_OPT_CHARS:
3150	{
3151	int n;
3152
3153	/ \%[abc] implemented as:*
3154	* NFA_SPLIT
3155	* +-CHAR(a)
3156	* \| +-NFA_SPLIT
3157	* \| +-CHAR(b)
3158	* \| \| +-NFA_SPLIT
3159	* \| \| +-CHAR(c)
3160	* \| \| \| +-next
3161	* \| \| +- next
3162	* \| +- next
3163	* +- next
3164	*/
3165	n = ++p; /* get number of characters /
3166	if (nfa_calc_size == TRUE) {
3167	nstate += n;
3168	break;
3169	}
3170	s = NULL; / avoid compiler warning /
3171	e1.out = NULL; / stores list with out1's /
3172	s1 = NULL; / previous NFA_SPLIT to connect to /
3173	while (n-- > `0`) {
3174	e = POP(); / get character /
3175	s = alloc_state(NFA_SPLIT, e.start, NULL);
3176	if (s == NULL)
3177	goto theend;
3178	if (e1.out == NULL)
3179	e1 = e;
3180	patch(e.out, s1);
3181	append(e1.out, list1(&s->out1));
3182	s1 = s;
3183	}
3184	PUSH(frag(s, e1.out));
3185	break;
3186	}
3187
3188	case NFA_PREV_ATOM_NO_WIDTH:
3189	case NFA_PREV_ATOM_NO_WIDTH_NEG:
3190	case NFA_PREV_ATOM_JUST_BEFORE:
3191	case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3192	case NFA_PREV_ATOM_LIKE_PATTERN:
3193	{
3194	int before = (*p == NFA_PREV_ATOM_JUST_BEFORE
3195	\|\| *p == NFA_PREV_ATOM_JUST_BEFORE_NEG);
3196	int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN);
3197	int start_state;
3198	int end_state;
3199	int n = `0`;
3200	nfa_state_T *zend;
3201	nfa_state_T *skip;
3202
3203	switch (*p) {
3204	case NFA_PREV_ATOM_NO_WIDTH:
3205	start_state = NFA_START_INVISIBLE;
3206	end_state = NFA_END_INVISIBLE;
3207	break;
3208	case NFA_PREV_ATOM_NO_WIDTH_NEG:
3209	start_state = NFA_START_INVISIBLE_NEG;
3210	end_state = NFA_END_INVISIBLE_NEG;
3211	break;
3212	case NFA_PREV_ATOM_JUST_BEFORE:
3213	start_state = NFA_START_INVISIBLE_BEFORE;
3214	end_state = NFA_END_INVISIBLE;
3215	break;
3216	case NFA_PREV_ATOM_JUST_BEFORE_NEG:
3217	start_state = NFA_START_INVISIBLE_BEFORE_NEG;
3218	end_state = NFA_END_INVISIBLE_NEG;
3219	break;
3220	default: / NFA_PREV_ATOM_LIKE_PATTERN: /
3221	start_state = NFA_START_PATTERN;
3222	end_state = NFA_END_PATTERN;
3223	break;
3224	}
3225
3226	if (before)
3227	n = ++p; /* get the count /
3228
3229	/ The \@= operator: match the preceding atom with zero width.*
3230	* The \@! operator: no match for the preceding atom.
3231	* The \@<= operator: match for the preceding atom.
3232	* The \@<! operator: no match for the preceding atom.
3233	* Surrounds the preceding atom with START_INVISIBLE and
3234	* END_INVISIBLE, similarly to MOPEN. */
3235
3236	if (nfa_calc_size == TRUE) {
3237	nstate += pattern ? `4` : `2`;
3238	break;
3239	}
3240	e = POP();
3241	s1 = alloc_state(end_state, NULL, NULL);
3242	if (s1 == NULL)
3243	goto theend;
3244
3245	s = alloc_state(start_state, e.start, s1);
3246	if (s == NULL)
3247	goto theend;
3248	if (pattern) {
3249	/ NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows. /
3250	skip = alloc_state(NFA_SKIP, NULL, NULL);
3251	if (skip == NULL) {
3252	goto theend;
3253	}
3254	zend = alloc_state(NFA_ZEND, s1, NULL);
3255	if (zend == NULL) {
3256	goto theend;
3257	}
3258	s1->out= skip;
3259	patch(e.out, zend);
3260	PUSH(frag(s, list1(&skip->out)));
3261	} else {
3262	patch(e.out, s1);
3263	PUSH(frag(s, list1(&s1->out)));
3264	if (before) {
3265	if (n <= `0`)
3266	/ See if we can guess the maximum width, it avoids a*
3267	* lot of pointless tries. */
3268	n = nfa_max_width(e.start, `0`);
3269	s->val = n; / store the count /
3270	}
3271	}
3272	break;
3273	}
3274
3275	case NFA_COMPOSING: // char with composing char
3276	FALLTHROUGH;
3277
3278	case NFA_MOPEN: / Submatch /
3279	case NFA_MOPEN1:
3280	case NFA_MOPEN2:
3281	case NFA_MOPEN3:
3282	case NFA_MOPEN4:
3283	case NFA_MOPEN5:
3284	case NFA_MOPEN6:
3285	case NFA_MOPEN7:
3286	case NFA_MOPEN8:
3287	case NFA_MOPEN9:
3288	case NFA_ZOPEN: / \z( \) Submatch /
3289	case NFA_ZOPEN1:
3290	case NFA_ZOPEN2:
3291	case NFA_ZOPEN3:
3292	case NFA_ZOPEN4:
3293	case NFA_ZOPEN5:
3294	case NFA_ZOPEN6:
3295	case NFA_ZOPEN7:
3296	case NFA_ZOPEN8:
3297	case NFA_ZOPEN9:
3298	case NFA_NOPEN: / \%( \) "Invisible Submatch" /
3299	if (nfa_calc_size == TRUE) {
3300	nstate += `2`;
3301	break;
3302	}
3303
3304	mopen = *p;
3305	switch (*p) {
3306	case NFA_NOPEN: mclose = NFA_NCLOSE; break;
3307	case NFA_ZOPEN: mclose = NFA_ZCLOSE; break;
3308	case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break;
3309	case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break;
3310	case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break;
3311	case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break;
3312	case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break;
3313	case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break;
3314	case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break;
3315	case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break;
3316	case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break;
3317	case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break;
3318	default:
3319	/ NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9 /
3320	mclose = *p + NSUBEXP;
3321	break;
3322	}
3323
3324	/ Allow "NFA_MOPEN" as a valid postfix representation for*
3325	* the empty regexp "". In this case, the NFA will be
3326	* NFA_MOPEN -> NFA_MCLOSE. Note that this also allows
3327	* empty groups of parenthesis, and empty mbyte chars */
3328	if (stackp == stack) {
3329	s = alloc_state(mopen, NULL, NULL);
3330	if (s == NULL)
3331	goto theend;
3332	s1 = alloc_state(mclose, NULL, NULL);
3333	if (s1 == NULL)
3334	goto theend;
3335	patch(list1(&s->out), s1);
3336	PUSH(frag(s, list1(&s1->out)));
3337	break;
3338	}
3339
3340	/ At least one node was emitted before NFA_MOPEN, so*
3341	* at least one node will be between NFA_MOPEN and NFA_MCLOSE */
3342	e = POP();
3343	s = alloc_state(mopen, e.start, NULL); / `(' /
3344	if (s == NULL)
3345	goto theend;
3346
3347	s1 = alloc_state(mclose, NULL, NULL); / `)' /
3348	if (s1 == NULL)
3349	goto theend;
3350	patch(e.out, s1);
3351
3352	if (mopen == NFA_COMPOSING)
3353	/ COMPOSING->out1 = END_COMPOSING /
3354	patch(list1(&s->out1), s1);
3355
3356	PUSH(frag(s, list1(&s1->out)));
3357	break;
3358
3359	case NFA_BACKREF1:
3360	case NFA_BACKREF2:
3361	case NFA_BACKREF3:
3362	case NFA_BACKREF4:
3363	case NFA_BACKREF5:
3364	case NFA_BACKREF6:
3365	case NFA_BACKREF7:
3366	case NFA_BACKREF8:
3367	case NFA_BACKREF9:
3368	case NFA_ZREF1:
3369	case NFA_ZREF2:
3370	case NFA_ZREF3:
3371	case NFA_ZREF4:
3372	case NFA_ZREF5:
3373	case NFA_ZREF6:
3374	case NFA_ZREF7:
3375	case NFA_ZREF8:
3376	case NFA_ZREF9:
3377	if (nfa_calc_size == TRUE) {
3378	nstate += `2`;
3379	break;
3380	}
3381	s = alloc_state(*p, NULL, NULL);
3382	if (s == NULL)
3383	goto theend;
3384	s1 = alloc_state(NFA_SKIP, NULL, NULL);
3385	if (s1 == NULL)
3386	goto theend;
3387	patch(list1(&s->out), s1);
3388	PUSH(frag(s, list1(&s1->out)));
3389	break;
3390
3391	case NFA_LNUM:
3392	case NFA_LNUM_GT:
3393	case NFA_LNUM_LT:
3394	case NFA_VCOL:
3395	case NFA_VCOL_GT:
3396	case NFA_VCOL_LT:
3397	case NFA_COL:
3398	case NFA_COL_GT:
3399	case NFA_COL_LT:
3400	case NFA_MARK:
3401	case NFA_MARK_GT:
3402	case NFA_MARK_LT:
3403	{
3404	int n = ++p; /* lnum, col or mark name /
3405
3406	if (nfa_calc_size == TRUE) {
3407	nstate += `1`;
3408	break;
3409	}
3410	s = alloc_state(p[-`1`], NULL, NULL);
3411	if (s == NULL)
3412	goto theend;
3413	s->val = n;
3414	PUSH(frag(s, list1(&s->out)));
3415	break;
3416	}
3417
3418	case NFA_ZSTART:
3419	case NFA_ZEND:
3420	default:
3421	/ Operands /
3422	if (nfa_calc_size == TRUE) {
3423	nstate++;
3424	break;
3425	}
3426	s = alloc_state(*p, NULL, NULL);
3427	if (s == NULL)
3428	goto theend;
3429	PUSH(frag(s, list1(&s->out)));
3430	break;
3431
3432	} / switch(p) /*
3433
3434	} / for(p = postfix; p; ++p) /*
3435
3436	if (nfa_calc_size == TRUE) {
3437	nstate++;
3438	goto theend; / Return value when counting size is ignored anyway /
3439	}
3440
3441	e = POP();
3442	if (stackp != stack) {
3443	xfree(stack);
3444	EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA),"
3445	"too many states left on stack"));
3446	}
3447
3448	if (istate >= nstate) {
3449	xfree(stack);
3450	EMSG_RET_NULL(_("E876: (NFA regexp) "
3451	"Not enough space to store the whole NFA "));
3452	}
3453
3454	matchstate = &state_ptr[istate++]; / the match state /
3455	matchstate->c = NFA_MATCH;
3456	matchstate->out = matchstate->out1 = NULL;
3457	matchstate->id = `0`;
3458
3459	patch(e.out, matchstate);
3460	ret = e.start;
3461
3462	theend:
3463	xfree(stack);
3464	return ret;
3465
3466	#undef POP1
3467	#undef PUSH1
3468	#undef POP2
3469	#undef PUSH2
3470	#undef POP
3471	#undef PUSH
3472	}
3473
3474	/*
3475	* After building the NFA program, inspect it to add optimization hints.
3476	*/
3477	static void nfa_postprocess(nfa_regprog_T *prog)
3478	{
3479	int i;
3480	int c;
3481
3482	for (i = `0`; i < prog->nstate; ++i) {
3483	c = prog->state[i].c;
3484	if (c == NFA_START_INVISIBLE
3485	\|\| c == NFA_START_INVISIBLE_NEG
3486	\|\| c == NFA_START_INVISIBLE_BEFORE
3487	\|\| c == NFA_START_INVISIBLE_BEFORE_NEG) {
3488	int directly;
3489
3490	/ Do it directly when what follows is possibly the end of the*
3491	* match. */
3492	if (match_follows(prog->state[i].out1->out, `0`))
3493	directly = TRUE;
3494	else {
3495	int ch_invisible = failure_chance(prog->state[i].out, `0`);
3496	int ch_follows = failure_chance(prog->state[i].out1->out, `0`);
3497
3498	/ Postpone when the invisible match is expensive or has a*
3499	* lower chance of failing. */
3500	if (c == NFA_START_INVISIBLE_BEFORE
3501	\|\| c == NFA_START_INVISIBLE_BEFORE_NEG) {
3502	/ "before" matches are very expensive when*
3503	* unbounded, always prefer what follows then,
3504	* unless what follows will always match.
3505	* Otherwise strongly prefer what follows. */
3506	if (prog->state[i].val <= `0` && ch_follows > `0`)
3507	directly = FALSE;
3508	else
3509	directly = ch_follows * `10` < ch_invisible;
3510	} else {
3511	/ normal invisible, first do the one with the*
3512	* highest failure chance */
3513	directly = ch_follows < ch_invisible;
3514	}
3515	}
3516	if (directly)
3517	/ switch to the _FIRST state /
3518	++prog->state[i].c;
3519	}
3520	}
3521	}
3522
3523	/****************************************************************
3524	* NFA execution code.
3525	****************************************************************/
3526
3527	/ Values for done in nfa_pim_T. /
3528	#define NFA_PIM_UNUSED 0 /* pim not used */
3529	#define NFA_PIM_TODO 1 /* pim not done yet */
3530	#define NFA_PIM_MATCH 2 /* pim executed, matches */
3531	#define NFA_PIM_NOMATCH 3 /* pim executed, no match */
3532
3533
3534	#ifdef REGEXP_DEBUG
3535	static void log_subsexpr(regsubs_T *subs)
3536	{
3537	log_subexpr(&subs->norm);
3538	if (nfa_has_zsubexpr)
3539	log_subexpr(&subs->synt);
3540	}
3541
3542	static void log_subexpr(regsub_T *sub)
3543	{
3544	int j;
3545
3546	for (j = `0`; j < sub->in_use; j++)
3547	if (REG_MULTI)
3548	fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n",
3549	j,
3550	sub->list.multi[j].start_col,
3551	(int)sub->list.multi[j].start_lnum,
3552	sub->list.multi[j].end_col,
3553	(int)sub->list.multi[j].end_lnum);
3554	else {
3555	char s = (char* *)sub->list.line[j].start;
3556	char e = (char* *)sub->list.line[j].end;
3557
3558	fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n",
3559	j,
3560	s == NULL ? "NULL" : s,
3561	e == NULL ? "NULL" : e);
3562	}
3563	}
3564
3565	static char pim_info(nfa_pim_T pim)
3566	{
3567	static char buf[`30`];
3568
3569	if (pim == NULL \|\| pim->result == NFA_PIM_UNUSED)
3570	buf[`0`] = NUL;
3571	else {
3572	sprintf(buf, " PIM col %d", REG_MULTI ? (int)pim->end.pos.col
3573	: (int)(pim->end.ptr - reginput));
3574	}
3575	return buf;
3576	}
3577
3578	#endif
3579
3580	// Used during execution: whether a match has been found.
3581	static int nfa_match;
3582	static proftime_T *nfa_time_limit;
3583	static int *nfa_timed_out;
3584	static int nfa_time_count;
3585
3586	// Copy postponed invisible match info from "from" to "to".
3587	static void copy_pim(nfa_pim_T to, nfa_pim_T from)
3588	{
3589	to->result = from->result;
3590	to->state = from->state;
3591	copy_sub(&to->subs.norm, &from->subs.norm);
3592	if (nfa_has_zsubexpr)
3593	copy_sub(&to->subs.synt, &from->subs.synt);
3594	to->end = from->end;
3595	}
3596
3597	static void clear_sub(regsub_T *sub)
3598	{
3599	if (REG_MULTI)
3600	/ Use 0xff to set lnum to -1 /
3601	memset(sub->list.multi, `0xff`,
3602	sizeof(struct multipos) * nfa_nsubexpr);
3603	else
3604	memset(sub->list.line, `0`, sizeof(struct linepos) * nfa_nsubexpr);
3605	sub->in_use = `0`;
3606	}
3607
3608	/*
3609	* Copy the submatches from "from" to "to".
3610	*/
3611	static void copy_sub(regsub_T to, regsub_T from)
3612	{
3613	to->in_use = from->in_use;
3614	if (from->in_use > `0`) {
3615	/ Copy the match start and end positions. /
3616	if (REG_MULTI)
3617	memmove(&to->list.multi[`0`],
3618	&from->list.multi[`0`],
3619	sizeof(struct multipos) * from->in_use);
3620	else
3621	memmove(&to->list.line[`0`],
3622	&from->list.line[`0`],
3623	sizeof(struct linepos) * from->in_use);
3624	}
3625	}
3626
3627	/*
3628	* Like copy_sub() but exclude the main match.
3629	*/
3630	static void copy_sub_off(regsub_T to, regsub_T from)
3631	{
3632	if (to->in_use < from->in_use)
3633	to->in_use = from->in_use;
3634	if (from->in_use > `1`) {
3635	/ Copy the match start and end positions. /
3636	if (REG_MULTI)
3637	memmove(&to->list.multi[`1`],
3638	&from->list.multi[`1`],
3639	sizeof(struct multipos) * (from->in_use - `1`));
3640	else
3641	memmove(&to->list.line[`1`],
3642	&from->list.line[`1`],
3643	sizeof(struct linepos) * (from->in_use - `1`));
3644	}
3645	}
3646
3647	/*
3648	* Like copy_sub() but only do the end of the main match if \ze is present.
3649	*/
3650	static void copy_ze_off(regsub_T to, regsub_T from)
3651	{
3652	if (nfa_has_zend) {
3653	if (REG_MULTI) {
3654	if (from->list.multi[`0`].end_lnum >= `0`){
3655	to->list.multi[`0`].end_lnum = from->list.multi[`0`].end_lnum;
3656	to->list.multi[`0`].end_col = from->list.multi[`0`].end_col;
3657	}
3658	} else {
3659	if (from->list.line[`0`].end != NULL)
3660	to->list.line[`0`].end = from->list.line[`0`].end;
3661	}
3662	}
3663	}
3664
3665	// Return TRUE if "sub1" and "sub2" have the same start positions.
3666	// When using back-references also check the end position.
3667	static int sub_equal(regsub_T sub1, regsub_T sub2)
3668	{
3669	int i;
3670	int todo;
3671	linenr_T s1;
3672	linenr_T s2;
3673	char_u *sp1;
3674	char_u *sp2;
3675
3676	todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use;
3677	if (REG_MULTI) {
3678	for (i = `0`; i < todo; ++i) {
3679	if (i < sub1->in_use)
3680	s1 = sub1->list.multi[i].start_lnum;
3681	else
3682	s1 = -`1`;
3683	if (i < sub2->in_use)
3684	s2 = sub2->list.multi[i].start_lnum;
3685	else
3686	s2 = -`1`;
3687	if (s1 != s2)
3688	return FALSE;
3689	if (s1 != -`1` && sub1->list.multi[i].start_col
3690	!= sub2->list.multi[i].start_col)
3691	return FALSE;
3692
3693	if (nfa_has_backref) {
3694	if (i < sub1->in_use) {
3695	s1 = sub1->list.multi[i].end_lnum;
3696	} else {
3697	s1 = -`1`;
3698	}
3699	if (i < sub2->in_use) {
3700	s2 = sub2->list.multi[i].end_lnum;
3701	} else {
3702	s2 = -`1`;
3703	}
3704	if (s1 != s2) {
3705	return FALSE;
3706	}
3707	if (s1 != -`1`
3708	&& sub1->list.multi[i].end_col != sub2->list.multi[i].end_col) {
3709	return FALSE;
3710	}
3711	}
3712	}
3713	} else {
3714	for (i = `0`; i < todo; ++i) {
3715	if (i < sub1->in_use)
3716	sp1 = sub1->list.line[i].start;
3717	else
3718	sp1 = NULL;
3719	if (i < sub2->in_use)
3720	sp2 = sub2->list.line[i].start;
3721	else
3722	sp2 = NULL;
3723	if (sp1 != sp2)
3724	return FALSE;
3725
3726	if (nfa_has_backref) {
3727	if (i < sub1->in_use) {
3728	sp1 = sub1->list.line[i].end;
3729	} else {
3730	sp1 = NULL;
3731	}
3732	if (i < sub2->in_use) {
3733	sp2 = sub2->list.line[i].end;
3734	} else {
3735	sp2 = NULL;
3736	}
3737	if (sp1 != sp2) {
3738	return FALSE;
3739	}
3740	}
3741	}
3742	}
3743
3744	return TRUE;
3745	}
3746
3747	#ifdef REGEXP_DEBUG
3748	static void report_state(char *action,
3749	regsub_T *sub,
3750	nfa_state_T *state,
3751	int lid,
3752	nfa_pim_T *pim) {
3753	int col;
3754
3755	if (sub->in_use <= `0`)
3756	col = -`1`;
3757	else if (REG_MULTI)
3758	col = sub->list.multi[`0`].start_col;
3759	else
3760	col = (int)(sub->list.line[`0`].start - regline);
3761	nfa_set_code(state->c);
3762	fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n",
3763	action, abs(state->id), lid, state->c, code, col,
3764	pim_info(pim));
3765	}
3766
3767	#endif
3768
3769	/*
3770	* Return TRUE if the same state is already in list "l" with the same
3771	* positions as "subs".
3772	*/
3773	static int
3774	has_state_with_pos (
3775	nfa_list_T l, /* runtime state list /
3776	nfa_state_T state, /* state to update /
3777	regsubs_T subs, /* pointers to subexpressions /
3778	nfa_pim_T pim /* postponed match or NULL /
3779	)
3780	{
3781	nfa_thread_T *thread;
3782	int i;
3783
3784	for (i = `0`; i < l->n; ++i) {
3785	thread = &l->t[i];
3786	if (thread->state->id == state->id
3787	&& sub_equal(&thread->subs.norm, &subs->norm)
3788	&& (!nfa_has_zsubexpr
3789	\|\| sub_equal(&thread->subs.synt, &subs->synt))
3790	&& pim_equal(&thread->pim, pim))
3791	return TRUE;
3792	}
3793	return FALSE;
3794	}
3795
3796	/*
3797	* Return TRUE if "one" and "two" are equal. That includes when both are not
3798	* set.
3799	*/
3800	static int pim_equal(nfa_pim_T one, nfa_pim_T two)
3801	{
3802	int one_unused = (one == NULL \|\| one->result == NFA_PIM_UNUSED);
3803	int two_unused = (two == NULL \|\| two->result == NFA_PIM_UNUSED);
3804
3805	if (one_unused)
3806	/ one is unused: equal when two is also unused /
3807	return two_unused;
3808	if (two_unused)
3809	/ one is used and two is not: not equal /
3810	return FALSE;
3811	/ compare the state id /
3812	if (one->state->id != two->state->id)
3813	return FALSE;
3814	/ compare the position /
3815	if (REG_MULTI)
3816	return one->end.pos.lnum == two->end.pos.lnum
3817	&& one->end.pos.col == two->end.pos.col;
3818	return one->end.ptr == two->end.ptr;
3819	}
3820
3821	/*
3822	* Return TRUE if "state" leads to a NFA_MATCH without advancing the input.
3823	*/
3824	static int match_follows(nfa_state_T startstate, int* depth)
3825	{
3826	nfa_state_T *state = startstate;
3827
3828	/ avoid too much recursion /
3829	if (depth > `10`)
3830	return FALSE;
3831
3832	while (state != NULL) {
3833	switch (state->c) {
3834	case NFA_MATCH:
3835	case NFA_MCLOSE:
3836	case NFA_END_INVISIBLE:
3837	case NFA_END_INVISIBLE_NEG:
3838	case NFA_END_PATTERN:
3839	return TRUE;
3840
3841	case NFA_SPLIT:
3842	return match_follows(state->out, depth + `1`)
3843	\|\| match_follows(state->out1, depth + `1`);
3844
3845	case NFA_START_INVISIBLE:
3846	case NFA_START_INVISIBLE_FIRST:
3847	case NFA_START_INVISIBLE_BEFORE:
3848	case NFA_START_INVISIBLE_BEFORE_FIRST:
3849	case NFA_START_INVISIBLE_NEG:
3850	case NFA_START_INVISIBLE_NEG_FIRST:
3851	case NFA_START_INVISIBLE_BEFORE_NEG:
3852	case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
3853	case NFA_COMPOSING:
3854	/ skip ahead to next state /
3855	state = state->out1->out;
3856	continue;
3857
3858	case NFA_ANY:
3859	case NFA_ANY_COMPOSING:
3860	case NFA_IDENT:
3861	case NFA_SIDENT:
3862	case NFA_KWORD:
3863	case NFA_SKWORD:
3864	case NFA_FNAME:
3865	case NFA_SFNAME:
3866	case NFA_PRINT:
3867	case NFA_SPRINT:
3868	case NFA_WHITE:
3869	case NFA_NWHITE:
3870	case NFA_DIGIT:
3871	case NFA_NDIGIT:
3872	case NFA_HEX:
3873	case NFA_NHEX:
3874	case NFA_OCTAL:
3875	case NFA_NOCTAL:
3876	case NFA_WORD:
3877	case NFA_NWORD:
3878	case NFA_HEAD:
3879	case NFA_NHEAD:
3880	case NFA_ALPHA:
3881	case NFA_NALPHA:
3882	case NFA_LOWER:
3883	case NFA_NLOWER:
3884	case NFA_UPPER:
3885	case NFA_NUPPER:
3886	case NFA_LOWER_IC:
3887	case NFA_NLOWER_IC:
3888	case NFA_UPPER_IC:
3889	case NFA_NUPPER_IC:
3890	case NFA_START_COLL:
3891	case NFA_START_NEG_COLL:
3892	case NFA_NEWL:
3893	/ state will advance input /
3894	return FALSE;
3895
3896	default:
3897	if (state->c > `0`)
3898	/ state will advance input /
3899	return FALSE;
3900
3901	/ Others: zero-width or possibly zero-width, might still find*
3902	* a match at the same position, keep looking. */
3903	break;
3904	}
3905	state = state->out;
3906	}
3907	return FALSE;
3908	}
3909
3910
3911	/*
3912	* Return TRUE if "state" is already in list "l".
3913	*/
3914	static int
3915	state_in_list (
3916	nfa_list_T l, /* runtime state list /
3917	nfa_state_T state, /* state to update /
3918	regsubs_T subs /* pointers to subexpressions /
3919	)
3920	{
3921	if (state->lastlist[nfa_ll_index] == l->id) {
3922	if (!nfa_has_backref \|\| has_state_with_pos(l, state, subs, NULL))
3923	return TRUE;
3924	}
3925	return FALSE;
3926	}
3927
3928	// Offset used for "off" by addstate_here().
3929	#define ADDSTATE_HERE_OFFSET 10
3930
3931	// Add "state" and possibly what follows to state list ".".
3932	// Returns "subs_arg", possibly copied into temp_subs.
3933	// Returns NULL when recursiveness is too deep.
3934	static regsubs_T *addstate(
3935	nfa_list_T l, // runtime state list*
3936	nfa_state_T state, // state to update*
3937	regsubs_T subs_arg, // pointers to subexpressions*
3938	nfa_pim_T pim, // postponed look-behind match*
3939	int off_arg) // byte offset, when -1 go to next line
3940	FUNC_ATTR_NONNULL_ARG(`1`, `2`) FUNC_ATTR_WARN_UNUSED_RESULT
3941	{
3942	int subidx;
3943	int off = off_arg;
3944	int add_here = FALSE;
3945	int listindex = `0`;
3946	int k;
3947	int found = FALSE;
3948	nfa_thread_T *thread;
3949	struct multipos save_multipos;
3950	int save_in_use;
3951	char_u *save_ptr;
3952	int i;
3953	regsub_T *sub;
3954	regsubs_T *subs = subs_arg;
3955	static regsubs_T temp_subs;
3956	#ifdef REGEXP_DEBUG
3957	int did_print = FALSE;
3958	#endif
3959	static int depth = `0`;
3960
3961	// This function is called recursively. When the depth is too much we run
3962	// out of stack and crash, limit recursiveness here.
3963	if (++depth >= `5000` \|\| subs == NULL) {
3964	depth--;
3965	return NULL;
3966	}
3967
3968	if (off_arg <= -ADDSTATE_HERE_OFFSET) {
3969	add_here = true;
3970	off = `0`;
3971	listindex = -(off_arg + ADDSTATE_HERE_OFFSET);
3972	}
3973
3974	switch (state->c) {
3975	case NFA_NCLOSE:
3976	case NFA_MCLOSE:
3977	case NFA_MCLOSE1:
3978	case NFA_MCLOSE2:
3979	case NFA_MCLOSE3:
3980	case NFA_MCLOSE4:
3981	case NFA_MCLOSE5:
3982	case NFA_MCLOSE6:
3983	case NFA_MCLOSE7:
3984	case NFA_MCLOSE8:
3985	case NFA_MCLOSE9:
3986	case NFA_ZCLOSE:
3987	case NFA_ZCLOSE1:
3988	case NFA_ZCLOSE2:
3989	case NFA_ZCLOSE3:
3990	case NFA_ZCLOSE4:
3991	case NFA_ZCLOSE5:
3992	case NFA_ZCLOSE6:
3993	case NFA_ZCLOSE7:
3994	case NFA_ZCLOSE8:
3995	case NFA_ZCLOSE9:
3996	case NFA_MOPEN:
3997	case NFA_ZEND:
3998	case NFA_SPLIT:
3999	case NFA_EMPTY:
4000	/ These nodes are not added themselves but their "out" and/or*
4001	* "out1" may be added below. */
4002	break;
4003
4004	case NFA_BOL:
4005	case NFA_BOF:
4006	/ "^" won't match past end-of-line, don't bother trying.*
4007	* Except when at the end of the line, or when we are going to the
4008	* next line for a look-behind match. */
4009	if (reginput > regline
4010	&& *reginput != NUL
4011	&& (nfa_endp == NULL
4012	\|\| !REG_MULTI
4013	\|\| reglnum == nfa_endp->se_u.pos.lnum))
4014	goto skip_add;
4015	FALLTHROUGH;
4016
4017	case NFA_MOPEN1:
4018	case NFA_MOPEN2:
4019	case NFA_MOPEN3:
4020	case NFA_MOPEN4:
4021	case NFA_MOPEN5:
4022	case NFA_MOPEN6:
4023	case NFA_MOPEN7:
4024	case NFA_MOPEN8:
4025	case NFA_MOPEN9:
4026	case NFA_ZOPEN:
4027	case NFA_ZOPEN1:
4028	case NFA_ZOPEN2:
4029	case NFA_ZOPEN3:
4030	case NFA_ZOPEN4:
4031	case NFA_ZOPEN5:
4032	case NFA_ZOPEN6:
4033	case NFA_ZOPEN7:
4034	case NFA_ZOPEN8:
4035	case NFA_ZOPEN9:
4036	case NFA_NOPEN:
4037	case NFA_ZSTART:
4038	/ These nodes need to be added so that we can bail out when it*
4039	* was added to this list before at the same position to avoid an
4040	* endless loop for "" /
4041
4042	default:
4043	if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP) {
4044	/ This state is already in the list, don't add it again,*
4045	* unless it is an MOPEN that is used for a backreference or
4046	* when there is a PIM. For NFA_MATCH check the position,
4047	* lower position is preferred. */
4048	if (!nfa_has_backref && pim == NULL && !l->has_pim
4049	&& state->c != NFA_MATCH) {
4050
4051	/ When called from addstate_here() do insert before*
4052	* existing states. */
4053	if (add_here) {
4054	for (k = `0`; k < l->n && k < listindex; ++k) {
4055	if (l->t[k].state->id == state->id) {
4056	found = TRUE;
4057	break;
4058	}
4059	}
4060	}
4061
4062	if (!add_here \|\| found) {
4063	skip_add:
4064	#ifdef REGEXP_DEBUG
4065	nfa_set_code(state->c);
4066	fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n",
4067	abs(state->id), l->id, state->c, code,
4068	pim == NULL ? "NULL" : "yes", l->has_pim, found);
4069	#endif
4070	depth--;
4071	return subs;
4072	}
4073	}
4074
4075	/ Do not add the state again when it exists with the same*
4076	* positions. */
4077	if (has_state_with_pos(l, state, subs, pim))
4078	goto skip_add;
4079	}
4080
4081	// When there are backreferences or PIMs the number of states may
4082	// be (a lot) bigger than anticipated.
4083	if (l->n == l->len) {
4084	const int newlen = l->len * `3` / `2` + `50`;
4085	const size_t newsize = newlen * sizeof(nfa_thread_T);
4086
4087	if ((long)(newsize >> `10`) >= p_mmp) {
4088	EMSG(_(e_maxmempat));
4089	depth--;
4090	return NULL;
4091	}
4092	if (subs != &temp_subs) {
4093	/ "subs" may point into the current array, need to make a*
4094	* copy before it becomes invalid. */
4095	copy_sub(&temp_subs.norm, &subs->norm);
4096	if (nfa_has_zsubexpr)
4097	copy_sub(&temp_subs.synt, &subs->synt);
4098	subs = &temp_subs;
4099	}
4100
4101	nfa_thread_T *const newt = xrealloc(l->t, newsize);
4102	l->t = newt;
4103	l->len = newlen;
4104	}
4105
4106	/ add the state to the list /
4107	state->lastlist[nfa_ll_index] = l->id;
4108	thread = &l->t[l->n++];
4109	thread->state = state;
4110	if (pim == NULL)
4111	thread->pim.result = NFA_PIM_UNUSED;
4112	else {
4113	copy_pim(&thread->pim, pim);
4114	l->has_pim = TRUE;
4115	}
4116	copy_sub(&thread->subs.norm, &subs->norm);
4117	if (nfa_has_zsubexpr)
4118	copy_sub(&thread->subs.synt, &subs->synt);
4119	#ifdef REGEXP_DEBUG
4120	report_state("Adding", &thread->subs.norm, state, l->id, pim);
4121	did_print = TRUE;
4122	#endif
4123	}
4124
4125	#ifdef REGEXP_DEBUG
4126	if (!did_print)
4127	report_state("Processing", &subs->norm, state, l->id, pim);
4128	#endif
4129	switch (state->c) {
4130	case NFA_MATCH:
4131	break;
4132
4133	case NFA_SPLIT:
4134	/ order matters here /
4135	subs = addstate(l, state->out, subs, pim, off_arg);
4136	subs = addstate(l, state->out1, subs, pim, off_arg);
4137	break;
4138
4139	case NFA_EMPTY:
4140	case NFA_NOPEN:
4141	case NFA_NCLOSE:
4142	subs = addstate(l, state->out, subs, pim, off_arg);
4143	break;
4144
4145	case NFA_MOPEN:
4146	case NFA_MOPEN1:
4147	case NFA_MOPEN2:
4148	case NFA_MOPEN3:
4149	case NFA_MOPEN4:
4150	case NFA_MOPEN5:
4151	case NFA_MOPEN6:
4152	case NFA_MOPEN7:
4153	case NFA_MOPEN8:
4154	case NFA_MOPEN9:
4155	case NFA_ZOPEN:
4156	case NFA_ZOPEN1:
4157	case NFA_ZOPEN2:
4158	case NFA_ZOPEN3:
4159	case NFA_ZOPEN4:
4160	case NFA_ZOPEN5:
4161	case NFA_ZOPEN6:
4162	case NFA_ZOPEN7:
4163	case NFA_ZOPEN8:
4164	case NFA_ZOPEN9:
4165	case NFA_ZSTART:
4166	if (state->c == NFA_ZSTART) {
4167	subidx = `0`;
4168	sub = &subs->norm;
4169	} else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) { // -V560
4170	subidx = state->c - NFA_ZOPEN;
4171	sub = &subs->synt;
4172	} else {
4173	subidx = state->c - NFA_MOPEN;
4174	sub = &subs->norm;
4175	}
4176
4177	/ avoid compiler warnings /
4178	save_ptr = NULL;
4179	memset(&save_multipos, `0`, sizeof(save_multipos));
4180
4181	/ Set the position (with "off" added) in the subexpression. Save*
4182	* and restore it when it was in use. Otherwise fill any gap. */
4183	if (REG_MULTI) {
4184	if (subidx < sub->in_use) {
4185	save_multipos = sub->list.multi[subidx];
4186	save_in_use = -`1`;
4187	} else {
4188	save_in_use = sub->in_use;
4189	for (i = sub->in_use; i < subidx; ++i) {
4190	sub->list.multi[i].start_lnum = -`1`;
4191	sub->list.multi[i].end_lnum = -`1`;
4192	}
4193	sub->in_use = subidx + `1`;
4194	}
4195	if (off == -`1`) {
4196	sub->list.multi[subidx].start_lnum = reglnum + `1`;
4197	sub->list.multi[subidx].start_col = `0`;
4198	} else {
4199
4200	sub->list.multi[subidx].start_lnum = reglnum;
4201	sub->list.multi[subidx].start_col =
4202	(colnr_T)(reginput - regline + off);
4203	}
4204	sub->list.multi[subidx].end_lnum = -`1`;
4205	} else {
4206	if (subidx < sub->in_use) {
4207	save_ptr = sub->list.line[subidx].start;
4208	save_in_use = -`1`;
4209	} else {
4210	save_in_use = sub->in_use;
4211	for (i = sub->in_use; i < subidx; ++i) {
4212	sub->list.line[i].start = NULL;
4213	sub->list.line[i].end = NULL;
4214	}
4215	sub->in_use = subidx + `1`;
4216	}
4217	sub->list.line[subidx].start = reginput + off;
4218	}
4219
4220	subs = addstate(l, state->out, subs, pim, off_arg);
4221	if (subs == NULL) {
4222	break;
4223	}
4224	// "subs" may have changed, need to set "sub" again.
4225	if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) { // -V560
4226	sub = &subs->synt;
4227	} else {
4228	sub = &subs->norm;
4229	}
4230
4231	if (save_in_use == -`1`) {
4232	if (REG_MULTI) {
4233	sub->list.multi[subidx] = save_multipos;
4234	}
4235	else
4236	sub->list.line[subidx].start = save_ptr;
4237	} else
4238	sub->in_use = save_in_use;
4239	break;
4240
4241	case NFA_MCLOSE:
4242	if (nfa_has_zend && (REG_MULTI
4243	? subs->norm.list.multi[`0`].end_lnum >= `0`
4244	: subs->norm.list.line[`0`].end != NULL)) {
4245	// Do not overwrite the position set by \ze.
4246	subs = addstate(l, state->out, subs, pim, off_arg);
4247	break;
4248	}
4249	FALLTHROUGH;
4250	case NFA_MCLOSE1:
4251	case NFA_MCLOSE2:
4252	case NFA_MCLOSE3:
4253	case NFA_MCLOSE4:
4254	case NFA_MCLOSE5:
4255	case NFA_MCLOSE6:
4256	case NFA_MCLOSE7:
4257	case NFA_MCLOSE8:
4258	case NFA_MCLOSE9:
4259	case NFA_ZCLOSE:
4260	case NFA_ZCLOSE1:
4261	case NFA_ZCLOSE2:
4262	case NFA_ZCLOSE3:
4263	case NFA_ZCLOSE4:
4264	case NFA_ZCLOSE5:
4265	case NFA_ZCLOSE6:
4266	case NFA_ZCLOSE7:
4267	case NFA_ZCLOSE8:
4268	case NFA_ZCLOSE9:
4269	case NFA_ZEND:
4270	if (state->c == NFA_ZEND) {
4271	subidx = `0`;
4272	sub = &subs->norm;
4273	} else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) { // -V560
4274	subidx = state->c - NFA_ZCLOSE;
4275	sub = &subs->synt;
4276	} else {
4277	subidx = state->c - NFA_MCLOSE;
4278	sub = &subs->norm;
4279	}
4280
4281	/ We don't fill in gaps here, there must have been an MOPEN that*
4282	* has done that. */
4283	save_in_use = sub->in_use;
4284	if (sub->in_use <= subidx)
4285	sub->in_use = subidx + `1`;
4286	if (REG_MULTI) {
4287	save_multipos = sub->list.multi[subidx];
4288	if (off == -`1`) {
4289	sub->list.multi[subidx].end_lnum = reglnum + `1`;
4290	sub->list.multi[subidx].end_col = `0`;
4291	} else {
4292	sub->list.multi[subidx].end_lnum = reglnum;
4293	sub->list.multi[subidx].end_col =
4294	(colnr_T)(reginput - regline + off);
4295	}
4296	/ avoid compiler warnings /
4297	save_ptr = NULL;
4298	} else {
4299	save_ptr = sub->list.line[subidx].end;
4300	sub->list.line[subidx].end = reginput + off;
4301	// avoid compiler warnings
4302	memset(&save_multipos, `0`, sizeof(save_multipos));
4303	}
4304
4305	subs = addstate(l, state->out, subs, pim, off_arg);
4306	if (subs == NULL) {
4307	break;
4308	}
4309	// "subs" may have changed, need to set "sub" again.
4310	if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) { // -V560
4311	sub = &subs->synt;
4312	} else {
4313	sub = &subs->norm;
4314	}
4315
4316	if (REG_MULTI) {
4317	sub->list.multi[subidx] = save_multipos;
4318	}
4319	else
4320	sub->list.line[subidx].end = save_ptr;
4321	sub->in_use = save_in_use;
4322	break;
4323	}
4324	depth--;
4325	return subs;
4326	}
4327
4328	/*
4329	* Like addstate(), but the new state(s) are put at position "*ip".
4330	* Used for zero-width matches, next state to use is the added one.
4331	* This makes sure the order of states to be tried does not change, which
4332	* matters for alternatives.
4333	*/
4334	static regsubs_T *addstate_here(
4335	nfa_list_T l, // runtime state list*
4336	nfa_state_T state, // state to update*
4337	regsubs_T subs, // pointers to subexpressions*
4338	nfa_pim_T pim, // postponed look-behind match*
4339	int *ip
4340	)
4341	FUNC_ATTR_NONNULL_ARG(`1`, `2`, `5`) FUNC_ATTR_WARN_UNUSED_RESULT
4342	{
4343	int tlen = l->n;
4344	int count;
4345	int listidx = *ip;
4346
4347	/ First add the state(s) at the end, so that we know how many there are.*
4348	* Pass the listidx as offset (avoids adding another argument to
4349	* addstate(). */
4350	regsubs_T *r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET);
4351	if (r == NULL) {
4352	return NULL;
4353	}
4354
4355	// when "ip" was at the end of the list, nothing to do*
4356	if (listidx + `1` == tlen) {
4357	return r;
4358	}
4359
4360	// re-order to put the new state at the current position
4361	count = l->n - tlen;
4362	if (count == `0`) {
4363	return r; // no state got added
4364	}
4365	if (count == `1`) {
4366	// overwrite the current state
4367	l->t[listidx] = l->t[l->n - `1`];
4368	} else if (count > `1`) {
4369	if (l->n + count - `1` >= l->len) {
4370	/ not enough space to move the new states, reallocate the list*
4371	* and move the states to the right position */
4372	const int newlen = l->len * `3` / `2` + `50`;
4373	const size_t newsize = newlen * sizeof(nfa_thread_T);
4374
4375	if ((long)(newsize >> `10`) >= p_mmp) {
4376	EMSG(_(e_maxmempat));
4377	return NULL;
4378	}
4379	nfa_thread_T *const newl = xmalloc(newsize);
4380	l->len = newlen;
4381	memmove(&(newl[`0`]),
4382	&(l->t[`0`]),
4383	sizeof(nfa_thread_T) * listidx);
4384	memmove(&(newl[listidx]),
4385	&(l->t[l->n - count]),
4386	sizeof(nfa_thread_T) * count);
4387	memmove(&(newl[listidx + count]),
4388	&(l->t[listidx + `1`]),
4389	sizeof(nfa_thread_T) * (l->n - count - listidx - `1`));
4390	xfree(l->t);
4391	l->t = newl;
4392	} else {
4393	/ make space for new states, then move them from the*
4394	* end to the current position */
4395	memmove(&(l->t[listidx + count]),
4396	&(l->t[listidx + `1`]),
4397	sizeof(nfa_thread_T) * (l->n - listidx - `1`));
4398	memmove(&(l->t[listidx]),
4399	&(l->t[l->n - `1`]),
4400	sizeof(nfa_thread_T) * count);
4401	}
4402	}
4403	--l->n;
4404	*ip = listidx - `1`;
4405
4406	return r;
4407	}
4408
4409	/*
4410	* Check character class "class" against current character c.
4411	*/
4412	static int check_char_class(int class, int c)
4413	{
4414	switch (class) {
4415	case NFA_CLASS_ALNUM:
4416	if (c >= `1` && c < `128` && isalnum(c)) {
4417	return OK;
4418	}
4419	break;
4420	case NFA_CLASS_ALPHA:
4421	if (c >= `1` && c < `128` && isalpha(c)) {
4422	return OK;
4423	}
4424	break;
4425	case NFA_CLASS_BLANK:
4426	if (c == `' '` \|\| c == `'\t'`)
4427	return OK;
4428	break;
4429	case NFA_CLASS_CNTRL:
4430	if (c >= `1` && c <= `127` && iscntrl(c)) {
4431	return OK;
4432	}
4433	break;
4434	case NFA_CLASS_DIGIT:
4435	if (ascii_isdigit(c))
4436	return OK;
4437	break;
4438	case NFA_CLASS_GRAPH:
4439	if (c >= `1` && c <= `127` && isgraph(c)) {
4440	return OK;
4441	}
4442	break;
4443	case NFA_CLASS_LOWER:
4444	if (mb_islower(c) && c != `170` && c != `186`) {
4445	return OK;
4446	}
4447	break;
4448	case NFA_CLASS_PRINT:
4449	if (vim_isprintc(c))
4450	return OK;
4451	break;
4452	case NFA_CLASS_PUNCT:
4453	if (c >= `1` && c < `128` && ispunct(c)) {
4454	return OK;
4455	}
4456	break;
4457	case NFA_CLASS_SPACE:
4458	if ((c >= `9` && c <= `13`) \|\| (c == `' '`))
4459	return OK;
4460	break;
4461	case NFA_CLASS_UPPER:
4462	if (mb_isupper(c)) {
4463	return OK;
4464	}
4465	break;
4466	case NFA_CLASS_XDIGIT:
4467	if (ascii_isxdigit(c))
4468	return OK;
4469	break;
4470	case NFA_CLASS_TAB:
4471	if (c == `'\t'`)
4472	return OK;
4473	break;
4474	case NFA_CLASS_RETURN:
4475	if (c == `'\r'`)
4476	return OK;
4477	break;
4478	case NFA_CLASS_BACKSPACE:
4479	if (c == `'\b'`)
4480	return OK;
4481	break;
4482	case NFA_CLASS_ESCAPE:
4483	if (c == ESC) {
4484	return OK;
4485	}
4486	break;
4487
4488	default:
4489	// should not be here :P
4490	IEMSGN(_(e_ill_char_class), class);
4491	return FAIL;
4492	}
4493	return FAIL;
4494	}
4495
4496	/*
4497	* Check for a match with subexpression "subidx".
4498	* Return TRUE if it matches.
4499	*/
4500	static int
4501	match_backref (
4502	regsub_T sub, /* pointers to subexpressions /
4503	int subidx,
4504	int bytelen /* out: length of match in bytes /
4505	)
4506	{
4507	int len;
4508
4509	if (sub->in_use <= subidx) {
4510	retempty:
4511	/ backref was not set, match an empty string /
4512	*bytelen = `0`;
4513	return TRUE;
4514	}
4515
4516	if (REG_MULTI) {
4517	if (sub->list.multi[subidx].start_lnum < `0`
4518	\|\| sub->list.multi[subidx].end_lnum < `0`)
4519	goto retempty;
4520	if (sub->list.multi[subidx].start_lnum == reglnum
4521	&& sub->list.multi[subidx].end_lnum == reglnum) {
4522	len = sub->list.multi[subidx].end_col
4523	- sub->list.multi[subidx].start_col;
4524	if (cstrncmp(regline + sub->list.multi[subidx].start_col,
4525	reginput, &len) == `0`) {
4526	*bytelen = len;
4527	return TRUE;
4528	}
4529	} else {
4530	if (match_with_backref(
4531	sub->list.multi[subidx].start_lnum,
4532	sub->list.multi[subidx].start_col,
4533	sub->list.multi[subidx].end_lnum,
4534	sub->list.multi[subidx].end_col,
4535	bytelen) == RA_MATCH)
4536	return TRUE;
4537	}
4538	} else {
4539	if (sub->list.line[subidx].start == NULL
4540	\|\| sub->list.line[subidx].end == NULL)
4541	goto retempty;
4542	len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start);
4543	if (cstrncmp(sub->list.line[subidx].start, reginput, &len) == `0`) {
4544	*bytelen = len;
4545	return TRUE;
4546	}
4547	}
4548	return FALSE;
4549	}
4550
4551
4552
4553	/*
4554	* Check for a match with \z subexpression "subidx".
4555	* Return TRUE if it matches.
4556	*/
4557	static int
4558	match_zref (
4559	int subidx,
4560	int bytelen /* out: length of match in bytes /
4561	)
4562	{
4563	int len;
4564
4565	cleanup_zsubexpr();
4566	if (re_extmatch_in == NULL \|\| re_extmatch_in->matches[subidx] == NULL) {
4567	/ backref was not set, match an empty string /
4568	*bytelen = `0`;
4569	return TRUE;
4570	}
4571
4572	len = (int)STRLEN(re_extmatch_in->matches[subidx]);
4573	if (cstrncmp(re_extmatch_in->matches[subidx], reginput, &len) == `0`) {
4574	*bytelen = len;
4575	return TRUE;
4576	}
4577	return FALSE;
4578	}
4579
4580	/*
4581	* Save list IDs for all NFA states of "prog" into "list".
4582	* Also reset the IDs to zero.
4583	* Only used for the recursive value lastlist[1].
4584	*/
4585	static void nfa_save_listids(nfa_regprog_T prog, int* *list)
4586	{
4587	int i;
4588	nfa_state_T *p;
4589
4590	/ Order in the list is reverse, it's a bit faster that way. /
4591	p = &prog->state[`0`];
4592	for (i = prog->nstate; --i >= `0`; ) {
4593	list[i] = p->lastlist[`1`];
4594	p->lastlist[`1`] = `0`;
4595	++p;
4596	}
4597	}
4598
4599	/*
4600	* Restore list IDs from "list" to all NFA states.
4601	*/
4602	static void nfa_restore_listids(nfa_regprog_T prog, int* *list)
4603	{
4604	int i;
4605	nfa_state_T *p;
4606
4607	p = &prog->state[`0`];
4608	for (i = prog->nstate; --i >= `0`; ) {
4609	p->lastlist[`1`] = list[i];
4610	++p;
4611	}
4612	}
4613
4614	static bool nfa_re_num_cmp(uintmax_t val, int op, uintmax_t pos)
4615	{
4616	if (op == `1`) return pos > val;
4617	if (op == `2`) return pos < val;
4618	return val == pos;
4619	}
4620
4621
4622	/*
4623	* Recursively call nfa_regmatch()
4624	* "pim" is NULL or contains info about a Postponed Invisible Match (start
4625	* position).
4626	*/
4627	static int recursive_regmatch(
4628	nfa_state_T state, nfa_pim_T pim, nfa_regprog_T *prog,
4629	regsubs_T submatch, regsubs_T m, int *listids, int* *listids_len)
4630	{
4631	int save_reginput_col = (int)(reginput - regline);
4632	int save_reglnum = reglnum;
4633	int save_nfa_match = nfa_match;
4634	int save_nfa_listid = nfa_listid;
4635	save_se_T *save_nfa_endp = nfa_endp;
4636	save_se_T endpos;
4637	save_se_T *endposp = NULL;
4638	int result;
4639	int need_restore = FALSE;
4640
4641	if (pim != NULL) {
4642	/ start at the position where the postponed match was /
4643	if (REG_MULTI)
4644	reginput = regline + pim->end.pos.col;
4645	else
4646	reginput = pim->end.ptr;
4647	}
4648
4649	if (state->c == NFA_START_INVISIBLE_BEFORE
4650	\|\| state->c == NFA_START_INVISIBLE_BEFORE_FIRST
4651	\|\| state->c == NFA_START_INVISIBLE_BEFORE_NEG
4652	\|\| state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) {
4653	/ The recursive match must end at the current position. When "pim" is*
4654	* not NULL it specifies the current position. */
4655	endposp = &endpos;
4656	if (REG_MULTI) {
4657	if (pim == NULL) {
4658	endpos.se_u.pos.col = (int)(reginput - regline);
4659	endpos.se_u.pos.lnum = reglnum;
4660	} else
4661	endpos.se_u.pos = pim->end.pos;
4662	} else {
4663	if (pim == NULL)
4664	endpos.se_u.ptr = reginput;
4665	else
4666	endpos.se_u.ptr = pim->end.ptr;
4667	}
4668
4669	/ Go back the specified number of bytes, or as far as the*
4670	* start of the previous line, to try matching "\@<=" or
4671	* not matching "\@<!". This is very inefficient, limit the number of
4672	* bytes if possible. */
4673	if (state->val <= `0`) {
4674	if (REG_MULTI) {
4675	regline = reg_getline(--reglnum);
4676	if (regline == NULL)
4677	/ can't go before the first line /
4678	regline = reg_getline(++reglnum);
4679	}
4680	reginput = regline;
4681	} else {
4682	if (REG_MULTI && (int)(reginput - regline) < state->val) {
4683	/ Not enough bytes in this line, go to end of*
4684	* previous line. */
4685	regline = reg_getline(--reglnum);
4686	if (regline == NULL) {
4687	/ can't go before the first line /
4688	regline = reg_getline(++reglnum);
4689	reginput = regline;
4690	} else
4691	reginput = regline + STRLEN(regline);
4692	}
4693	if ((int)(reginput - regline) >= state->val) {
4694	reginput -= state->val;
4695	reginput -= utf_head_off(regline, reginput);
4696	} else {
4697	reginput = regline;
4698	}
4699	}
4700	}
4701
4702	#ifdef REGEXP_DEBUG
4703	if (log_fd != stderr)
4704	fclose(log_fd);
4705	log_fd = NULL;
4706	#endif
4707	/ Have to clear the lastlist field of the NFA nodes, so that*
4708	* nfa_regmatch() and addstate() can run properly after recursion. */
4709	if (nfa_ll_index == `1`) {
4710	/ Already calling nfa_regmatch() recursively. Save the lastlist[1]*
4711	* values and clear them. */
4712	if (listids == NULL \|\| listids_len < nstate) {
4713	xfree(*listids);
4714	listids = xmalloc(sizeof(listids) nstate);
4715	*listids_len = nstate;
4716	}
4717	nfa_save_listids(prog, *listids);
4718	need_restore = TRUE;
4719	/ any value of nfa_listid will do /
4720	} else {
4721	/ First recursive nfa_regmatch() call, switch to the second lastlist*
4722	* entry. Make sure nfa_listid is different from a previous recursive
4723	* call, because some states may still have this ID. */
4724	++nfa_ll_index;
4725	if (nfa_listid <= nfa_alt_listid)
4726	nfa_listid = nfa_alt_listid;
4727	}
4728
4729	/ Call nfa_regmatch() to check if the current concat matches at this*
4730	* position. The concat ends with the node NFA_END_INVISIBLE */
4731	nfa_endp = endposp;
4732	result = nfa_regmatch(prog, state->out, submatch, m);
4733
4734	if (need_restore)
4735	nfa_restore_listids(prog, *listids);
4736	else {
4737	--nfa_ll_index;
4738	nfa_alt_listid = nfa_listid;
4739	}
4740
4741	/ restore position in input text /
4742	reglnum = save_reglnum;
4743	if (REG_MULTI)
4744	regline = reg_getline(reglnum);
4745	reginput = regline + save_reginput_col;
4746	if (result != NFA_TOO_EXPENSIVE) {
4747	nfa_match = save_nfa_match;
4748	nfa_listid = save_nfa_listid;
4749	}
4750	nfa_endp = save_nfa_endp;
4751
4752	#ifdef REGEXP_DEBUG
4753	log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
4754	if (log_fd != NULL) {
4755	fprintf(log_fd, "****************************\n");
4756	fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n");
4757	fprintf(log_fd, "MATCH = %s\n", !result ? "FALSE" : "OK");
4758	fprintf(log_fd, "****************************\n");
4759	} else {
4760	EMSG(_(e_log_open_failed));
4761	log_fd = stderr;
4762	}
4763	#endif
4764
4765	return result;
4766	}
4767
4768
4769	/*
4770	* Estimate the chance of a match with "state" failing.
4771	* empty match: 0
4772	* NFA_ANY: 1
4773	* specific character: 99
4774	*/
4775	static int failure_chance(nfa_state_T state, int* depth)
4776	{
4777	int c = state->c;
4778	int l, r;
4779
4780	/ detect looping /
4781	if (depth > `4`)
4782	return `1`;
4783
4784	switch (c) {
4785	case NFA_SPLIT:
4786	if (state->out->c == NFA_SPLIT \|\| state->out1->c == NFA_SPLIT)
4787	/ avoid recursive stuff /
4788	return `1`;
4789	/ two alternatives, use the lowest failure chance /
4790	l = failure_chance(state->out, depth + `1`);
4791	r = failure_chance(state->out1, depth + `1`);
4792	return l < r ? l : r;
4793
4794	case NFA_ANY:
4795	/ matches anything, unlikely to fail /
4796	return `1`;
4797
4798	case NFA_MATCH:
4799	case NFA_MCLOSE:
4800	case NFA_ANY_COMPOSING:
4801	/ empty match works always /
4802	return `0`;
4803
4804	case NFA_START_INVISIBLE:
4805	case NFA_START_INVISIBLE_FIRST:
4806	case NFA_START_INVISIBLE_NEG:
4807	case NFA_START_INVISIBLE_NEG_FIRST:
4808	case NFA_START_INVISIBLE_BEFORE:
4809	case NFA_START_INVISIBLE_BEFORE_FIRST:
4810	case NFA_START_INVISIBLE_BEFORE_NEG:
4811	case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
4812	case NFA_START_PATTERN:
4813	/ recursive regmatch is expensive, use low failure chance /
4814	return `5`;
4815
4816	case NFA_BOL:
4817	case NFA_EOL:
4818	case NFA_BOF:
4819	case NFA_EOF:
4820	case NFA_NEWL:
4821	return `99`;
4822
4823	case NFA_BOW:
4824	case NFA_EOW:
4825	return `90`;
4826
4827	case NFA_MOPEN:
4828	case NFA_MOPEN1:
4829	case NFA_MOPEN2:
4830	case NFA_MOPEN3:
4831	case NFA_MOPEN4:
4832	case NFA_MOPEN5:
4833	case NFA_MOPEN6:
4834	case NFA_MOPEN7:
4835	case NFA_MOPEN8:
4836	case NFA_MOPEN9:
4837	case NFA_ZOPEN:
4838	case NFA_ZOPEN1:
4839	case NFA_ZOPEN2:
4840	case NFA_ZOPEN3:
4841	case NFA_ZOPEN4:
4842	case NFA_ZOPEN5:
4843	case NFA_ZOPEN6:
4844	case NFA_ZOPEN7:
4845	case NFA_ZOPEN8:
4846	case NFA_ZOPEN9:
4847	case NFA_ZCLOSE:
4848	case NFA_ZCLOSE1:
4849	case NFA_ZCLOSE2:
4850	case NFA_ZCLOSE3:
4851	case NFA_ZCLOSE4:
4852	case NFA_ZCLOSE5:
4853	case NFA_ZCLOSE6:
4854	case NFA_ZCLOSE7:
4855	case NFA_ZCLOSE8:
4856	case NFA_ZCLOSE9:
4857	case NFA_NOPEN:
4858	case NFA_MCLOSE1:
4859	case NFA_MCLOSE2:
4860	case NFA_MCLOSE3:
4861	case NFA_MCLOSE4:
4862	case NFA_MCLOSE5:
4863	case NFA_MCLOSE6:
4864	case NFA_MCLOSE7:
4865	case NFA_MCLOSE8:
4866	case NFA_MCLOSE9:
4867	case NFA_NCLOSE:
4868	return failure_chance(state->out, depth + `1`);
4869
4870	case NFA_BACKREF1:
4871	case NFA_BACKREF2:
4872	case NFA_BACKREF3:
4873	case NFA_BACKREF4:
4874	case NFA_BACKREF5:
4875	case NFA_BACKREF6:
4876	case NFA_BACKREF7:
4877	case NFA_BACKREF8:
4878	case NFA_BACKREF9:
4879	case NFA_ZREF1:
4880	case NFA_ZREF2:
4881	case NFA_ZREF3:
4882	case NFA_ZREF4:
4883	case NFA_ZREF5:
4884	case NFA_ZREF6:
4885	case NFA_ZREF7:
4886	case NFA_ZREF8:
4887	case NFA_ZREF9:
4888	/ backreferences don't match in many places /
4889	return `94`;
4890
4891	case NFA_LNUM_GT:
4892	case NFA_LNUM_LT:
4893	case NFA_COL_GT:
4894	case NFA_COL_LT:
4895	case NFA_VCOL_GT:
4896	case NFA_VCOL_LT:
4897	case NFA_MARK_GT:
4898	case NFA_MARK_LT:
4899	case NFA_VISUAL:
4900	/ before/after positions don't match very often /
4901	return `85`;
4902
4903	case NFA_LNUM:
4904	return `90`;
4905
4906	case NFA_CURSOR:
4907	case NFA_COL:
4908	case NFA_VCOL:
4909	case NFA_MARK:
4910	/ specific positions rarely match /
4911	return `98`;
4912
4913	case NFA_COMPOSING:
4914	return `95`;
4915
4916	default:
4917	if (c > `0`)
4918	/ character match fails often /
4919	return `95`;
4920	}
4921
4922	/ something else, includes character classes /
4923	return `50`;
4924	}
4925
4926	/*
4927	* Skip until the char "c" we know a match must start with.
4928	*/
4929	static int skip_to_start(int c, colnr_T *colp)
4930	{
4931	const char_u *const s = cstrchr(regline + *colp, c);
4932	if (s == NULL) {
4933	return FAIL;
4934	}
4935	colp = (int*)(s - regline);
4936	return OK;
4937	}
4938
4939	/*
4940	* Check for a match with match_text.
4941	* Called after skip_to_start() has found regstart.
4942	* Returns zero for no match, 1 for a match.
4943	*/
4944	static long find_match_text(colnr_T startcol, int regstart, char_u *match_text)
4945	{
4946	#define PTR2LEN(x) enc_utf8 ? utf_ptr2len(x) : MB_PTR2LEN(x)
4947
4948	colnr_T col = startcol;
4949	int regstart_len = PTR2LEN(regline + startcol);
4950
4951	for (;;) {
4952	bool match = true;
4953	char_u *s1 = match_text;
4954	char_u s2 = regline + col + regstart_len; // skip regstart*
4955	while (*s1) {
4956	int c1_len = PTR2LEN(s1);
4957	int c1 = PTR2CHAR(s1);
4958	int c2_len = PTR2LEN(s2);
4959	int c2 = PTR2CHAR(s2);
4960
4961	if ((c1 != c2 && (!rex.reg_ic \|\| mb_tolower(c1) != mb_tolower(c2)))
4962	\|\| c1_len != c2_len) {
4963	match = false;
4964	break;
4965	}
4966	s1 += c1_len;
4967	s2 += c2_len;
4968	}
4969	if (match
4970	// check that no composing char follows
4971	&& !(enc_utf8 && utf_iscomposing(PTR2CHAR(s2)))) {
4972	cleanup_subexpr();
4973	if (REG_MULTI) {
4974	rex.reg_startpos[`0`].lnum = reglnum;
4975	rex.reg_startpos[`0`].col = col;
4976	rex.reg_endpos[`0`].lnum = reglnum;
4977	rex.reg_endpos[`0`].col = s2 - regline;
4978	} else {
4979	rex.reg_startp[`0`] = regline + col;
4980	rex.reg_endp[`0`] = s2;
4981	}
4982	return `1L`;
4983	}
4984
4985	// Try finding regstart after the current match.
4986	col += regstart_len; // skip regstart
4987	if (skip_to_start(regstart, &col) == FAIL) {
4988	break;
4989	}
4990	}
4991	return `0L`;
4992
4993	#undef PTR2LEN
4994	}
4995
4996	static int nfa_did_time_out(void)
4997	{
4998	if (nfa_time_limit != NULL && profile_passed_limit(*nfa_time_limit)) {
4999	if (nfa_timed_out != NULL) {
5000	*nfa_timed_out = true;
5001	}
5002	return true;
5003	}
5004	return false;
5005	}
5006
5007	/// Main matching routine.
5008	///
5009	/// Run NFA to determine whether it matches reginput.
5010	///
5011	/// When "nfa_endp" is not NULL it is a required end-of-match position.
5012	///
5013	/// Return TRUE if there is a match, FALSE if there is no match,
5014	/// NFA_TOO_EXPENSIVE if we end up with too many states.
5015	/// When there is a match "submatch" contains the positions.
5016	///
5017	/// Note: Caller must ensure that: start != NULL.
5018	static int nfa_regmatch(nfa_regprog_T prog, nfa_state_T start,
5019	regsubs_T submatch, regsubs_T m)
5020	{
5021	int result = false;
5022	int flag = `0`;
5023	bool go_to_nextline = false;
5024	nfa_thread_T *t;
5025	nfa_list_T list[`2`];
5026	int listidx;
5027	nfa_list_T *thislist;
5028	nfa_list_T *nextlist;
5029	int *listids = NULL;
5030	int listids_len = `0`;
5031	nfa_state_T *add_state;
5032	bool add_here;
5033	int add_count;
5034	int add_off = `0`;
5035	int toplevel = start->c == NFA_MOPEN;
5036	regsubs_T *r;
5037	#ifdef NFA_REGEXP_DEBUG_LOG
5038	FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG, "a");
5039
5040	if (debug == NULL) {
5041	EMSG2("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG);
5042	return false;
5043	}
5044	#endif
5045	// Some patterns may take a long time to match, especially when using
5046	// recursive_regmatch(). Allow interrupting them with CTRL-C.
5047	fast_breakcheck();
5048	if (got_int) {
5049	#ifdef NFA_REGEXP_DEBUG_LOG
5050	fclose(debug);
5051	#endif
5052	return false;
5053	}
5054	if (nfa_did_time_out()) {
5055	#ifdef NFA_REGEXP_DEBUG_LOG
5056	fclose(debug);
5057	#endif
5058	return false;
5059	}
5060
5061	nfa_match = false;
5062
5063	// Allocate memory for the lists of nodes.
5064	size_t size = (nstate + `1`) * sizeof(nfa_thread_T);
5065	list[`0`].t = xmalloc(size);
5066	list[`0`].len = nstate + `1`;
5067	list[`1`].t = xmalloc(size);
5068	list[`1`].len = nstate + `1`;
5069
5070	#ifdef REGEXP_DEBUG
5071	log_fd = fopen(NFA_REGEXP_RUN_LOG, "a");
5072	if (log_fd != NULL) {
5073	fprintf(log_fd, "**********************************\n");
5074	nfa_set_code(start->c);
5075	fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n",
5076	abs(start->id), code);
5077	fprintf(log_fd, "**********************************\n");
5078	} else {
5079	EMSG(_(e_log_open_failed));
5080	log_fd = stderr;
5081	}
5082	#endif
5083
5084	thislist = &list[`0`];
5085	thislist->n = `0`;
5086	thislist->has_pim = FALSE;
5087	nextlist = &list[`1`];
5088	nextlist->n = `0`;
5089	nextlist->has_pim = FALSE;
5090	#ifdef REGEXP_DEBUG
5091	fprintf(log_fd, "(---) STARTSTATE first\n");
5092	#endif
5093	thislist->id = nfa_listid + `1`;
5094
5095	/ Inline optimized code for addstate(thislist, start, m, 0) if we know*
5096	* it's the first MOPEN. */
5097	if (toplevel) {
5098	if (REG_MULTI) {
5099	m->norm.list.multi[`0`].start_lnum = reglnum;
5100	m->norm.list.multi[`0`].start_col = (colnr_T)(reginput - regline);
5101	} else
5102	m->norm.list.line[`0`].start = reginput;
5103	m->norm.in_use = `1`;
5104	r = addstate(thislist, start->out, m, NULL, `0`);
5105	} else {
5106	r = addstate(thislist, start, m, NULL, `0`);
5107	}
5108	if (r == NULL) {
5109	nfa_match = NFA_TOO_EXPENSIVE;
5110	goto theend;
5111	}
5112
5113	#define ADD_STATE_IF_MATCH(state) \
5114	if (result) { \
5115	add_state = state->out; \
5116	add_off = clen; \
5117	}
5118
5119	/*
5120	* Run for each character.
5121	*/
5122	for (;; ) {
5123	int curc = utf_ptr2char(reginput);
5124	int clen = utfc_ptr2len(reginput);
5125	if (curc == NUL) {
5126	clen = `0`;
5127	go_to_nextline = false;
5128	}
5129
5130	/ swap lists /
5131	thislist = &list[flag];
5132	nextlist = &list[flag ^= `1`];
5133	nextlist->n = `0`; // clear nextlist
5134	nextlist->has_pim = false;
5135	nfa_listid++;
5136	if (prog->re_engine == AUTOMATIC_ENGINE
5137	&& (nfa_listid >= NFA_MAX_STATES)) {
5138	// Too many states, retry with old engine.
5139	nfa_match = NFA_TOO_EXPENSIVE;
5140	goto theend;
5141	}
5142
5143	thislist->id = nfa_listid;
5144	nextlist->id = nfa_listid + `1`;
5145
5146	#ifdef REGEXP_DEBUG
5147	fprintf(log_fd, "------------------------------------------\n");
5148	fprintf(log_fd, ">>> Reginput is \"%s\"\n", reginput);
5149	fprintf(log_fd,
5150	">>> Advanced one character... Current char is %c (code %d) \n",
5151	curc,
5152	(int)curc);
5153	fprintf(log_fd, ">>> Thislist has %d states available: ", thislist->n);
5154	{
5155	int i;
5156
5157	for (i = `0`; i < thislist->n; i++)
5158	fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
5159	}
5160	fprintf(log_fd, "\n");
5161	#endif
5162
5163	#ifdef NFA_REGEXP_DEBUG_LOG
5164	fprintf(debug, "\n-------------------\n");
5165	#endif
5166	/*
5167	* If the state lists are empty we can stop.
5168	*/
5169	if (thislist->n == `0`)
5170	break;
5171
5172	// compute nextlist
5173	for (listidx = `0`; listidx < thislist->n; listidx++) {
5174	// If the list gets very long there probably is something wrong.
5175	// At least allow interrupting with CTRL-C.
5176	fast_breakcheck();
5177	if (got_int) {
5178	break;
5179	}
5180	if (nfa_time_limit != NULL && ++nfa_time_count == `20`) {
5181	nfa_time_count = `0`;
5182	if (nfa_did_time_out()) {
5183	break;
5184	}
5185	}
5186	t = &thislist->t[listidx];
5187
5188	#ifdef NFA_REGEXP_DEBUG_LOG
5189	nfa_set_code(t->state->c);
5190	fprintf(debug, "%s, ", code);
5191	#endif
5192	#ifdef REGEXP_DEBUG
5193	{
5194	int col;
5195
5196	if (t->subs.norm.in_use <= `0`) {
5197	col = -`1`;
5198	} else if (REG_MULTI) {
5199	col = t->subs.norm.list.multi[`0`].start_col;
5200	} else {
5201	col = (int)(t->subs.norm.list.line[`0`].start - regline);
5202	}
5203	nfa_set_code(t->state->c);
5204	fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n",
5205	abs(t->state->id), (int)t->state->c, code, col,
5206	pim_info(&t->pim));
5207	}
5208	#endif
5209
5210	/*
5211	* Handle the possible codes of the current state.
5212	* The most important is NFA_MATCH.
5213	*/
5214	add_state = NULL;
5215	add_here = false;
5216	add_count = `0`;
5217	switch (t->state->c) {
5218	case NFA_MATCH:
5219	{
5220	// If the match ends before a composing characters and
5221	// rex.reg_icombine is not set, that is not really a match.
5222	if (enc_utf8 && !rex.reg_icombine && utf_iscomposing(curc)) {
5223	break;
5224	}
5225	nfa_match = true;
5226	copy_sub(&submatch->norm, &t->subs.norm);
5227	if (nfa_has_zsubexpr)
5228	copy_sub(&submatch->synt, &t->subs.synt);
5229	#ifdef REGEXP_DEBUG
5230	log_subsexpr(&t->subs);
5231	#endif
5232	/ Found the left-most longest match, do not look at any other*
5233	* states at this position. When the list of states is going
5234	* to be empty quit without advancing, so that "reginput" is
5235	* correct. */
5236	if (nextlist->n == `0`)
5237	clen = `0`;
5238	goto nextchar;
5239	}
5240
5241	case NFA_END_INVISIBLE:
5242	case NFA_END_INVISIBLE_NEG:
5243	case NFA_END_PATTERN:
5244	/*
5245	* This is only encountered after a NFA_START_INVISIBLE or
5246	* NFA_START_INVISIBLE_BEFORE node.
5247	* They surround a zero-width group, used with "\@=", "\&",
5248	* "\@!", "\@<=" and "\@<!".
5249	* If we got here, it means that the current "invisible" group
5250	* finished successfully, so return control to the parent
5251	* nfa_regmatch(). For a look-behind match only when it ends
5252	* in the position in "nfa_endp".
5253	* Submatches are stored in *m, and used in the parent call.
5254	*/
5255	#ifdef REGEXP_DEBUG
5256	if (nfa_endp != NULL) {
5257	if (REG_MULTI)
5258	fprintf(
5259	log_fd,
5260	"Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n",
5261	(int)reglnum,
5262	(int)nfa_endp->se_u.pos.lnum,
5263	(int)(reginput - regline),
5264	nfa_endp->se_u.pos.col);
5265	else
5266	fprintf(log_fd, "Current col: %d, endp col: %d\n",
5267	(int)(reginput - regline),
5268	(int)(nfa_endp->se_u.ptr - reginput));
5269	}
5270	#endif
5271	/ If "nfa_endp" is set it's only a match if it ends at*
5272	* "nfa_endp" */
5273	if (nfa_endp != NULL && (REG_MULTI
5274	? (reglnum != nfa_endp->se_u.pos.lnum
5275	\|\| (int)(reginput - regline)
5276	!= nfa_endp->se_u.pos.col)
5277	: reginput != nfa_endp->se_u.ptr))
5278	break;
5279
5280	/ do not set submatches for \@! /
5281	if (t->state->c != NFA_END_INVISIBLE_NEG) {
5282	copy_sub(&m->norm, &t->subs.norm);
5283	if (nfa_has_zsubexpr)
5284	copy_sub(&m->synt, &t->subs.synt);
5285	}
5286	#ifdef REGEXP_DEBUG
5287	fprintf(log_fd, "Match found:\n");
5288	log_subsexpr(m);
5289	#endif
5290	nfa_match = true;
5291	// See comment above at "goto nextchar".
5292	if (nextlist->n == `0`) {
5293	clen = `0`;
5294	}
5295	goto nextchar;
5296
5297	case NFA_START_INVISIBLE:
5298	case NFA_START_INVISIBLE_FIRST:
5299	case NFA_START_INVISIBLE_NEG:
5300	case NFA_START_INVISIBLE_NEG_FIRST:
5301	case NFA_START_INVISIBLE_BEFORE:
5302	case NFA_START_INVISIBLE_BEFORE_FIRST:
5303	case NFA_START_INVISIBLE_BEFORE_NEG:
5304	case NFA_START_INVISIBLE_BEFORE_NEG_FIRST:
5305	{
5306	#ifdef REGEXP_DEBUG
5307	fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n",
5308	failure_chance(t->state->out, `0`),
5309	failure_chance(t->state->out1->out, `0`));
5310	#endif
5311	// Do it directly if there already is a PIM or when
5312	// nfa_postprocess() detected it will work better.
5313	if (t->pim.result != NFA_PIM_UNUSED
5314	\|\| t->state->c == NFA_START_INVISIBLE_FIRST
5315	\|\| t->state->c == NFA_START_INVISIBLE_NEG_FIRST
5316	\|\| t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST
5317	\|\| t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) {
5318	int in_use = m->norm.in_use;
5319
5320	// Copy submatch info for the recursive call, opposite
5321	// of what happens on success below.
5322	copy_sub_off(&m->norm, &t->subs.norm);
5323	if (nfa_has_zsubexpr)
5324	copy_sub_off(&m->synt, &t->subs.synt);
5325
5326	// First try matching the invisible match, then what
5327	// follows.
5328	result = recursive_regmatch(t->state, NULL, prog, submatch, m,
5329	&listids, &listids_len);
5330	if (result == NFA_TOO_EXPENSIVE) {
5331	nfa_match = result;
5332	goto theend;
5333	}
5334
5335	// for \@! and \@<! it is a match when the result is
5336	// FALSE
5337	if (result != (t->state->c == NFA_START_INVISIBLE_NEG
5338	\|\| t->state->c == NFA_START_INVISIBLE_NEG_FIRST
5339	\|\| t->state->c
5340	== NFA_START_INVISIBLE_BEFORE_NEG
5341	\|\| t->state->c
5342	== NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
5343	// Copy submatch info from the recursive call
5344	copy_sub_off(&t->subs.norm, &m->norm);
5345	if (nfa_has_zsubexpr)
5346	copy_sub_off(&t->subs.synt, &m->synt);
5347	// If the pattern has \ze and it matched in the
5348	// sub pattern, use it.
5349	copy_ze_off(&t->subs.norm, &m->norm);
5350
5351	// t->state->out1 is the corresponding
5352	// END_INVISIBLE node; Add its out to the current
5353	// list (zero-width match).
5354	add_here = true;
5355	add_state = t->state->out1->out;
5356	}
5357	m->norm.in_use = in_use;
5358	} else {
5359	nfa_pim_T pim;
5360
5361	// First try matching what follows. Only if a match
5362	// is found verify the invisible match matches. Add a
5363	// nfa_pim_T to the following states, it contains info
5364	// about the invisible match.
5365	pim.state = t->state;
5366	pim.result = NFA_PIM_TODO;
5367	pim.subs.norm.in_use = `0`;
5368	pim.subs.synt.in_use = `0`;
5369	if (REG_MULTI) {
5370	pim.end.pos.col = (int)(reginput - regline);
5371	pim.end.pos.lnum = reglnum;
5372	} else
5373	pim.end.ptr = reginput;
5374
5375	// t->state->out1 is the corresponding END_INVISIBLE
5376	// node; Add its out to the current list (zero-width
5377	// match).
5378	if (addstate_here(thislist, t->state->out1->out, &t->subs,
5379	&pim, &listidx) == NULL) {
5380	nfa_match = NFA_TOO_EXPENSIVE;
5381	goto theend;
5382	}
5383	}
5384	}
5385	break;
5386
5387	case NFA_START_PATTERN:
5388	{
5389	nfa_state_T *skip = NULL;
5390	#ifdef REGEXP_DEBUG
5391	int skip_lid = `0`;
5392	#endif
5393
5394	// There is no point in trying to match the pattern if the
5395	// output state is not going to be added to the list.
5396	if (state_in_list(nextlist, t->state->out1->out, &t->subs)) {
5397	skip = t->state->out1->out;
5398	#ifdef REGEXP_DEBUG
5399	skip_lid = nextlist->id;
5400	#endif
5401	} else if (state_in_list(nextlist,
5402	t->state->out1->out->out, &t->subs)) {
5403	skip = t->state->out1->out->out;
5404	#ifdef REGEXP_DEBUG
5405	skip_lid = nextlist->id;
5406	#endif
5407	} else if (state_in_list(thislist,
5408	t->state->out1->out->out, &t->subs)) {
5409	skip = t->state->out1->out->out;
5410	#ifdef REGEXP_DEBUG
5411	skip_lid = thislist->id;
5412	#endif
5413	}
5414	if (skip != NULL) {
5415	#ifdef REGEXP_DEBUG
5416	nfa_set_code(skip->c);
5417	fprintf(
5418	log_fd,
5419	"> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n",
5420	abs(skip->id), skip_lid, skip->c, code);
5421	#endif
5422	break;
5423	}
5424	// Copy submatch info to the recursive call, opposite of what
5425	// happens afterwards.
5426	copy_sub_off(&m->norm, &t->subs.norm);
5427	if (nfa_has_zsubexpr) {
5428	copy_sub_off(&m->synt, &t->subs.synt);
5429	}
5430
5431	// First try matching the pattern.
5432	result = recursive_regmatch(t->state, NULL, prog, submatch, m,
5433	&listids, &listids_len);
5434	if (result == NFA_TOO_EXPENSIVE) {
5435	nfa_match = result;
5436	goto theend;
5437	}
5438	if (result) {
5439	int bytelen;
5440
5441	#ifdef REGEXP_DEBUG
5442	fprintf(log_fd, "NFA_START_PATTERN matches:\n");
5443	log_subsexpr(m);
5444	#endif
5445	// Copy submatch info from the recursive call
5446	copy_sub_off(&t->subs.norm, &m->norm);
5447	if (nfa_has_zsubexpr) {
5448	copy_sub_off(&t->subs.synt, &m->synt);
5449	}
5450	// Now we need to skip over the matched text and then
5451	// continue with what follows.
5452	if (REG_MULTI) {
5453	// TODO(RE): multi-line match
5454	bytelen = m->norm.list.multi[`0`].end_col
5455	- (int)(reginput - regline);
5456	} else {
5457	bytelen = (int)(m->norm.list.line[`0`].end - reginput);
5458	}
5459
5460	#ifdef REGEXP_DEBUG
5461	fprintf(log_fd, "NFA_START_PATTERN length: %d\n", bytelen);
5462	#endif
5463	if (bytelen == `0`) {
5464	// empty match, output of corresponding
5465	// NFA_END_PATTERN/NFA_SKIP to be used at current
5466	// position
5467	add_here = true;
5468	add_state = t->state->out1->out->out;
5469	} else if (bytelen <= clen) {
5470	// match current character, output of corresponding
5471	// NFA_END_PATTERN to be used at next position.
5472	add_state = t->state->out1->out->out;
5473	add_off = clen;
5474	} else {
5475	// skip over the matched characters, set character
5476	// count in NFA_SKIP
5477	add_state = t->state->out1->out;
5478	add_off = bytelen;
5479	add_count = bytelen - clen;
5480	}
5481	}
5482	break;
5483	}
5484
5485	case NFA_BOL:
5486	if (reginput == regline) {
5487	add_here = true;
5488	add_state = t->state->out;
5489	}
5490	break;
5491
5492	case NFA_EOL:
5493	if (curc == NUL) {
5494	add_here = true;
5495	add_state = t->state->out;
5496	}
5497	break;
5498
5499	case NFA_BOW:
5500	result = true;
5501
5502	if (curc == NUL) {
5503	result = false;
5504	} else if (has_mbyte) {
5505	int this_class;
5506
5507	// Get class of current and previous char (if it exists).
5508	this_class = mb_get_class_tab(reginput, rex.reg_buf->b_chartab);
5509	if (this_class <= `1`) {
5510	result = false;
5511	} else if (reg_prev_class() == this_class) {
5512	result = false;
5513	}
5514	} else if (!vim_iswordc_buf(curc, rex.reg_buf)
5515	\|\| (reginput > regline
5516	&& vim_iswordc_buf(reginput[-`1`], rex.reg_buf))) {
5517	result = false;
5518	}
5519	if (result) {
5520	add_here = true;
5521	add_state = t->state->out;
5522	}
5523	break;
5524
5525	case NFA_EOW:
5526	result = true;
5527	if (reginput == regline) {
5528	result = false;
5529	} else if (has_mbyte) {
5530	int this_class, prev_class;
5531
5532	// Get class of current and previous char (if it exists).
5533	this_class = mb_get_class_tab(reginput, rex.reg_buf->b_chartab);
5534	prev_class = reg_prev_class();
5535	if (this_class == prev_class
5536	\|\| prev_class == `0` \|\| prev_class == `1`) {
5537	result = false;
5538	}
5539	} else if (!vim_iswordc_buf(reginput[-`1`], rex.reg_buf)
5540	\|\| (reginput[`0`] != NUL
5541	&& vim_iswordc_buf(curc, rex.reg_buf))) {
5542	result = false;
5543	}
5544	if (result) {
5545	add_here = true;
5546	add_state = t->state->out;
5547	}
5548	break;
5549
5550	case NFA_BOF:
5551	if (reglnum == `0` && reginput == regline
5552	&& (!REG_MULTI \|\| rex.reg_firstlnum == `1`)) {
5553	add_here = true;
5554	add_state = t->state->out;
5555	}
5556	break;
5557
5558	case NFA_EOF:
5559	if (reglnum == rex.reg_maxline && curc == NUL) {
5560	add_here = true;
5561	add_state = t->state->out;
5562	}
5563	break;
5564
5565	case NFA_COMPOSING:
5566	{
5567	int mc = curc;
5568	int len = `0`;
5569	nfa_state_T *end;
5570	nfa_state_T *sta;
5571	int cchars[MAX_MCO];
5572	int ccount = `0`;
5573	int j;
5574
5575	sta = t->state->out;
5576	len = `0`;
5577	if (utf_iscomposing(sta->c)) {
5578	// Only match composing character(s), ignore base
5579	// character. Used for ".{composing}" and "{composing}"
5580	// (no preceding character).
5581	len += mb_char2len(mc);
5582	}
5583	if (rex.reg_icombine && len == `0`) {
5584	// If \Z was present, then ignore composing characters.
5585	// When ignoring the base character this always matches.
5586	if (sta->c != curc) {
5587	result = FAIL;
5588	} else {
5589	result = OK;
5590	}
5591	while (sta->c != NFA_END_COMPOSING) {
5592	sta = sta->out;
5593	}
5594	} else if (len > `0` \|\| mc == sta->c) {
5595	// Check base character matches first, unless ignored.
5596	if (len == `0`) {
5597	len += mb_char2len(mc);
5598	sta = sta->out;
5599	}
5600
5601	// We don't care about the order of composing characters.
5602	// Get them into cchars[] first.
5603	while (len < clen) {
5604	mc = utf_ptr2char(reginput + len);
5605	cchars[ccount++] = mc;
5606	len += mb_char2len(mc);
5607	if (ccount == MAX_MCO)
5608	break;
5609	}
5610
5611	// Check that each composing char in the pattern matches a
5612	// composing char in the text. We do not check if all
5613	// composing chars are matched.
5614	result = OK;
5615	while (sta->c != NFA_END_COMPOSING) {
5616	for (j = `0`; j < ccount; ++j)
5617	if (cchars[j] == sta->c)
5618	break;
5619	if (j == ccount) {
5620	result = FAIL;
5621	break;
5622	}
5623	sta = sta->out;
5624	}
5625	} else
5626	result = FAIL;
5627
5628	end = t->state->out1; // NFA_END_COMPOSING
5629	ADD_STATE_IF_MATCH(end);
5630	break;
5631	}
5632
5633	case NFA_NEWL:
5634	if (curc == NUL && !rex.reg_line_lbr && REG_MULTI
5635	&& reglnum <= rex.reg_maxline) {
5636	go_to_nextline = true;
5637	// Pass -1 for the offset, which means taking the position
5638	// at the start of the next line.
5639	add_state = t->state->out;
5640	add_off = -`1`;
5641	} else if (curc == `'\n'` && rex.reg_line_lbr) {
5642	// match \n as if it is an ordinary character
5643	add_state = t->state->out;
5644	add_off = `1`;
5645	}
5646	break;
5647
5648	case NFA_START_COLL:
5649	case NFA_START_NEG_COLL:
5650	{
5651	// What follows is a list of characters, until NFA_END_COLL.
5652	// One of them must match or none of them must match.
5653	nfa_state_T *state;
5654	int result_if_matched;
5655	int c1, c2;
5656
5657	// Never match EOL. If it's part of the collection it is added
5658	// as a separate state with an OR.
5659	if (curc == NUL) {
5660	break;
5661	}
5662
5663	state = t->state->out;
5664	result_if_matched = (t->state->c == NFA_START_COLL);
5665	for (;; ) {
5666	if (state->c == NFA_END_COLL) {
5667	result = !result_if_matched;
5668	break;
5669	}
5670	if (state->c == NFA_RANGE_MIN) {
5671	c1 = state->val;
5672	state = state->out; // advance to NFA_RANGE_MAX
5673	c2 = state->val;
5674	#ifdef REGEXP_DEBUG
5675	fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n",
5676	curc, c1, c2);
5677	#endif
5678	if (curc >= c1 && curc <= c2) {
5679	result = result_if_matched;
5680	break;
5681	}
5682	if (rex.reg_ic) {
5683	int curc_low = mb_tolower(curc);
5684	int done = false;
5685
5686	for (; c1 <= c2; c1++) {
5687	if (mb_tolower(c1) == curc_low) {
5688	result = result_if_matched;
5689	done = TRUE;
5690	break;
5691	}
5692	}
5693	if (done) {
5694	break;
5695	}
5696	}
5697	} else if (state->c < `0` ? check_char_class(state->c, curc)
5698	: (curc == state->c
5699	\|\| (rex.reg_ic && mb_tolower(curc)
5700	== mb_tolower(state->c)))) {
5701	result = result_if_matched;
5702	break;
5703	}
5704	state = state->out;
5705	}
5706	if (result) {
5707	// next state is in out of the NFA_END_COLL, out1 of
5708	// START points to the END state
5709	add_state = t->state->out1->out;
5710	add_off = clen;
5711	}
5712	break;
5713	}
5714
5715	case NFA_ANY:
5716	// Any char except '\0', (end of input) does not match.
5717	if (curc > `0`) {
5718	add_state = t->state->out;
5719	add_off = clen;
5720	}
5721	break;
5722
5723	case NFA_ANY_COMPOSING:
5724	// On a composing character skip over it. Otherwise do
5725	// nothing. Always matches.
5726	if (enc_utf8 && utf_iscomposing(curc)) {
5727	add_off = clen;
5728	} else {
5729	add_here = true;
5730	add_off = `0`;
5731	}
5732	add_state = t->state->out;
5733	break;
5734
5735	// Character classes like \a for alpha, \d for digit etc.
5736	case NFA_IDENT: // \i
5737	result = vim_isIDc(curc);
5738	ADD_STATE_IF_MATCH(t->state);
5739	break;
5740
5741	case NFA_SIDENT: // \I
5742	result = !ascii_isdigit(curc) && vim_isIDc(curc);
5743	ADD_STATE_IF_MATCH(t->state);
5744	break;
5745
5746	case NFA_KWORD: // \k
5747	result = vim_iswordp_buf(reginput, rex.reg_buf);
5748	ADD_STATE_IF_MATCH(t->state);
5749	break;
5750
5751	case NFA_SKWORD: // \K
5752	result = !ascii_isdigit(curc)
5753	&& vim_iswordp_buf(reginput, rex.reg_buf);
5754	ADD_STATE_IF_MATCH(t->state);
5755	break;
5756
5757	case NFA_FNAME: // \f
5758	result = vim_isfilec(curc);
5759	ADD_STATE_IF_MATCH(t->state);
5760	break;
5761
5762	case NFA_SFNAME: // \F
5763	result = !ascii_isdigit(curc) && vim_isfilec(curc);
5764	ADD_STATE_IF_MATCH(t->state);
5765	break;
5766
5767	case NFA_PRINT: // \p
5768	result = vim_isprintc(PTR2CHAR(reginput));
5769	ADD_STATE_IF_MATCH(t->state);
5770	break;
5771
5772	case NFA_SPRINT: // \P
5773	result = !ascii_isdigit(curc) && vim_isprintc(PTR2CHAR(reginput));
5774	ADD_STATE_IF_MATCH(t->state);
5775	break;
5776
5777	case NFA_WHITE: // \s
5778	result = ascii_iswhite(curc);
5779	ADD_STATE_IF_MATCH(t->state);
5780	break;
5781
5782	case NFA_NWHITE: // \S
5783	result = curc != NUL && !ascii_iswhite(curc);
5784	ADD_STATE_IF_MATCH(t->state);
5785	break;
5786
5787	case NFA_DIGIT: // \d
5788	result = ri_digit(curc);
5789	ADD_STATE_IF_MATCH(t->state);
5790	break;
5791
5792	case NFA_NDIGIT: // \D
5793	result = curc != NUL && !ri_digit(curc);
5794	ADD_STATE_IF_MATCH(t->state);
5795	break;
5796
5797	case NFA_HEX: // \x
5798	result = ri_hex(curc);
5799	ADD_STATE_IF_MATCH(t->state);
5800	break;
5801
5802	case NFA_NHEX: // \X
5803	result = curc != NUL && !ri_hex(curc);
5804	ADD_STATE_IF_MATCH(t->state);
5805	break;
5806
5807	case NFA_OCTAL: // \o
5808	result = ri_octal(curc);
5809	ADD_STATE_IF_MATCH(t->state);
5810	break;
5811
5812	case NFA_NOCTAL: // \O
5813	result = curc != NUL && !ri_octal(curc);
5814	ADD_STATE_IF_MATCH(t->state);
5815	break;
5816
5817	case NFA_WORD: // \w
5818	result = ri_word(curc);
5819	ADD_STATE_IF_MATCH(t->state);
5820	break;
5821
5822	case NFA_NWORD: // \W
5823	result = curc != NUL && !ri_word(curc);
5824	ADD_STATE_IF_MATCH(t->state);
5825	break;
5826
5827	case NFA_HEAD: // \h
5828	result = ri_head(curc);
5829	ADD_STATE_IF_MATCH(t->state);
5830	break;
5831
5832	case NFA_NHEAD: // \H
5833	result = curc != NUL && !ri_head(curc);
5834	ADD_STATE_IF_MATCH(t->state);
5835	break;
5836
5837	case NFA_ALPHA: // \a
5838	result = ri_alpha(curc);
5839	ADD_STATE_IF_MATCH(t->state);
5840	break;
5841
5842	case NFA_NALPHA: // \A
5843	result = curc != NUL && !ri_alpha(curc);
5844	ADD_STATE_IF_MATCH(t->state);
5845	break;
5846
5847	case NFA_LOWER: // \l
5848	result = ri_lower(curc);
5849	ADD_STATE_IF_MATCH(t->state);
5850	break;
5851
5852	case NFA_NLOWER: // \L
5853	result = curc != NUL && !ri_lower(curc);
5854	ADD_STATE_IF_MATCH(t->state);
5855	break;
5856
5857	case NFA_UPPER: // \u
5858	result = ri_upper(curc);
5859	ADD_STATE_IF_MATCH(t->state);
5860	break;
5861
5862	case NFA_NUPPER: // \U
5863	result = curc != NUL && !ri_upper(curc);
5864	ADD_STATE_IF_MATCH(t->state);
5865	break;
5866
5867	case NFA_LOWER_IC: // [a-z]
5868	result = ri_lower(curc) \|\| (rex.reg_ic && ri_upper(curc));
5869	ADD_STATE_IF_MATCH(t->state);
5870	break;
5871
5872	case NFA_NLOWER_IC: // [^a-z]
5873	result = curc != NUL
5874	&& !(ri_lower(curc) \|\| (rex.reg_ic && ri_upper(curc)));
5875	ADD_STATE_IF_MATCH(t->state);
5876	break;
5877
5878	case NFA_UPPER_IC: // [A-Z]
5879	result = ri_upper(curc) \|\| (rex.reg_ic && ri_lower(curc));
5880	ADD_STATE_IF_MATCH(t->state);
5881	break;
5882
5883	case NFA_NUPPER_IC: // [^A-Z]
5884	result = curc != NUL
5885	&& !(ri_upper(curc) \|\| (rex.reg_ic && ri_lower(curc)));
5886	ADD_STATE_IF_MATCH(t->state);
5887	break;
5888
5889	case NFA_BACKREF1:
5890	case NFA_BACKREF2:
5891	case NFA_BACKREF3:
5892	case NFA_BACKREF4:
5893	case NFA_BACKREF5:
5894	case NFA_BACKREF6:
5895	case NFA_BACKREF7:
5896	case NFA_BACKREF8:
5897	case NFA_BACKREF9:
5898	case NFA_ZREF1:
5899	case NFA_ZREF2:
5900	case NFA_ZREF3:
5901	case NFA_ZREF4:
5902	case NFA_ZREF5:
5903	case NFA_ZREF6:
5904	case NFA_ZREF7:
5905	case NFA_ZREF8:
5906	case NFA_ZREF9:
5907	// \1 .. \9 \z1 .. \z9
5908	{
5909	int subidx;
5910	int bytelen;
5911
5912	if (t->state->c <= NFA_BACKREF9) {
5913	subidx = t->state->c - NFA_BACKREF1 + `1`;
5914	result = match_backref(&t->subs.norm, subidx, &bytelen);
5915	} else {
5916	subidx = t->state->c - NFA_ZREF1 + `1`;
5917	result = match_zref(subidx, &bytelen);
5918	}
5919
5920	if (result) {
5921	if (bytelen == `0`) {
5922	// empty match always works, output of NFA_SKIP to be
5923	// used next
5924	add_here = true;
5925	add_state = t->state->out->out;
5926	} else if (bytelen <= clen) {
5927	// match current character, jump ahead to out of
5928	// NFA_SKIP
5929	add_state = t->state->out->out;
5930	add_off = clen;
5931	} else {
5932	// skip over the matched characters, set character
5933	// count in NFA_SKIP
5934	add_state = t->state->out;
5935	add_off = bytelen;
5936	add_count = bytelen - clen;
5937	}
5938	}
5939	break;
5940	}
5941	case NFA_SKIP:
5942	// character of previous matching \1 .. \9 or \@>
5943	if (t->count - clen <= `0`) {
5944	// end of match, go to what follows
5945	add_state = t->state->out;
5946	add_off = clen;
5947	} else {
5948	// add state again with decremented count
5949	add_state = t->state;
5950	add_off = `0`;
5951	add_count = t->count - clen;
5952	}
5953	break;
5954
5955	case NFA_LNUM:
5956	case NFA_LNUM_GT:
5957	case NFA_LNUM_LT:
5958	assert(t->state->val >= `0`
5959	&& !((rex.reg_firstlnum > `0`
5960	&& reglnum > LONG_MAX - rex.reg_firstlnum)
5961	\|\| (rex.reg_firstlnum < `0`
5962	&& reglnum < LONG_MIN + rex.reg_firstlnum))
5963	&& reglnum + rex.reg_firstlnum >= `0`);
5964	result = (REG_MULTI
5965	&& nfa_re_num_cmp((uintmax_t)t->state->val,
5966	t->state->c - NFA_LNUM,
5967	(uintmax_t)(reglnum + rex.reg_firstlnum)));
5968	if (result) {
5969	add_here = true;
5970	add_state = t->state->out;
5971	}
5972	break;
5973
5974	case NFA_COL:
5975	case NFA_COL_GT:
5976	case NFA_COL_LT:
5977	assert(t->state->val >= `0`
5978	&& reginput >= regline
5979	&& (uintmax_t)(reginput - regline) <= UINTMAX_MAX - `1`);
5980	result = nfa_re_num_cmp((uintmax_t)t->state->val,
5981	t->state->c - NFA_COL,
5982	(uintmax_t)(reginput - regline + `1`));
5983	if (result) {
5984	add_here = true;
5985	add_state = t->state->out;
5986	}
5987	break;
5988
5989	case NFA_VCOL:
5990	case NFA_VCOL_GT:
5991	case NFA_VCOL_LT:
5992	{
5993	int op = t->state->c - NFA_VCOL;
5994	colnr_T col = (colnr_T)(reginput - regline);
5995
5996	// Bail out quickly when there can't be a match, avoid the overhead of
5997	// win_linetabsize() on long lines.
5998	if (op != `1` && col > t->state->val * (has_mbyte ? MB_MAXBYTES : `1`)) {
5999	break;
6000	}
6001
6002	result = false;
6003	win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
6004	if (op == `1` && col - `1` > t->state->val && col > `100`) {
6005	long ts = wp->w_buffer->b_p_ts;
6006
6007	// Guess that a character won't use more columns than 'tabstop',
6008	// with a minimum of 4.
6009	if (ts < `4`) {
6010	ts = `4`;
6011	}
6012	result = col > t->state->val * ts;
6013	}
6014	if (!result) {
6015	uintmax_t lts = win_linetabsize(wp, regline, col);
6016	assert(t->state->val >= `0`);
6017	result = nfa_re_num_cmp((uintmax_t)t->state->val, op, lts + `1`);
6018	}
6019	if (result) {
6020	add_here = true;
6021	add_state = t->state->out;
6022	}
6023	}
6024	break;
6025
6026	case NFA_MARK:
6027	case NFA_MARK_GT:
6028	case NFA_MARK_LT:
6029	{
6030	pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, false);
6031
6032	// Compare the mark position to the match position.
6033	result = (pos != NULL // mark doesn't exist
6034	&& pos->lnum > `0` // mark isn't set in reg_buf
6035	&& (pos->lnum == reglnum + rex.reg_firstlnum
6036	? (pos->col == (colnr_T)(reginput - regline)
6037	? t->state->c == NFA_MARK
6038	: (pos->col < (colnr_T)(reginput - regline)
6039	? t->state->c == NFA_MARK_GT
6040	: t->state->c == NFA_MARK_LT))
6041	: (pos->lnum < reglnum + rex.reg_firstlnum
6042	? t->state->c == NFA_MARK_GT
6043	: t->state->c == NFA_MARK_LT)));
6044	if (result) {
6045	add_here = true;
6046	add_state = t->state->out;
6047	}
6048	break;
6049	}
6050
6051	case NFA_CURSOR:
6052	result = (rex.reg_win != NULL
6053	&& (reglnum + rex.reg_firstlnum == rex.reg_win->w_cursor.lnum)
6054	&& ((colnr_T)(reginput - regline)
6055	== rex.reg_win->w_cursor.col));
6056	if (result) {
6057	add_here = true;
6058	add_state = t->state->out;
6059	}
6060	break;
6061
6062	case NFA_VISUAL:
6063	result = reg_match_visual();
6064	if (result) {
6065	add_here = true;
6066	add_state = t->state->out;
6067	}
6068	break;
6069
6070	case NFA_MOPEN1:
6071	case NFA_MOPEN2:
6072	case NFA_MOPEN3:
6073	case NFA_MOPEN4:
6074	case NFA_MOPEN5:
6075	case NFA_MOPEN6:
6076	case NFA_MOPEN7:
6077	case NFA_MOPEN8:
6078	case NFA_MOPEN9:
6079	case NFA_ZOPEN:
6080	case NFA_ZOPEN1:
6081	case NFA_ZOPEN2:
6082	case NFA_ZOPEN3:
6083	case NFA_ZOPEN4:
6084	case NFA_ZOPEN5:
6085	case NFA_ZOPEN6:
6086	case NFA_ZOPEN7:
6087	case NFA_ZOPEN8:
6088	case NFA_ZOPEN9:
6089	case NFA_NOPEN:
6090	case NFA_ZSTART:
6091	// These states are only added to be able to bail out when
6092	// they are added again, nothing is to be done.
6093	break;
6094
6095	default: // regular character
6096	{
6097	int c = t->state->c;
6098
6099	#ifdef REGEXP_DEBUG
6100	if (c < `0`) {
6101	IEMSGN("INTERNAL: Negative state char: %" PRId64, c);
6102	}
6103	#endif
6104	result = (c == curc);
6105
6106	if (!result && rex.reg_ic) {
6107	result = mb_tolower(c) == mb_tolower(curc);
6108	}
6109
6110	// If rex.reg_icombine is not set only skip over the character
6111	// itself. When it is set skip over composing characters.
6112	if (result && enc_utf8 && !rex.reg_icombine) {
6113	clen = utf_ptr2len(reginput);
6114	}
6115
6116	ADD_STATE_IF_MATCH(t->state);
6117	break;
6118	}
6119	} // switch (t->state->c)
6120
6121	if (add_state != NULL) {
6122	nfa_pim_T *pim;
6123	nfa_pim_T pim_copy;
6124
6125	if (t->pim.result == NFA_PIM_UNUSED)
6126	pim = NULL;
6127	else
6128	pim = &t->pim;
6129
6130	// Handle the postponed invisible match if the match might end
6131	// without advancing and before the end of the line.
6132	if (pim != NULL && (clen == `0` \|\| match_follows(add_state, `0`))) {
6133	if (pim->result == NFA_PIM_TODO) {
6134	#ifdef REGEXP_DEBUG
6135	fprintf(log_fd, "\n");
6136	fprintf(log_fd, "==================================\n");
6137	fprintf(log_fd, "Postponed recursive nfa_regmatch()\n");
6138	fprintf(log_fd, "\n");
6139	#endif
6140	result = recursive_regmatch(pim->state, pim, prog, submatch, m,
6141	&listids, &listids_len);
6142	pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH;
6143	// for \@! and \@<! it is a match when the result is
6144	// FALSE
6145	if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
6146	\|\| pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6147	\|\| pim->state->c
6148	== NFA_START_INVISIBLE_BEFORE_NEG
6149	\|\| pim->state->c
6150	== NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
6151	// Copy submatch info from the recursive call
6152	copy_sub_off(&pim->subs.norm, &m->norm);
6153	if (nfa_has_zsubexpr)
6154	copy_sub_off(&pim->subs.synt, &m->synt);
6155	}
6156	} else {
6157	result = (pim->result == NFA_PIM_MATCH);
6158	#ifdef REGEXP_DEBUG
6159	fprintf(log_fd, "\n");
6160	fprintf(
6161	log_fd,
6162	"Using previous recursive nfa_regmatch() result, result == %d\n",
6163	pim->result);
6164	fprintf(log_fd, "MATCH = %s\n", result ? "OK" : "FALSE");
6165	fprintf(log_fd, "\n");
6166	#endif
6167	}
6168
6169	// for \@! and \@<! it is a match when result is FALSE
6170	if (result != (pim->state->c == NFA_START_INVISIBLE_NEG
6171	\|\| pim->state->c == NFA_START_INVISIBLE_NEG_FIRST
6172	\|\| pim->state->c
6173	== NFA_START_INVISIBLE_BEFORE_NEG
6174	\|\| pim->state->c
6175	== NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) {
6176	// Copy submatch info from the recursive call
6177	copy_sub_off(&t->subs.norm, &pim->subs.norm);
6178	if (nfa_has_zsubexpr)
6179	copy_sub_off(&t->subs.synt, &pim->subs.synt);
6180	} else {
6181	// look-behind match failed, don't add the state
6182	continue;
6183	}
6184
6185	// Postponed invisible match was handled, don't add it to
6186	// following states.
6187	pim = NULL;
6188	}
6189
6190	// If "pim" points into l->t it will become invalid when
6191	// adding the state causes the list to be reallocated. Make a
6192	// local copy to avoid that.
6193	if (pim == &t->pim) {
6194	copy_pim(&pim_copy, pim);
6195	pim = &pim_copy;
6196	}
6197
6198	if (add_here) {
6199	r = addstate_here(thislist, add_state, &t->subs, pim, &listidx);
6200	} else {
6201	r = addstate(nextlist, add_state, &t->subs, pim, add_off);
6202	if (add_count > `0`) {
6203	nextlist->t[nextlist->n - `1`].count = add_count;
6204	}
6205	}
6206	if (r == NULL) {
6207	nfa_match = NFA_TOO_EXPENSIVE;
6208	goto theend;
6209	}
6210	}
6211	} // for (thislist = thislist; thislist->state; thislist++)
6212
6213	// Look for the start of a match in the current position by adding the
6214	// start state to the list of states.
6215	// The first found match is the leftmost one, thus the order of states
6216	// matters!
6217	// Do not add the start state in recursive calls of nfa_regmatch(),
6218	// because recursive calls should only start in the first position.
6219	// Unless "nfa_endp" is not NULL, then we match the end position.
6220	// Also don't start a match past the first line.
6221	if (!nfa_match
6222	&& ((toplevel
6223	&& reglnum == `0`
6224	&& clen != `0`
6225	&& (rex.reg_maxcol == `0`
6226	\|\| (colnr_T)(reginput - regline) < rex.reg_maxcol))
6227	\|\| (nfa_endp != NULL
6228	&& (REG_MULTI
6229	? (reglnum < nfa_endp->se_u.pos.lnum
6230	\|\| (reglnum == nfa_endp->se_u.pos.lnum
6231	&& (int)(reginput - regline)
6232	< nfa_endp->se_u.pos.col))
6233	: reginput < nfa_endp->se_u.ptr)))) {
6234	#ifdef REGEXP_DEBUG
6235	fprintf(log_fd, "(---) STARTSTATE\n");
6236	#endif
6237	// Inline optimized code for addstate() if we know the state is
6238	// the first MOPEN.
6239	if (toplevel) {
6240	int add = TRUE;
6241	int c;
6242
6243	if (prog->regstart != NUL && clen != `0`) {
6244	if (nextlist->n == `0`) {
6245	colnr_T col = (colnr_T)(reginput - regline) + clen;
6246
6247	// Nextlist is empty, we can skip ahead to the
6248	// character that must appear at the start.
6249	if (skip_to_start(prog->regstart, &col) == FAIL) {
6250	break;
6251	}
6252	#ifdef REGEXP_DEBUG
6253	fprintf(log_fd, " Skipping ahead %d bytes to regstart\n",
6254	col - ((colnr_T)(reginput - regline) + clen));
6255	#endif
6256	reginput = regline + col - clen;
6257	} else {
6258	// Checking if the required start character matches is
6259	// cheaper than adding a state that won't match.
6260	c = PTR2CHAR(reginput + clen);
6261	if (c != prog->regstart && (!rex.reg_ic \|\| mb_tolower(c)
6262	!= mb_tolower(prog->regstart))) {
6263	#ifdef REGEXP_DEBUG
6264	fprintf(log_fd,
6265	" Skipping start state, regstart does not match\n");
6266	#endif
6267	add = FALSE;
6268	}
6269	}
6270	}
6271
6272	if (add) {
6273	if (REG_MULTI)
6274	m->norm.list.multi[`0`].start_col =
6275	(colnr_T)(reginput - regline) + clen;
6276	else
6277	m->norm.list.line[`0`].start = reginput + clen;
6278	if (addstate(nextlist, start->out, m, NULL, clen) == NULL) {
6279	nfa_match = NFA_TOO_EXPENSIVE;
6280	goto theend;
6281	}
6282	}
6283	} else {
6284	if (addstate(nextlist, start, m, NULL, clen) == NULL) {
6285	nfa_match = NFA_TOO_EXPENSIVE;
6286	goto theend;
6287	}
6288	}
6289	}
6290
6291	#ifdef REGEXP_DEBUG
6292	fprintf(log_fd, ">>> Thislist had %d states available: ", thislist->n);
6293	{
6294	int i;
6295
6296	for (i = `0`; i < thislist->n; i++)
6297	fprintf(log_fd, "%d ", abs(thislist->t[i].state->id));
6298	}
6299	fprintf(log_fd, "\n");
6300	#endif
6301
6302	nextchar:
6303	// Advance to the next character, or advance to the next line, or
6304	// finish.
6305	if (clen != `0`) {
6306	reginput += clen;
6307	} else if (go_to_nextline \|\| (nfa_endp != NULL && REG_MULTI
6308	&& reglnum < nfa_endp->se_u.pos.lnum)) {
6309	reg_nextline();
6310	} else {
6311	break;
6312	}
6313
6314	// Allow interrupting with CTRL-C.
6315	line_breakcheck();
6316	if (got_int) {
6317	break;
6318	}
6319	// Check for timeout once every twenty times to avoid overhead.
6320	if (nfa_time_limit != NULL && ++nfa_time_count == `20`) {
6321	nfa_time_count = `0`;
6322	if (nfa_did_time_out()) {
6323	break;
6324	}
6325	}
6326	}
6327
6328	#ifdef REGEXP_DEBUG
6329	if (log_fd != stderr)
6330	fclose(log_fd);
6331	log_fd = NULL;
6332	#endif
6333
6334	theend:
6335	// Free memory
6336	xfree(list[`0`].t);
6337	xfree(list[`1`].t);
6338	xfree(listids);
6339	#undef ADD_STATE_IF_MATCH
6340	#ifdef NFA_REGEXP_DEBUG_LOG
6341	fclose(debug);
6342	#endif
6343
6344	return nfa_match;
6345	}
6346
6347	// Try match of "prog" with at regline["col"].
6348	// Returns <= 0 for failure, number of lines contained in the match otherwise.
6349	static long nfa_regtry(nfa_regprog_T *prog,
6350	colnr_T col,
6351	proftime_T tm, // timeout limit or NULL*
6352	int timed_out) // flag set on timeout or NULL*
6353	{
6354	int i;
6355	regsubs_T subs, m;
6356	nfa_state_T *start = prog->start;
6357	#ifdef REGEXP_DEBUG
6358	FILE *f;
6359	#endif
6360
6361	reginput = regline + col;
6362	nfa_time_limit = tm;
6363	nfa_timed_out = timed_out;
6364	nfa_time_count = `0`;
6365
6366	#ifdef REGEXP_DEBUG
6367	f = fopen(NFA_REGEXP_RUN_LOG, "a");
6368	if (f != NULL) {
6369	fprintf(f,
6370	"\n\n\t=======================================================\n");
6371	#ifdef REGEXP_DEBUG
6372	fprintf(f, "\tRegexp is \"%s\"\n", nfa_regengine.expr);
6373	#endif
6374	fprintf(f, "\tInput text is \"%s\" \n", reginput);
6375	fprintf(f, "\t=======================================================\n\n");
6376	nfa_print_state(f, start);
6377	fprintf(f, "\n\n");
6378	fclose(f);
6379	} else {
6380	EMSG("Could not open temporary log file for writing");
6381	}
6382	#endif
6383
6384	clear_sub(&subs.norm);
6385	clear_sub(&m.norm);
6386	clear_sub(&subs.synt);
6387	clear_sub(&m.synt);
6388
6389	int result = nfa_regmatch(prog, start, &subs, &m);
6390	if (!result) {
6391	return `0`;
6392	} else if (result == NFA_TOO_EXPENSIVE) {
6393	return result;
6394	}
6395
6396	cleanup_subexpr();
6397	if (REG_MULTI) {
6398	for (i = `0`; i < subs.norm.in_use; i++) {
6399	rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum;
6400	rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col;
6401
6402	rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum;
6403	rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col;
6404	}
6405
6406	if (rex.reg_startpos[`0`].lnum < `0`) {
6407	rex.reg_startpos[`0`].lnum = `0`;
6408	rex.reg_startpos[`0`].col = col;
6409	}
6410	if (rex.reg_endpos[`0`].lnum < `0`) {
6411	// pattern has a \ze but it didn't match, use current end
6412	rex.reg_endpos[`0`].lnum = reglnum;
6413	rex.reg_endpos[`0`].col = (int)(reginput - regline);
6414	} else {
6415	// Use line number of "\ze".
6416	reglnum = rex.reg_endpos[`0`].lnum;
6417	}
6418	} else {
6419	for (i = `0`; i < subs.norm.in_use; i++) {
6420	rex.reg_startp[i] = subs.norm.list.line[i].start;
6421	rex.reg_endp[i] = subs.norm.list.line[i].end;
6422	}
6423
6424	if (rex.reg_startp[`0`] == NULL) {
6425	rex.reg_startp[`0`] = regline + col;
6426	}
6427	if (rex.reg_endp[`0`] == NULL) {
6428	rex.reg_endp[`0`] = reginput;
6429	}
6430	}
6431
6432	/ Package any found \z(...\) matches for export. Default is none. /
6433	unref_extmatch(re_extmatch_out);
6434	re_extmatch_out = NULL;
6435
6436	if (prog->reghasz == REX_SET) {
6437	cleanup_zsubexpr();
6438	re_extmatch_out = make_extmatch();
6439	// Loop over \z1, \z2, etc. There is no \z0.
6440	for (i = `1`; i < subs.synt.in_use; i++) {
6441	if (REG_MULTI) {
6442	struct multipos *mpos = &subs.synt.list.multi[i];
6443
6444	// Only accept single line matches that are valid.
6445	if (mpos->start_lnum >= `0`
6446	&& mpos->start_lnum == mpos->end_lnum
6447	&& mpos->end_col >= mpos->start_col) {
6448	re_extmatch_out->matches[i] =
6449	vim_strnsave(reg_getline(mpos->start_lnum) + mpos->start_col,
6450	mpos->end_col - mpos->start_col);
6451	}
6452	} else {
6453	struct linepos *lpos = &subs.synt.list.line[i];
6454
6455	if (lpos->start != NULL && lpos->end != NULL)
6456	re_extmatch_out->matches[i] =
6457	vim_strnsave(lpos->start,
6458	(int)(lpos->end - lpos->start));
6459	}
6460	}
6461	}
6462
6463	return `1` + reglnum;
6464	}
6465
6466	/// Match a regexp against a string ("line" points to the string) or multiple
6467	/// lines ("line" is NULL, use reg_getline()).
6468	///
6469	/// @param line String in which to search or NULL
6470	/// @param startcol Column to start looking for match
6471	/// @param tm Timeout limit or NULL
6472	/// @param timed_out Flag set on timeout or NULL
6473	///
6474	/// @return <= 0 if there is no match and number of lines contained in the
6475	/// match otherwise.
6476	static long nfa_regexec_both(char_u *line, colnr_T startcol,
6477	proftime_T tm, int* *timed_out)
6478	{
6479	nfa_regprog_T *prog;
6480	long retval = `0L`;
6481	int i;
6482	colnr_T col = startcol;
6483
6484	if (REG_MULTI) {
6485	prog = (nfa_regprog_T *)rex.reg_mmatch->regprog;
6486	line = reg_getline((linenr_T)`0`); // relative to the cursor
6487	rex.reg_startpos = rex.reg_mmatch->startpos;
6488	rex.reg_endpos = rex.reg_mmatch->endpos;
6489	} else {
6490	prog = (nfa_regprog_T *)rex.reg_match->regprog;
6491	rex.reg_startp = rex.reg_match->startp;
6492	rex.reg_endp = rex.reg_match->endp;
6493	}
6494
6495	/ Be paranoid... /
6496	if (prog == NULL \|\| line == NULL) {
6497	EMSG(_(e_null));
6498	goto theend;
6499	}
6500
6501	// If pattern contains "\c" or "\C": overrule value of rex.reg_ic
6502	if (prog->regflags & RF_ICASE) {
6503	rex.reg_ic = true;
6504	} else if (prog->regflags & RF_NOICASE) {
6505	rex.reg_ic = false;
6506	}
6507
6508	// If pattern contains "\Z" overrule value of rex.reg_icombine
6509	if (prog->regflags & RF_ICOMBINE) {
6510	rex.reg_icombine = true;
6511	}
6512
6513	regline = line;
6514	reglnum = `0`; / relative to line /
6515
6516	nfa_has_zend = prog->has_zend;
6517	nfa_has_backref = prog->has_backref;
6518	nfa_nsubexpr = prog->nsubexp;
6519	nfa_listid = `1`;
6520	nfa_alt_listid = `2`;
6521	nfa_regengine.expr = prog->pattern;
6522
6523	if (prog->reganch && col > `0`)
6524	return `0L`;
6525
6526	need_clear_subexpr = TRUE;
6527	/ Clear the external match subpointers if necessary. /
6528	if (prog->reghasz == REX_SET) {
6529	nfa_has_zsubexpr = TRUE;
6530	need_clear_zsubexpr = TRUE;
6531	} else
6532	nfa_has_zsubexpr = FALSE;
6533
6534	if (prog->regstart != NUL) {
6535	/ Skip ahead until a character we know the match must start with.*
6536	* When there is none there is no match. */
6537	if (skip_to_start(prog->regstart, &col) == FAIL)
6538	return `0L`;
6539
6540	// If match_text is set it contains the full text that must match.
6541	// Nothing else to try. Doesn't handle combining chars well.
6542	if (prog->match_text != NULL && !rex.reg_icombine) {
6543	return find_match_text(col, prog->regstart, prog->match_text);
6544	}
6545	}
6546
6547	// If the start column is past the maximum column: no need to try.
6548	if (rex.reg_maxcol > `0` && col >= rex.reg_maxcol) {
6549	goto theend;
6550	}
6551
6552	nstate = prog->nstate;
6553	for (i = `0`; i < nstate; ++i) {
6554	prog->state[i].id = i;
6555	prog->state[i].lastlist[`0`] = `0`;
6556	prog->state[i].lastlist[`1`] = `0`;
6557	}
6558
6559	retval = nfa_regtry(prog, col, tm, timed_out);
6560
6561	nfa_regengine.expr = NULL;
6562
6563	theend:
6564	return retval;
6565	}
6566
6567	/*
6568	* Compile a regular expression into internal code for the NFA matcher.
6569	* Returns the program in allocated space. Returns NULL for an error.
6570	*/
6571	static regprog_T nfa_regcomp(char_u expr, int re_flags)
6572	{
6573	nfa_regprog_T *prog = NULL;
6574	int *postfix;
6575
6576	if (expr == NULL)
6577	return NULL;
6578
6579	nfa_regengine.expr = expr;
6580	nfa_re_flags = re_flags;
6581
6582	init_class_tab();
6583
6584	nfa_regcomp_start(expr, re_flags);
6585
6586	// Build postfix form of the regexp. Needed to build the NFA
6587	// (and count its size).
6588	postfix = re2post();
6589	if (postfix == NULL) {
6590	goto fail; // Cascaded (syntax?) error
6591	}
6592
6593	/*
6594	* In order to build the NFA, we parse the input regexp twice:
6595	* 1. first pass to count size (so we can allocate space)
6596	* 2. second to emit code
6597	*/
6598	#ifdef REGEXP_DEBUG
6599	{
6600	FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a");
6601
6602	if (f != NULL) {
6603	fprintf(f,
6604	"\n*****************************\n\n\n\n\t"
6605	"Compiling regexp \"%s\"... hold on !\n",
6606	expr);
6607	fclose(f);
6608	}
6609	}
6610	#endif
6611
6612	/*
6613	* PASS 1
6614	* Count number of NFA states in "nstate". Do not build the NFA.
6615	*/
6616	post2nfa(postfix, post_ptr, TRUE);
6617
6618	/ allocate the regprog with space for the compiled regexp /
6619	size_t prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - `1`);
6620	prog = xmalloc(prog_size);
6621	state_ptr = prog->state;
6622
6623	/*
6624	* PASS 2
6625	* Build the NFA
6626	*/
6627	prog->start = post2nfa(postfix, post_ptr, FALSE);
6628	if (prog->start == NULL)
6629	goto fail;
6630
6631	prog->regflags = regflags;
6632	prog->engine = &nfa_regengine;
6633	prog->nstate = nstate;
6634	prog->has_zend = nfa_has_zend;
6635	prog->has_backref = nfa_has_backref;
6636	prog->nsubexp = regnpar;
6637
6638	nfa_postprocess(prog);
6639
6640	prog->reganch = nfa_get_reganch(prog->start, `0`);
6641	prog->regstart = nfa_get_regstart(prog->start, `0`);
6642	prog->match_text = nfa_get_match_text(prog->start);
6643
6644	#ifdef REGEXP_DEBUG
6645	nfa_postfix_dump(expr, OK);
6646	nfa_dump(prog);
6647	#endif
6648	/ Remember whether this pattern has any \z specials in it. /
6649	prog->reghasz = re_has_z;
6650	prog->pattern = vim_strsave(expr);
6651	nfa_regengine.expr = NULL;
6652
6653	out:
6654	xfree(post_start);
6655	post_start = post_ptr = post_end = NULL;
6656	state_ptr = NULL;
6657	return (regprog_T *)prog;
6658
6659	fail:
6660	XFREE_CLEAR(prog);
6661	#ifdef REGEXP_DEBUG
6662	nfa_postfix_dump(expr, FAIL);
6663	#endif
6664	nfa_regengine.expr = NULL;
6665	goto out;
6666	}
6667
6668	/*
6669	* Free a compiled regexp program, returned by nfa_regcomp().
6670	*/
6671	static void nfa_regfree(regprog_T *prog)
6672	{
6673	if (prog != NULL) {
6674	xfree(((nfa_regprog_T *)prog)->match_text);
6675	xfree(((nfa_regprog_T *)prog)->pattern);
6676	xfree(prog);
6677	}
6678	}
6679
6680	/*
6681	* Match a regexp against a string.
6682	* "rmp->regprog" is a compiled regexp as returned by nfa_regcomp().
6683	* Uses curbuf for line count and 'iskeyword'.
6684	* If "line_lbr" is true, consider a "\n" in "line" to be a line break.
6685	*
6686	* Returns <= 0 for failure, number of lines contained in the match otherwise.
6687	*/
6688	static int
6689	nfa_regexec_nl (
6690	regmatch_T *rmp,
6691	char_u line, /* string to match against /
6692	colnr_T col, / column to start looking for match /
6693	bool line_lbr
6694	)
6695	{
6696	rex.reg_match = rmp;
6697	rex.reg_mmatch = NULL;
6698	rex.reg_maxline = `0`;
6699	rex.reg_line_lbr = line_lbr;
6700	rex.reg_buf = curbuf;
6701	rex.reg_win = NULL;
6702	rex.reg_ic = rmp->rm_ic;
6703	rex.reg_icombine = false;
6704	rex.reg_maxcol = `0`;
6705	return nfa_regexec_both(line, col, NULL, NULL);
6706	}
6707
6708	/// Matches a regexp against multiple lines.
6709	/// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
6710	/// Uses curbuf for line count and 'iskeyword'.
6711	///
6712	/// @param win Window in which to search or NULL
6713	/// @param buf Buffer in which to search
6714	/// @param lnum Number of line to start looking for match
6715	/// @param col Column to start looking for match
6716	/// @param tm Timeout limit or NULL
6717	/// @param timed_out Flag set on timeout or NULL
6718	///
6719	/// @return <= 0 if there is no match and number of lines contained in the match
6720	/// otherwise.
6721	///
6722	/// @note The body is the same as bt_regexec() except for nfa_regexec_both()
6723	///
6724	/// @warning
6725	/// Match may actually be in another line. e.g.:
6726	/// when r.e. is \nc, cursor is at 'a' and the text buffer looks like
6727	///
6728	/// @par
6729	///
6730	/// +-------------------------+
6731	/// \|a \|
6732	/// \|b \|
6733	/// \|c \|
6734	/// \| \|
6735	/// +-------------------------+
6736	///
6737	/// @par
6738	/// then nfa_regexec_multi() returns 3. while the original vim_regexec_multi()
6739	/// returns 0 and a second call at line 2 will return 2.
6740	///
6741	/// @par
6742	/// FIXME if this behavior is not compatible.
6743	static long nfa_regexec_multi(regmmatch_T rmp, win_T win, buf_T *buf,
6744	linenr_T lnum, colnr_T col,
6745	proftime_T tm, int* *timed_out)
6746	{
6747	rex.reg_match = NULL;
6748	rex.reg_mmatch = rmp;
6749	rex.reg_buf = buf;
6750	rex.reg_win = win;
6751	rex.reg_firstlnum = lnum;
6752	rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
6753	rex.reg_line_lbr = false;
6754	rex.reg_ic = rmp->rmm_ic;
6755	rex.reg_icombine = false;
6756	rex.reg_maxcol = rmp->rmm_maxcol;
6757
6758	return nfa_regexec_both(NULL, col, tm, timed_out);
6759	}
6760

Browse the source code of neovim/src/nvim/regexp_nfa.c