1 | // This is an open source non-commercial project. Dear PVS-Studio, please check |
2 | // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com |
3 | |
4 | /* |
5 | * NFA regular expression implementation. |
6 | * |
7 | * This file is included in "regexp.c". |
8 | */ |
9 | |
10 | #include <assert.h> |
11 | #include <inttypes.h> |
12 | #include <stdbool.h> |
13 | #include <limits.h> |
14 | |
15 | #include "nvim/ascii.h" |
16 | #include "nvim/garray.h" |
17 | |
18 | /* |
19 | * Logging of NFA engine. |
20 | * |
21 | * The NFA engine can write four log files: |
22 | * - Error log: Contains NFA engine's fatal errors. |
23 | * - Dump log: Contains compiled NFA state machine's information. |
24 | * - Run log: Contains information of matching procedure. |
25 | * - Debug log: Contains detailed information of matching procedure. Can be |
26 | * disabled by undefining NFA_REGEXP_DEBUG_LOG. |
27 | * The first one can also be used without debug mode. |
28 | * The last three are enabled when compiled as debug mode and individually |
29 | * disabled by commenting them out. |
30 | * The log files can get quite big! |
31 | * Do disable all of this when compiling Vim for debugging, undefine REGEXP_DEBUG in |
32 | * regexp.c |
33 | */ |
34 | #ifdef REGEXP_DEBUG |
35 | # define NFA_REGEXP_ERROR_LOG "nfa_regexp_error.log" |
36 | # define NFA_REGEXP_DUMP_LOG "nfa_regexp_dump.log" |
37 | # define NFA_REGEXP_RUN_LOG "nfa_regexp_run.log" |
38 | # define NFA_REGEXP_DEBUG_LOG "nfa_regexp_debug.log" |
39 | #endif |
40 | |
41 | /* Added to NFA_ANY - NFA_NUPPER_IC to include a NL. */ |
42 | #define NFA_ADD_NL 31 |
43 | |
44 | enum { |
45 | NFA_SPLIT = -1024, |
46 | NFA_MATCH, |
47 | NFA_EMPTY, /* matches 0-length */ |
48 | |
49 | NFA_START_COLL, /* [abc] start */ |
50 | NFA_END_COLL, /* [abc] end */ |
51 | NFA_START_NEG_COLL, /* [^abc] start */ |
52 | NFA_END_NEG_COLL, /* [^abc] end (postfix only) */ |
53 | NFA_RANGE, /* range of the two previous items |
54 | * (postfix only) */ |
55 | NFA_RANGE_MIN, /* low end of a range */ |
56 | NFA_RANGE_MAX, /* high end of a range */ |
57 | |
58 | NFA_CONCAT, // concatenate two previous items (postfix |
59 | // only) |
60 | NFA_OR, // \| (postfix only) |
61 | NFA_STAR, // greedy * (postfix only) |
62 | NFA_STAR_NONGREEDY, // non-greedy * (postfix only) |
63 | NFA_QUEST, // greedy \? (postfix only) |
64 | NFA_QUEST_NONGREEDY, // non-greedy \? (postfix only) |
65 | |
66 | NFA_BOL, /* ^ Begin line */ |
67 | NFA_EOL, /* $ End line */ |
68 | NFA_BOW, /* \< Begin word */ |
69 | NFA_EOW, /* \> End word */ |
70 | NFA_BOF, /* \%^ Begin file */ |
71 | NFA_EOF, /* \%$ End file */ |
72 | NFA_NEWL, |
73 | NFA_ZSTART, /* Used for \zs */ |
74 | NFA_ZEND, /* Used for \ze */ |
75 | NFA_NOPEN, /* Start of subexpression marked with \%( */ |
76 | NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */ |
77 | NFA_START_INVISIBLE, |
78 | NFA_START_INVISIBLE_FIRST, |
79 | NFA_START_INVISIBLE_NEG, |
80 | NFA_START_INVISIBLE_NEG_FIRST, |
81 | NFA_START_INVISIBLE_BEFORE, |
82 | NFA_START_INVISIBLE_BEFORE_FIRST, |
83 | NFA_START_INVISIBLE_BEFORE_NEG, |
84 | NFA_START_INVISIBLE_BEFORE_NEG_FIRST, |
85 | NFA_START_PATTERN, |
86 | NFA_END_INVISIBLE, |
87 | NFA_END_INVISIBLE_NEG, |
88 | NFA_END_PATTERN, |
89 | NFA_COMPOSING, /* Next nodes in NFA are part of the |
90 | composing multibyte char */ |
91 | NFA_END_COMPOSING, /* End of a composing char in the NFA */ |
92 | NFA_ANY_COMPOSING, // \%C: Any composing characters. |
93 | NFA_OPT_CHARS, /* \%[abc] */ |
94 | |
95 | /* The following are used only in the postfix form, not in the NFA */ |
96 | NFA_PREV_ATOM_NO_WIDTH, /* Used for \@= */ |
97 | NFA_PREV_ATOM_NO_WIDTH_NEG, /* Used for \@! */ |
98 | NFA_PREV_ATOM_JUST_BEFORE, /* Used for \@<= */ |
99 | NFA_PREV_ATOM_JUST_BEFORE_NEG, /* Used for \@<! */ |
100 | NFA_PREV_ATOM_LIKE_PATTERN, /* Used for \@> */ |
101 | |
102 | NFA_BACKREF1, /* \1 */ |
103 | NFA_BACKREF2, /* \2 */ |
104 | NFA_BACKREF3, /* \3 */ |
105 | NFA_BACKREF4, /* \4 */ |
106 | NFA_BACKREF5, /* \5 */ |
107 | NFA_BACKREF6, /* \6 */ |
108 | NFA_BACKREF7, /* \7 */ |
109 | NFA_BACKREF8, /* \8 */ |
110 | NFA_BACKREF9, /* \9 */ |
111 | NFA_ZREF1, /* \z1 */ |
112 | NFA_ZREF2, /* \z2 */ |
113 | NFA_ZREF3, /* \z3 */ |
114 | NFA_ZREF4, /* \z4 */ |
115 | NFA_ZREF5, /* \z5 */ |
116 | NFA_ZREF6, /* \z6 */ |
117 | NFA_ZREF7, /* \z7 */ |
118 | NFA_ZREF8, /* \z8 */ |
119 | NFA_ZREF9, /* \z9 */ |
120 | NFA_SKIP, /* Skip characters */ |
121 | |
122 | NFA_MOPEN, |
123 | NFA_MOPEN1, |
124 | NFA_MOPEN2, |
125 | NFA_MOPEN3, |
126 | NFA_MOPEN4, |
127 | NFA_MOPEN5, |
128 | NFA_MOPEN6, |
129 | NFA_MOPEN7, |
130 | NFA_MOPEN8, |
131 | NFA_MOPEN9, |
132 | |
133 | NFA_MCLOSE, |
134 | NFA_MCLOSE1, |
135 | NFA_MCLOSE2, |
136 | NFA_MCLOSE3, |
137 | NFA_MCLOSE4, |
138 | NFA_MCLOSE5, |
139 | NFA_MCLOSE6, |
140 | NFA_MCLOSE7, |
141 | NFA_MCLOSE8, |
142 | NFA_MCLOSE9, |
143 | |
144 | NFA_ZOPEN, |
145 | NFA_ZOPEN1, |
146 | NFA_ZOPEN2, |
147 | NFA_ZOPEN3, |
148 | NFA_ZOPEN4, |
149 | NFA_ZOPEN5, |
150 | NFA_ZOPEN6, |
151 | NFA_ZOPEN7, |
152 | NFA_ZOPEN8, |
153 | NFA_ZOPEN9, |
154 | |
155 | NFA_ZCLOSE, |
156 | NFA_ZCLOSE1, |
157 | NFA_ZCLOSE2, |
158 | NFA_ZCLOSE3, |
159 | NFA_ZCLOSE4, |
160 | NFA_ZCLOSE5, |
161 | NFA_ZCLOSE6, |
162 | NFA_ZCLOSE7, |
163 | NFA_ZCLOSE8, |
164 | NFA_ZCLOSE9, |
165 | |
166 | /* NFA_FIRST_NL */ |
167 | NFA_ANY, /* Match any one character. */ |
168 | NFA_IDENT, /* Match identifier char */ |
169 | NFA_SIDENT, /* Match identifier char but no digit */ |
170 | NFA_KWORD, /* Match keyword char */ |
171 | NFA_SKWORD, /* Match word char but no digit */ |
172 | NFA_FNAME, /* Match file name char */ |
173 | NFA_SFNAME, /* Match file name char but no digit */ |
174 | NFA_PRINT, /* Match printable char */ |
175 | NFA_SPRINT, /* Match printable char but no digit */ |
176 | NFA_WHITE, /* Match whitespace char */ |
177 | NFA_NWHITE, /* Match non-whitespace char */ |
178 | NFA_DIGIT, /* Match digit char */ |
179 | NFA_NDIGIT, /* Match non-digit char */ |
180 | NFA_HEX, /* Match hex char */ |
181 | NFA_NHEX, /* Match non-hex char */ |
182 | NFA_OCTAL, /* Match octal char */ |
183 | NFA_NOCTAL, /* Match non-octal char */ |
184 | NFA_WORD, /* Match word char */ |
185 | NFA_NWORD, /* Match non-word char */ |
186 | NFA_HEAD, /* Match head char */ |
187 | NFA_NHEAD, /* Match non-head char */ |
188 | NFA_ALPHA, /* Match alpha char */ |
189 | NFA_NALPHA, /* Match non-alpha char */ |
190 | NFA_LOWER, /* Match lowercase char */ |
191 | NFA_NLOWER, /* Match non-lowercase char */ |
192 | NFA_UPPER, /* Match uppercase char */ |
193 | NFA_NUPPER, /* Match non-uppercase char */ |
194 | NFA_LOWER_IC, /* Match [a-z] */ |
195 | NFA_NLOWER_IC, /* Match [^a-z] */ |
196 | NFA_UPPER_IC, /* Match [A-Z] */ |
197 | NFA_NUPPER_IC, /* Match [^A-Z] */ |
198 | |
199 | NFA_FIRST_NL = NFA_ANY + NFA_ADD_NL, |
200 | NFA_LAST_NL = NFA_NUPPER_IC + NFA_ADD_NL, |
201 | |
202 | NFA_CURSOR, /* Match cursor pos */ |
203 | NFA_LNUM, /* Match line number */ |
204 | NFA_LNUM_GT, /* Match > line number */ |
205 | NFA_LNUM_LT, /* Match < line number */ |
206 | NFA_COL, /* Match cursor column */ |
207 | NFA_COL_GT, /* Match > cursor column */ |
208 | NFA_COL_LT, /* Match < cursor column */ |
209 | NFA_VCOL, /* Match cursor virtual column */ |
210 | NFA_VCOL_GT, /* Match > cursor virtual column */ |
211 | NFA_VCOL_LT, /* Match < cursor virtual column */ |
212 | NFA_MARK, /* Match mark */ |
213 | NFA_MARK_GT, /* Match > mark */ |
214 | NFA_MARK_LT, /* Match < mark */ |
215 | NFA_VISUAL, /* Match Visual area */ |
216 | |
217 | /* Character classes [:alnum:] etc */ |
218 | NFA_CLASS_ALNUM, |
219 | NFA_CLASS_ALPHA, |
220 | NFA_CLASS_BLANK, |
221 | NFA_CLASS_CNTRL, |
222 | NFA_CLASS_DIGIT, |
223 | NFA_CLASS_GRAPH, |
224 | NFA_CLASS_LOWER, |
225 | NFA_CLASS_PRINT, |
226 | NFA_CLASS_PUNCT, |
227 | NFA_CLASS_SPACE, |
228 | NFA_CLASS_UPPER, |
229 | NFA_CLASS_XDIGIT, |
230 | NFA_CLASS_TAB, |
231 | NFA_CLASS_RETURN, |
232 | NFA_CLASS_BACKSPACE, |
233 | NFA_CLASS_ESCAPE |
234 | }; |
235 | |
236 | /* Keep in sync with classchars. */ |
237 | static int nfa_classcodes[] = { |
238 | NFA_ANY, NFA_IDENT, NFA_SIDENT, NFA_KWORD,NFA_SKWORD, |
239 | NFA_FNAME, NFA_SFNAME, NFA_PRINT, NFA_SPRINT, |
240 | NFA_WHITE, NFA_NWHITE, NFA_DIGIT, NFA_NDIGIT, |
241 | NFA_HEX, NFA_NHEX, NFA_OCTAL, NFA_NOCTAL, |
242 | NFA_WORD, NFA_NWORD, NFA_HEAD, NFA_NHEAD, |
243 | NFA_ALPHA, NFA_NALPHA, NFA_LOWER, NFA_NLOWER, |
244 | NFA_UPPER, NFA_NUPPER |
245 | }; |
246 | |
247 | static char_u e_nul_found[] = N_( |
248 | "E865: (NFA) Regexp end encountered prematurely" ); |
249 | static char_u e_misplaced[] = N_("E866: (NFA regexp) Misplaced %c" ); |
250 | static char_u e_ill_char_class[] = N_( |
251 | "E877: (NFA regexp) Invalid character class: %" PRId64); |
252 | |
253 | /* Since the out pointers in the list are always |
254 | * uninitialized, we use the pointers themselves |
255 | * as storage for the Ptrlists. */ |
256 | typedef union Ptrlist Ptrlist; |
257 | union Ptrlist { |
258 | Ptrlist *next; |
259 | nfa_state_T *s; |
260 | }; |
261 | |
262 | struct Frag { |
263 | nfa_state_T *start; |
264 | Ptrlist *out; |
265 | }; |
266 | typedef struct Frag Frag_T; |
267 | |
268 | typedef struct { |
269 | int in_use; /* number of subexpr with useful info */ |
270 | |
271 | /* When REG_MULTI is TRUE list.multi is used, otherwise list.line. */ |
272 | union { |
273 | struct multipos { |
274 | linenr_T start_lnum; |
275 | linenr_T end_lnum; |
276 | colnr_T start_col; |
277 | colnr_T end_col; |
278 | } multi[NSUBEXP]; |
279 | struct linepos { |
280 | char_u *start; |
281 | char_u *end; |
282 | } line[NSUBEXP]; |
283 | } list; |
284 | } regsub_T; |
285 | |
286 | typedef struct { |
287 | regsub_T norm; /* \( .. \) matches */ |
288 | regsub_T synt; /* \z( .. \) matches */ |
289 | } regsubs_T; |
290 | |
291 | /* nfa_pim_T stores a Postponed Invisible Match. */ |
292 | typedef struct nfa_pim_S nfa_pim_T; |
293 | struct nfa_pim_S { |
294 | int result; /* NFA_PIM_*, see below */ |
295 | nfa_state_T *state; /* the invisible match start state */ |
296 | regsubs_T subs; /* submatch info, only party used */ |
297 | union { |
298 | lpos_T pos; |
299 | char_u *ptr; |
300 | } end; /* where the match must end */ |
301 | }; |
302 | |
303 | /* nfa_thread_T contains execution information of a NFA state */ |
304 | typedef struct { |
305 | nfa_state_T *state; |
306 | int count; |
307 | nfa_pim_T pim; /* if pim.result != NFA_PIM_UNUSED: postponed |
308 | * invisible match */ |
309 | regsubs_T subs; /* submatch info, only party used */ |
310 | } nfa_thread_T; |
311 | |
312 | /* nfa_list_T contains the alternative NFA execution states. */ |
313 | typedef struct { |
314 | nfa_thread_T *t; /* allocated array of states */ |
315 | int n; /* nr of states currently in "t" */ |
316 | int len; /* max nr of states in "t" */ |
317 | int id; /* ID of the list */ |
318 | int has_pim; /* TRUE when any state has a PIM */ |
319 | } nfa_list_T; |
320 | |
321 | /// re_flags passed to nfa_regcomp(). |
322 | static int nfa_re_flags; |
323 | |
324 | /* NFA regexp \ze operator encountered. */ |
325 | static int nfa_has_zend; |
326 | |
327 | /* NFA regexp \1 .. \9 encountered. */ |
328 | static int nfa_has_backref; |
329 | |
330 | /* NFA regexp has \z( ), set zsubexpr. */ |
331 | static int nfa_has_zsubexpr; |
332 | |
333 | /* Number of sub expressions actually being used during execution. 1 if only |
334 | * the whole match (subexpr 0) is used. */ |
335 | static int nfa_nsubexpr; |
336 | |
337 | static int *post_start; /* holds the postfix form of r.e. */ |
338 | static int *post_end; |
339 | static int *post_ptr; |
340 | |
341 | static int nstate; /* Number of states in the NFA. Also used when |
342 | * executing. */ |
343 | static int istate; /* Index in the state vector, used in alloc_state() */ |
344 | |
345 | /* If not NULL match must end at this position */ |
346 | static save_se_T *nfa_endp = NULL; |
347 | |
348 | /* listid is global, so that it increases on recursive calls to |
349 | * nfa_regmatch(), which means we don't have to clear the lastlist field of |
350 | * all the states. */ |
351 | static int nfa_listid; |
352 | static int nfa_alt_listid; |
353 | |
354 | /* 0 for first call to nfa_regmatch(), 1 for recursive call. */ |
355 | static int nfa_ll_index = 0; |
356 | |
357 | #ifdef INCLUDE_GENERATED_DECLARATIONS |
358 | # include "regexp_nfa.c.generated.h" |
359 | #endif |
360 | |
361 | // Helper functions used when doing re2post() ... regatom() parsing |
362 | #define EMIT(c) \ |
363 | do { \ |
364 | if (post_ptr >= post_end) { \ |
365 | realloc_post_list(); \ |
366 | } \ |
367 | *post_ptr++ = c; \ |
368 | } while (0) |
369 | |
370 | /* |
371 | * Initialize internal variables before NFA compilation. |
372 | */ |
373 | static void |
374 | nfa_regcomp_start ( |
375 | char_u *expr, |
376 | int re_flags /* see vim_regcomp() */ |
377 | ) |
378 | { |
379 | size_t postfix_size; |
380 | size_t nstate_max; |
381 | |
382 | nstate = 0; |
383 | istate = 0; |
384 | /* A reasonable estimation for maximum size */ |
385 | nstate_max = (STRLEN(expr) + 1) * 25; |
386 | |
387 | /* Some items blow up in size, such as [A-z]. Add more space for that. |
388 | * When it is still not enough realloc_post_list() will be used. */ |
389 | nstate_max += 1000; |
390 | |
391 | /* Size for postfix representation of expr. */ |
392 | postfix_size = sizeof(int) * nstate_max; |
393 | |
394 | post_start = (int *)xmalloc(postfix_size); |
395 | post_ptr = post_start; |
396 | post_end = post_start + nstate_max; |
397 | nfa_has_zend = FALSE; |
398 | nfa_has_backref = FALSE; |
399 | |
400 | /* shared with BT engine */ |
401 | regcomp_start(expr, re_flags); |
402 | } |
403 | |
404 | /* |
405 | * Figure out if the NFA state list starts with an anchor, must match at start |
406 | * of the line. |
407 | */ |
408 | static int nfa_get_reganch(nfa_state_T *start, int depth) |
409 | { |
410 | nfa_state_T *p = start; |
411 | |
412 | if (depth > 4) |
413 | return 0; |
414 | |
415 | while (p != NULL) { |
416 | switch (p->c) { |
417 | case NFA_BOL: |
418 | case NFA_BOF: |
419 | return 1; /* yes! */ |
420 | |
421 | case NFA_ZSTART: |
422 | case NFA_ZEND: |
423 | case NFA_CURSOR: |
424 | case NFA_VISUAL: |
425 | |
426 | case NFA_MOPEN: |
427 | case NFA_MOPEN1: |
428 | case NFA_MOPEN2: |
429 | case NFA_MOPEN3: |
430 | case NFA_MOPEN4: |
431 | case NFA_MOPEN5: |
432 | case NFA_MOPEN6: |
433 | case NFA_MOPEN7: |
434 | case NFA_MOPEN8: |
435 | case NFA_MOPEN9: |
436 | case NFA_NOPEN: |
437 | case NFA_ZOPEN: |
438 | case NFA_ZOPEN1: |
439 | case NFA_ZOPEN2: |
440 | case NFA_ZOPEN3: |
441 | case NFA_ZOPEN4: |
442 | case NFA_ZOPEN5: |
443 | case NFA_ZOPEN6: |
444 | case NFA_ZOPEN7: |
445 | case NFA_ZOPEN8: |
446 | case NFA_ZOPEN9: |
447 | p = p->out; |
448 | break; |
449 | |
450 | case NFA_SPLIT: |
451 | return nfa_get_reganch(p->out, depth + 1) |
452 | && nfa_get_reganch(p->out1, depth + 1); |
453 | |
454 | default: |
455 | return 0; /* noooo */ |
456 | } |
457 | } |
458 | return 0; |
459 | } |
460 | |
461 | /* |
462 | * Figure out if the NFA state list starts with a character which must match |
463 | * at start of the match. |
464 | */ |
465 | static int nfa_get_regstart(nfa_state_T *start, int depth) |
466 | { |
467 | nfa_state_T *p = start; |
468 | |
469 | if (depth > 4) |
470 | return 0; |
471 | |
472 | while (p != NULL) { |
473 | switch (p->c) { |
474 | /* all kinds of zero-width matches */ |
475 | case NFA_BOL: |
476 | case NFA_BOF: |
477 | case NFA_BOW: |
478 | case NFA_EOW: |
479 | case NFA_ZSTART: |
480 | case NFA_ZEND: |
481 | case NFA_CURSOR: |
482 | case NFA_VISUAL: |
483 | case NFA_LNUM: |
484 | case NFA_LNUM_GT: |
485 | case NFA_LNUM_LT: |
486 | case NFA_COL: |
487 | case NFA_COL_GT: |
488 | case NFA_COL_LT: |
489 | case NFA_VCOL: |
490 | case NFA_VCOL_GT: |
491 | case NFA_VCOL_LT: |
492 | case NFA_MARK: |
493 | case NFA_MARK_GT: |
494 | case NFA_MARK_LT: |
495 | |
496 | case NFA_MOPEN: |
497 | case NFA_MOPEN1: |
498 | case NFA_MOPEN2: |
499 | case NFA_MOPEN3: |
500 | case NFA_MOPEN4: |
501 | case NFA_MOPEN5: |
502 | case NFA_MOPEN6: |
503 | case NFA_MOPEN7: |
504 | case NFA_MOPEN8: |
505 | case NFA_MOPEN9: |
506 | case NFA_NOPEN: |
507 | case NFA_ZOPEN: |
508 | case NFA_ZOPEN1: |
509 | case NFA_ZOPEN2: |
510 | case NFA_ZOPEN3: |
511 | case NFA_ZOPEN4: |
512 | case NFA_ZOPEN5: |
513 | case NFA_ZOPEN6: |
514 | case NFA_ZOPEN7: |
515 | case NFA_ZOPEN8: |
516 | case NFA_ZOPEN9: |
517 | p = p->out; |
518 | break; |
519 | |
520 | case NFA_SPLIT: |
521 | { |
522 | int c1 = nfa_get_regstart(p->out, depth + 1); |
523 | int c2 = nfa_get_regstart(p->out1, depth + 1); |
524 | |
525 | if (c1 == c2) |
526 | return c1; /* yes! */ |
527 | return 0; |
528 | } |
529 | |
530 | default: |
531 | if (p->c > 0) |
532 | return p->c; /* yes! */ |
533 | return 0; |
534 | } |
535 | } |
536 | return 0; |
537 | } |
538 | |
539 | /* |
540 | * Figure out if the NFA state list contains just literal text and nothing |
541 | * else. If so return a string in allocated memory with what must match after |
542 | * regstart. Otherwise return NULL. |
543 | */ |
544 | static char_u *nfa_get_match_text(nfa_state_T *start) |
545 | { |
546 | nfa_state_T *p = start; |
547 | int len = 0; |
548 | char_u *ret; |
549 | char_u *s; |
550 | |
551 | if (p->c != NFA_MOPEN) |
552 | return NULL; /* just in case */ |
553 | p = p->out; |
554 | while (p->c > 0) { |
555 | len += MB_CHAR2LEN(p->c); |
556 | p = p->out; |
557 | } |
558 | if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH) |
559 | return NULL; |
560 | |
561 | ret = xmalloc(len); |
562 | p = start->out->out; /* skip first char, it goes into regstart */ |
563 | s = ret; |
564 | while (p->c > 0) { |
565 | s += utf_char2bytes(p->c, s); |
566 | p = p->out; |
567 | } |
568 | *s = NUL; |
569 | |
570 | return ret; |
571 | } |
572 | |
573 | /* |
574 | * Allocate more space for post_start. Called when |
575 | * running above the estimated number of states. |
576 | */ |
577 | static void realloc_post_list(void) |
578 | { |
579 | size_t new_max = (post_end - post_start) + 1000; |
580 | int *new_start = xrealloc(post_start, new_max * sizeof(int)); |
581 | post_ptr = new_start + (post_ptr - post_start); |
582 | post_end = new_start + new_max; |
583 | post_start = new_start; |
584 | } |
585 | |
586 | /* |
587 | * Search between "start" and "end" and try to recognize a |
588 | * character class in expanded form. For example [0-9]. |
589 | * On success, return the id the character class to be emitted. |
590 | * On failure, return 0 (=FAIL) |
591 | * Start points to the first char of the range, while end should point |
592 | * to the closing brace. |
593 | * Keep in mind that 'ignorecase' applies at execution time, thus [a-z] may |
594 | * need to be interpreted as [a-zA-Z]. |
595 | */ |
596 | static int nfa_recognize_char_class(char_u *start, char_u *end, int ) |
597 | { |
598 | # define CLASS_not 0x80 |
599 | # define CLASS_af 0x40 |
600 | # define CLASS_AF 0x20 |
601 | # define CLASS_az 0x10 |
602 | # define CLASS_AZ 0x08 |
603 | # define CLASS_o7 0x04 |
604 | # define CLASS_o9 0x02 |
605 | # define CLASS_underscore 0x01 |
606 | |
607 | int newl = FALSE; |
608 | char_u *p; |
609 | int config = 0; |
610 | |
611 | if (extra_newl == TRUE) |
612 | newl = TRUE; |
613 | |
614 | if (*end != ']') |
615 | return FAIL; |
616 | p = start; |
617 | if (*p == '^') { |
618 | config |= CLASS_not; |
619 | p++; |
620 | } |
621 | |
622 | while (p < end) { |
623 | if (p + 2 < end && *(p + 1) == '-') { |
624 | switch (*p) { |
625 | case '0': |
626 | if (*(p + 2) == '9') { |
627 | config |= CLASS_o9; |
628 | break; |
629 | } else if (*(p + 2) == '7') { |
630 | config |= CLASS_o7; |
631 | break; |
632 | } |
633 | return FAIL; |
634 | case 'a': |
635 | if (*(p + 2) == 'z') { |
636 | config |= CLASS_az; |
637 | break; |
638 | } else if (*(p + 2) == 'f') { |
639 | config |= CLASS_af; |
640 | break; |
641 | } |
642 | return FAIL; |
643 | case 'A': |
644 | if (*(p + 2) == 'Z') { |
645 | config |= CLASS_AZ; |
646 | break; |
647 | } else if (*(p + 2) == 'F') { |
648 | config |= CLASS_AF; |
649 | break; |
650 | } |
651 | return FAIL; |
652 | default: |
653 | return FAIL; |
654 | } |
655 | p += 3; |
656 | } else if (p + 1 < end && *p == '\\' && *(p + 1) == 'n') { |
657 | newl = TRUE; |
658 | p += 2; |
659 | } else if (*p == '_') { |
660 | config |= CLASS_underscore; |
661 | p++; |
662 | } else if (*p == '\n') { |
663 | newl = TRUE; |
664 | p++; |
665 | } else |
666 | return FAIL; |
667 | } /* while (p < end) */ |
668 | |
669 | if (p != end) |
670 | return FAIL; |
671 | |
672 | if (newl == TRUE) |
673 | extra_newl = NFA_ADD_NL; |
674 | |
675 | switch (config) { |
676 | case CLASS_o9: |
677 | return extra_newl + NFA_DIGIT; |
678 | case CLASS_not | CLASS_o9: |
679 | return extra_newl + NFA_NDIGIT; |
680 | case CLASS_af | CLASS_AF | CLASS_o9: |
681 | return extra_newl + NFA_HEX; |
682 | case CLASS_not | CLASS_af | CLASS_AF | CLASS_o9: |
683 | return extra_newl + NFA_NHEX; |
684 | case CLASS_o7: |
685 | return extra_newl + NFA_OCTAL; |
686 | case CLASS_not | CLASS_o7: |
687 | return extra_newl + NFA_NOCTAL; |
688 | case CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore: |
689 | return extra_newl + NFA_WORD; |
690 | case CLASS_not | CLASS_az | CLASS_AZ | CLASS_o9 | CLASS_underscore: |
691 | return extra_newl + NFA_NWORD; |
692 | case CLASS_az | CLASS_AZ | CLASS_underscore: |
693 | return extra_newl + NFA_HEAD; |
694 | case CLASS_not | CLASS_az | CLASS_AZ | CLASS_underscore: |
695 | return extra_newl + NFA_NHEAD; |
696 | case CLASS_az | CLASS_AZ: |
697 | return extra_newl + NFA_ALPHA; |
698 | case CLASS_not | CLASS_az | CLASS_AZ: |
699 | return extra_newl + NFA_NALPHA; |
700 | case CLASS_az: |
701 | return extra_newl + NFA_LOWER_IC; |
702 | case CLASS_not | CLASS_az: |
703 | return extra_newl + NFA_NLOWER_IC; |
704 | case CLASS_AZ: |
705 | return extra_newl + NFA_UPPER_IC; |
706 | case CLASS_not | CLASS_AZ: |
707 | return extra_newl + NFA_NUPPER_IC; |
708 | } |
709 | return FAIL; |
710 | } |
711 | |
712 | /* |
713 | * Produce the bytes for equivalence class "c". |
714 | * Currently only handles latin1, latin9 and utf-8. |
715 | * Emits bytes in postfix notation: 'a,b,NFA_OR,c,NFA_OR' is |
716 | * equivalent to 'a OR b OR c' |
717 | * |
718 | * NOTE! When changing this function, also update reg_equi_class() |
719 | */ |
720 | static void nfa_emit_equi_class(int c) |
721 | { |
722 | #define EMIT2(c) EMIT(c); EMIT(NFA_CONCAT); |
723 | #define EMITMBC(c) EMIT(c); EMIT(NFA_CONCAT); |
724 | |
725 | if (enc_utf8 || STRCMP(p_enc, "latin1" ) == 0 |
726 | || STRCMP(p_enc, "iso-8859-15" ) == 0) { |
727 | #define A_grave 0xc0 |
728 | #define A_acute 0xc1 |
729 | #define A_circumflex 0xc2 |
730 | #define A_virguilla 0xc3 |
731 | #define A_diaeresis 0xc4 |
732 | #define A_ring 0xc5 |
733 | #define C_cedilla 0xc7 |
734 | #define E_grave 0xc8 |
735 | #define E_acute 0xc9 |
736 | #define E_circumflex 0xca |
737 | #define E_diaeresis 0xcb |
738 | #define I_grave 0xcc |
739 | #define I_acute 0xcd |
740 | #define I_circumflex 0xce |
741 | #define I_diaeresis 0xcf |
742 | #define N_virguilla 0xd1 |
743 | #define O_grave 0xd2 |
744 | #define O_acute 0xd3 |
745 | #define O_circumflex 0xd4 |
746 | #define O_virguilla 0xd5 |
747 | #define O_diaeresis 0xd6 |
748 | #define O_slash 0xd8 |
749 | #define U_grave 0xd9 |
750 | #define U_acute 0xda |
751 | #define U_circumflex 0xdb |
752 | #define U_diaeresis 0xdc |
753 | #define Y_acute 0xdd |
754 | #define a_grave 0xe0 |
755 | #define a_acute 0xe1 |
756 | #define a_circumflex 0xe2 |
757 | #define a_virguilla 0xe3 |
758 | #define a_diaeresis 0xe4 |
759 | #define a_ring 0xe5 |
760 | #define c_cedilla 0xe7 |
761 | #define e_grave 0xe8 |
762 | #define e_acute 0xe9 |
763 | #define e_circumflex 0xea |
764 | #define e_diaeresis 0xeb |
765 | #define i_grave 0xec |
766 | #define i_acute 0xed |
767 | #define i_circumflex 0xee |
768 | #define i_diaeresis 0xef |
769 | #define n_virguilla 0xf1 |
770 | #define o_grave 0xf2 |
771 | #define o_acute 0xf3 |
772 | #define o_circumflex 0xf4 |
773 | #define o_virguilla 0xf5 |
774 | #define o_diaeresis 0xf6 |
775 | #define o_slash 0xf8 |
776 | #define u_grave 0xf9 |
777 | #define u_acute 0xfa |
778 | #define u_circumflex 0xfb |
779 | #define u_diaeresis 0xfc |
780 | #define y_acute 0xfd |
781 | #define y_diaeresis 0xff |
782 | switch (c) { |
783 | case 'A': case A_grave: case A_acute: case A_circumflex: |
784 | case A_virguilla: case A_diaeresis: case A_ring: |
785 | CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104) |
786 | CASEMBC(0x1cd) CASEMBC(0x1de) CASEMBC(0x1e0) |
787 | CASEMBC(0x1ea2) |
788 | EMIT2('A'); EMIT2(A_grave); EMIT2(A_acute); |
789 | EMIT2(A_circumflex); EMIT2(A_virguilla); |
790 | EMIT2(A_diaeresis); EMIT2(A_ring); |
791 | EMITMBC(0x100) EMITMBC(0x102) EMITMBC(0x104) |
792 | EMITMBC(0x1cd) EMITMBC(0x1de) EMITMBC(0x1e0) |
793 | EMITMBC(0x1ea2) |
794 | return; |
795 | |
796 | case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06) |
797 | EMIT2('B'); EMITMBC(0x1e02) EMITMBC(0x1e06) |
798 | return; |
799 | |
800 | case 'C': case C_cedilla: CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a) |
801 | CASEMBC(0x10c) |
802 | EMIT2('C'); EMIT2(C_cedilla); EMITMBC(0x106) EMITMBC(0x108) |
803 | EMITMBC(0x10a) EMITMBC(0x10c) |
804 | return; |
805 | |
806 | case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a) |
807 | CASEMBC(0x1e0e) CASEMBC(0x1e10) |
808 | EMIT2('D'); EMITMBC(0x10e) EMITMBC(0x110) EMITMBC(0x1e0a) |
809 | EMITMBC(0x1e0e) EMITMBC(0x1e10) |
810 | return; |
811 | |
812 | case 'E': case E_grave: case E_acute: case E_circumflex: |
813 | case E_diaeresis: CASEMBC(0x112) CASEMBC(0x114) |
814 | CASEMBC(0x116) CASEMBC(0x118) CASEMBC(0x11a) |
815 | CASEMBC(0x1eba) CASEMBC(0x1ebc) |
816 | EMIT2('E'); EMIT2(E_grave); EMIT2(E_acute); |
817 | EMIT2(E_circumflex); EMIT2(E_diaeresis); |
818 | EMITMBC(0x112) EMITMBC(0x114) EMITMBC(0x116) |
819 | EMITMBC(0x118) EMITMBC(0x11a) EMITMBC(0x1eba) |
820 | EMITMBC(0x1ebc) |
821 | return; |
822 | |
823 | case 'F': CASEMBC(0x1e1e) |
824 | EMIT2('F'); EMITMBC(0x1e1e) |
825 | return; |
826 | |
827 | case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120) |
828 | CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6) |
829 | CASEMBC(0x1f4) CASEMBC(0x1e20) |
830 | EMIT2('G'); EMITMBC(0x11c) EMITMBC(0x11e) EMITMBC(0x120) |
831 | EMITMBC(0x122) EMITMBC(0x1e4) EMITMBC(0x1e6) |
832 | EMITMBC(0x1f4) EMITMBC(0x1e20) |
833 | return; |
834 | |
835 | case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22) |
836 | CASEMBC(0x1e26) CASEMBC(0x1e28) |
837 | EMIT2('H'); EMITMBC(0x124) EMITMBC(0x126) EMITMBC(0x1e22) |
838 | EMITMBC(0x1e26) EMITMBC(0x1e28) |
839 | return; |
840 | |
841 | case 'I': case I_grave: case I_acute: case I_circumflex: |
842 | case I_diaeresis: CASEMBC(0x128) CASEMBC(0x12a) |
843 | CASEMBC(0x12c) CASEMBC(0x12e) CASEMBC(0x130) |
844 | CASEMBC(0x1cf) CASEMBC(0x1ec8) |
845 | EMIT2('I'); EMIT2(I_grave); EMIT2(I_acute); |
846 | EMIT2(I_circumflex); EMIT2(I_diaeresis); |
847 | EMITMBC(0x128) EMITMBC(0x12a) |
848 | EMITMBC(0x12c) EMITMBC(0x12e) EMITMBC(0x130) |
849 | EMITMBC(0x1cf) EMITMBC(0x1ec8) |
850 | return; |
851 | |
852 | case 'J': CASEMBC(0x134) |
853 | EMIT2('J'); EMITMBC(0x134) |
854 | return; |
855 | |
856 | case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30) |
857 | CASEMBC(0x1e34) |
858 | EMIT2('K'); EMITMBC(0x136) EMITMBC(0x1e8) EMITMBC(0x1e30) |
859 | EMITMBC(0x1e34) |
860 | return; |
861 | |
862 | case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d) |
863 | CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a) |
864 | EMIT2('L'); EMITMBC(0x139) EMITMBC(0x13b) EMITMBC(0x13d) |
865 | EMITMBC(0x13f) EMITMBC(0x141) EMITMBC(0x1e3a) |
866 | return; |
867 | |
868 | case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40) |
869 | EMIT2('M'); EMITMBC(0x1e3e) EMITMBC(0x1e40) |
870 | return; |
871 | |
872 | case 'N': case N_virguilla: CASEMBC(0x143) CASEMBC(0x145) |
873 | CASEMBC(0x147) CASEMBC(0x1e44) CASEMBC(0x1e48) |
874 | EMIT2('N'); EMIT2(N_virguilla); |
875 | EMITMBC(0x143) EMITMBC(0x145) |
876 | EMITMBC(0x147) EMITMBC(0x1e44) EMITMBC(0x1e48) |
877 | return; |
878 | |
879 | case 'O': case O_grave: case O_acute: case O_circumflex: |
880 | case O_virguilla: case O_diaeresis: case O_slash: |
881 | CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150) |
882 | CASEMBC(0x1a0) CASEMBC(0x1d1) CASEMBC(0x1ea) |
883 | CASEMBC(0x1ec) CASEMBC(0x1ece) |
884 | EMIT2('O'); EMIT2(O_grave); EMIT2(O_acute); |
885 | EMIT2(O_circumflex); EMIT2(O_virguilla); |
886 | EMIT2(O_diaeresis); EMIT2(O_slash); |
887 | EMITMBC(0x14c) EMITMBC(0x14e) EMITMBC(0x150) |
888 | EMITMBC(0x1a0) EMITMBC(0x1d1) EMITMBC(0x1ea) |
889 | EMITMBC(0x1ec) EMITMBC(0x1ece) |
890 | return; |
891 | |
892 | case 'P': case 0x1e54: case 0x1e56: |
893 | EMIT2('P'); EMITMBC(0x1e54) EMITMBC(0x1e56) |
894 | return; |
895 | |
896 | case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158) |
897 | CASEMBC(0x1e58) CASEMBC(0x1e5e) |
898 | EMIT2('R'); EMITMBC(0x154) EMITMBC(0x156) EMITMBC(0x158) |
899 | EMITMBC(0x1e58) EMITMBC(0x1e5e) |
900 | return; |
901 | |
902 | case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e) |
903 | CASEMBC(0x160) CASEMBC(0x1e60) |
904 | EMIT2('S'); EMITMBC(0x15a) EMITMBC(0x15c) EMITMBC(0x15e) |
905 | EMITMBC(0x160) EMITMBC(0x1e60) |
906 | return; |
907 | |
908 | case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166) |
909 | CASEMBC(0x1e6a) CASEMBC(0x1e6e) |
910 | EMIT2('T'); EMITMBC(0x162) EMITMBC(0x164) EMITMBC(0x166) |
911 | EMITMBC(0x1e6a) EMITMBC(0x1e6e) |
912 | return; |
913 | |
914 | case 'U': case U_grave: case U_acute: case U_diaeresis: |
915 | case U_circumflex: CASEMBC(0x168) CASEMBC(0x16a) |
916 | CASEMBC(0x16c) CASEMBC(0x16e) CASEMBC(0x170) |
917 | CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3) |
918 | CASEMBC(0x1ee6) |
919 | EMIT2('U'); EMIT2(U_grave); EMIT2(U_acute); |
920 | EMIT2(U_diaeresis); EMIT2(U_circumflex); |
921 | EMITMBC(0x168) EMITMBC(0x16a) |
922 | EMITMBC(0x16c) EMITMBC(0x16e) EMITMBC(0x170) |
923 | EMITMBC(0x172) EMITMBC(0x1af) EMITMBC(0x1d3) |
924 | EMITMBC(0x1ee6) |
925 | return; |
926 | |
927 | case 'V': CASEMBC(0x1e7c) |
928 | EMIT2('V'); EMITMBC(0x1e7c) |
929 | return; |
930 | |
931 | case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82) |
932 | CASEMBC(0x1e84) CASEMBC(0x1e86) |
933 | EMIT2('W'); EMITMBC(0x174) EMITMBC(0x1e80) EMITMBC(0x1e82) |
934 | EMITMBC(0x1e84) EMITMBC(0x1e86) |
935 | return; |
936 | |
937 | case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c) |
938 | EMIT2('X'); EMITMBC(0x1e8a) EMITMBC(0x1e8c) |
939 | return; |
940 | |
941 | case 'Y': case Y_acute: CASEMBC(0x176) CASEMBC(0x178) |
942 | CASEMBC(0x1e8e) CASEMBC(0x1ef2) CASEMBC(0x1ef6) |
943 | CASEMBC(0x1ef8) |
944 | EMIT2('Y'); EMIT2(Y_acute); |
945 | EMITMBC(0x176) EMITMBC(0x178) |
946 | EMITMBC(0x1e8e) EMITMBC(0x1ef2) EMITMBC(0x1ef6) |
947 | EMITMBC(0x1ef8) |
948 | return; |
949 | |
950 | case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d) |
951 | CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94) |
952 | EMIT2('Z'); EMITMBC(0x179) EMITMBC(0x17b) EMITMBC(0x17d) |
953 | EMITMBC(0x1b5) EMITMBC(0x1e90) EMITMBC(0x1e94) |
954 | return; |
955 | |
956 | case 'a': case a_grave: case a_acute: case a_circumflex: |
957 | case a_virguilla: case a_diaeresis: case a_ring: |
958 | CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105) |
959 | CASEMBC(0x1ce) CASEMBC(0x1df) CASEMBC(0x1e1) |
960 | CASEMBC(0x1ea3) |
961 | EMIT2('a'); EMIT2(a_grave); EMIT2(a_acute); |
962 | EMIT2(a_circumflex); EMIT2(a_virguilla); |
963 | EMIT2(a_diaeresis); EMIT2(a_ring); |
964 | EMITMBC(0x101) EMITMBC(0x103) EMITMBC(0x105) |
965 | EMITMBC(0x1ce) EMITMBC(0x1df) EMITMBC(0x1e1) |
966 | EMITMBC(0x1ea3) |
967 | return; |
968 | |
969 | case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07) |
970 | EMIT2('b'); EMITMBC(0x1e03) EMITMBC(0x1e07) |
971 | return; |
972 | |
973 | case 'c': case c_cedilla: CASEMBC(0x107) CASEMBC(0x109) |
974 | CASEMBC(0x10b) CASEMBC(0x10d) |
975 | EMIT2('c'); EMIT2(c_cedilla); |
976 | EMITMBC(0x107) EMITMBC(0x109) |
977 | EMITMBC(0x10b) EMITMBC(0x10d) |
978 | return; |
979 | |
980 | case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b) |
981 | CASEMBC(0x1e0f) CASEMBC(0x1e11) |
982 | EMIT2('d'); EMITMBC(0x10f) EMITMBC(0x111) EMITMBC(0x1e0b) |
983 | EMITMBC(0x1e0f) EMITMBC(0x1e11) |
984 | return; |
985 | |
986 | case 'e': case e_grave: case e_acute: case e_circumflex: |
987 | case e_diaeresis: CASEMBC(0x113) CASEMBC(0x115) |
988 | CASEMBC(0x117) CASEMBC(0x119) CASEMBC(0x11b) |
989 | CASEMBC(0x1ebb) CASEMBC(0x1ebd) |
990 | EMIT2('e'); EMIT2(e_grave); EMIT2(e_acute); |
991 | EMIT2(e_circumflex); EMIT2(e_diaeresis); |
992 | EMITMBC(0x113) EMITMBC(0x115) |
993 | EMITMBC(0x117) EMITMBC(0x119) EMITMBC(0x11b) |
994 | EMITMBC(0x1ebb) EMITMBC(0x1ebd) |
995 | return; |
996 | |
997 | case 'f': CASEMBC(0x1e1f) |
998 | EMIT2('f'); EMITMBC(0x1e1f) |
999 | return; |
1000 | |
1001 | case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121) |
1002 | CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7) |
1003 | CASEMBC(0x1f5) CASEMBC(0x1e21) |
1004 | EMIT2('g'); EMITMBC(0x11d) EMITMBC(0x11f) EMITMBC(0x121) |
1005 | EMITMBC(0x123) EMITMBC(0x1e5) EMITMBC(0x1e7) |
1006 | EMITMBC(0x1f5) EMITMBC(0x1e21) |
1007 | return; |
1008 | |
1009 | case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23) |
1010 | CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96) |
1011 | EMIT2('h'); EMITMBC(0x125) EMITMBC(0x127) EMITMBC(0x1e23) |
1012 | EMITMBC(0x1e27) EMITMBC(0x1e29) EMITMBC(0x1e96) |
1013 | return; |
1014 | |
1015 | case 'i': case i_grave: case i_acute: case i_circumflex: |
1016 | case i_diaeresis: CASEMBC(0x129) CASEMBC(0x12b) |
1017 | CASEMBC(0x12d) CASEMBC(0x12f) CASEMBC(0x1d0) |
1018 | CASEMBC(0x1ec9) |
1019 | EMIT2('i'); EMIT2(i_grave); EMIT2(i_acute); |
1020 | EMIT2(i_circumflex); EMIT2(i_diaeresis); |
1021 | EMITMBC(0x129) EMITMBC(0x12b) |
1022 | EMITMBC(0x12d) EMITMBC(0x12f) EMITMBC(0x1d0) |
1023 | EMITMBC(0x1ec9) |
1024 | return; |
1025 | |
1026 | case 'j': CASEMBC(0x135) CASEMBC(0x1f0) |
1027 | EMIT2('j'); EMITMBC(0x135) EMITMBC(0x1f0) |
1028 | return; |
1029 | |
1030 | case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31) |
1031 | CASEMBC(0x1e35) |
1032 | EMIT2('k'); EMITMBC(0x137) EMITMBC(0x1e9) EMITMBC(0x1e31) |
1033 | EMITMBC(0x1e35) |
1034 | return; |
1035 | |
1036 | case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e) |
1037 | CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b) |
1038 | EMIT2('l'); EMITMBC(0x13a) EMITMBC(0x13c) EMITMBC(0x13e) |
1039 | EMITMBC(0x140) EMITMBC(0x142) EMITMBC(0x1e3b) |
1040 | return; |
1041 | |
1042 | case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41) |
1043 | EMIT2('m'); EMITMBC(0x1e3f) EMITMBC(0x1e41) |
1044 | return; |
1045 | |
1046 | case 'n': case n_virguilla: CASEMBC(0x144) CASEMBC(0x146) |
1047 | CASEMBC(0x148) CASEMBC(0x149) CASEMBC(0x1e45) |
1048 | CASEMBC(0x1e49) |
1049 | EMIT2('n'); EMIT2(n_virguilla); |
1050 | EMITMBC(0x144) EMITMBC(0x146) |
1051 | EMITMBC(0x148) EMITMBC(0x149) EMITMBC(0x1e45) |
1052 | EMITMBC(0x1e49) |
1053 | return; |
1054 | |
1055 | case 'o': case o_grave: case o_acute: case o_circumflex: |
1056 | case o_virguilla: case o_diaeresis: case o_slash: |
1057 | CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151) |
1058 | CASEMBC(0x1a1) CASEMBC(0x1d2) CASEMBC(0x1eb) |
1059 | CASEMBC(0x1ed) CASEMBC(0x1ecf) |
1060 | EMIT2('o'); EMIT2(o_grave); EMIT2(o_acute); |
1061 | EMIT2(o_circumflex); EMIT2(o_virguilla); |
1062 | EMIT2(o_diaeresis); EMIT2(o_slash); |
1063 | EMITMBC(0x14d) EMITMBC(0x14f) EMITMBC(0x151) |
1064 | EMITMBC(0x1a1) EMITMBC(0x1d2) EMITMBC(0x1eb) |
1065 | EMITMBC(0x1ed) EMITMBC(0x1ecf) |
1066 | return; |
1067 | |
1068 | case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57) |
1069 | EMIT2('p'); EMITMBC(0x1e55) EMITMBC(0x1e57) |
1070 | return; |
1071 | |
1072 | case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159) |
1073 | CASEMBC(0x1e59) CASEMBC(0x1e5f) |
1074 | EMIT2('r'); EMITMBC(0x155) EMITMBC(0x157) EMITMBC(0x159) |
1075 | EMITMBC(0x1e59) EMITMBC(0x1e5f) |
1076 | return; |
1077 | |
1078 | case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f) |
1079 | CASEMBC(0x161) CASEMBC(0x1e61) |
1080 | EMIT2('s'); EMITMBC(0x15b) EMITMBC(0x15d) EMITMBC(0x15f) |
1081 | EMITMBC(0x161) EMITMBC(0x1e61) |
1082 | return; |
1083 | |
1084 | case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167) |
1085 | CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97) |
1086 | EMIT2('t'); EMITMBC(0x163) EMITMBC(0x165) EMITMBC(0x167) |
1087 | EMITMBC(0x1e6b) EMITMBC(0x1e6f) EMITMBC(0x1e97) |
1088 | return; |
1089 | |
1090 | case 'u': case u_grave: case u_acute: case u_circumflex: |
1091 | case u_diaeresis: CASEMBC(0x169) CASEMBC(0x16b) |
1092 | CASEMBC(0x16d) CASEMBC(0x16f) CASEMBC(0x171) |
1093 | CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4) |
1094 | CASEMBC(0x1ee7) |
1095 | EMIT2('u'); EMIT2(u_grave); EMIT2(u_acute); |
1096 | EMIT2(u_circumflex); EMIT2(u_diaeresis); |
1097 | EMITMBC(0x169) EMITMBC(0x16b) |
1098 | EMITMBC(0x16d) EMITMBC(0x16f) EMITMBC(0x171) |
1099 | EMITMBC(0x173) EMITMBC(0x1b0) EMITMBC(0x1d4) |
1100 | EMITMBC(0x1ee7) |
1101 | return; |
1102 | |
1103 | case 'v': CASEMBC(0x1e7d) |
1104 | EMIT2('v'); EMITMBC(0x1e7d) |
1105 | return; |
1106 | |
1107 | case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83) |
1108 | CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98) |
1109 | EMIT2('w'); EMITMBC(0x175) EMITMBC(0x1e81) EMITMBC(0x1e83) |
1110 | EMITMBC(0x1e85) EMITMBC(0x1e87) EMITMBC(0x1e98) |
1111 | return; |
1112 | |
1113 | case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d) |
1114 | EMIT2('x'); EMITMBC(0x1e8b) EMITMBC(0x1e8d) |
1115 | return; |
1116 | |
1117 | case 'y': case y_acute: case y_diaeresis: CASEMBC(0x177) |
1118 | CASEMBC(0x1e8f) CASEMBC(0x1e99) CASEMBC(0x1ef3) |
1119 | CASEMBC(0x1ef7) CASEMBC(0x1ef9) |
1120 | EMIT2('y'); EMIT2(y_acute); EMIT2(y_diaeresis); |
1121 | EMITMBC(0x177) |
1122 | EMITMBC(0x1e8f) EMITMBC(0x1e99) EMITMBC(0x1ef3) |
1123 | EMITMBC(0x1ef7) EMITMBC(0x1ef9) |
1124 | return; |
1125 | |
1126 | case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e) |
1127 | CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95) |
1128 | EMIT2('z'); EMITMBC(0x17a) EMITMBC(0x17c) EMITMBC(0x17e) |
1129 | EMITMBC(0x1b6) EMITMBC(0x1e91) EMITMBC(0x1e95) |
1130 | return; |
1131 | |
1132 | /* default: character itself */ |
1133 | } |
1134 | } |
1135 | |
1136 | EMIT2(c); |
1137 | #undef EMIT2 |
1138 | #undef EMITMBC |
1139 | } |
1140 | |
1141 | /* |
1142 | * Code to parse regular expression. |
1143 | * |
1144 | * We try to reuse parsing functions in regexp.c to |
1145 | * minimize surprise and keep the syntax consistent. |
1146 | */ |
1147 | |
1148 | /* |
1149 | * Parse the lowest level. |
1150 | * |
1151 | * An atom can be one of a long list of items. Many atoms match one character |
1152 | * in the text. It is often an ordinary character or a character class. |
1153 | * Braces can be used to make a pattern into an atom. The "\z(\)" construct |
1154 | * is only for syntax highlighting. |
1155 | * |
1156 | * atom ::= ordinary-atom |
1157 | * or \( pattern \) |
1158 | * or \%( pattern \) |
1159 | * or \z( pattern \) |
1160 | */ |
1161 | static int nfa_regatom(void) |
1162 | { |
1163 | int c; |
1164 | int charclass; |
1165 | int equiclass; |
1166 | int collclass; |
1167 | int got_coll_char; |
1168 | char_u *p; |
1169 | char_u *endp; |
1170 | char_u *old_regparse = regparse; |
1171 | int = 0; |
1172 | int emit_range; |
1173 | int negated; |
1174 | int startc = -1; |
1175 | int endc = -1; |
1176 | int oldstartc = -1; |
1177 | int save_prev_at_start = prev_at_start; |
1178 | |
1179 | c = getchr(); |
1180 | switch (c) { |
1181 | case NUL: |
1182 | EMSG_RET_FAIL(_(e_nul_found)); |
1183 | |
1184 | case Magic('^'): |
1185 | EMIT(NFA_BOL); |
1186 | break; |
1187 | |
1188 | case Magic('$'): |
1189 | EMIT(NFA_EOL); |
1190 | had_eol = TRUE; |
1191 | break; |
1192 | |
1193 | case Magic('<'): |
1194 | EMIT(NFA_BOW); |
1195 | break; |
1196 | |
1197 | case Magic('>'): |
1198 | EMIT(NFA_EOW); |
1199 | break; |
1200 | |
1201 | case Magic('_'): |
1202 | c = no_Magic(getchr()); |
1203 | if (c == NUL) |
1204 | EMSG_RET_FAIL(_(e_nul_found)); |
1205 | |
1206 | if (c == '^') { /* "\_^" is start-of-line */ |
1207 | EMIT(NFA_BOL); |
1208 | break; |
1209 | } |
1210 | if (c == '$') { /* "\_$" is end-of-line */ |
1211 | EMIT(NFA_EOL); |
1212 | had_eol = TRUE; |
1213 | break; |
1214 | } |
1215 | |
1216 | extra = NFA_ADD_NL; |
1217 | |
1218 | /* "\_[" is collection plus newline */ |
1219 | if (c == '[') |
1220 | goto collection; |
1221 | |
1222 | // "\_x" is character class plus newline |
1223 | FALLTHROUGH; |
1224 | |
1225 | /* |
1226 | * Character classes. |
1227 | */ |
1228 | case Magic('.'): |
1229 | case Magic('i'): |
1230 | case Magic('I'): |
1231 | case Magic('k'): |
1232 | case Magic('K'): |
1233 | case Magic('f'): |
1234 | case Magic('F'): |
1235 | case Magic('p'): |
1236 | case Magic('P'): |
1237 | case Magic('s'): |
1238 | case Magic('S'): |
1239 | case Magic('d'): |
1240 | case Magic('D'): |
1241 | case Magic('x'): |
1242 | case Magic('X'): |
1243 | case Magic('o'): |
1244 | case Magic('O'): |
1245 | case Magic('w'): |
1246 | case Magic('W'): |
1247 | case Magic('h'): |
1248 | case Magic('H'): |
1249 | case Magic('a'): |
1250 | case Magic('A'): |
1251 | case Magic('l'): |
1252 | case Magic('L'): |
1253 | case Magic('u'): |
1254 | case Magic('U'): |
1255 | p = vim_strchr(classchars, no_Magic(c)); |
1256 | if (p == NULL) { |
1257 | if (extra == NFA_ADD_NL) { |
1258 | EMSGN(_(e_ill_char_class), c); |
1259 | rc_did_emsg = TRUE; |
1260 | return FAIL; |
1261 | } |
1262 | IEMSGN("INTERNAL: Unknown character class char: %" PRId64, c); |
1263 | return FAIL; |
1264 | } |
1265 | // When '.' is followed by a composing char ignore the dot, so that |
1266 | // the composing char is matched here. |
1267 | if (enc_utf8 && c == Magic('.') && utf_iscomposing(peekchr())) { |
1268 | old_regparse = regparse; |
1269 | c = getchr(); |
1270 | goto nfa_do_multibyte; |
1271 | } |
1272 | EMIT(nfa_classcodes[p - classchars]); |
1273 | if (extra == NFA_ADD_NL) { |
1274 | EMIT(NFA_NEWL); |
1275 | EMIT(NFA_OR); |
1276 | regflags |= RF_HASNL; |
1277 | } |
1278 | break; |
1279 | |
1280 | case Magic('n'): |
1281 | if (reg_string) { |
1282 | // In a string "\n" matches a newline character. |
1283 | EMIT(NL); |
1284 | } else { |
1285 | // In buffer text "\n" matches the end of a line. |
1286 | EMIT(NFA_NEWL); |
1287 | regflags |= RF_HASNL; |
1288 | } |
1289 | break; |
1290 | |
1291 | case Magic('('): |
1292 | if (nfa_reg(REG_PAREN) == FAIL) { |
1293 | return FAIL; // cascaded error |
1294 | } |
1295 | break; |
1296 | |
1297 | case Magic('|'): |
1298 | case Magic('&'): |
1299 | case Magic(')'): |
1300 | EMSGN(_(e_misplaced), no_Magic(c)); // -V1037 |
1301 | return FAIL; |
1302 | |
1303 | case Magic('='): |
1304 | case Magic('?'): |
1305 | case Magic('+'): |
1306 | case Magic('@'): |
1307 | case Magic('*'): |
1308 | case Magic('{'): |
1309 | // these should follow an atom, not form an atom |
1310 | EMSGN(_(e_misplaced), no_Magic(c)); |
1311 | return FAIL; |
1312 | |
1313 | case Magic('~'): |
1314 | { |
1315 | char_u *lp; |
1316 | |
1317 | // Previous substitute pattern. |
1318 | // Generated as "\%(pattern\)". |
1319 | if (reg_prev_sub == NULL) { |
1320 | EMSG(_(e_nopresub)); |
1321 | return FAIL; |
1322 | } |
1323 | for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp)) { |
1324 | EMIT(PTR2CHAR(lp)); |
1325 | if (lp != reg_prev_sub) |
1326 | EMIT(NFA_CONCAT); |
1327 | } |
1328 | EMIT(NFA_NOPEN); |
1329 | break; |
1330 | } |
1331 | |
1332 | case Magic('1'): |
1333 | case Magic('2'): |
1334 | case Magic('3'): |
1335 | case Magic('4'): |
1336 | case Magic('5'): |
1337 | case Magic('6'): |
1338 | case Magic('7'): |
1339 | case Magic('8'): |
1340 | case Magic('9'): |
1341 | { |
1342 | int refnum = no_Magic(c) - '1'; |
1343 | |
1344 | if (!seen_endbrace(refnum + 1)) { |
1345 | return FAIL; |
1346 | } |
1347 | EMIT(NFA_BACKREF1 + refnum); |
1348 | nfa_has_backref = true; |
1349 | } |
1350 | break; |
1351 | |
1352 | case Magic('z'): |
1353 | c = no_Magic(getchr()); |
1354 | switch (c) { |
1355 | case 's': |
1356 | EMIT(NFA_ZSTART); |
1357 | if (!re_mult_next("\\zs" )) { |
1358 | return false; |
1359 | } |
1360 | break; |
1361 | case 'e': |
1362 | EMIT(NFA_ZEND); |
1363 | nfa_has_zend = true; |
1364 | if (!re_mult_next("\\zs" )) { |
1365 | return false; |
1366 | } |
1367 | break; |
1368 | case '1': |
1369 | case '2': |
1370 | case '3': |
1371 | case '4': |
1372 | case '5': |
1373 | case '6': |
1374 | case '7': |
1375 | case '8': |
1376 | case '9': |
1377 | // \z1...\z9 |
1378 | if ((reg_do_extmatch & REX_USE) == 0) { |
1379 | EMSG_RET_FAIL(_(e_z1_not_allowed)); |
1380 | } |
1381 | EMIT(NFA_ZREF1 + (no_Magic(c) - '1')); |
1382 | /* No need to set nfa_has_backref, the sub-matches don't |
1383 | * change when \z1 .. \z9 matches or not. */ |
1384 | re_has_z = REX_USE; |
1385 | break; |
1386 | case '(': |
1387 | // \z( |
1388 | if (reg_do_extmatch != REX_SET) { |
1389 | EMSG_RET_FAIL(_(e_z_not_allowed)); |
1390 | } |
1391 | if (nfa_reg(REG_ZPAREN) == FAIL) { |
1392 | return FAIL; // cascaded error |
1393 | } |
1394 | re_has_z = REX_SET; |
1395 | break; |
1396 | default: |
1397 | emsgf(_("E867: (NFA) Unknown operator '\\z%c'" ), |
1398 | no_Magic(c)); |
1399 | return FAIL; |
1400 | } |
1401 | break; |
1402 | |
1403 | case Magic('%'): |
1404 | c = no_Magic(getchr()); |
1405 | switch (c) { |
1406 | /* () without a back reference */ |
1407 | case '(': |
1408 | if (nfa_reg(REG_NPAREN) == FAIL) |
1409 | return FAIL; |
1410 | EMIT(NFA_NOPEN); |
1411 | break; |
1412 | |
1413 | case 'd': /* %d123 decimal */ |
1414 | case 'o': /* %o123 octal */ |
1415 | case 'x': /* %xab hex 2 */ |
1416 | case 'u': /* %uabcd hex 4 */ |
1417 | case 'U': /* %U1234abcd hex 8 */ |
1418 | { |
1419 | int64_t nr; |
1420 | |
1421 | switch (c) { |
1422 | case 'd': nr = getdecchrs(); break; |
1423 | case 'o': nr = getoctchrs(); break; |
1424 | case 'x': nr = gethexchrs(2); break; |
1425 | case 'u': nr = gethexchrs(4); break; |
1426 | case 'U': nr = gethexchrs(8); break; |
1427 | default: nr = -1; break; |
1428 | } |
1429 | |
1430 | if (nr < 0 || nr > INT_MAX) { |
1431 | EMSG2_RET_FAIL(_("E678: Invalid character after %s%%[dxouU]" ), |
1432 | reg_magic == MAGIC_ALL); |
1433 | } |
1434 | // A NUL is stored in the text as NL |
1435 | // TODO(vim): what if a composing character follows? |
1436 | EMIT(nr == 0 ? 0x0a : nr); |
1437 | } |
1438 | break; |
1439 | |
1440 | /* Catch \%^ and \%$ regardless of where they appear in the |
1441 | * pattern -- regardless of whether or not it makes sense. */ |
1442 | case '^': |
1443 | EMIT(NFA_BOF); |
1444 | break; |
1445 | |
1446 | case '$': |
1447 | EMIT(NFA_EOF); |
1448 | break; |
1449 | |
1450 | case '#': |
1451 | EMIT(NFA_CURSOR); |
1452 | break; |
1453 | |
1454 | case 'V': |
1455 | EMIT(NFA_VISUAL); |
1456 | break; |
1457 | |
1458 | case 'C': |
1459 | EMIT(NFA_ANY_COMPOSING); |
1460 | break; |
1461 | |
1462 | case '[': |
1463 | { |
1464 | int n; |
1465 | |
1466 | /* \%[abc] */ |
1467 | for (n = 0; (c = peekchr()) != ']'; ++n) { |
1468 | if (c == NUL) |
1469 | EMSG2_RET_FAIL(_(e_missing_sb), |
1470 | reg_magic == MAGIC_ALL); |
1471 | /* recursive call! */ |
1472 | if (nfa_regatom() == FAIL) |
1473 | return FAIL; |
1474 | } |
1475 | getchr(); /* get the ] */ |
1476 | if (n == 0) |
1477 | EMSG2_RET_FAIL(_(e_empty_sb), |
1478 | reg_magic == MAGIC_ALL); |
1479 | EMIT(NFA_OPT_CHARS); |
1480 | EMIT(n); |
1481 | |
1482 | /* Emit as "\%(\%[abc]\)" to be able to handle |
1483 | * "\%[abc]*" which would cause the empty string to be |
1484 | * matched an unlimited number of times. NFA_NOPEN is |
1485 | * added only once at a position, while NFA_SPLIT is |
1486 | * added multiple times. This is more efficient than |
1487 | * not allowing NFA_SPLIT multiple times, it is used |
1488 | * a lot. */ |
1489 | EMIT(NFA_NOPEN); |
1490 | break; |
1491 | } |
1492 | |
1493 | default: |
1494 | { |
1495 | int64_t n = 0; |
1496 | const int cmp = c; |
1497 | |
1498 | if (c == '<' || c == '>') |
1499 | c = getchr(); |
1500 | while (ascii_isdigit(c)) { |
1501 | if (n > (INT32_MAX - (c - '0')) / 10) { |
1502 | EMSG(_("E951: \\% value too large" )); |
1503 | return FAIL; |
1504 | } |
1505 | n = n * 10 + (c - '0'); |
1506 | c = getchr(); |
1507 | } |
1508 | if (c == 'l' || c == 'c' || c == 'v') { |
1509 | int32_t limit = INT32_MAX; |
1510 | |
1511 | if (c == 'l') { |
1512 | // \%{n}l \%{n}<l \%{n}>l |
1513 | EMIT(cmp == '<' ? NFA_LNUM_LT : |
1514 | cmp == '>' ? NFA_LNUM_GT : NFA_LNUM); |
1515 | if (save_prev_at_start) { |
1516 | at_start = true; |
1517 | } |
1518 | } else if (c == 'c') { |
1519 | // \%{n}c \%{n}<c \%{n}>c |
1520 | EMIT(cmp == '<' ? NFA_COL_LT : |
1521 | cmp == '>' ? NFA_COL_GT : NFA_COL); |
1522 | } else { |
1523 | // \%{n}v \%{n}<v \%{n}>v |
1524 | EMIT(cmp == '<' ? NFA_VCOL_LT : |
1525 | cmp == '>' ? NFA_VCOL_GT : NFA_VCOL); |
1526 | limit = INT32_MAX / MB_MAXBYTES; |
1527 | } |
1528 | if (n >= limit) { |
1529 | EMSG(_("E951: \\% value too large" )); |
1530 | return FAIL; |
1531 | } |
1532 | EMIT((int)n); |
1533 | break; |
1534 | } else if (c == '\'' && n == 0) { |
1535 | /* \%'m \%<'m \%>'m */ |
1536 | EMIT(cmp == '<' ? NFA_MARK_LT : |
1537 | cmp == '>' ? NFA_MARK_GT : NFA_MARK); |
1538 | EMIT(getchr()); |
1539 | break; |
1540 | } |
1541 | } |
1542 | emsgf(_("E867: (NFA) Unknown operator '\\%%%c'" ), |
1543 | no_Magic(c)); |
1544 | return FAIL; |
1545 | } |
1546 | break; |
1547 | |
1548 | case Magic('['): |
1549 | collection: |
1550 | /* |
1551 | * [abc] uses NFA_START_COLL - NFA_END_COLL |
1552 | * [^abc] uses NFA_START_NEG_COLL - NFA_END_NEG_COLL |
1553 | * Each character is produced as a regular state, using |
1554 | * NFA_CONCAT to bind them together. |
1555 | * Besides normal characters there can be: |
1556 | * - character classes NFA_CLASS_* |
1557 | * - ranges, two characters followed by NFA_RANGE. |
1558 | */ |
1559 | |
1560 | p = regparse; |
1561 | endp = skip_anyof(p); |
1562 | if (*endp == ']') { |
1563 | /* |
1564 | * Try to reverse engineer character classes. For example, |
1565 | * recognize that [0-9] stands for \d and [A-Za-z_] for \h, |
1566 | * and perform the necessary substitutions in the NFA. |
1567 | */ |
1568 | int result = nfa_recognize_char_class(regparse, endp, |
1569 | extra == NFA_ADD_NL); |
1570 | if (result != FAIL) { |
1571 | if (result >= NFA_FIRST_NL && result <= NFA_LAST_NL) { |
1572 | EMIT(result - NFA_ADD_NL); |
1573 | EMIT(NFA_NEWL); |
1574 | EMIT(NFA_OR); |
1575 | } else |
1576 | EMIT(result); |
1577 | regparse = endp; |
1578 | MB_PTR_ADV(regparse); |
1579 | return OK; |
1580 | } |
1581 | /* |
1582 | * Failed to recognize a character class. Use the simple |
1583 | * version that turns [abc] into 'a' OR 'b' OR 'c' |
1584 | */ |
1585 | startc = endc = oldstartc = -1; |
1586 | negated = false; |
1587 | if (*regparse == '^') { // negated range |
1588 | negated = true; |
1589 | MB_PTR_ADV(regparse); |
1590 | EMIT(NFA_START_NEG_COLL); |
1591 | } else |
1592 | EMIT(NFA_START_COLL); |
1593 | if (*regparse == '-') { |
1594 | startc = '-'; |
1595 | EMIT(startc); |
1596 | EMIT(NFA_CONCAT); |
1597 | MB_PTR_ADV(regparse); |
1598 | } |
1599 | /* Emit the OR branches for each character in the [] */ |
1600 | emit_range = FALSE; |
1601 | while (regparse < endp) { |
1602 | oldstartc = startc; |
1603 | startc = -1; |
1604 | got_coll_char = FALSE; |
1605 | if (*regparse == '[') { |
1606 | /* Check for [: :], [= =], [. .] */ |
1607 | equiclass = collclass = 0; |
1608 | charclass = get_char_class(®parse); |
1609 | if (charclass == CLASS_NONE) { |
1610 | equiclass = get_equi_class(®parse); |
1611 | if (equiclass == 0) |
1612 | collclass = get_coll_element(®parse); |
1613 | } |
1614 | |
1615 | /* Character class like [:alpha:] */ |
1616 | if (charclass != CLASS_NONE) { |
1617 | switch (charclass) { |
1618 | case CLASS_ALNUM: |
1619 | EMIT(NFA_CLASS_ALNUM); |
1620 | break; |
1621 | case CLASS_ALPHA: |
1622 | EMIT(NFA_CLASS_ALPHA); |
1623 | break; |
1624 | case CLASS_BLANK: |
1625 | EMIT(NFA_CLASS_BLANK); |
1626 | break; |
1627 | case CLASS_CNTRL: |
1628 | EMIT(NFA_CLASS_CNTRL); |
1629 | break; |
1630 | case CLASS_DIGIT: |
1631 | EMIT(NFA_CLASS_DIGIT); |
1632 | break; |
1633 | case CLASS_GRAPH: |
1634 | EMIT(NFA_CLASS_GRAPH); |
1635 | break; |
1636 | case CLASS_LOWER: |
1637 | EMIT(NFA_CLASS_LOWER); |
1638 | break; |
1639 | case CLASS_PRINT: |
1640 | EMIT(NFA_CLASS_PRINT); |
1641 | break; |
1642 | case CLASS_PUNCT: |
1643 | EMIT(NFA_CLASS_PUNCT); |
1644 | break; |
1645 | case CLASS_SPACE: |
1646 | EMIT(NFA_CLASS_SPACE); |
1647 | break; |
1648 | case CLASS_UPPER: |
1649 | EMIT(NFA_CLASS_UPPER); |
1650 | break; |
1651 | case CLASS_XDIGIT: |
1652 | EMIT(NFA_CLASS_XDIGIT); |
1653 | break; |
1654 | case CLASS_TAB: |
1655 | EMIT(NFA_CLASS_TAB); |
1656 | break; |
1657 | case CLASS_RETURN: |
1658 | EMIT(NFA_CLASS_RETURN); |
1659 | break; |
1660 | case CLASS_BACKSPACE: |
1661 | EMIT(NFA_CLASS_BACKSPACE); |
1662 | break; |
1663 | case CLASS_ESCAPE: |
1664 | EMIT(NFA_CLASS_ESCAPE); |
1665 | break; |
1666 | } |
1667 | EMIT(NFA_CONCAT); |
1668 | continue; |
1669 | } |
1670 | /* Try equivalence class [=a=] and the like */ |
1671 | if (equiclass != 0) { |
1672 | nfa_emit_equi_class(equiclass); |
1673 | continue; |
1674 | } |
1675 | /* Try collating class like [. .] */ |
1676 | if (collclass != 0) { |
1677 | startc = collclass; /* allow [.a.]-x as a range */ |
1678 | /* Will emit the proper atom at the end of the |
1679 | * while loop. */ |
1680 | } |
1681 | } |
1682 | /* Try a range like 'a-x' or '\t-z'. Also allows '-' as a |
1683 | * start character. */ |
1684 | if (*regparse == '-' && oldstartc != -1) { |
1685 | emit_range = TRUE; |
1686 | startc = oldstartc; |
1687 | MB_PTR_ADV(regparse); |
1688 | continue; // reading the end of the range |
1689 | } |
1690 | |
1691 | /* Now handle simple and escaped characters. |
1692 | * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim |
1693 | * accepts "\t", "\e", etc., but only when the 'l' flag in |
1694 | * 'cpoptions' is not included. |
1695 | */ |
1696 | if (*regparse == '\\' |
1697 | && regparse + 1 <= endp |
1698 | && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL |
1699 | || (!reg_cpo_lit |
1700 | && vim_strchr(REGEXP_ABBR, regparse[1]) |
1701 | != NULL) |
1702 | ) |
1703 | ) { |
1704 | MB_PTR_ADV(regparse); |
1705 | |
1706 | if (*regparse == 'n') { |
1707 | startc = (reg_string || emit_range || regparse[1] == '-') |
1708 | ? NL : NFA_NEWL; |
1709 | } else if (*regparse == 'd' |
1710 | || *regparse == 'o' |
1711 | || *regparse == 'x' |
1712 | || *regparse == 'u' |
1713 | || *regparse == 'U' |
1714 | ) { |
1715 | // TODO(RE): This needs more testing |
1716 | startc = coll_get_char(); |
1717 | got_coll_char = true; |
1718 | MB_PTR_BACK(old_regparse, regparse); |
1719 | } else { |
1720 | /* \r,\t,\e,\b */ |
1721 | startc = backslash_trans(*regparse); |
1722 | } |
1723 | } |
1724 | |
1725 | /* Normal printable char */ |
1726 | if (startc == -1) |
1727 | startc = PTR2CHAR(regparse); |
1728 | |
1729 | /* Previous char was '-', so this char is end of range. */ |
1730 | if (emit_range) { |
1731 | endc = startc; |
1732 | startc = oldstartc; |
1733 | if (startc > endc) { |
1734 | EMSG_RET_FAIL(_(e_reverse_range)); |
1735 | } |
1736 | |
1737 | if (endc > startc + 2) { |
1738 | /* Emit a range instead of the sequence of |
1739 | * individual characters. */ |
1740 | if (startc == 0) |
1741 | /* \x00 is translated to \x0a, start at \x01. */ |
1742 | EMIT(1); |
1743 | else |
1744 | --post_ptr; /* remove NFA_CONCAT */ |
1745 | EMIT(endc); |
1746 | EMIT(NFA_RANGE); |
1747 | EMIT(NFA_CONCAT); |
1748 | } else if (has_mbyte && ((*mb_char2len)(startc) > 1 |
1749 | || (*mb_char2len)(endc) > 1)) { |
1750 | /* Emit the characters in the range. |
1751 | * "startc" was already emitted, so skip it. |
1752 | * */ |
1753 | for (c = startc + 1; c <= endc; c++) { |
1754 | EMIT(c); |
1755 | EMIT(NFA_CONCAT); |
1756 | } |
1757 | } else { |
1758 | /* Emit the range. "startc" was already emitted, so |
1759 | * skip it. */ |
1760 | for (c = startc + 1; c <= endc; c++) { |
1761 | EMIT(c); |
1762 | EMIT(NFA_CONCAT); |
1763 | } |
1764 | } |
1765 | emit_range = FALSE; |
1766 | startc = -1; |
1767 | } else { |
1768 | /* This char (startc) is not part of a range. Just |
1769 | * emit it. |
1770 | * Normally, simply emit startc. But if we get char |
1771 | * code=0 from a collating char, then replace it with |
1772 | * 0x0a. |
1773 | * This is needed to completely mimic the behaviour of |
1774 | * the backtracking engine. */ |
1775 | if (startc == NFA_NEWL) { |
1776 | /* Line break can't be matched as part of the |
1777 | * collection, add an OR below. But not for negated |
1778 | * range. */ |
1779 | if (!negated) |
1780 | extra = NFA_ADD_NL; |
1781 | } else { |
1782 | if (got_coll_char == TRUE && startc == 0) |
1783 | EMIT(0x0a); |
1784 | else |
1785 | EMIT(startc); |
1786 | EMIT(NFA_CONCAT); |
1787 | } |
1788 | } |
1789 | |
1790 | MB_PTR_ADV(regparse); |
1791 | } // while (p < endp) |
1792 | |
1793 | MB_PTR_BACK(old_regparse, regparse); |
1794 | if (*regparse == '-') { // if last, '-' is just a char |
1795 | EMIT('-'); |
1796 | EMIT(NFA_CONCAT); |
1797 | } |
1798 | |
1799 | /* skip the trailing ] */ |
1800 | regparse = endp; |
1801 | MB_PTR_ADV(regparse); |
1802 | |
1803 | /* Mark end of the collection. */ |
1804 | if (negated == TRUE) |
1805 | EMIT(NFA_END_NEG_COLL); |
1806 | else |
1807 | EMIT(NFA_END_COLL); |
1808 | |
1809 | /* \_[] also matches \n but it's not negated */ |
1810 | if (extra == NFA_ADD_NL) { |
1811 | EMIT(reg_string ? NL : NFA_NEWL); |
1812 | EMIT(NFA_OR); |
1813 | } |
1814 | |
1815 | return OK; |
1816 | } /* if exists closing ] */ |
1817 | |
1818 | if (reg_strict) |
1819 | EMSG_RET_FAIL(_(e_missingbracket)); |
1820 | FALLTHROUGH; |
1821 | |
1822 | default: |
1823 | { |
1824 | int plen; |
1825 | |
1826 | nfa_do_multibyte: |
1827 | // plen is length of current char with composing chars |
1828 | if (enc_utf8 && ((*mb_char2len)(c) |
1829 | != (plen = utfc_ptr2len(old_regparse)) |
1830 | || utf_iscomposing(c))) { |
1831 | int i = 0; |
1832 | |
1833 | /* A base character plus composing characters, or just one |
1834 | * or more composing characters. |
1835 | * This requires creating a separate atom as if enclosing |
1836 | * the characters in (), where NFA_COMPOSING is the ( and |
1837 | * NFA_END_COMPOSING is the ). Note that right now we are |
1838 | * building the postfix form, not the NFA itself; |
1839 | * a composing char could be: a, b, c, NFA_COMPOSING |
1840 | * where 'b' and 'c' are chars with codes > 256. */ |
1841 | for (;; ) { |
1842 | EMIT(c); |
1843 | if (i > 0) |
1844 | EMIT(NFA_CONCAT); |
1845 | if ((i += utf_char2len(c)) >= plen) |
1846 | break; |
1847 | c = utf_ptr2char(old_regparse + i); |
1848 | } |
1849 | EMIT(NFA_COMPOSING); |
1850 | regparse = old_regparse + plen; |
1851 | } else { |
1852 | c = no_Magic(c); |
1853 | EMIT(c); |
1854 | } |
1855 | return OK; |
1856 | } |
1857 | } |
1858 | |
1859 | return OK; |
1860 | } |
1861 | |
1862 | /* |
1863 | * Parse something followed by possible [*+=]. |
1864 | * |
1865 | * A piece is an atom, possibly followed by a multi, an indication of how many |
1866 | * times the atom can be matched. Example: "a*" matches any sequence of "a" |
1867 | * characters: "", "a", "aa", etc. |
1868 | * |
1869 | * piece ::= atom |
1870 | * or atom multi |
1871 | */ |
1872 | static int nfa_regpiece(void) |
1873 | { |
1874 | int i; |
1875 | int op; |
1876 | int ret; |
1877 | long minval, maxval; |
1878 | int greedy = TRUE; /* Braces are prefixed with '-' ? */ |
1879 | parse_state_T old_state; |
1880 | parse_state_T new_state; |
1881 | int64_t c2; |
1882 | int old_post_pos; |
1883 | int my_post_start; |
1884 | int quest; |
1885 | |
1886 | /* Save the current parse state, so that we can use it if <atom>{m,n} is |
1887 | * next. */ |
1888 | save_parse_state(&old_state); |
1889 | |
1890 | /* store current pos in the postfix form, for \{m,n} involving 0s */ |
1891 | my_post_start = (int)(post_ptr - post_start); |
1892 | |
1893 | ret = nfa_regatom(); |
1894 | if (ret == FAIL) |
1895 | return FAIL; /* cascaded error */ |
1896 | |
1897 | op = peekchr(); |
1898 | if (re_multi_type(op) == NOT_MULTI) |
1899 | return OK; |
1900 | |
1901 | skipchr(); |
1902 | switch (op) { |
1903 | case Magic('*'): |
1904 | EMIT(NFA_STAR); |
1905 | break; |
1906 | |
1907 | case Magic('+'): |
1908 | /* |
1909 | * Trick: Normally, (a*)\+ would match the whole input "aaa". The |
1910 | * first and only submatch would be "aaa". But the backtracking |
1911 | * engine interprets the plus as "try matching one more time", and |
1912 | * a* matches a second time at the end of the input, the empty |
1913 | * string. |
1914 | * The submatch will be the empty string. |
1915 | * |
1916 | * In order to be consistent with the old engine, we replace |
1917 | * <atom>+ with <atom><atom>* |
1918 | */ |
1919 | restore_parse_state(&old_state); |
1920 | curchr = -1; |
1921 | if (nfa_regatom() == FAIL) |
1922 | return FAIL; |
1923 | EMIT(NFA_STAR); |
1924 | EMIT(NFA_CONCAT); |
1925 | skipchr(); /* skip the \+ */ |
1926 | break; |
1927 | |
1928 | case Magic('@'): |
1929 | c2 = getdecchrs(); |
1930 | op = no_Magic(getchr()); |
1931 | i = 0; |
1932 | switch(op) { |
1933 | case '=': |
1934 | /* \@= */ |
1935 | i = NFA_PREV_ATOM_NO_WIDTH; |
1936 | break; |
1937 | case '!': |
1938 | /* \@! */ |
1939 | i = NFA_PREV_ATOM_NO_WIDTH_NEG; |
1940 | break; |
1941 | case '<': |
1942 | op = no_Magic(getchr()); |
1943 | if (op == '=') |
1944 | /* \@<= */ |
1945 | i = NFA_PREV_ATOM_JUST_BEFORE; |
1946 | else if (op == '!') |
1947 | /* \@<! */ |
1948 | i = NFA_PREV_ATOM_JUST_BEFORE_NEG; |
1949 | break; |
1950 | case '>': |
1951 | /* \@> */ |
1952 | i = NFA_PREV_ATOM_LIKE_PATTERN; |
1953 | break; |
1954 | } |
1955 | if (i == 0) { |
1956 | emsgf(_("E869: (NFA) Unknown operator '\\@%c'" ), op); |
1957 | return FAIL; |
1958 | } |
1959 | EMIT(i); |
1960 | if (i == NFA_PREV_ATOM_JUST_BEFORE |
1961 | || i == NFA_PREV_ATOM_JUST_BEFORE_NEG) |
1962 | EMIT(c2); |
1963 | break; |
1964 | |
1965 | case Magic('?'): |
1966 | case Magic('='): |
1967 | EMIT(NFA_QUEST); |
1968 | break; |
1969 | |
1970 | case Magic('{'): |
1971 | /* a{2,5} will expand to 'aaa?a?a?' |
1972 | * a{-1,3} will expand to 'aa??a??', where ?? is the nongreedy |
1973 | * version of '?' |
1974 | * \v(ab){2,3} will expand to '(ab)(ab)(ab)?', where all the |
1975 | * parenthesis have the same id |
1976 | */ |
1977 | |
1978 | greedy = TRUE; |
1979 | c2 = peekchr(); |
1980 | if (c2 == '-' || c2 == Magic('-')) { |
1981 | skipchr(); |
1982 | greedy = FALSE; |
1983 | } |
1984 | if (!read_limits(&minval, &maxval)) |
1985 | EMSG_RET_FAIL(_("E870: (NFA regexp) Error reading repetition limits" )); |
1986 | |
1987 | /* <atom>{0,inf}, <atom>{0,} and <atom>{} are equivalent to |
1988 | * <atom>* */ |
1989 | if (minval == 0 && maxval == MAX_LIMIT) { |
1990 | if (greedy) |
1991 | /* \{}, \{0,} */ |
1992 | EMIT(NFA_STAR); |
1993 | else |
1994 | /* \{-}, \{-0,} */ |
1995 | EMIT(NFA_STAR_NONGREEDY); |
1996 | break; |
1997 | } |
1998 | |
1999 | /* Special case: x{0} or x{-0} */ |
2000 | if (maxval == 0) { |
2001 | /* Ignore result of previous call to nfa_regatom() */ |
2002 | post_ptr = post_start + my_post_start; |
2003 | /* NFA_EMPTY is 0-length and works everywhere */ |
2004 | EMIT(NFA_EMPTY); |
2005 | return OK; |
2006 | } |
2007 | |
2008 | // The engine is very inefficient (uses too many states) when the maximum |
2009 | // is much larger than the minimum and when the maximum is large. Bail out |
2010 | // if we can use the other engine. |
2011 | if ((nfa_re_flags & RE_AUTO) && (maxval > 500 || maxval > minval + 200)) { |
2012 | return FAIL; |
2013 | } |
2014 | |
2015 | /* Ignore previous call to nfa_regatom() */ |
2016 | post_ptr = post_start + my_post_start; |
2017 | /* Save parse state after the repeated atom and the \{} */ |
2018 | save_parse_state(&new_state); |
2019 | |
2020 | quest = (greedy == TRUE ? NFA_QUEST : NFA_QUEST_NONGREEDY); |
2021 | for (i = 0; i < maxval; i++) { |
2022 | /* Goto beginning of the repeated atom */ |
2023 | restore_parse_state(&old_state); |
2024 | old_post_pos = (int)(post_ptr - post_start); |
2025 | if (nfa_regatom() == FAIL) |
2026 | return FAIL; |
2027 | /* after "minval" times, atoms are optional */ |
2028 | if (i + 1 > minval) { |
2029 | if (maxval == MAX_LIMIT) { |
2030 | if (greedy) |
2031 | EMIT(NFA_STAR); |
2032 | else |
2033 | EMIT(NFA_STAR_NONGREEDY); |
2034 | } else |
2035 | EMIT(quest); |
2036 | } |
2037 | if (old_post_pos != my_post_start) |
2038 | EMIT(NFA_CONCAT); |
2039 | if (i + 1 > minval && maxval == MAX_LIMIT) |
2040 | break; |
2041 | } |
2042 | |
2043 | /* Go to just after the repeated atom and the \{} */ |
2044 | restore_parse_state(&new_state); |
2045 | curchr = -1; |
2046 | |
2047 | break; |
2048 | |
2049 | |
2050 | default: |
2051 | break; |
2052 | } /* end switch */ |
2053 | |
2054 | if (re_multi_type(peekchr()) != NOT_MULTI) { |
2055 | // Can't have a multi follow a multi. |
2056 | EMSG_RET_FAIL(_("E871: (NFA regexp) Can't have a multi follow a multi" )); |
2057 | } |
2058 | |
2059 | return OK; |
2060 | } |
2061 | |
2062 | /* |
2063 | * Parse one or more pieces, concatenated. It matches a match for the |
2064 | * first piece, followed by a match for the second piece, etc. Example: |
2065 | * "f[0-9]b", first matches "f", then a digit and then "b". |
2066 | * |
2067 | * concat ::= piece |
2068 | * or piece piece |
2069 | * or piece piece piece |
2070 | * etc. |
2071 | */ |
2072 | static int nfa_regconcat(void) |
2073 | { |
2074 | int cont = TRUE; |
2075 | int first = TRUE; |
2076 | |
2077 | while (cont) { |
2078 | switch (peekchr()) { |
2079 | case NUL: |
2080 | case Magic('|'): |
2081 | case Magic('&'): |
2082 | case Magic(')'): |
2083 | cont = FALSE; |
2084 | break; |
2085 | |
2086 | case Magic('Z'): |
2087 | regflags |= RF_ICOMBINE; |
2088 | skipchr_keepstart(); |
2089 | break; |
2090 | case Magic('c'): |
2091 | regflags |= RF_ICASE; |
2092 | skipchr_keepstart(); |
2093 | break; |
2094 | case Magic('C'): |
2095 | regflags |= RF_NOICASE; |
2096 | skipchr_keepstart(); |
2097 | break; |
2098 | case Magic('v'): |
2099 | reg_magic = MAGIC_ALL; |
2100 | skipchr_keepstart(); |
2101 | curchr = -1; |
2102 | break; |
2103 | case Magic('m'): |
2104 | reg_magic = MAGIC_ON; |
2105 | skipchr_keepstart(); |
2106 | curchr = -1; |
2107 | break; |
2108 | case Magic('M'): |
2109 | reg_magic = MAGIC_OFF; |
2110 | skipchr_keepstart(); |
2111 | curchr = -1; |
2112 | break; |
2113 | case Magic('V'): |
2114 | reg_magic = MAGIC_NONE; |
2115 | skipchr_keepstart(); |
2116 | curchr = -1; |
2117 | break; |
2118 | |
2119 | default: |
2120 | if (nfa_regpiece() == FAIL) |
2121 | return FAIL; |
2122 | if (first == FALSE) |
2123 | EMIT(NFA_CONCAT); |
2124 | else |
2125 | first = FALSE; |
2126 | break; |
2127 | } |
2128 | } |
2129 | |
2130 | return OK; |
2131 | } |
2132 | |
2133 | /* |
2134 | * Parse a branch, one or more concats, separated by "\&". It matches the |
2135 | * last concat, but only if all the preceding concats also match at the same |
2136 | * position. Examples: |
2137 | * "foobeep\&..." matches "foo" in "foobeep". |
2138 | * ".*Peter\&.*Bob" matches in a line containing both "Peter" and "Bob" |
2139 | * |
2140 | * branch ::= concat |
2141 | * or concat \& concat |
2142 | * or concat \& concat \& concat |
2143 | * etc. |
2144 | */ |
2145 | static int nfa_regbranch(void) |
2146 | { |
2147 | int old_post_pos; |
2148 | |
2149 | old_post_pos = (int)(post_ptr - post_start); |
2150 | |
2151 | /* First branch, possibly the only one */ |
2152 | if (nfa_regconcat() == FAIL) |
2153 | return FAIL; |
2154 | |
2155 | // Try next concats |
2156 | while (peekchr() == Magic('&')) { |
2157 | skipchr(); |
2158 | // if concat is empty do emit a node |
2159 | if (old_post_pos == (int)(post_ptr - post_start)) { |
2160 | EMIT(NFA_EMPTY); |
2161 | } |
2162 | EMIT(NFA_NOPEN); |
2163 | EMIT(NFA_PREV_ATOM_NO_WIDTH); |
2164 | old_post_pos = (int)(post_ptr - post_start); |
2165 | if (nfa_regconcat() == FAIL) |
2166 | return FAIL; |
2167 | /* if concat is empty do emit a node */ |
2168 | if (old_post_pos == (int)(post_ptr - post_start)) |
2169 | EMIT(NFA_EMPTY); |
2170 | EMIT(NFA_CONCAT); |
2171 | } |
2172 | |
2173 | /* if a branch is empty, emit one node for it */ |
2174 | if (old_post_pos == (int)(post_ptr - post_start)) |
2175 | EMIT(NFA_EMPTY); |
2176 | |
2177 | return OK; |
2178 | } |
2179 | |
2180 | /* |
2181 | * Parse a pattern, one or more branches, separated by "\|". It matches |
2182 | * anything that matches one of the branches. Example: "foo\|beep" matches |
2183 | * "foo" and matches "beep". If more than one branch matches, the first one |
2184 | * is used. |
2185 | * |
2186 | * pattern ::= branch |
2187 | * or branch \| branch |
2188 | * or branch \| branch \| branch |
2189 | * etc. |
2190 | */ |
2191 | static int |
2192 | nfa_reg ( |
2193 | int paren /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */ |
2194 | ) |
2195 | { |
2196 | int parno = 0; |
2197 | |
2198 | if (paren == REG_PAREN) { |
2199 | if (regnpar >= NSUBEXP) /* Too many `(' */ |
2200 | EMSG_RET_FAIL(_("E872: (NFA regexp) Too many '('" )); |
2201 | parno = regnpar++; |
2202 | } else if (paren == REG_ZPAREN) { |
2203 | /* Make a ZOPEN node. */ |
2204 | if (regnzpar >= NSUBEXP) |
2205 | EMSG_RET_FAIL(_("E879: (NFA regexp) Too many \\z(" )); |
2206 | parno = regnzpar++; |
2207 | } |
2208 | |
2209 | if (nfa_regbranch() == FAIL) |
2210 | return FAIL; /* cascaded error */ |
2211 | |
2212 | while (peekchr() == Magic('|')) { |
2213 | skipchr(); |
2214 | if (nfa_regbranch() == FAIL) |
2215 | return FAIL; /* cascaded error */ |
2216 | EMIT(NFA_OR); |
2217 | } |
2218 | |
2219 | /* Check for proper termination. */ |
2220 | if (paren != REG_NOPAREN && getchr() != Magic(')')) { |
2221 | if (paren == REG_NPAREN) |
2222 | EMSG2_RET_FAIL(_(e_unmatchedpp), reg_magic == MAGIC_ALL); |
2223 | else |
2224 | EMSG2_RET_FAIL(_(e_unmatchedp), reg_magic == MAGIC_ALL); |
2225 | } else if (paren == REG_NOPAREN && peekchr() != NUL) { |
2226 | if (peekchr() == Magic(')')) |
2227 | EMSG2_RET_FAIL(_(e_unmatchedpar), reg_magic == MAGIC_ALL); |
2228 | else |
2229 | EMSG_RET_FAIL(_("E873: (NFA regexp) proper termination error" )); |
2230 | } |
2231 | /* |
2232 | * Here we set the flag allowing back references to this set of |
2233 | * parentheses. |
2234 | */ |
2235 | if (paren == REG_PAREN) { |
2236 | had_endbrace[parno] = TRUE; /* have seen the close paren */ |
2237 | EMIT(NFA_MOPEN + parno); |
2238 | } else if (paren == REG_ZPAREN) |
2239 | EMIT(NFA_ZOPEN + parno); |
2240 | |
2241 | return OK; |
2242 | } |
2243 | |
2244 | #ifdef REGEXP_DEBUG |
2245 | static char_u code[50]; |
2246 | |
2247 | static void nfa_set_code(int c) |
2248 | { |
2249 | int addnl = FALSE; |
2250 | |
2251 | if (c >= NFA_FIRST_NL && c <= NFA_LAST_NL) { |
2252 | addnl = TRUE; |
2253 | c -= NFA_ADD_NL; |
2254 | } |
2255 | |
2256 | STRCPY(code, "" ); |
2257 | switch (c) { |
2258 | case NFA_MATCH: STRCPY(code, "NFA_MATCH " ); break; |
2259 | case NFA_SPLIT: STRCPY(code, "NFA_SPLIT " ); break; |
2260 | case NFA_CONCAT: STRCPY(code, "NFA_CONCAT " ); break; |
2261 | case NFA_NEWL: STRCPY(code, "NFA_NEWL " ); break; |
2262 | case NFA_ZSTART: STRCPY(code, "NFA_ZSTART" ); break; |
2263 | case NFA_ZEND: STRCPY(code, "NFA_ZEND" ); break; |
2264 | |
2265 | case NFA_BACKREF1: STRCPY(code, "NFA_BACKREF1" ); break; |
2266 | case NFA_BACKREF2: STRCPY(code, "NFA_BACKREF2" ); break; |
2267 | case NFA_BACKREF3: STRCPY(code, "NFA_BACKREF3" ); break; |
2268 | case NFA_BACKREF4: STRCPY(code, "NFA_BACKREF4" ); break; |
2269 | case NFA_BACKREF5: STRCPY(code, "NFA_BACKREF5" ); break; |
2270 | case NFA_BACKREF6: STRCPY(code, "NFA_BACKREF6" ); break; |
2271 | case NFA_BACKREF7: STRCPY(code, "NFA_BACKREF7" ); break; |
2272 | case NFA_BACKREF8: STRCPY(code, "NFA_BACKREF8" ); break; |
2273 | case NFA_BACKREF9: STRCPY(code, "NFA_BACKREF9" ); break; |
2274 | case NFA_ZREF1: STRCPY(code, "NFA_ZREF1" ); break; |
2275 | case NFA_ZREF2: STRCPY(code, "NFA_ZREF2" ); break; |
2276 | case NFA_ZREF3: STRCPY(code, "NFA_ZREF3" ); break; |
2277 | case NFA_ZREF4: STRCPY(code, "NFA_ZREF4" ); break; |
2278 | case NFA_ZREF5: STRCPY(code, "NFA_ZREF5" ); break; |
2279 | case NFA_ZREF6: STRCPY(code, "NFA_ZREF6" ); break; |
2280 | case NFA_ZREF7: STRCPY(code, "NFA_ZREF7" ); break; |
2281 | case NFA_ZREF8: STRCPY(code, "NFA_ZREF8" ); break; |
2282 | case NFA_ZREF9: STRCPY(code, "NFA_ZREF9" ); break; |
2283 | case NFA_SKIP: STRCPY(code, "NFA_SKIP" ); break; |
2284 | |
2285 | case NFA_PREV_ATOM_NO_WIDTH: |
2286 | STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH" ); break; |
2287 | case NFA_PREV_ATOM_NO_WIDTH_NEG: |
2288 | STRCPY(code, "NFA_PREV_ATOM_NO_WIDTH_NEG" ); break; |
2289 | case NFA_PREV_ATOM_JUST_BEFORE: |
2290 | STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE" ); break; |
2291 | case NFA_PREV_ATOM_JUST_BEFORE_NEG: |
2292 | STRCPY(code, "NFA_PREV_ATOM_JUST_BEFORE_NEG" ); break; |
2293 | case NFA_PREV_ATOM_LIKE_PATTERN: |
2294 | STRCPY(code, "NFA_PREV_ATOM_LIKE_PATTERN" ); break; |
2295 | |
2296 | case NFA_NOPEN: STRCPY(code, "NFA_NOPEN" ); break; |
2297 | case NFA_NCLOSE: STRCPY(code, "NFA_NCLOSE" ); break; |
2298 | case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE" ); break; |
2299 | case NFA_START_INVISIBLE_FIRST: |
2300 | STRCPY(code, "NFA_START_INVISIBLE_FIRST" ); break; |
2301 | case NFA_START_INVISIBLE_NEG: |
2302 | STRCPY(code, "NFA_START_INVISIBLE_NEG" ); break; |
2303 | case NFA_START_INVISIBLE_NEG_FIRST: |
2304 | STRCPY(code, "NFA_START_INVISIBLE_NEG_FIRST" ); break; |
2305 | case NFA_START_INVISIBLE_BEFORE: |
2306 | STRCPY(code, "NFA_START_INVISIBLE_BEFORE" ); break; |
2307 | case NFA_START_INVISIBLE_BEFORE_FIRST: |
2308 | STRCPY(code, "NFA_START_INVISIBLE_BEFORE_FIRST" ); break; |
2309 | case NFA_START_INVISIBLE_BEFORE_NEG: |
2310 | STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG" ); break; |
2311 | case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: |
2312 | STRCPY(code, "NFA_START_INVISIBLE_BEFORE_NEG_FIRST" ); break; |
2313 | case NFA_START_PATTERN: STRCPY(code, "NFA_START_PATTERN" ); break; |
2314 | case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE" ); break; |
2315 | case NFA_END_INVISIBLE_NEG: STRCPY(code, "NFA_END_INVISIBLE_NEG" ); break; |
2316 | case NFA_END_PATTERN: STRCPY(code, "NFA_END_PATTERN" ); break; |
2317 | |
2318 | case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING" ); break; |
2319 | case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING" ); break; |
2320 | case NFA_OPT_CHARS: STRCPY(code, "NFA_OPT_CHARS" ); break; |
2321 | |
2322 | case NFA_MOPEN: |
2323 | case NFA_MOPEN1: |
2324 | case NFA_MOPEN2: |
2325 | case NFA_MOPEN3: |
2326 | case NFA_MOPEN4: |
2327 | case NFA_MOPEN5: |
2328 | case NFA_MOPEN6: |
2329 | case NFA_MOPEN7: |
2330 | case NFA_MOPEN8: |
2331 | case NFA_MOPEN9: |
2332 | STRCPY(code, "NFA_MOPEN(x)" ); |
2333 | code[10] = c - NFA_MOPEN + '0'; |
2334 | break; |
2335 | case NFA_MCLOSE: |
2336 | case NFA_MCLOSE1: |
2337 | case NFA_MCLOSE2: |
2338 | case NFA_MCLOSE3: |
2339 | case NFA_MCLOSE4: |
2340 | case NFA_MCLOSE5: |
2341 | case NFA_MCLOSE6: |
2342 | case NFA_MCLOSE7: |
2343 | case NFA_MCLOSE8: |
2344 | case NFA_MCLOSE9: |
2345 | STRCPY(code, "NFA_MCLOSE(x)" ); |
2346 | code[11] = c - NFA_MCLOSE + '0'; |
2347 | break; |
2348 | case NFA_ZOPEN: |
2349 | case NFA_ZOPEN1: |
2350 | case NFA_ZOPEN2: |
2351 | case NFA_ZOPEN3: |
2352 | case NFA_ZOPEN4: |
2353 | case NFA_ZOPEN5: |
2354 | case NFA_ZOPEN6: |
2355 | case NFA_ZOPEN7: |
2356 | case NFA_ZOPEN8: |
2357 | case NFA_ZOPEN9: |
2358 | STRCPY(code, "NFA_ZOPEN(x)" ); |
2359 | code[10] = c - NFA_ZOPEN + '0'; |
2360 | break; |
2361 | case NFA_ZCLOSE: |
2362 | case NFA_ZCLOSE1: |
2363 | case NFA_ZCLOSE2: |
2364 | case NFA_ZCLOSE3: |
2365 | case NFA_ZCLOSE4: |
2366 | case NFA_ZCLOSE5: |
2367 | case NFA_ZCLOSE6: |
2368 | case NFA_ZCLOSE7: |
2369 | case NFA_ZCLOSE8: |
2370 | case NFA_ZCLOSE9: |
2371 | STRCPY(code, "NFA_ZCLOSE(x)" ); |
2372 | code[11] = c - NFA_ZCLOSE + '0'; |
2373 | break; |
2374 | case NFA_EOL: STRCPY(code, "NFA_EOL " ); break; |
2375 | case NFA_BOL: STRCPY(code, "NFA_BOL " ); break; |
2376 | case NFA_EOW: STRCPY(code, "NFA_EOW " ); break; |
2377 | case NFA_BOW: STRCPY(code, "NFA_BOW " ); break; |
2378 | case NFA_EOF: STRCPY(code, "NFA_EOF " ); break; |
2379 | case NFA_BOF: STRCPY(code, "NFA_BOF " ); break; |
2380 | case NFA_LNUM: STRCPY(code, "NFA_LNUM " ); break; |
2381 | case NFA_LNUM_GT: STRCPY(code, "NFA_LNUM_GT " ); break; |
2382 | case NFA_LNUM_LT: STRCPY(code, "NFA_LNUM_LT " ); break; |
2383 | case NFA_COL: STRCPY(code, "NFA_COL " ); break; |
2384 | case NFA_COL_GT: STRCPY(code, "NFA_COL_GT " ); break; |
2385 | case NFA_COL_LT: STRCPY(code, "NFA_COL_LT " ); break; |
2386 | case NFA_VCOL: STRCPY(code, "NFA_VCOL " ); break; |
2387 | case NFA_VCOL_GT: STRCPY(code, "NFA_VCOL_GT " ); break; |
2388 | case NFA_VCOL_LT: STRCPY(code, "NFA_VCOL_LT " ); break; |
2389 | case NFA_MARK: STRCPY(code, "NFA_MARK " ); break; |
2390 | case NFA_MARK_GT: STRCPY(code, "NFA_MARK_GT " ); break; |
2391 | case NFA_MARK_LT: STRCPY(code, "NFA_MARK_LT " ); break; |
2392 | case NFA_CURSOR: STRCPY(code, "NFA_CURSOR " ); break; |
2393 | case NFA_VISUAL: STRCPY(code, "NFA_VISUAL " ); break; |
2394 | case NFA_ANY_COMPOSING: STRCPY(code, "NFA_ANY_COMPOSING " ); break; |
2395 | |
2396 | case NFA_STAR: STRCPY(code, "NFA_STAR " ); break; |
2397 | case NFA_STAR_NONGREEDY: STRCPY(code, "NFA_STAR_NONGREEDY " ); break; |
2398 | case NFA_QUEST: STRCPY(code, "NFA_QUEST" ); break; |
2399 | case NFA_QUEST_NONGREEDY: STRCPY(code, "NFA_QUEST_NON_GREEDY" ); break; |
2400 | case NFA_EMPTY: STRCPY(code, "NFA_EMPTY" ); break; |
2401 | case NFA_OR: STRCPY(code, "NFA_OR" ); break; |
2402 | |
2403 | case NFA_START_COLL: STRCPY(code, "NFA_START_COLL" ); break; |
2404 | case NFA_END_COLL: STRCPY(code, "NFA_END_COLL" ); break; |
2405 | case NFA_START_NEG_COLL: STRCPY(code, "NFA_START_NEG_COLL" ); break; |
2406 | case NFA_END_NEG_COLL: STRCPY(code, "NFA_END_NEG_COLL" ); break; |
2407 | case NFA_RANGE: STRCPY(code, "NFA_RANGE" ); break; |
2408 | case NFA_RANGE_MIN: STRCPY(code, "NFA_RANGE_MIN" ); break; |
2409 | case NFA_RANGE_MAX: STRCPY(code, "NFA_RANGE_MAX" ); break; |
2410 | |
2411 | case NFA_CLASS_ALNUM: STRCPY(code, "NFA_CLASS_ALNUM" ); break; |
2412 | case NFA_CLASS_ALPHA: STRCPY(code, "NFA_CLASS_ALPHA" ); break; |
2413 | case NFA_CLASS_BLANK: STRCPY(code, "NFA_CLASS_BLANK" ); break; |
2414 | case NFA_CLASS_CNTRL: STRCPY(code, "NFA_CLASS_CNTRL" ); break; |
2415 | case NFA_CLASS_DIGIT: STRCPY(code, "NFA_CLASS_DIGIT" ); break; |
2416 | case NFA_CLASS_GRAPH: STRCPY(code, "NFA_CLASS_GRAPH" ); break; |
2417 | case NFA_CLASS_LOWER: STRCPY(code, "NFA_CLASS_LOWER" ); break; |
2418 | case NFA_CLASS_PRINT: STRCPY(code, "NFA_CLASS_PRINT" ); break; |
2419 | case NFA_CLASS_PUNCT: STRCPY(code, "NFA_CLASS_PUNCT" ); break; |
2420 | case NFA_CLASS_SPACE: STRCPY(code, "NFA_CLASS_SPACE" ); break; |
2421 | case NFA_CLASS_UPPER: STRCPY(code, "NFA_CLASS_UPPER" ); break; |
2422 | case NFA_CLASS_XDIGIT: STRCPY(code, "NFA_CLASS_XDIGIT" ); break; |
2423 | case NFA_CLASS_TAB: STRCPY(code, "NFA_CLASS_TAB" ); break; |
2424 | case NFA_CLASS_RETURN: STRCPY(code, "NFA_CLASS_RETURN" ); break; |
2425 | case NFA_CLASS_BACKSPACE: STRCPY(code, "NFA_CLASS_BACKSPACE" ); break; |
2426 | case NFA_CLASS_ESCAPE: STRCPY(code, "NFA_CLASS_ESCAPE" ); break; |
2427 | |
2428 | case NFA_ANY: STRCPY(code, "NFA_ANY" ); break; |
2429 | case NFA_IDENT: STRCPY(code, "NFA_IDENT" ); break; |
2430 | case NFA_SIDENT: STRCPY(code, "NFA_SIDENT" ); break; |
2431 | case NFA_KWORD: STRCPY(code, "NFA_KWORD" ); break; |
2432 | case NFA_SKWORD: STRCPY(code, "NFA_SKWORD" ); break; |
2433 | case NFA_FNAME: STRCPY(code, "NFA_FNAME" ); break; |
2434 | case NFA_SFNAME: STRCPY(code, "NFA_SFNAME" ); break; |
2435 | case NFA_PRINT: STRCPY(code, "NFA_PRINT" ); break; |
2436 | case NFA_SPRINT: STRCPY(code, "NFA_SPRINT" ); break; |
2437 | case NFA_WHITE: STRCPY(code, "NFA_WHITE" ); break; |
2438 | case NFA_NWHITE: STRCPY(code, "NFA_NWHITE" ); break; |
2439 | case NFA_DIGIT: STRCPY(code, "NFA_DIGIT" ); break; |
2440 | case NFA_NDIGIT: STRCPY(code, "NFA_NDIGIT" ); break; |
2441 | case NFA_HEX: STRCPY(code, "NFA_HEX" ); break; |
2442 | case NFA_NHEX: STRCPY(code, "NFA_NHEX" ); break; |
2443 | case NFA_OCTAL: STRCPY(code, "NFA_OCTAL" ); break; |
2444 | case NFA_NOCTAL: STRCPY(code, "NFA_NOCTAL" ); break; |
2445 | case NFA_WORD: STRCPY(code, "NFA_WORD" ); break; |
2446 | case NFA_NWORD: STRCPY(code, "NFA_NWORD" ); break; |
2447 | case NFA_HEAD: STRCPY(code, "NFA_HEAD" ); break; |
2448 | case NFA_NHEAD: STRCPY(code, "NFA_NHEAD" ); break; |
2449 | case NFA_ALPHA: STRCPY(code, "NFA_ALPHA" ); break; |
2450 | case NFA_NALPHA: STRCPY(code, "NFA_NALPHA" ); break; |
2451 | case NFA_LOWER: STRCPY(code, "NFA_LOWER" ); break; |
2452 | case NFA_NLOWER: STRCPY(code, "NFA_NLOWER" ); break; |
2453 | case NFA_UPPER: STRCPY(code, "NFA_UPPER" ); break; |
2454 | case NFA_NUPPER: STRCPY(code, "NFA_NUPPER" ); break; |
2455 | case NFA_LOWER_IC: STRCPY(code, "NFA_LOWER_IC" ); break; |
2456 | case NFA_NLOWER_IC: STRCPY(code, "NFA_NLOWER_IC" ); break; |
2457 | case NFA_UPPER_IC: STRCPY(code, "NFA_UPPER_IC" ); break; |
2458 | case NFA_NUPPER_IC: STRCPY(code, "NFA_NUPPER_IC" ); break; |
2459 | |
2460 | default: |
2461 | STRCPY(code, "CHAR(x)" ); |
2462 | code[5] = c; |
2463 | } |
2464 | |
2465 | if (addnl == TRUE) |
2466 | STRCAT(code, " + NEWLINE " ); |
2467 | |
2468 | } |
2469 | |
2470 | static FILE *log_fd; |
2471 | static char_u e_log_open_failed[] = N_( |
2472 | "Could not open temporary log file for writing, displaying on stderr... " ); |
2473 | |
2474 | /* |
2475 | * Print the postfix notation of the current regexp. |
2476 | */ |
2477 | static void nfa_postfix_dump(char_u *expr, int retval) |
2478 | { |
2479 | int *p; |
2480 | FILE *f; |
2481 | |
2482 | f = fopen(NFA_REGEXP_DUMP_LOG, "a" ); |
2483 | if (f != NULL) { |
2484 | fprintf(f, "\n-------------------------\n" ); |
2485 | if (retval == FAIL) { |
2486 | fprintf(f, ">>> NFA engine failed... \n" ); |
2487 | } else if (retval == OK) { |
2488 | fprintf(f, ">>> NFA engine succeeded !\n" ); |
2489 | } |
2490 | fprintf(f, "Regexp: \"%s\"\nPostfix notation (char): \"" , expr); |
2491 | for (p = post_start; *p && p < post_ptr; p++) { |
2492 | nfa_set_code(*p); |
2493 | fprintf(f, "%s, " , code); |
2494 | } |
2495 | fprintf(f, "\"\nPostfix notation (int): " ); |
2496 | for (p = post_start; *p && p < post_ptr; p++) |
2497 | fprintf(f, "%d " , *p); |
2498 | fprintf(f, "\n\n" ); |
2499 | fclose(f); |
2500 | } |
2501 | } |
2502 | |
2503 | /* |
2504 | * Print the NFA starting with a root node "state". |
2505 | */ |
2506 | static void nfa_print_state(FILE *debugf, nfa_state_T *state) |
2507 | { |
2508 | garray_T indent; |
2509 | |
2510 | ga_init(&indent, 1, 64); |
2511 | ga_append(&indent, '\0'); |
2512 | nfa_print_state2(debugf, state, &indent); |
2513 | ga_clear(&indent); |
2514 | } |
2515 | |
2516 | static void nfa_print_state2(FILE *debugf, nfa_state_T *state, garray_T *indent) |
2517 | { |
2518 | char_u *p; |
2519 | |
2520 | if (state == NULL) |
2521 | return; |
2522 | |
2523 | fprintf(debugf, "(%2d)" , abs(state->id)); |
2524 | |
2525 | /* Output indent */ |
2526 | p = (char_u *)indent->ga_data; |
2527 | if (indent->ga_len >= 3) { |
2528 | int last = indent->ga_len - 3; |
2529 | char_u save[2]; |
2530 | |
2531 | STRNCPY(save, &p[last], 2); |
2532 | STRNCPY(&p[last], "+-" , 2); |
2533 | fprintf(debugf, " %s" , p); |
2534 | STRNCPY(&p[last], save, 2); |
2535 | } else |
2536 | fprintf(debugf, " %s" , p); |
2537 | |
2538 | nfa_set_code(state->c); |
2539 | fprintf(debugf, "%s (%d) (id=%d) val=%d\n" , |
2540 | code, |
2541 | state->c, |
2542 | abs(state->id), |
2543 | state->val); |
2544 | if (state->id < 0) |
2545 | return; |
2546 | |
2547 | state->id = abs(state->id) * -1; |
2548 | |
2549 | /* grow indent for state->out */ |
2550 | indent->ga_len -= 1; |
2551 | if (state->out1) |
2552 | ga_concat(indent, (char_u *)"| " ); |
2553 | else |
2554 | ga_concat(indent, (char_u *)" " ); |
2555 | ga_append(indent, '\0'); |
2556 | |
2557 | nfa_print_state2(debugf, state->out, indent); |
2558 | |
2559 | /* replace last part of indent for state->out1 */ |
2560 | indent->ga_len -= 3; |
2561 | ga_concat(indent, (char_u *)" " ); |
2562 | ga_append(indent, '\0'); |
2563 | |
2564 | nfa_print_state2(debugf, state->out1, indent); |
2565 | |
2566 | /* shrink indent */ |
2567 | indent->ga_len -= 3; |
2568 | ga_append(indent, '\0'); |
2569 | } |
2570 | |
2571 | /* |
2572 | * Print the NFA state machine. |
2573 | */ |
2574 | static void nfa_dump(nfa_regprog_T *prog) |
2575 | { |
2576 | FILE *debugf = fopen(NFA_REGEXP_DUMP_LOG, "a" ); |
2577 | |
2578 | if (debugf != NULL) { |
2579 | nfa_print_state(debugf, prog->start); |
2580 | |
2581 | if (prog->reganch) |
2582 | fprintf(debugf, "reganch: %d\n" , prog->reganch); |
2583 | if (prog->regstart != NUL) |
2584 | fprintf(debugf, "regstart: %c (decimal: %d)\n" , |
2585 | prog->regstart, prog->regstart); |
2586 | if (prog->match_text != NULL) |
2587 | fprintf(debugf, "match_text: \"%s\"\n" , prog->match_text); |
2588 | |
2589 | fclose(debugf); |
2590 | } |
2591 | } |
2592 | #endif /* REGEXP_DEBUG */ |
2593 | |
2594 | /* |
2595 | * Parse r.e. @expr and convert it into postfix form. |
2596 | * Return the postfix string on success, NULL otherwise. |
2597 | */ |
2598 | static int *re2post(void) |
2599 | { |
2600 | if (nfa_reg(REG_NOPAREN) == FAIL) |
2601 | return NULL; |
2602 | EMIT(NFA_MOPEN); |
2603 | return post_start; |
2604 | } |
2605 | |
2606 | /* NB. Some of the code below is inspired by Russ's. */ |
2607 | |
2608 | /* |
2609 | * Represents an NFA state plus zero or one or two arrows exiting. |
2610 | * if c == MATCH, no arrows out; matching state. |
2611 | * If c == SPLIT, unlabeled arrows to out and out1 (if != NULL). |
2612 | * If c < 256, labeled arrow with character c to out. |
2613 | */ |
2614 | |
2615 | static nfa_state_T *state_ptr; /* points to nfa_prog->state */ |
2616 | |
2617 | /* |
2618 | * Allocate and initialize nfa_state_T. |
2619 | */ |
2620 | static nfa_state_T *alloc_state(int c, nfa_state_T *out, nfa_state_T *out1) |
2621 | { |
2622 | nfa_state_T *s; |
2623 | |
2624 | if (istate >= nstate) |
2625 | return NULL; |
2626 | |
2627 | s = &state_ptr[istate++]; |
2628 | |
2629 | s->c = c; |
2630 | s->out = out; |
2631 | s->out1 = out1; |
2632 | s->val = 0; |
2633 | |
2634 | s->id = istate; |
2635 | s->lastlist[0] = 0; |
2636 | s->lastlist[1] = 0; |
2637 | |
2638 | return s; |
2639 | } |
2640 | |
2641 | /* |
2642 | * A partially built NFA without the matching state filled in. |
2643 | * Frag_T.start points at the start state. |
2644 | * Frag_T.out is a list of places that need to be set to the |
2645 | * next state for this fragment. |
2646 | */ |
2647 | |
2648 | |
2649 | /* |
2650 | * Initialize a Frag_T struct and return it. |
2651 | */ |
2652 | static Frag_T frag(nfa_state_T *start, Ptrlist *out) |
2653 | { |
2654 | Frag_T n; |
2655 | |
2656 | n.start = start; |
2657 | n.out = out; |
2658 | return n; |
2659 | } |
2660 | |
2661 | /* |
2662 | * Create singleton list containing just outp. |
2663 | */ |
2664 | static Ptrlist *list1(nfa_state_T **outp) |
2665 | { |
2666 | Ptrlist *l; |
2667 | |
2668 | l = (Ptrlist *)outp; |
2669 | l->next = NULL; |
2670 | return l; |
2671 | } |
2672 | |
2673 | /* |
2674 | * Patch the list of states at out to point to start. |
2675 | */ |
2676 | static void patch(Ptrlist *l, nfa_state_T *s) |
2677 | { |
2678 | Ptrlist *next; |
2679 | |
2680 | for (; l; l = next) { |
2681 | next = l->next; |
2682 | l->s = s; |
2683 | } |
2684 | } |
2685 | |
2686 | |
2687 | /* |
2688 | * Join the two lists l1 and l2, returning the combination. |
2689 | */ |
2690 | static Ptrlist *append(Ptrlist *l1, Ptrlist *l2) |
2691 | { |
2692 | Ptrlist *oldl1; |
2693 | |
2694 | oldl1 = l1; |
2695 | while (l1->next) |
2696 | l1 = l1->next; |
2697 | l1->next = l2; |
2698 | return oldl1; |
2699 | } |
2700 | |
2701 | /* |
2702 | * Stack used for transforming postfix form into NFA. |
2703 | */ |
2704 | static Frag_T empty; |
2705 | |
2706 | static void st_error(int *postfix, int *end, int *p) |
2707 | { |
2708 | #ifdef NFA_REGEXP_ERROR_LOG |
2709 | FILE *df; |
2710 | int *p2; |
2711 | |
2712 | df = fopen(NFA_REGEXP_ERROR_LOG, "a" ); |
2713 | if (df) { |
2714 | fprintf(df, "Error popping the stack!\n" ); |
2715 | #ifdef REGEXP_DEBUG |
2716 | fprintf(df, "Current regexp is \"%s\"\n" , nfa_regengine.expr); |
2717 | #endif |
2718 | fprintf(df, "Postfix form is: " ); |
2719 | #ifdef REGEXP_DEBUG |
2720 | for (p2 = postfix; p2 < end; p2++) { |
2721 | nfa_set_code(*p2); |
2722 | fprintf(df, "%s, " , code); |
2723 | } |
2724 | nfa_set_code(*p); |
2725 | fprintf(df, "\nCurrent position is: " ); |
2726 | for (p2 = postfix; p2 <= p; p2++) { |
2727 | nfa_set_code(*p2); |
2728 | fprintf(df, "%s, " , code); |
2729 | } |
2730 | #else |
2731 | for (p2 = postfix; p2 < end; p2++) { |
2732 | fprintf(df, "%d, " , *p2); |
2733 | } |
2734 | fprintf(df, "\nCurrent position is: " ); |
2735 | for (p2 = postfix; p2 <= p; p2++) { |
2736 | fprintf(df, "%d, " , *p2); |
2737 | } |
2738 | #endif |
2739 | fprintf(df, "\n--------------------------\n" ); |
2740 | fclose(df); |
2741 | } |
2742 | #endif |
2743 | EMSG(_("E874: (NFA) Could not pop the stack!" )); |
2744 | } |
2745 | |
2746 | /* |
2747 | * Push an item onto the stack. |
2748 | */ |
2749 | static void st_push(Frag_T s, Frag_T **p, Frag_T *stack_end) |
2750 | { |
2751 | Frag_T *stackp = *p; |
2752 | |
2753 | if (stackp >= stack_end) |
2754 | return; |
2755 | *stackp = s; |
2756 | *p = *p + 1; |
2757 | } |
2758 | |
2759 | /* |
2760 | * Pop an item from the stack. |
2761 | */ |
2762 | static Frag_T st_pop(Frag_T **p, Frag_T *stack) |
2763 | { |
2764 | Frag_T *stackp; |
2765 | |
2766 | *p = *p - 1; |
2767 | stackp = *p; |
2768 | if (stackp < stack) |
2769 | return empty; |
2770 | return **p; |
2771 | } |
2772 | |
2773 | /* |
2774 | * Estimate the maximum byte length of anything matching "state". |
2775 | * When unknown or unlimited return -1. |
2776 | */ |
2777 | static int nfa_max_width(nfa_state_T *startstate, int depth) |
2778 | { |
2779 | int l, r; |
2780 | nfa_state_T *state = startstate; |
2781 | int len = 0; |
2782 | |
2783 | /* detect looping in a NFA_SPLIT */ |
2784 | if (depth > 4) |
2785 | return -1; |
2786 | |
2787 | while (state != NULL) { |
2788 | switch (state->c) { |
2789 | case NFA_END_INVISIBLE: |
2790 | case NFA_END_INVISIBLE_NEG: |
2791 | /* the end, return what we have */ |
2792 | return len; |
2793 | |
2794 | case NFA_SPLIT: |
2795 | /* two alternatives, use the maximum */ |
2796 | l = nfa_max_width(state->out, depth + 1); |
2797 | r = nfa_max_width(state->out1, depth + 1); |
2798 | if (l < 0 || r < 0) |
2799 | return -1; |
2800 | return len + (l > r ? l : r); |
2801 | |
2802 | case NFA_ANY: |
2803 | case NFA_START_COLL: |
2804 | case NFA_START_NEG_COLL: |
2805 | // Matches some character, including composing chars. |
2806 | len += MB_MAXBYTES; |
2807 | if (state->c != NFA_ANY) { |
2808 | // Skip over the characters. |
2809 | state = state->out1->out; |
2810 | continue; |
2811 | } |
2812 | break; |
2813 | |
2814 | case NFA_DIGIT: |
2815 | case NFA_WHITE: |
2816 | case NFA_HEX: |
2817 | case NFA_OCTAL: |
2818 | /* ascii */ |
2819 | ++len; |
2820 | break; |
2821 | |
2822 | case NFA_IDENT: |
2823 | case NFA_SIDENT: |
2824 | case NFA_KWORD: |
2825 | case NFA_SKWORD: |
2826 | case NFA_FNAME: |
2827 | case NFA_SFNAME: |
2828 | case NFA_PRINT: |
2829 | case NFA_SPRINT: |
2830 | case NFA_NWHITE: |
2831 | case NFA_NDIGIT: |
2832 | case NFA_NHEX: |
2833 | case NFA_NOCTAL: |
2834 | case NFA_WORD: |
2835 | case NFA_NWORD: |
2836 | case NFA_HEAD: |
2837 | case NFA_NHEAD: |
2838 | case NFA_ALPHA: |
2839 | case NFA_NALPHA: |
2840 | case NFA_LOWER: |
2841 | case NFA_NLOWER: |
2842 | case NFA_UPPER: |
2843 | case NFA_NUPPER: |
2844 | case NFA_LOWER_IC: |
2845 | case NFA_NLOWER_IC: |
2846 | case NFA_UPPER_IC: |
2847 | case NFA_NUPPER_IC: |
2848 | case NFA_ANY_COMPOSING: |
2849 | /* possibly non-ascii */ |
2850 | if (has_mbyte) |
2851 | len += 3; |
2852 | else |
2853 | ++len; |
2854 | break; |
2855 | |
2856 | case NFA_START_INVISIBLE: |
2857 | case NFA_START_INVISIBLE_NEG: |
2858 | case NFA_START_INVISIBLE_BEFORE: |
2859 | case NFA_START_INVISIBLE_BEFORE_NEG: |
2860 | /* zero-width, out1 points to the END state */ |
2861 | state = state->out1->out; |
2862 | continue; |
2863 | |
2864 | case NFA_BACKREF1: |
2865 | case NFA_BACKREF2: |
2866 | case NFA_BACKREF3: |
2867 | case NFA_BACKREF4: |
2868 | case NFA_BACKREF5: |
2869 | case NFA_BACKREF6: |
2870 | case NFA_BACKREF7: |
2871 | case NFA_BACKREF8: |
2872 | case NFA_BACKREF9: |
2873 | case NFA_ZREF1: |
2874 | case NFA_ZREF2: |
2875 | case NFA_ZREF3: |
2876 | case NFA_ZREF4: |
2877 | case NFA_ZREF5: |
2878 | case NFA_ZREF6: |
2879 | case NFA_ZREF7: |
2880 | case NFA_ZREF8: |
2881 | case NFA_ZREF9: |
2882 | case NFA_NEWL: |
2883 | case NFA_SKIP: |
2884 | /* unknown width */ |
2885 | return -1; |
2886 | |
2887 | case NFA_BOL: |
2888 | case NFA_EOL: |
2889 | case NFA_BOF: |
2890 | case NFA_EOF: |
2891 | case NFA_BOW: |
2892 | case NFA_EOW: |
2893 | case NFA_MOPEN: |
2894 | case NFA_MOPEN1: |
2895 | case NFA_MOPEN2: |
2896 | case NFA_MOPEN3: |
2897 | case NFA_MOPEN4: |
2898 | case NFA_MOPEN5: |
2899 | case NFA_MOPEN6: |
2900 | case NFA_MOPEN7: |
2901 | case NFA_MOPEN8: |
2902 | case NFA_MOPEN9: |
2903 | case NFA_ZOPEN: |
2904 | case NFA_ZOPEN1: |
2905 | case NFA_ZOPEN2: |
2906 | case NFA_ZOPEN3: |
2907 | case NFA_ZOPEN4: |
2908 | case NFA_ZOPEN5: |
2909 | case NFA_ZOPEN6: |
2910 | case NFA_ZOPEN7: |
2911 | case NFA_ZOPEN8: |
2912 | case NFA_ZOPEN9: |
2913 | case NFA_ZCLOSE: |
2914 | case NFA_ZCLOSE1: |
2915 | case NFA_ZCLOSE2: |
2916 | case NFA_ZCLOSE3: |
2917 | case NFA_ZCLOSE4: |
2918 | case NFA_ZCLOSE5: |
2919 | case NFA_ZCLOSE6: |
2920 | case NFA_ZCLOSE7: |
2921 | case NFA_ZCLOSE8: |
2922 | case NFA_ZCLOSE9: |
2923 | case NFA_MCLOSE: |
2924 | case NFA_MCLOSE1: |
2925 | case NFA_MCLOSE2: |
2926 | case NFA_MCLOSE3: |
2927 | case NFA_MCLOSE4: |
2928 | case NFA_MCLOSE5: |
2929 | case NFA_MCLOSE6: |
2930 | case NFA_MCLOSE7: |
2931 | case NFA_MCLOSE8: |
2932 | case NFA_MCLOSE9: |
2933 | case NFA_NOPEN: |
2934 | case NFA_NCLOSE: |
2935 | |
2936 | case NFA_LNUM_GT: |
2937 | case NFA_LNUM_LT: |
2938 | case NFA_COL_GT: |
2939 | case NFA_COL_LT: |
2940 | case NFA_VCOL_GT: |
2941 | case NFA_VCOL_LT: |
2942 | case NFA_MARK_GT: |
2943 | case NFA_MARK_LT: |
2944 | case NFA_VISUAL: |
2945 | case NFA_LNUM: |
2946 | case NFA_CURSOR: |
2947 | case NFA_COL: |
2948 | case NFA_VCOL: |
2949 | case NFA_MARK: |
2950 | |
2951 | case NFA_ZSTART: |
2952 | case NFA_ZEND: |
2953 | case NFA_OPT_CHARS: |
2954 | case NFA_EMPTY: |
2955 | case NFA_START_PATTERN: |
2956 | case NFA_END_PATTERN: |
2957 | case NFA_COMPOSING: |
2958 | case NFA_END_COMPOSING: |
2959 | /* zero-width */ |
2960 | break; |
2961 | |
2962 | default: |
2963 | if (state->c < 0) |
2964 | /* don't know what this is */ |
2965 | return -1; |
2966 | /* normal character */ |
2967 | len += MB_CHAR2LEN(state->c); |
2968 | break; |
2969 | } |
2970 | |
2971 | /* normal way to continue */ |
2972 | state = state->out; |
2973 | } |
2974 | |
2975 | /* unrecognized, "cannot happen" */ |
2976 | return -1; |
2977 | } |
2978 | |
2979 | /* |
2980 | * Convert a postfix form into its equivalent NFA. |
2981 | * Return the NFA start state on success, NULL otherwise. |
2982 | */ |
2983 | static nfa_state_T *post2nfa(int *postfix, int *end, int nfa_calc_size) |
2984 | { |
2985 | int *p; |
2986 | int mopen; |
2987 | int mclose; |
2988 | Frag_T *stack = NULL; |
2989 | Frag_T *stackp = NULL; |
2990 | Frag_T *stack_end = NULL; |
2991 | Frag_T e1; |
2992 | Frag_T e2; |
2993 | Frag_T e; |
2994 | nfa_state_T *s; |
2995 | nfa_state_T *s1; |
2996 | nfa_state_T *matchstate; |
2997 | nfa_state_T *ret = NULL; |
2998 | |
2999 | if (postfix == NULL) |
3000 | return NULL; |
3001 | |
3002 | #define PUSH(s) st_push((s), &stackp, stack_end) |
3003 | #define POP() st_pop(&stackp, stack); \ |
3004 | if (stackp < stack) { \ |
3005 | st_error(postfix, end, p); \ |
3006 | xfree(stack); \ |
3007 | return NULL; \ |
3008 | } |
3009 | |
3010 | if (nfa_calc_size == false) { |
3011 | // Allocate space for the stack. Max states on the stack: "nstate". |
3012 | stack = xmalloc((nstate + 1) * sizeof(Frag_T)); |
3013 | stackp = stack; |
3014 | stack_end = stack + (nstate + 1); |
3015 | } |
3016 | |
3017 | for (p = postfix; p < end; ++p) { |
3018 | switch (*p) { |
3019 | case NFA_CONCAT: |
3020 | /* Concatenation. |
3021 | * Pay attention: this operator does not exist in the r.e. itself |
3022 | * (it is implicit, really). It is added when r.e. is translated |
3023 | * to postfix form in re2post(). */ |
3024 | if (nfa_calc_size == TRUE) { |
3025 | /* nstate += 0; */ |
3026 | break; |
3027 | } |
3028 | e2 = POP(); |
3029 | e1 = POP(); |
3030 | patch(e1.out, e2.start); |
3031 | PUSH(frag(e1.start, e2.out)); |
3032 | break; |
3033 | |
3034 | case NFA_OR: |
3035 | /* Alternation */ |
3036 | if (nfa_calc_size == TRUE) { |
3037 | nstate++; |
3038 | break; |
3039 | } |
3040 | e2 = POP(); |
3041 | e1 = POP(); |
3042 | s = alloc_state(NFA_SPLIT, e1.start, e2.start); |
3043 | if (s == NULL) |
3044 | goto theend; |
3045 | PUSH(frag(s, append(e1.out, e2.out))); |
3046 | break; |
3047 | |
3048 | case NFA_STAR: |
3049 | /* Zero or more, prefer more */ |
3050 | if (nfa_calc_size == TRUE) { |
3051 | nstate++; |
3052 | break; |
3053 | } |
3054 | e = POP(); |
3055 | s = alloc_state(NFA_SPLIT, e.start, NULL); |
3056 | if (s == NULL) |
3057 | goto theend; |
3058 | patch(e.out, s); |
3059 | PUSH(frag(s, list1(&s->out1))); |
3060 | break; |
3061 | |
3062 | case NFA_STAR_NONGREEDY: |
3063 | /* Zero or more, prefer zero */ |
3064 | if (nfa_calc_size == TRUE) { |
3065 | nstate++; |
3066 | break; |
3067 | } |
3068 | e = POP(); |
3069 | s = alloc_state(NFA_SPLIT, NULL, e.start); |
3070 | if (s == NULL) |
3071 | goto theend; |
3072 | patch(e.out, s); |
3073 | PUSH(frag(s, list1(&s->out))); |
3074 | break; |
3075 | |
3076 | case NFA_QUEST: |
3077 | /* one or zero atoms=> greedy match */ |
3078 | if (nfa_calc_size == TRUE) { |
3079 | nstate++; |
3080 | break; |
3081 | } |
3082 | e = POP(); |
3083 | s = alloc_state(NFA_SPLIT, e.start, NULL); |
3084 | if (s == NULL) |
3085 | goto theend; |
3086 | PUSH(frag(s, append(e.out, list1(&s->out1)))); |
3087 | break; |
3088 | |
3089 | case NFA_QUEST_NONGREEDY: |
3090 | /* zero or one atoms => non-greedy match */ |
3091 | if (nfa_calc_size == TRUE) { |
3092 | nstate++; |
3093 | break; |
3094 | } |
3095 | e = POP(); |
3096 | s = alloc_state(NFA_SPLIT, NULL, e.start); |
3097 | if (s == NULL) |
3098 | goto theend; |
3099 | PUSH(frag(s, append(e.out, list1(&s->out)))); |
3100 | break; |
3101 | |
3102 | case NFA_END_COLL: |
3103 | case NFA_END_NEG_COLL: |
3104 | /* On the stack is the sequence starting with NFA_START_COLL or |
3105 | * NFA_START_NEG_COLL and all possible characters. Patch it to |
3106 | * add the output to the start. */ |
3107 | if (nfa_calc_size == TRUE) { |
3108 | nstate++; |
3109 | break; |
3110 | } |
3111 | e = POP(); |
3112 | s = alloc_state(NFA_END_COLL, NULL, NULL); |
3113 | if (s == NULL) |
3114 | goto theend; |
3115 | patch(e.out, s); |
3116 | e.start->out1 = s; |
3117 | PUSH(frag(e.start, list1(&s->out))); |
3118 | break; |
3119 | |
3120 | case NFA_RANGE: |
3121 | /* Before this are two characters, the low and high end of a |
3122 | * range. Turn them into two states with MIN and MAX. */ |
3123 | if (nfa_calc_size == TRUE) { |
3124 | /* nstate += 0; */ |
3125 | break; |
3126 | } |
3127 | e2 = POP(); |
3128 | e1 = POP(); |
3129 | e2.start->val = e2.start->c; |
3130 | e2.start->c = NFA_RANGE_MAX; |
3131 | e1.start->val = e1.start->c; |
3132 | e1.start->c = NFA_RANGE_MIN; |
3133 | patch(e1.out, e2.start); |
3134 | PUSH(frag(e1.start, e2.out)); |
3135 | break; |
3136 | |
3137 | case NFA_EMPTY: |
3138 | /* 0-length, used in a repetition with max/min count of 0 */ |
3139 | if (nfa_calc_size == TRUE) { |
3140 | nstate++; |
3141 | break; |
3142 | } |
3143 | s = alloc_state(NFA_EMPTY, NULL, NULL); |
3144 | if (s == NULL) |
3145 | goto theend; |
3146 | PUSH(frag(s, list1(&s->out))); |
3147 | break; |
3148 | |
3149 | case NFA_OPT_CHARS: |
3150 | { |
3151 | int n; |
3152 | |
3153 | /* \%[abc] implemented as: |
3154 | * NFA_SPLIT |
3155 | * +-CHAR(a) |
3156 | * | +-NFA_SPLIT |
3157 | * | +-CHAR(b) |
3158 | * | | +-NFA_SPLIT |
3159 | * | | +-CHAR(c) |
3160 | * | | | +-next |
3161 | * | | +- next |
3162 | * | +- next |
3163 | * +- next |
3164 | */ |
3165 | n = *++p; /* get number of characters */ |
3166 | if (nfa_calc_size == TRUE) { |
3167 | nstate += n; |
3168 | break; |
3169 | } |
3170 | s = NULL; /* avoid compiler warning */ |
3171 | e1.out = NULL; /* stores list with out1's */ |
3172 | s1 = NULL; /* previous NFA_SPLIT to connect to */ |
3173 | while (n-- > 0) { |
3174 | e = POP(); /* get character */ |
3175 | s = alloc_state(NFA_SPLIT, e.start, NULL); |
3176 | if (s == NULL) |
3177 | goto theend; |
3178 | if (e1.out == NULL) |
3179 | e1 = e; |
3180 | patch(e.out, s1); |
3181 | append(e1.out, list1(&s->out1)); |
3182 | s1 = s; |
3183 | } |
3184 | PUSH(frag(s, e1.out)); |
3185 | break; |
3186 | } |
3187 | |
3188 | case NFA_PREV_ATOM_NO_WIDTH: |
3189 | case NFA_PREV_ATOM_NO_WIDTH_NEG: |
3190 | case NFA_PREV_ATOM_JUST_BEFORE: |
3191 | case NFA_PREV_ATOM_JUST_BEFORE_NEG: |
3192 | case NFA_PREV_ATOM_LIKE_PATTERN: |
3193 | { |
3194 | int before = (*p == NFA_PREV_ATOM_JUST_BEFORE |
3195 | || *p == NFA_PREV_ATOM_JUST_BEFORE_NEG); |
3196 | int pattern = (*p == NFA_PREV_ATOM_LIKE_PATTERN); |
3197 | int start_state; |
3198 | int end_state; |
3199 | int n = 0; |
3200 | nfa_state_T *zend; |
3201 | nfa_state_T *skip; |
3202 | |
3203 | switch (*p) { |
3204 | case NFA_PREV_ATOM_NO_WIDTH: |
3205 | start_state = NFA_START_INVISIBLE; |
3206 | end_state = NFA_END_INVISIBLE; |
3207 | break; |
3208 | case NFA_PREV_ATOM_NO_WIDTH_NEG: |
3209 | start_state = NFA_START_INVISIBLE_NEG; |
3210 | end_state = NFA_END_INVISIBLE_NEG; |
3211 | break; |
3212 | case NFA_PREV_ATOM_JUST_BEFORE: |
3213 | start_state = NFA_START_INVISIBLE_BEFORE; |
3214 | end_state = NFA_END_INVISIBLE; |
3215 | break; |
3216 | case NFA_PREV_ATOM_JUST_BEFORE_NEG: |
3217 | start_state = NFA_START_INVISIBLE_BEFORE_NEG; |
3218 | end_state = NFA_END_INVISIBLE_NEG; |
3219 | break; |
3220 | default: /* NFA_PREV_ATOM_LIKE_PATTERN: */ |
3221 | start_state = NFA_START_PATTERN; |
3222 | end_state = NFA_END_PATTERN; |
3223 | break; |
3224 | } |
3225 | |
3226 | if (before) |
3227 | n = *++p; /* get the count */ |
3228 | |
3229 | /* The \@= operator: match the preceding atom with zero width. |
3230 | * The \@! operator: no match for the preceding atom. |
3231 | * The \@<= operator: match for the preceding atom. |
3232 | * The \@<! operator: no match for the preceding atom. |
3233 | * Surrounds the preceding atom with START_INVISIBLE and |
3234 | * END_INVISIBLE, similarly to MOPEN. */ |
3235 | |
3236 | if (nfa_calc_size == TRUE) { |
3237 | nstate += pattern ? 4 : 2; |
3238 | break; |
3239 | } |
3240 | e = POP(); |
3241 | s1 = alloc_state(end_state, NULL, NULL); |
3242 | if (s1 == NULL) |
3243 | goto theend; |
3244 | |
3245 | s = alloc_state(start_state, e.start, s1); |
3246 | if (s == NULL) |
3247 | goto theend; |
3248 | if (pattern) { |
3249 | /* NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows. */ |
3250 | skip = alloc_state(NFA_SKIP, NULL, NULL); |
3251 | if (skip == NULL) { |
3252 | goto theend; |
3253 | } |
3254 | zend = alloc_state(NFA_ZEND, s1, NULL); |
3255 | if (zend == NULL) { |
3256 | goto theend; |
3257 | } |
3258 | s1->out= skip; |
3259 | patch(e.out, zend); |
3260 | PUSH(frag(s, list1(&skip->out))); |
3261 | } else { |
3262 | patch(e.out, s1); |
3263 | PUSH(frag(s, list1(&s1->out))); |
3264 | if (before) { |
3265 | if (n <= 0) |
3266 | /* See if we can guess the maximum width, it avoids a |
3267 | * lot of pointless tries. */ |
3268 | n = nfa_max_width(e.start, 0); |
3269 | s->val = n; /* store the count */ |
3270 | } |
3271 | } |
3272 | break; |
3273 | } |
3274 | |
3275 | case NFA_COMPOSING: // char with composing char |
3276 | FALLTHROUGH; |
3277 | |
3278 | case NFA_MOPEN: /* \( \) Submatch */ |
3279 | case NFA_MOPEN1: |
3280 | case NFA_MOPEN2: |
3281 | case NFA_MOPEN3: |
3282 | case NFA_MOPEN4: |
3283 | case NFA_MOPEN5: |
3284 | case NFA_MOPEN6: |
3285 | case NFA_MOPEN7: |
3286 | case NFA_MOPEN8: |
3287 | case NFA_MOPEN9: |
3288 | case NFA_ZOPEN: /* \z( \) Submatch */ |
3289 | case NFA_ZOPEN1: |
3290 | case NFA_ZOPEN2: |
3291 | case NFA_ZOPEN3: |
3292 | case NFA_ZOPEN4: |
3293 | case NFA_ZOPEN5: |
3294 | case NFA_ZOPEN6: |
3295 | case NFA_ZOPEN7: |
3296 | case NFA_ZOPEN8: |
3297 | case NFA_ZOPEN9: |
3298 | case NFA_NOPEN: /* \%( \) "Invisible Submatch" */ |
3299 | if (nfa_calc_size == TRUE) { |
3300 | nstate += 2; |
3301 | break; |
3302 | } |
3303 | |
3304 | mopen = *p; |
3305 | switch (*p) { |
3306 | case NFA_NOPEN: mclose = NFA_NCLOSE; break; |
3307 | case NFA_ZOPEN: mclose = NFA_ZCLOSE; break; |
3308 | case NFA_ZOPEN1: mclose = NFA_ZCLOSE1; break; |
3309 | case NFA_ZOPEN2: mclose = NFA_ZCLOSE2; break; |
3310 | case NFA_ZOPEN3: mclose = NFA_ZCLOSE3; break; |
3311 | case NFA_ZOPEN4: mclose = NFA_ZCLOSE4; break; |
3312 | case NFA_ZOPEN5: mclose = NFA_ZCLOSE5; break; |
3313 | case NFA_ZOPEN6: mclose = NFA_ZCLOSE6; break; |
3314 | case NFA_ZOPEN7: mclose = NFA_ZCLOSE7; break; |
3315 | case NFA_ZOPEN8: mclose = NFA_ZCLOSE8; break; |
3316 | case NFA_ZOPEN9: mclose = NFA_ZCLOSE9; break; |
3317 | case NFA_COMPOSING: mclose = NFA_END_COMPOSING; break; |
3318 | default: |
3319 | /* NFA_MOPEN, NFA_MOPEN1 .. NFA_MOPEN9 */ |
3320 | mclose = *p + NSUBEXP; |
3321 | break; |
3322 | } |
3323 | |
3324 | /* Allow "NFA_MOPEN" as a valid postfix representation for |
3325 | * the empty regexp "". In this case, the NFA will be |
3326 | * NFA_MOPEN -> NFA_MCLOSE. Note that this also allows |
3327 | * empty groups of parenthesis, and empty mbyte chars */ |
3328 | if (stackp == stack) { |
3329 | s = alloc_state(mopen, NULL, NULL); |
3330 | if (s == NULL) |
3331 | goto theend; |
3332 | s1 = alloc_state(mclose, NULL, NULL); |
3333 | if (s1 == NULL) |
3334 | goto theend; |
3335 | patch(list1(&s->out), s1); |
3336 | PUSH(frag(s, list1(&s1->out))); |
3337 | break; |
3338 | } |
3339 | |
3340 | /* At least one node was emitted before NFA_MOPEN, so |
3341 | * at least one node will be between NFA_MOPEN and NFA_MCLOSE */ |
3342 | e = POP(); |
3343 | s = alloc_state(mopen, e.start, NULL); /* `(' */ |
3344 | if (s == NULL) |
3345 | goto theend; |
3346 | |
3347 | s1 = alloc_state(mclose, NULL, NULL); /* `)' */ |
3348 | if (s1 == NULL) |
3349 | goto theend; |
3350 | patch(e.out, s1); |
3351 | |
3352 | if (mopen == NFA_COMPOSING) |
3353 | /* COMPOSING->out1 = END_COMPOSING */ |
3354 | patch(list1(&s->out1), s1); |
3355 | |
3356 | PUSH(frag(s, list1(&s1->out))); |
3357 | break; |
3358 | |
3359 | case NFA_BACKREF1: |
3360 | case NFA_BACKREF2: |
3361 | case NFA_BACKREF3: |
3362 | case NFA_BACKREF4: |
3363 | case NFA_BACKREF5: |
3364 | case NFA_BACKREF6: |
3365 | case NFA_BACKREF7: |
3366 | case NFA_BACKREF8: |
3367 | case NFA_BACKREF9: |
3368 | case NFA_ZREF1: |
3369 | case NFA_ZREF2: |
3370 | case NFA_ZREF3: |
3371 | case NFA_ZREF4: |
3372 | case NFA_ZREF5: |
3373 | case NFA_ZREF6: |
3374 | case NFA_ZREF7: |
3375 | case NFA_ZREF8: |
3376 | case NFA_ZREF9: |
3377 | if (nfa_calc_size == TRUE) { |
3378 | nstate += 2; |
3379 | break; |
3380 | } |
3381 | s = alloc_state(*p, NULL, NULL); |
3382 | if (s == NULL) |
3383 | goto theend; |
3384 | s1 = alloc_state(NFA_SKIP, NULL, NULL); |
3385 | if (s1 == NULL) |
3386 | goto theend; |
3387 | patch(list1(&s->out), s1); |
3388 | PUSH(frag(s, list1(&s1->out))); |
3389 | break; |
3390 | |
3391 | case NFA_LNUM: |
3392 | case NFA_LNUM_GT: |
3393 | case NFA_LNUM_LT: |
3394 | case NFA_VCOL: |
3395 | case NFA_VCOL_GT: |
3396 | case NFA_VCOL_LT: |
3397 | case NFA_COL: |
3398 | case NFA_COL_GT: |
3399 | case NFA_COL_LT: |
3400 | case NFA_MARK: |
3401 | case NFA_MARK_GT: |
3402 | case NFA_MARK_LT: |
3403 | { |
3404 | int n = *++p; /* lnum, col or mark name */ |
3405 | |
3406 | if (nfa_calc_size == TRUE) { |
3407 | nstate += 1; |
3408 | break; |
3409 | } |
3410 | s = alloc_state(p[-1], NULL, NULL); |
3411 | if (s == NULL) |
3412 | goto theend; |
3413 | s->val = n; |
3414 | PUSH(frag(s, list1(&s->out))); |
3415 | break; |
3416 | } |
3417 | |
3418 | case NFA_ZSTART: |
3419 | case NFA_ZEND: |
3420 | default: |
3421 | /* Operands */ |
3422 | if (nfa_calc_size == TRUE) { |
3423 | nstate++; |
3424 | break; |
3425 | } |
3426 | s = alloc_state(*p, NULL, NULL); |
3427 | if (s == NULL) |
3428 | goto theend; |
3429 | PUSH(frag(s, list1(&s->out))); |
3430 | break; |
3431 | |
3432 | } /* switch(*p) */ |
3433 | |
3434 | } /* for(p = postfix; *p; ++p) */ |
3435 | |
3436 | if (nfa_calc_size == TRUE) { |
3437 | nstate++; |
3438 | goto theend; /* Return value when counting size is ignored anyway */ |
3439 | } |
3440 | |
3441 | e = POP(); |
3442 | if (stackp != stack) { |
3443 | xfree(stack); |
3444 | EMSG_RET_NULL(_("E875: (NFA regexp) (While converting from postfix to NFA)," |
3445 | "too many states left on stack" )); |
3446 | } |
3447 | |
3448 | if (istate >= nstate) { |
3449 | xfree(stack); |
3450 | EMSG_RET_NULL(_("E876: (NFA regexp) " |
3451 | "Not enough space to store the whole NFA " )); |
3452 | } |
3453 | |
3454 | matchstate = &state_ptr[istate++]; /* the match state */ |
3455 | matchstate->c = NFA_MATCH; |
3456 | matchstate->out = matchstate->out1 = NULL; |
3457 | matchstate->id = 0; |
3458 | |
3459 | patch(e.out, matchstate); |
3460 | ret = e.start; |
3461 | |
3462 | theend: |
3463 | xfree(stack); |
3464 | return ret; |
3465 | |
3466 | #undef POP1 |
3467 | #undef PUSH1 |
3468 | #undef POP2 |
3469 | #undef PUSH2 |
3470 | #undef POP |
3471 | #undef PUSH |
3472 | } |
3473 | |
3474 | /* |
3475 | * After building the NFA program, inspect it to add optimization hints. |
3476 | */ |
3477 | static void nfa_postprocess(nfa_regprog_T *prog) |
3478 | { |
3479 | int i; |
3480 | int c; |
3481 | |
3482 | for (i = 0; i < prog->nstate; ++i) { |
3483 | c = prog->state[i].c; |
3484 | if (c == NFA_START_INVISIBLE |
3485 | || c == NFA_START_INVISIBLE_NEG |
3486 | || c == NFA_START_INVISIBLE_BEFORE |
3487 | || c == NFA_START_INVISIBLE_BEFORE_NEG) { |
3488 | int directly; |
3489 | |
3490 | /* Do it directly when what follows is possibly the end of the |
3491 | * match. */ |
3492 | if (match_follows(prog->state[i].out1->out, 0)) |
3493 | directly = TRUE; |
3494 | else { |
3495 | int ch_invisible = failure_chance(prog->state[i].out, 0); |
3496 | int ch_follows = failure_chance(prog->state[i].out1->out, 0); |
3497 | |
3498 | /* Postpone when the invisible match is expensive or has a |
3499 | * lower chance of failing. */ |
3500 | if (c == NFA_START_INVISIBLE_BEFORE |
3501 | || c == NFA_START_INVISIBLE_BEFORE_NEG) { |
3502 | /* "before" matches are very expensive when |
3503 | * unbounded, always prefer what follows then, |
3504 | * unless what follows will always match. |
3505 | * Otherwise strongly prefer what follows. */ |
3506 | if (prog->state[i].val <= 0 && ch_follows > 0) |
3507 | directly = FALSE; |
3508 | else |
3509 | directly = ch_follows * 10 < ch_invisible; |
3510 | } else { |
3511 | /* normal invisible, first do the one with the |
3512 | * highest failure chance */ |
3513 | directly = ch_follows < ch_invisible; |
3514 | } |
3515 | } |
3516 | if (directly) |
3517 | /* switch to the _FIRST state */ |
3518 | ++prog->state[i].c; |
3519 | } |
3520 | } |
3521 | } |
3522 | |
3523 | /**************************************************************** |
3524 | * NFA execution code. |
3525 | ****************************************************************/ |
3526 | |
3527 | /* Values for done in nfa_pim_T. */ |
3528 | #define NFA_PIM_UNUSED 0 /* pim not used */ |
3529 | #define NFA_PIM_TODO 1 /* pim not done yet */ |
3530 | #define NFA_PIM_MATCH 2 /* pim executed, matches */ |
3531 | #define NFA_PIM_NOMATCH 3 /* pim executed, no match */ |
3532 | |
3533 | |
3534 | #ifdef REGEXP_DEBUG |
3535 | static void log_subsexpr(regsubs_T *subs) |
3536 | { |
3537 | log_subexpr(&subs->norm); |
3538 | if (nfa_has_zsubexpr) |
3539 | log_subexpr(&subs->synt); |
3540 | } |
3541 | |
3542 | static void log_subexpr(regsub_T *sub) |
3543 | { |
3544 | int j; |
3545 | |
3546 | for (j = 0; j < sub->in_use; j++) |
3547 | if (REG_MULTI) |
3548 | fprintf(log_fd, "*** group %d, start: c=%d, l=%d, end: c=%d, l=%d\n" , |
3549 | j, |
3550 | sub->list.multi[j].start_col, |
3551 | (int)sub->list.multi[j].start_lnum, |
3552 | sub->list.multi[j].end_col, |
3553 | (int)sub->list.multi[j].end_lnum); |
3554 | else { |
3555 | char *s = (char *)sub->list.line[j].start; |
3556 | char *e = (char *)sub->list.line[j].end; |
3557 | |
3558 | fprintf(log_fd, "*** group %d, start: \"%s\", end: \"%s\"\n" , |
3559 | j, |
3560 | s == NULL ? "NULL" : s, |
3561 | e == NULL ? "NULL" : e); |
3562 | } |
3563 | } |
3564 | |
3565 | static char *pim_info(nfa_pim_T *pim) |
3566 | { |
3567 | static char buf[30]; |
3568 | |
3569 | if (pim == NULL || pim->result == NFA_PIM_UNUSED) |
3570 | buf[0] = NUL; |
3571 | else { |
3572 | sprintf(buf, " PIM col %d" , REG_MULTI ? (int)pim->end.pos.col |
3573 | : (int)(pim->end.ptr - reginput)); |
3574 | } |
3575 | return buf; |
3576 | } |
3577 | |
3578 | #endif |
3579 | |
3580 | // Used during execution: whether a match has been found. |
3581 | static int nfa_match; |
3582 | static proftime_T *nfa_time_limit; |
3583 | static int *nfa_timed_out; |
3584 | static int nfa_time_count; |
3585 | |
3586 | // Copy postponed invisible match info from "from" to "to". |
3587 | static void copy_pim(nfa_pim_T *to, nfa_pim_T *from) |
3588 | { |
3589 | to->result = from->result; |
3590 | to->state = from->state; |
3591 | copy_sub(&to->subs.norm, &from->subs.norm); |
3592 | if (nfa_has_zsubexpr) |
3593 | copy_sub(&to->subs.synt, &from->subs.synt); |
3594 | to->end = from->end; |
3595 | } |
3596 | |
3597 | static void clear_sub(regsub_T *sub) |
3598 | { |
3599 | if (REG_MULTI) |
3600 | /* Use 0xff to set lnum to -1 */ |
3601 | memset(sub->list.multi, 0xff, |
3602 | sizeof(struct multipos) * nfa_nsubexpr); |
3603 | else |
3604 | memset(sub->list.line, 0, sizeof(struct linepos) * nfa_nsubexpr); |
3605 | sub->in_use = 0; |
3606 | } |
3607 | |
3608 | /* |
3609 | * Copy the submatches from "from" to "to". |
3610 | */ |
3611 | static void copy_sub(regsub_T *to, regsub_T *from) |
3612 | { |
3613 | to->in_use = from->in_use; |
3614 | if (from->in_use > 0) { |
3615 | /* Copy the match start and end positions. */ |
3616 | if (REG_MULTI) |
3617 | memmove(&to->list.multi[0], |
3618 | &from->list.multi[0], |
3619 | sizeof(struct multipos) * from->in_use); |
3620 | else |
3621 | memmove(&to->list.line[0], |
3622 | &from->list.line[0], |
3623 | sizeof(struct linepos) * from->in_use); |
3624 | } |
3625 | } |
3626 | |
3627 | /* |
3628 | * Like copy_sub() but exclude the main match. |
3629 | */ |
3630 | static void copy_sub_off(regsub_T *to, regsub_T *from) |
3631 | { |
3632 | if (to->in_use < from->in_use) |
3633 | to->in_use = from->in_use; |
3634 | if (from->in_use > 1) { |
3635 | /* Copy the match start and end positions. */ |
3636 | if (REG_MULTI) |
3637 | memmove(&to->list.multi[1], |
3638 | &from->list.multi[1], |
3639 | sizeof(struct multipos) * (from->in_use - 1)); |
3640 | else |
3641 | memmove(&to->list.line[1], |
3642 | &from->list.line[1], |
3643 | sizeof(struct linepos) * (from->in_use - 1)); |
3644 | } |
3645 | } |
3646 | |
3647 | /* |
3648 | * Like copy_sub() but only do the end of the main match if \ze is present. |
3649 | */ |
3650 | static void copy_ze_off(regsub_T *to, regsub_T *from) |
3651 | { |
3652 | if (nfa_has_zend) { |
3653 | if (REG_MULTI) { |
3654 | if (from->list.multi[0].end_lnum >= 0){ |
3655 | to->list.multi[0].end_lnum = from->list.multi[0].end_lnum; |
3656 | to->list.multi[0].end_col = from->list.multi[0].end_col; |
3657 | } |
3658 | } else { |
3659 | if (from->list.line[0].end != NULL) |
3660 | to->list.line[0].end = from->list.line[0].end; |
3661 | } |
3662 | } |
3663 | } |
3664 | |
3665 | // Return TRUE if "sub1" and "sub2" have the same start positions. |
3666 | // When using back-references also check the end position. |
3667 | static int sub_equal(regsub_T *sub1, regsub_T *sub2) |
3668 | { |
3669 | int i; |
3670 | int todo; |
3671 | linenr_T s1; |
3672 | linenr_T s2; |
3673 | char_u *sp1; |
3674 | char_u *sp2; |
3675 | |
3676 | todo = sub1->in_use > sub2->in_use ? sub1->in_use : sub2->in_use; |
3677 | if (REG_MULTI) { |
3678 | for (i = 0; i < todo; ++i) { |
3679 | if (i < sub1->in_use) |
3680 | s1 = sub1->list.multi[i].start_lnum; |
3681 | else |
3682 | s1 = -1; |
3683 | if (i < sub2->in_use) |
3684 | s2 = sub2->list.multi[i].start_lnum; |
3685 | else |
3686 | s2 = -1; |
3687 | if (s1 != s2) |
3688 | return FALSE; |
3689 | if (s1 != -1 && sub1->list.multi[i].start_col |
3690 | != sub2->list.multi[i].start_col) |
3691 | return FALSE; |
3692 | |
3693 | if (nfa_has_backref) { |
3694 | if (i < sub1->in_use) { |
3695 | s1 = sub1->list.multi[i].end_lnum; |
3696 | } else { |
3697 | s1 = -1; |
3698 | } |
3699 | if (i < sub2->in_use) { |
3700 | s2 = sub2->list.multi[i].end_lnum; |
3701 | } else { |
3702 | s2 = -1; |
3703 | } |
3704 | if (s1 != s2) { |
3705 | return FALSE; |
3706 | } |
3707 | if (s1 != -1 |
3708 | && sub1->list.multi[i].end_col != sub2->list.multi[i].end_col) { |
3709 | return FALSE; |
3710 | } |
3711 | } |
3712 | } |
3713 | } else { |
3714 | for (i = 0; i < todo; ++i) { |
3715 | if (i < sub1->in_use) |
3716 | sp1 = sub1->list.line[i].start; |
3717 | else |
3718 | sp1 = NULL; |
3719 | if (i < sub2->in_use) |
3720 | sp2 = sub2->list.line[i].start; |
3721 | else |
3722 | sp2 = NULL; |
3723 | if (sp1 != sp2) |
3724 | return FALSE; |
3725 | |
3726 | if (nfa_has_backref) { |
3727 | if (i < sub1->in_use) { |
3728 | sp1 = sub1->list.line[i].end; |
3729 | } else { |
3730 | sp1 = NULL; |
3731 | } |
3732 | if (i < sub2->in_use) { |
3733 | sp2 = sub2->list.line[i].end; |
3734 | } else { |
3735 | sp2 = NULL; |
3736 | } |
3737 | if (sp1 != sp2) { |
3738 | return FALSE; |
3739 | } |
3740 | } |
3741 | } |
3742 | } |
3743 | |
3744 | return TRUE; |
3745 | } |
3746 | |
3747 | #ifdef REGEXP_DEBUG |
3748 | static void report_state(char *action, |
3749 | regsub_T *sub, |
3750 | nfa_state_T *state, |
3751 | int lid, |
3752 | nfa_pim_T *pim) { |
3753 | int col; |
3754 | |
3755 | if (sub->in_use <= 0) |
3756 | col = -1; |
3757 | else if (REG_MULTI) |
3758 | col = sub->list.multi[0].start_col; |
3759 | else |
3760 | col = (int)(sub->list.line[0].start - regline); |
3761 | nfa_set_code(state->c); |
3762 | fprintf(log_fd, "> %s state %d to list %d. char %d: %s (start col %d)%s\n" , |
3763 | action, abs(state->id), lid, state->c, code, col, |
3764 | pim_info(pim)); |
3765 | } |
3766 | |
3767 | #endif |
3768 | |
3769 | /* |
3770 | * Return TRUE if the same state is already in list "l" with the same |
3771 | * positions as "subs". |
3772 | */ |
3773 | static int |
3774 | has_state_with_pos ( |
3775 | nfa_list_T *l, /* runtime state list */ |
3776 | nfa_state_T *state, /* state to update */ |
3777 | regsubs_T *subs, /* pointers to subexpressions */ |
3778 | nfa_pim_T *pim /* postponed match or NULL */ |
3779 | ) |
3780 | { |
3781 | nfa_thread_T *thread; |
3782 | int i; |
3783 | |
3784 | for (i = 0; i < l->n; ++i) { |
3785 | thread = &l->t[i]; |
3786 | if (thread->state->id == state->id |
3787 | && sub_equal(&thread->subs.norm, &subs->norm) |
3788 | && (!nfa_has_zsubexpr |
3789 | || sub_equal(&thread->subs.synt, &subs->synt)) |
3790 | && pim_equal(&thread->pim, pim)) |
3791 | return TRUE; |
3792 | } |
3793 | return FALSE; |
3794 | } |
3795 | |
3796 | /* |
3797 | * Return TRUE if "one" and "two" are equal. That includes when both are not |
3798 | * set. |
3799 | */ |
3800 | static int pim_equal(nfa_pim_T *one, nfa_pim_T *two) |
3801 | { |
3802 | int one_unused = (one == NULL || one->result == NFA_PIM_UNUSED); |
3803 | int two_unused = (two == NULL || two->result == NFA_PIM_UNUSED); |
3804 | |
3805 | if (one_unused) |
3806 | /* one is unused: equal when two is also unused */ |
3807 | return two_unused; |
3808 | if (two_unused) |
3809 | /* one is used and two is not: not equal */ |
3810 | return FALSE; |
3811 | /* compare the state id */ |
3812 | if (one->state->id != two->state->id) |
3813 | return FALSE; |
3814 | /* compare the position */ |
3815 | if (REG_MULTI) |
3816 | return one->end.pos.lnum == two->end.pos.lnum |
3817 | && one->end.pos.col == two->end.pos.col; |
3818 | return one->end.ptr == two->end.ptr; |
3819 | } |
3820 | |
3821 | /* |
3822 | * Return TRUE if "state" leads to a NFA_MATCH without advancing the input. |
3823 | */ |
3824 | static int match_follows(nfa_state_T *startstate, int depth) |
3825 | { |
3826 | nfa_state_T *state = startstate; |
3827 | |
3828 | /* avoid too much recursion */ |
3829 | if (depth > 10) |
3830 | return FALSE; |
3831 | |
3832 | while (state != NULL) { |
3833 | switch (state->c) { |
3834 | case NFA_MATCH: |
3835 | case NFA_MCLOSE: |
3836 | case NFA_END_INVISIBLE: |
3837 | case NFA_END_INVISIBLE_NEG: |
3838 | case NFA_END_PATTERN: |
3839 | return TRUE; |
3840 | |
3841 | case NFA_SPLIT: |
3842 | return match_follows(state->out, depth + 1) |
3843 | || match_follows(state->out1, depth + 1); |
3844 | |
3845 | case NFA_START_INVISIBLE: |
3846 | case NFA_START_INVISIBLE_FIRST: |
3847 | case NFA_START_INVISIBLE_BEFORE: |
3848 | case NFA_START_INVISIBLE_BEFORE_FIRST: |
3849 | case NFA_START_INVISIBLE_NEG: |
3850 | case NFA_START_INVISIBLE_NEG_FIRST: |
3851 | case NFA_START_INVISIBLE_BEFORE_NEG: |
3852 | case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: |
3853 | case NFA_COMPOSING: |
3854 | /* skip ahead to next state */ |
3855 | state = state->out1->out; |
3856 | continue; |
3857 | |
3858 | case NFA_ANY: |
3859 | case NFA_ANY_COMPOSING: |
3860 | case NFA_IDENT: |
3861 | case NFA_SIDENT: |
3862 | case NFA_KWORD: |
3863 | case NFA_SKWORD: |
3864 | case NFA_FNAME: |
3865 | case NFA_SFNAME: |
3866 | case NFA_PRINT: |
3867 | case NFA_SPRINT: |
3868 | case NFA_WHITE: |
3869 | case NFA_NWHITE: |
3870 | case NFA_DIGIT: |
3871 | case NFA_NDIGIT: |
3872 | case NFA_HEX: |
3873 | case NFA_NHEX: |
3874 | case NFA_OCTAL: |
3875 | case NFA_NOCTAL: |
3876 | case NFA_WORD: |
3877 | case NFA_NWORD: |
3878 | case NFA_HEAD: |
3879 | case NFA_NHEAD: |
3880 | case NFA_ALPHA: |
3881 | case NFA_NALPHA: |
3882 | case NFA_LOWER: |
3883 | case NFA_NLOWER: |
3884 | case NFA_UPPER: |
3885 | case NFA_NUPPER: |
3886 | case NFA_LOWER_IC: |
3887 | case NFA_NLOWER_IC: |
3888 | case NFA_UPPER_IC: |
3889 | case NFA_NUPPER_IC: |
3890 | case NFA_START_COLL: |
3891 | case NFA_START_NEG_COLL: |
3892 | case NFA_NEWL: |
3893 | /* state will advance input */ |
3894 | return FALSE; |
3895 | |
3896 | default: |
3897 | if (state->c > 0) |
3898 | /* state will advance input */ |
3899 | return FALSE; |
3900 | |
3901 | /* Others: zero-width or possibly zero-width, might still find |
3902 | * a match at the same position, keep looking. */ |
3903 | break; |
3904 | } |
3905 | state = state->out; |
3906 | } |
3907 | return FALSE; |
3908 | } |
3909 | |
3910 | |
3911 | /* |
3912 | * Return TRUE if "state" is already in list "l". |
3913 | */ |
3914 | static int |
3915 | state_in_list ( |
3916 | nfa_list_T *l, /* runtime state list */ |
3917 | nfa_state_T *state, /* state to update */ |
3918 | regsubs_T *subs /* pointers to subexpressions */ |
3919 | ) |
3920 | { |
3921 | if (state->lastlist[nfa_ll_index] == l->id) { |
3922 | if (!nfa_has_backref || has_state_with_pos(l, state, subs, NULL)) |
3923 | return TRUE; |
3924 | } |
3925 | return FALSE; |
3926 | } |
3927 | |
3928 | // Offset used for "off" by addstate_here(). |
3929 | #define ADDSTATE_HERE_OFFSET 10 |
3930 | |
3931 | // Add "state" and possibly what follows to state list ".". |
3932 | // Returns "subs_arg", possibly copied into temp_subs. |
3933 | // Returns NULL when recursiveness is too deep. |
3934 | static regsubs_T *addstate( |
3935 | nfa_list_T *l, // runtime state list |
3936 | nfa_state_T *state, // state to update |
3937 | regsubs_T *subs_arg, // pointers to subexpressions |
3938 | nfa_pim_T *pim, // postponed look-behind match |
3939 | int off_arg) // byte offset, when -1 go to next line |
3940 | FUNC_ATTR_NONNULL_ARG(1, 2) FUNC_ATTR_WARN_UNUSED_RESULT |
3941 | { |
3942 | int subidx; |
3943 | int off = off_arg; |
3944 | int add_here = FALSE; |
3945 | int listindex = 0; |
3946 | int k; |
3947 | int found = FALSE; |
3948 | nfa_thread_T *thread; |
3949 | struct multipos save_multipos; |
3950 | int save_in_use; |
3951 | char_u *save_ptr; |
3952 | int i; |
3953 | regsub_T *sub; |
3954 | regsubs_T *subs = subs_arg; |
3955 | static regsubs_T temp_subs; |
3956 | #ifdef REGEXP_DEBUG |
3957 | int did_print = FALSE; |
3958 | #endif |
3959 | static int depth = 0; |
3960 | |
3961 | // This function is called recursively. When the depth is too much we run |
3962 | // out of stack and crash, limit recursiveness here. |
3963 | if (++depth >= 5000 || subs == NULL) { |
3964 | depth--; |
3965 | return NULL; |
3966 | } |
3967 | |
3968 | if (off_arg <= -ADDSTATE_HERE_OFFSET) { |
3969 | add_here = true; |
3970 | off = 0; |
3971 | listindex = -(off_arg + ADDSTATE_HERE_OFFSET); |
3972 | } |
3973 | |
3974 | switch (state->c) { |
3975 | case NFA_NCLOSE: |
3976 | case NFA_MCLOSE: |
3977 | case NFA_MCLOSE1: |
3978 | case NFA_MCLOSE2: |
3979 | case NFA_MCLOSE3: |
3980 | case NFA_MCLOSE4: |
3981 | case NFA_MCLOSE5: |
3982 | case NFA_MCLOSE6: |
3983 | case NFA_MCLOSE7: |
3984 | case NFA_MCLOSE8: |
3985 | case NFA_MCLOSE9: |
3986 | case NFA_ZCLOSE: |
3987 | case NFA_ZCLOSE1: |
3988 | case NFA_ZCLOSE2: |
3989 | case NFA_ZCLOSE3: |
3990 | case NFA_ZCLOSE4: |
3991 | case NFA_ZCLOSE5: |
3992 | case NFA_ZCLOSE6: |
3993 | case NFA_ZCLOSE7: |
3994 | case NFA_ZCLOSE8: |
3995 | case NFA_ZCLOSE9: |
3996 | case NFA_MOPEN: |
3997 | case NFA_ZEND: |
3998 | case NFA_SPLIT: |
3999 | case NFA_EMPTY: |
4000 | /* These nodes are not added themselves but their "out" and/or |
4001 | * "out1" may be added below. */ |
4002 | break; |
4003 | |
4004 | case NFA_BOL: |
4005 | case NFA_BOF: |
4006 | /* "^" won't match past end-of-line, don't bother trying. |
4007 | * Except when at the end of the line, or when we are going to the |
4008 | * next line for a look-behind match. */ |
4009 | if (reginput > regline |
4010 | && *reginput != NUL |
4011 | && (nfa_endp == NULL |
4012 | || !REG_MULTI |
4013 | || reglnum == nfa_endp->se_u.pos.lnum)) |
4014 | goto skip_add; |
4015 | FALLTHROUGH; |
4016 | |
4017 | case NFA_MOPEN1: |
4018 | case NFA_MOPEN2: |
4019 | case NFA_MOPEN3: |
4020 | case NFA_MOPEN4: |
4021 | case NFA_MOPEN5: |
4022 | case NFA_MOPEN6: |
4023 | case NFA_MOPEN7: |
4024 | case NFA_MOPEN8: |
4025 | case NFA_MOPEN9: |
4026 | case NFA_ZOPEN: |
4027 | case NFA_ZOPEN1: |
4028 | case NFA_ZOPEN2: |
4029 | case NFA_ZOPEN3: |
4030 | case NFA_ZOPEN4: |
4031 | case NFA_ZOPEN5: |
4032 | case NFA_ZOPEN6: |
4033 | case NFA_ZOPEN7: |
4034 | case NFA_ZOPEN8: |
4035 | case NFA_ZOPEN9: |
4036 | case NFA_NOPEN: |
4037 | case NFA_ZSTART: |
4038 | /* These nodes need to be added so that we can bail out when it |
4039 | * was added to this list before at the same position to avoid an |
4040 | * endless loop for "\(\)*" */ |
4041 | |
4042 | default: |
4043 | if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP) { |
4044 | /* This state is already in the list, don't add it again, |
4045 | * unless it is an MOPEN that is used for a backreference or |
4046 | * when there is a PIM. For NFA_MATCH check the position, |
4047 | * lower position is preferred. */ |
4048 | if (!nfa_has_backref && pim == NULL && !l->has_pim |
4049 | && state->c != NFA_MATCH) { |
4050 | |
4051 | /* When called from addstate_here() do insert before |
4052 | * existing states. */ |
4053 | if (add_here) { |
4054 | for (k = 0; k < l->n && k < listindex; ++k) { |
4055 | if (l->t[k].state->id == state->id) { |
4056 | found = TRUE; |
4057 | break; |
4058 | } |
4059 | } |
4060 | } |
4061 | |
4062 | if (!add_here || found) { |
4063 | skip_add: |
4064 | #ifdef REGEXP_DEBUG |
4065 | nfa_set_code(state->c); |
4066 | fprintf(log_fd, "> Not adding state %d to list %d. char %d: %s pim: %s has_pim: %d found: %d\n" , |
4067 | abs(state->id), l->id, state->c, code, |
4068 | pim == NULL ? "NULL" : "yes" , l->has_pim, found); |
4069 | #endif |
4070 | depth--; |
4071 | return subs; |
4072 | } |
4073 | } |
4074 | |
4075 | /* Do not add the state again when it exists with the same |
4076 | * positions. */ |
4077 | if (has_state_with_pos(l, state, subs, pim)) |
4078 | goto skip_add; |
4079 | } |
4080 | |
4081 | // When there are backreferences or PIMs the number of states may |
4082 | // be (a lot) bigger than anticipated. |
4083 | if (l->n == l->len) { |
4084 | const int newlen = l->len * 3 / 2 + 50; |
4085 | const size_t newsize = newlen * sizeof(nfa_thread_T); |
4086 | |
4087 | if ((long)(newsize >> 10) >= p_mmp) { |
4088 | EMSG(_(e_maxmempat)); |
4089 | depth--; |
4090 | return NULL; |
4091 | } |
4092 | if (subs != &temp_subs) { |
4093 | /* "subs" may point into the current array, need to make a |
4094 | * copy before it becomes invalid. */ |
4095 | copy_sub(&temp_subs.norm, &subs->norm); |
4096 | if (nfa_has_zsubexpr) |
4097 | copy_sub(&temp_subs.synt, &subs->synt); |
4098 | subs = &temp_subs; |
4099 | } |
4100 | |
4101 | nfa_thread_T *const newt = xrealloc(l->t, newsize); |
4102 | l->t = newt; |
4103 | l->len = newlen; |
4104 | } |
4105 | |
4106 | /* add the state to the list */ |
4107 | state->lastlist[nfa_ll_index] = l->id; |
4108 | thread = &l->t[l->n++]; |
4109 | thread->state = state; |
4110 | if (pim == NULL) |
4111 | thread->pim.result = NFA_PIM_UNUSED; |
4112 | else { |
4113 | copy_pim(&thread->pim, pim); |
4114 | l->has_pim = TRUE; |
4115 | } |
4116 | copy_sub(&thread->subs.norm, &subs->norm); |
4117 | if (nfa_has_zsubexpr) |
4118 | copy_sub(&thread->subs.synt, &subs->synt); |
4119 | #ifdef REGEXP_DEBUG |
4120 | report_state("Adding" , &thread->subs.norm, state, l->id, pim); |
4121 | did_print = TRUE; |
4122 | #endif |
4123 | } |
4124 | |
4125 | #ifdef REGEXP_DEBUG |
4126 | if (!did_print) |
4127 | report_state("Processing" , &subs->norm, state, l->id, pim); |
4128 | #endif |
4129 | switch (state->c) { |
4130 | case NFA_MATCH: |
4131 | break; |
4132 | |
4133 | case NFA_SPLIT: |
4134 | /* order matters here */ |
4135 | subs = addstate(l, state->out, subs, pim, off_arg); |
4136 | subs = addstate(l, state->out1, subs, pim, off_arg); |
4137 | break; |
4138 | |
4139 | case NFA_EMPTY: |
4140 | case NFA_NOPEN: |
4141 | case NFA_NCLOSE: |
4142 | subs = addstate(l, state->out, subs, pim, off_arg); |
4143 | break; |
4144 | |
4145 | case NFA_MOPEN: |
4146 | case NFA_MOPEN1: |
4147 | case NFA_MOPEN2: |
4148 | case NFA_MOPEN3: |
4149 | case NFA_MOPEN4: |
4150 | case NFA_MOPEN5: |
4151 | case NFA_MOPEN6: |
4152 | case NFA_MOPEN7: |
4153 | case NFA_MOPEN8: |
4154 | case NFA_MOPEN9: |
4155 | case NFA_ZOPEN: |
4156 | case NFA_ZOPEN1: |
4157 | case NFA_ZOPEN2: |
4158 | case NFA_ZOPEN3: |
4159 | case NFA_ZOPEN4: |
4160 | case NFA_ZOPEN5: |
4161 | case NFA_ZOPEN6: |
4162 | case NFA_ZOPEN7: |
4163 | case NFA_ZOPEN8: |
4164 | case NFA_ZOPEN9: |
4165 | case NFA_ZSTART: |
4166 | if (state->c == NFA_ZSTART) { |
4167 | subidx = 0; |
4168 | sub = &subs->norm; |
4169 | } else if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) { // -V560 |
4170 | subidx = state->c - NFA_ZOPEN; |
4171 | sub = &subs->synt; |
4172 | } else { |
4173 | subidx = state->c - NFA_MOPEN; |
4174 | sub = &subs->norm; |
4175 | } |
4176 | |
4177 | /* avoid compiler warnings */ |
4178 | save_ptr = NULL; |
4179 | memset(&save_multipos, 0, sizeof(save_multipos)); |
4180 | |
4181 | /* Set the position (with "off" added) in the subexpression. Save |
4182 | * and restore it when it was in use. Otherwise fill any gap. */ |
4183 | if (REG_MULTI) { |
4184 | if (subidx < sub->in_use) { |
4185 | save_multipos = sub->list.multi[subidx]; |
4186 | save_in_use = -1; |
4187 | } else { |
4188 | save_in_use = sub->in_use; |
4189 | for (i = sub->in_use; i < subidx; ++i) { |
4190 | sub->list.multi[i].start_lnum = -1; |
4191 | sub->list.multi[i].end_lnum = -1; |
4192 | } |
4193 | sub->in_use = subidx + 1; |
4194 | } |
4195 | if (off == -1) { |
4196 | sub->list.multi[subidx].start_lnum = reglnum + 1; |
4197 | sub->list.multi[subidx].start_col = 0; |
4198 | } else { |
4199 | |
4200 | sub->list.multi[subidx].start_lnum = reglnum; |
4201 | sub->list.multi[subidx].start_col = |
4202 | (colnr_T)(reginput - regline + off); |
4203 | } |
4204 | sub->list.multi[subidx].end_lnum = -1; |
4205 | } else { |
4206 | if (subidx < sub->in_use) { |
4207 | save_ptr = sub->list.line[subidx].start; |
4208 | save_in_use = -1; |
4209 | } else { |
4210 | save_in_use = sub->in_use; |
4211 | for (i = sub->in_use; i < subidx; ++i) { |
4212 | sub->list.line[i].start = NULL; |
4213 | sub->list.line[i].end = NULL; |
4214 | } |
4215 | sub->in_use = subidx + 1; |
4216 | } |
4217 | sub->list.line[subidx].start = reginput + off; |
4218 | } |
4219 | |
4220 | subs = addstate(l, state->out, subs, pim, off_arg); |
4221 | if (subs == NULL) { |
4222 | break; |
4223 | } |
4224 | // "subs" may have changed, need to set "sub" again. |
4225 | if (state->c >= NFA_ZOPEN && state->c <= NFA_ZOPEN9) { // -V560 |
4226 | sub = &subs->synt; |
4227 | } else { |
4228 | sub = &subs->norm; |
4229 | } |
4230 | |
4231 | if (save_in_use == -1) { |
4232 | if (REG_MULTI) { |
4233 | sub->list.multi[subidx] = save_multipos; |
4234 | } |
4235 | else |
4236 | sub->list.line[subidx].start = save_ptr; |
4237 | } else |
4238 | sub->in_use = save_in_use; |
4239 | break; |
4240 | |
4241 | case NFA_MCLOSE: |
4242 | if (nfa_has_zend && (REG_MULTI |
4243 | ? subs->norm.list.multi[0].end_lnum >= 0 |
4244 | : subs->norm.list.line[0].end != NULL)) { |
4245 | // Do not overwrite the position set by \ze. |
4246 | subs = addstate(l, state->out, subs, pim, off_arg); |
4247 | break; |
4248 | } |
4249 | FALLTHROUGH; |
4250 | case NFA_MCLOSE1: |
4251 | case NFA_MCLOSE2: |
4252 | case NFA_MCLOSE3: |
4253 | case NFA_MCLOSE4: |
4254 | case NFA_MCLOSE5: |
4255 | case NFA_MCLOSE6: |
4256 | case NFA_MCLOSE7: |
4257 | case NFA_MCLOSE8: |
4258 | case NFA_MCLOSE9: |
4259 | case NFA_ZCLOSE: |
4260 | case NFA_ZCLOSE1: |
4261 | case NFA_ZCLOSE2: |
4262 | case NFA_ZCLOSE3: |
4263 | case NFA_ZCLOSE4: |
4264 | case NFA_ZCLOSE5: |
4265 | case NFA_ZCLOSE6: |
4266 | case NFA_ZCLOSE7: |
4267 | case NFA_ZCLOSE8: |
4268 | case NFA_ZCLOSE9: |
4269 | case NFA_ZEND: |
4270 | if (state->c == NFA_ZEND) { |
4271 | subidx = 0; |
4272 | sub = &subs->norm; |
4273 | } else if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) { // -V560 |
4274 | subidx = state->c - NFA_ZCLOSE; |
4275 | sub = &subs->synt; |
4276 | } else { |
4277 | subidx = state->c - NFA_MCLOSE; |
4278 | sub = &subs->norm; |
4279 | } |
4280 | |
4281 | /* We don't fill in gaps here, there must have been an MOPEN that |
4282 | * has done that. */ |
4283 | save_in_use = sub->in_use; |
4284 | if (sub->in_use <= subidx) |
4285 | sub->in_use = subidx + 1; |
4286 | if (REG_MULTI) { |
4287 | save_multipos = sub->list.multi[subidx]; |
4288 | if (off == -1) { |
4289 | sub->list.multi[subidx].end_lnum = reglnum + 1; |
4290 | sub->list.multi[subidx].end_col = 0; |
4291 | } else { |
4292 | sub->list.multi[subidx].end_lnum = reglnum; |
4293 | sub->list.multi[subidx].end_col = |
4294 | (colnr_T)(reginput - regline + off); |
4295 | } |
4296 | /* avoid compiler warnings */ |
4297 | save_ptr = NULL; |
4298 | } else { |
4299 | save_ptr = sub->list.line[subidx].end; |
4300 | sub->list.line[subidx].end = reginput + off; |
4301 | // avoid compiler warnings |
4302 | memset(&save_multipos, 0, sizeof(save_multipos)); |
4303 | } |
4304 | |
4305 | subs = addstate(l, state->out, subs, pim, off_arg); |
4306 | if (subs == NULL) { |
4307 | break; |
4308 | } |
4309 | // "subs" may have changed, need to set "sub" again. |
4310 | if (state->c >= NFA_ZCLOSE && state->c <= NFA_ZCLOSE9) { // -V560 |
4311 | sub = &subs->synt; |
4312 | } else { |
4313 | sub = &subs->norm; |
4314 | } |
4315 | |
4316 | if (REG_MULTI) { |
4317 | sub->list.multi[subidx] = save_multipos; |
4318 | } |
4319 | else |
4320 | sub->list.line[subidx].end = save_ptr; |
4321 | sub->in_use = save_in_use; |
4322 | break; |
4323 | } |
4324 | depth--; |
4325 | return subs; |
4326 | } |
4327 | |
4328 | /* |
4329 | * Like addstate(), but the new state(s) are put at position "*ip". |
4330 | * Used for zero-width matches, next state to use is the added one. |
4331 | * This makes sure the order of states to be tried does not change, which |
4332 | * matters for alternatives. |
4333 | */ |
4334 | static regsubs_T *addstate_here( |
4335 | nfa_list_T *l, // runtime state list |
4336 | nfa_state_T *state, // state to update |
4337 | regsubs_T *subs, // pointers to subexpressions |
4338 | nfa_pim_T *pim, // postponed look-behind match |
4339 | int *ip |
4340 | ) |
4341 | FUNC_ATTR_NONNULL_ARG(1, 2, 5) FUNC_ATTR_WARN_UNUSED_RESULT |
4342 | { |
4343 | int tlen = l->n; |
4344 | int count; |
4345 | int listidx = *ip; |
4346 | |
4347 | /* First add the state(s) at the end, so that we know how many there are. |
4348 | * Pass the listidx as offset (avoids adding another argument to |
4349 | * addstate(). */ |
4350 | regsubs_T *r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET); |
4351 | if (r == NULL) { |
4352 | return NULL; |
4353 | } |
4354 | |
4355 | // when "*ip" was at the end of the list, nothing to do |
4356 | if (listidx + 1 == tlen) { |
4357 | return r; |
4358 | } |
4359 | |
4360 | // re-order to put the new state at the current position |
4361 | count = l->n - tlen; |
4362 | if (count == 0) { |
4363 | return r; // no state got added |
4364 | } |
4365 | if (count == 1) { |
4366 | // overwrite the current state |
4367 | l->t[listidx] = l->t[l->n - 1]; |
4368 | } else if (count > 1) { |
4369 | if (l->n + count - 1 >= l->len) { |
4370 | /* not enough space to move the new states, reallocate the list |
4371 | * and move the states to the right position */ |
4372 | const int newlen = l->len * 3 / 2 + 50; |
4373 | const size_t newsize = newlen * sizeof(nfa_thread_T); |
4374 | |
4375 | if ((long)(newsize >> 10) >= p_mmp) { |
4376 | EMSG(_(e_maxmempat)); |
4377 | return NULL; |
4378 | } |
4379 | nfa_thread_T *const newl = xmalloc(newsize); |
4380 | l->len = newlen; |
4381 | memmove(&(newl[0]), |
4382 | &(l->t[0]), |
4383 | sizeof(nfa_thread_T) * listidx); |
4384 | memmove(&(newl[listidx]), |
4385 | &(l->t[l->n - count]), |
4386 | sizeof(nfa_thread_T) * count); |
4387 | memmove(&(newl[listidx + count]), |
4388 | &(l->t[listidx + 1]), |
4389 | sizeof(nfa_thread_T) * (l->n - count - listidx - 1)); |
4390 | xfree(l->t); |
4391 | l->t = newl; |
4392 | } else { |
4393 | /* make space for new states, then move them from the |
4394 | * end to the current position */ |
4395 | memmove(&(l->t[listidx + count]), |
4396 | &(l->t[listidx + 1]), |
4397 | sizeof(nfa_thread_T) * (l->n - listidx - 1)); |
4398 | memmove(&(l->t[listidx]), |
4399 | &(l->t[l->n - 1]), |
4400 | sizeof(nfa_thread_T) * count); |
4401 | } |
4402 | } |
4403 | --l->n; |
4404 | *ip = listidx - 1; |
4405 | |
4406 | return r; |
4407 | } |
4408 | |
4409 | /* |
4410 | * Check character class "class" against current character c. |
4411 | */ |
4412 | static int check_char_class(int class, int c) |
4413 | { |
4414 | switch (class) { |
4415 | case NFA_CLASS_ALNUM: |
4416 | if (c >= 1 && c < 128 && isalnum(c)) { |
4417 | return OK; |
4418 | } |
4419 | break; |
4420 | case NFA_CLASS_ALPHA: |
4421 | if (c >= 1 && c < 128 && isalpha(c)) { |
4422 | return OK; |
4423 | } |
4424 | break; |
4425 | case NFA_CLASS_BLANK: |
4426 | if (c == ' ' || c == '\t') |
4427 | return OK; |
4428 | break; |
4429 | case NFA_CLASS_CNTRL: |
4430 | if (c >= 1 && c <= 127 && iscntrl(c)) { |
4431 | return OK; |
4432 | } |
4433 | break; |
4434 | case NFA_CLASS_DIGIT: |
4435 | if (ascii_isdigit(c)) |
4436 | return OK; |
4437 | break; |
4438 | case NFA_CLASS_GRAPH: |
4439 | if (c >= 1 && c <= 127 && isgraph(c)) { |
4440 | return OK; |
4441 | } |
4442 | break; |
4443 | case NFA_CLASS_LOWER: |
4444 | if (mb_islower(c) && c != 170 && c != 186) { |
4445 | return OK; |
4446 | } |
4447 | break; |
4448 | case NFA_CLASS_PRINT: |
4449 | if (vim_isprintc(c)) |
4450 | return OK; |
4451 | break; |
4452 | case NFA_CLASS_PUNCT: |
4453 | if (c >= 1 && c < 128 && ispunct(c)) { |
4454 | return OK; |
4455 | } |
4456 | break; |
4457 | case NFA_CLASS_SPACE: |
4458 | if ((c >= 9 && c <= 13) || (c == ' ')) |
4459 | return OK; |
4460 | break; |
4461 | case NFA_CLASS_UPPER: |
4462 | if (mb_isupper(c)) { |
4463 | return OK; |
4464 | } |
4465 | break; |
4466 | case NFA_CLASS_XDIGIT: |
4467 | if (ascii_isxdigit(c)) |
4468 | return OK; |
4469 | break; |
4470 | case NFA_CLASS_TAB: |
4471 | if (c == '\t') |
4472 | return OK; |
4473 | break; |
4474 | case NFA_CLASS_RETURN: |
4475 | if (c == '\r') |
4476 | return OK; |
4477 | break; |
4478 | case NFA_CLASS_BACKSPACE: |
4479 | if (c == '\b') |
4480 | return OK; |
4481 | break; |
4482 | case NFA_CLASS_ESCAPE: |
4483 | if (c == ESC) { |
4484 | return OK; |
4485 | } |
4486 | break; |
4487 | |
4488 | default: |
4489 | // should not be here :P |
4490 | IEMSGN(_(e_ill_char_class), class); |
4491 | return FAIL; |
4492 | } |
4493 | return FAIL; |
4494 | } |
4495 | |
4496 | /* |
4497 | * Check for a match with subexpression "subidx". |
4498 | * Return TRUE if it matches. |
4499 | */ |
4500 | static int |
4501 | match_backref ( |
4502 | regsub_T *sub, /* pointers to subexpressions */ |
4503 | int subidx, |
4504 | int *bytelen /* out: length of match in bytes */ |
4505 | ) |
4506 | { |
4507 | int len; |
4508 | |
4509 | if (sub->in_use <= subidx) { |
4510 | retempty: |
4511 | /* backref was not set, match an empty string */ |
4512 | *bytelen = 0; |
4513 | return TRUE; |
4514 | } |
4515 | |
4516 | if (REG_MULTI) { |
4517 | if (sub->list.multi[subidx].start_lnum < 0 |
4518 | || sub->list.multi[subidx].end_lnum < 0) |
4519 | goto retempty; |
4520 | if (sub->list.multi[subidx].start_lnum == reglnum |
4521 | && sub->list.multi[subidx].end_lnum == reglnum) { |
4522 | len = sub->list.multi[subidx].end_col |
4523 | - sub->list.multi[subidx].start_col; |
4524 | if (cstrncmp(regline + sub->list.multi[subidx].start_col, |
4525 | reginput, &len) == 0) { |
4526 | *bytelen = len; |
4527 | return TRUE; |
4528 | } |
4529 | } else { |
4530 | if (match_with_backref( |
4531 | sub->list.multi[subidx].start_lnum, |
4532 | sub->list.multi[subidx].start_col, |
4533 | sub->list.multi[subidx].end_lnum, |
4534 | sub->list.multi[subidx].end_col, |
4535 | bytelen) == RA_MATCH) |
4536 | return TRUE; |
4537 | } |
4538 | } else { |
4539 | if (sub->list.line[subidx].start == NULL |
4540 | || sub->list.line[subidx].end == NULL) |
4541 | goto retempty; |
4542 | len = (int)(sub->list.line[subidx].end - sub->list.line[subidx].start); |
4543 | if (cstrncmp(sub->list.line[subidx].start, reginput, &len) == 0) { |
4544 | *bytelen = len; |
4545 | return TRUE; |
4546 | } |
4547 | } |
4548 | return FALSE; |
4549 | } |
4550 | |
4551 | |
4552 | |
4553 | /* |
4554 | * Check for a match with \z subexpression "subidx". |
4555 | * Return TRUE if it matches. |
4556 | */ |
4557 | static int |
4558 | match_zref ( |
4559 | int subidx, |
4560 | int *bytelen /* out: length of match in bytes */ |
4561 | ) |
4562 | { |
4563 | int len; |
4564 | |
4565 | cleanup_zsubexpr(); |
4566 | if (re_extmatch_in == NULL || re_extmatch_in->matches[subidx] == NULL) { |
4567 | /* backref was not set, match an empty string */ |
4568 | *bytelen = 0; |
4569 | return TRUE; |
4570 | } |
4571 | |
4572 | len = (int)STRLEN(re_extmatch_in->matches[subidx]); |
4573 | if (cstrncmp(re_extmatch_in->matches[subidx], reginput, &len) == 0) { |
4574 | *bytelen = len; |
4575 | return TRUE; |
4576 | } |
4577 | return FALSE; |
4578 | } |
4579 | |
4580 | /* |
4581 | * Save list IDs for all NFA states of "prog" into "list". |
4582 | * Also reset the IDs to zero. |
4583 | * Only used for the recursive value lastlist[1]. |
4584 | */ |
4585 | static void nfa_save_listids(nfa_regprog_T *prog, int *list) |
4586 | { |
4587 | int i; |
4588 | nfa_state_T *p; |
4589 | |
4590 | /* Order in the list is reverse, it's a bit faster that way. */ |
4591 | p = &prog->state[0]; |
4592 | for (i = prog->nstate; --i >= 0; ) { |
4593 | list[i] = p->lastlist[1]; |
4594 | p->lastlist[1] = 0; |
4595 | ++p; |
4596 | } |
4597 | } |
4598 | |
4599 | /* |
4600 | * Restore list IDs from "list" to all NFA states. |
4601 | */ |
4602 | static void nfa_restore_listids(nfa_regprog_T *prog, int *list) |
4603 | { |
4604 | int i; |
4605 | nfa_state_T *p; |
4606 | |
4607 | p = &prog->state[0]; |
4608 | for (i = prog->nstate; --i >= 0; ) { |
4609 | p->lastlist[1] = list[i]; |
4610 | ++p; |
4611 | } |
4612 | } |
4613 | |
4614 | static bool nfa_re_num_cmp(uintmax_t val, int op, uintmax_t pos) |
4615 | { |
4616 | if (op == 1) return pos > val; |
4617 | if (op == 2) return pos < val; |
4618 | return val == pos; |
4619 | } |
4620 | |
4621 | |
4622 | /* |
4623 | * Recursively call nfa_regmatch() |
4624 | * "pim" is NULL or contains info about a Postponed Invisible Match (start |
4625 | * position). |
4626 | */ |
4627 | static int recursive_regmatch( |
4628 | nfa_state_T *state, nfa_pim_T *pim, nfa_regprog_T *prog, |
4629 | regsubs_T *submatch, regsubs_T *m, int **listids, int *listids_len) |
4630 | { |
4631 | int save_reginput_col = (int)(reginput - regline); |
4632 | int save_reglnum = reglnum; |
4633 | int save_nfa_match = nfa_match; |
4634 | int save_nfa_listid = nfa_listid; |
4635 | save_se_T *save_nfa_endp = nfa_endp; |
4636 | save_se_T endpos; |
4637 | save_se_T *endposp = NULL; |
4638 | int result; |
4639 | int need_restore = FALSE; |
4640 | |
4641 | if (pim != NULL) { |
4642 | /* start at the position where the postponed match was */ |
4643 | if (REG_MULTI) |
4644 | reginput = regline + pim->end.pos.col; |
4645 | else |
4646 | reginput = pim->end.ptr; |
4647 | } |
4648 | |
4649 | if (state->c == NFA_START_INVISIBLE_BEFORE |
4650 | || state->c == NFA_START_INVISIBLE_BEFORE_FIRST |
4651 | || state->c == NFA_START_INVISIBLE_BEFORE_NEG |
4652 | || state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) { |
4653 | /* The recursive match must end at the current position. When "pim" is |
4654 | * not NULL it specifies the current position. */ |
4655 | endposp = &endpos; |
4656 | if (REG_MULTI) { |
4657 | if (pim == NULL) { |
4658 | endpos.se_u.pos.col = (int)(reginput - regline); |
4659 | endpos.se_u.pos.lnum = reglnum; |
4660 | } else |
4661 | endpos.se_u.pos = pim->end.pos; |
4662 | } else { |
4663 | if (pim == NULL) |
4664 | endpos.se_u.ptr = reginput; |
4665 | else |
4666 | endpos.se_u.ptr = pim->end.ptr; |
4667 | } |
4668 | |
4669 | /* Go back the specified number of bytes, or as far as the |
4670 | * start of the previous line, to try matching "\@<=" or |
4671 | * not matching "\@<!". This is very inefficient, limit the number of |
4672 | * bytes if possible. */ |
4673 | if (state->val <= 0) { |
4674 | if (REG_MULTI) { |
4675 | regline = reg_getline(--reglnum); |
4676 | if (regline == NULL) |
4677 | /* can't go before the first line */ |
4678 | regline = reg_getline(++reglnum); |
4679 | } |
4680 | reginput = regline; |
4681 | } else { |
4682 | if (REG_MULTI && (int)(reginput - regline) < state->val) { |
4683 | /* Not enough bytes in this line, go to end of |
4684 | * previous line. */ |
4685 | regline = reg_getline(--reglnum); |
4686 | if (regline == NULL) { |
4687 | /* can't go before the first line */ |
4688 | regline = reg_getline(++reglnum); |
4689 | reginput = regline; |
4690 | } else |
4691 | reginput = regline + STRLEN(regline); |
4692 | } |
4693 | if ((int)(reginput - regline) >= state->val) { |
4694 | reginput -= state->val; |
4695 | reginput -= utf_head_off(regline, reginput); |
4696 | } else { |
4697 | reginput = regline; |
4698 | } |
4699 | } |
4700 | } |
4701 | |
4702 | #ifdef REGEXP_DEBUG |
4703 | if (log_fd != stderr) |
4704 | fclose(log_fd); |
4705 | log_fd = NULL; |
4706 | #endif |
4707 | /* Have to clear the lastlist field of the NFA nodes, so that |
4708 | * nfa_regmatch() and addstate() can run properly after recursion. */ |
4709 | if (nfa_ll_index == 1) { |
4710 | /* Already calling nfa_regmatch() recursively. Save the lastlist[1] |
4711 | * values and clear them. */ |
4712 | if (*listids == NULL || *listids_len < nstate) { |
4713 | xfree(*listids); |
4714 | *listids = xmalloc(sizeof(**listids) * nstate); |
4715 | *listids_len = nstate; |
4716 | } |
4717 | nfa_save_listids(prog, *listids); |
4718 | need_restore = TRUE; |
4719 | /* any value of nfa_listid will do */ |
4720 | } else { |
4721 | /* First recursive nfa_regmatch() call, switch to the second lastlist |
4722 | * entry. Make sure nfa_listid is different from a previous recursive |
4723 | * call, because some states may still have this ID. */ |
4724 | ++nfa_ll_index; |
4725 | if (nfa_listid <= nfa_alt_listid) |
4726 | nfa_listid = nfa_alt_listid; |
4727 | } |
4728 | |
4729 | /* Call nfa_regmatch() to check if the current concat matches at this |
4730 | * position. The concat ends with the node NFA_END_INVISIBLE */ |
4731 | nfa_endp = endposp; |
4732 | result = nfa_regmatch(prog, state->out, submatch, m); |
4733 | |
4734 | if (need_restore) |
4735 | nfa_restore_listids(prog, *listids); |
4736 | else { |
4737 | --nfa_ll_index; |
4738 | nfa_alt_listid = nfa_listid; |
4739 | } |
4740 | |
4741 | /* restore position in input text */ |
4742 | reglnum = save_reglnum; |
4743 | if (REG_MULTI) |
4744 | regline = reg_getline(reglnum); |
4745 | reginput = regline + save_reginput_col; |
4746 | if (result != NFA_TOO_EXPENSIVE) { |
4747 | nfa_match = save_nfa_match; |
4748 | nfa_listid = save_nfa_listid; |
4749 | } |
4750 | nfa_endp = save_nfa_endp; |
4751 | |
4752 | #ifdef REGEXP_DEBUG |
4753 | log_fd = fopen(NFA_REGEXP_RUN_LOG, "a" ); |
4754 | if (log_fd != NULL) { |
4755 | fprintf(log_fd, "****************************\n" ); |
4756 | fprintf(log_fd, "FINISHED RUNNING nfa_regmatch() recursively\n" ); |
4757 | fprintf(log_fd, "MATCH = %s\n" , !result ? "FALSE" : "OK" ); |
4758 | fprintf(log_fd, "****************************\n" ); |
4759 | } else { |
4760 | EMSG(_(e_log_open_failed)); |
4761 | log_fd = stderr; |
4762 | } |
4763 | #endif |
4764 | |
4765 | return result; |
4766 | } |
4767 | |
4768 | |
4769 | /* |
4770 | * Estimate the chance of a match with "state" failing. |
4771 | * empty match: 0 |
4772 | * NFA_ANY: 1 |
4773 | * specific character: 99 |
4774 | */ |
4775 | static int failure_chance(nfa_state_T *state, int depth) |
4776 | { |
4777 | int c = state->c; |
4778 | int l, r; |
4779 | |
4780 | /* detect looping */ |
4781 | if (depth > 4) |
4782 | return 1; |
4783 | |
4784 | switch (c) { |
4785 | case NFA_SPLIT: |
4786 | if (state->out->c == NFA_SPLIT || state->out1->c == NFA_SPLIT) |
4787 | /* avoid recursive stuff */ |
4788 | return 1; |
4789 | /* two alternatives, use the lowest failure chance */ |
4790 | l = failure_chance(state->out, depth + 1); |
4791 | r = failure_chance(state->out1, depth + 1); |
4792 | return l < r ? l : r; |
4793 | |
4794 | case NFA_ANY: |
4795 | /* matches anything, unlikely to fail */ |
4796 | return 1; |
4797 | |
4798 | case NFA_MATCH: |
4799 | case NFA_MCLOSE: |
4800 | case NFA_ANY_COMPOSING: |
4801 | /* empty match works always */ |
4802 | return 0; |
4803 | |
4804 | case NFA_START_INVISIBLE: |
4805 | case NFA_START_INVISIBLE_FIRST: |
4806 | case NFA_START_INVISIBLE_NEG: |
4807 | case NFA_START_INVISIBLE_NEG_FIRST: |
4808 | case NFA_START_INVISIBLE_BEFORE: |
4809 | case NFA_START_INVISIBLE_BEFORE_FIRST: |
4810 | case NFA_START_INVISIBLE_BEFORE_NEG: |
4811 | case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: |
4812 | case NFA_START_PATTERN: |
4813 | /* recursive regmatch is expensive, use low failure chance */ |
4814 | return 5; |
4815 | |
4816 | case NFA_BOL: |
4817 | case NFA_EOL: |
4818 | case NFA_BOF: |
4819 | case NFA_EOF: |
4820 | case NFA_NEWL: |
4821 | return 99; |
4822 | |
4823 | case NFA_BOW: |
4824 | case NFA_EOW: |
4825 | return 90; |
4826 | |
4827 | case NFA_MOPEN: |
4828 | case NFA_MOPEN1: |
4829 | case NFA_MOPEN2: |
4830 | case NFA_MOPEN3: |
4831 | case NFA_MOPEN4: |
4832 | case NFA_MOPEN5: |
4833 | case NFA_MOPEN6: |
4834 | case NFA_MOPEN7: |
4835 | case NFA_MOPEN8: |
4836 | case NFA_MOPEN9: |
4837 | case NFA_ZOPEN: |
4838 | case NFA_ZOPEN1: |
4839 | case NFA_ZOPEN2: |
4840 | case NFA_ZOPEN3: |
4841 | case NFA_ZOPEN4: |
4842 | case NFA_ZOPEN5: |
4843 | case NFA_ZOPEN6: |
4844 | case NFA_ZOPEN7: |
4845 | case NFA_ZOPEN8: |
4846 | case NFA_ZOPEN9: |
4847 | case NFA_ZCLOSE: |
4848 | case NFA_ZCLOSE1: |
4849 | case NFA_ZCLOSE2: |
4850 | case NFA_ZCLOSE3: |
4851 | case NFA_ZCLOSE4: |
4852 | case NFA_ZCLOSE5: |
4853 | case NFA_ZCLOSE6: |
4854 | case NFA_ZCLOSE7: |
4855 | case NFA_ZCLOSE8: |
4856 | case NFA_ZCLOSE9: |
4857 | case NFA_NOPEN: |
4858 | case NFA_MCLOSE1: |
4859 | case NFA_MCLOSE2: |
4860 | case NFA_MCLOSE3: |
4861 | case NFA_MCLOSE4: |
4862 | case NFA_MCLOSE5: |
4863 | case NFA_MCLOSE6: |
4864 | case NFA_MCLOSE7: |
4865 | case NFA_MCLOSE8: |
4866 | case NFA_MCLOSE9: |
4867 | case NFA_NCLOSE: |
4868 | return failure_chance(state->out, depth + 1); |
4869 | |
4870 | case NFA_BACKREF1: |
4871 | case NFA_BACKREF2: |
4872 | case NFA_BACKREF3: |
4873 | case NFA_BACKREF4: |
4874 | case NFA_BACKREF5: |
4875 | case NFA_BACKREF6: |
4876 | case NFA_BACKREF7: |
4877 | case NFA_BACKREF8: |
4878 | case NFA_BACKREF9: |
4879 | case NFA_ZREF1: |
4880 | case NFA_ZREF2: |
4881 | case NFA_ZREF3: |
4882 | case NFA_ZREF4: |
4883 | case NFA_ZREF5: |
4884 | case NFA_ZREF6: |
4885 | case NFA_ZREF7: |
4886 | case NFA_ZREF8: |
4887 | case NFA_ZREF9: |
4888 | /* backreferences don't match in many places */ |
4889 | return 94; |
4890 | |
4891 | case NFA_LNUM_GT: |
4892 | case NFA_LNUM_LT: |
4893 | case NFA_COL_GT: |
4894 | case NFA_COL_LT: |
4895 | case NFA_VCOL_GT: |
4896 | case NFA_VCOL_LT: |
4897 | case NFA_MARK_GT: |
4898 | case NFA_MARK_LT: |
4899 | case NFA_VISUAL: |
4900 | /* before/after positions don't match very often */ |
4901 | return 85; |
4902 | |
4903 | case NFA_LNUM: |
4904 | return 90; |
4905 | |
4906 | case NFA_CURSOR: |
4907 | case NFA_COL: |
4908 | case NFA_VCOL: |
4909 | case NFA_MARK: |
4910 | /* specific positions rarely match */ |
4911 | return 98; |
4912 | |
4913 | case NFA_COMPOSING: |
4914 | return 95; |
4915 | |
4916 | default: |
4917 | if (c > 0) |
4918 | /* character match fails often */ |
4919 | return 95; |
4920 | } |
4921 | |
4922 | /* something else, includes character classes */ |
4923 | return 50; |
4924 | } |
4925 | |
4926 | /* |
4927 | * Skip until the char "c" we know a match must start with. |
4928 | */ |
4929 | static int skip_to_start(int c, colnr_T *colp) |
4930 | { |
4931 | const char_u *const s = cstrchr(regline + *colp, c); |
4932 | if (s == NULL) { |
4933 | return FAIL; |
4934 | } |
4935 | *colp = (int)(s - regline); |
4936 | return OK; |
4937 | } |
4938 | |
4939 | /* |
4940 | * Check for a match with match_text. |
4941 | * Called after skip_to_start() has found regstart. |
4942 | * Returns zero for no match, 1 for a match. |
4943 | */ |
4944 | static long find_match_text(colnr_T startcol, int regstart, char_u *match_text) |
4945 | { |
4946 | #define PTR2LEN(x) enc_utf8 ? utf_ptr2len(x) : MB_PTR2LEN(x) |
4947 | |
4948 | colnr_T col = startcol; |
4949 | int regstart_len = PTR2LEN(regline + startcol); |
4950 | |
4951 | for (;;) { |
4952 | bool match = true; |
4953 | char_u *s1 = match_text; |
4954 | char_u *s2 = regline + col + regstart_len; // skip regstart |
4955 | while (*s1) { |
4956 | int c1_len = PTR2LEN(s1); |
4957 | int c1 = PTR2CHAR(s1); |
4958 | int c2_len = PTR2LEN(s2); |
4959 | int c2 = PTR2CHAR(s2); |
4960 | |
4961 | if ((c1 != c2 && (!rex.reg_ic || mb_tolower(c1) != mb_tolower(c2))) |
4962 | || c1_len != c2_len) { |
4963 | match = false; |
4964 | break; |
4965 | } |
4966 | s1 += c1_len; |
4967 | s2 += c2_len; |
4968 | } |
4969 | if (match |
4970 | // check that no composing char follows |
4971 | && !(enc_utf8 && utf_iscomposing(PTR2CHAR(s2)))) { |
4972 | cleanup_subexpr(); |
4973 | if (REG_MULTI) { |
4974 | rex.reg_startpos[0].lnum = reglnum; |
4975 | rex.reg_startpos[0].col = col; |
4976 | rex.reg_endpos[0].lnum = reglnum; |
4977 | rex.reg_endpos[0].col = s2 - regline; |
4978 | } else { |
4979 | rex.reg_startp[0] = regline + col; |
4980 | rex.reg_endp[0] = s2; |
4981 | } |
4982 | return 1L; |
4983 | } |
4984 | |
4985 | // Try finding regstart after the current match. |
4986 | col += regstart_len; // skip regstart |
4987 | if (skip_to_start(regstart, &col) == FAIL) { |
4988 | break; |
4989 | } |
4990 | } |
4991 | return 0L; |
4992 | |
4993 | #undef PTR2LEN |
4994 | } |
4995 | |
4996 | static int nfa_did_time_out(void) |
4997 | { |
4998 | if (nfa_time_limit != NULL && profile_passed_limit(*nfa_time_limit)) { |
4999 | if (nfa_timed_out != NULL) { |
5000 | *nfa_timed_out = true; |
5001 | } |
5002 | return true; |
5003 | } |
5004 | return false; |
5005 | } |
5006 | |
5007 | /// Main matching routine. |
5008 | /// |
5009 | /// Run NFA to determine whether it matches reginput. |
5010 | /// |
5011 | /// When "nfa_endp" is not NULL it is a required end-of-match position. |
5012 | /// |
5013 | /// Return TRUE if there is a match, FALSE if there is no match, |
5014 | /// NFA_TOO_EXPENSIVE if we end up with too many states. |
5015 | /// When there is a match "submatch" contains the positions. |
5016 | /// |
5017 | /// Note: Caller must ensure that: start != NULL. |
5018 | static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, |
5019 | regsubs_T *submatch, regsubs_T *m) |
5020 | { |
5021 | int result = false; |
5022 | int flag = 0; |
5023 | bool go_to_nextline = false; |
5024 | nfa_thread_T *t; |
5025 | nfa_list_T list[2]; |
5026 | int listidx; |
5027 | nfa_list_T *thislist; |
5028 | nfa_list_T *nextlist; |
5029 | int *listids = NULL; |
5030 | int listids_len = 0; |
5031 | nfa_state_T *add_state; |
5032 | bool add_here; |
5033 | int add_count; |
5034 | int add_off = 0; |
5035 | int toplevel = start->c == NFA_MOPEN; |
5036 | regsubs_T *r; |
5037 | #ifdef NFA_REGEXP_DEBUG_LOG |
5038 | FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG, "a" ); |
5039 | |
5040 | if (debug == NULL) { |
5041 | EMSG2("(NFA) COULD NOT OPEN %s!" , NFA_REGEXP_DEBUG_LOG); |
5042 | return false; |
5043 | } |
5044 | #endif |
5045 | // Some patterns may take a long time to match, especially when using |
5046 | // recursive_regmatch(). Allow interrupting them with CTRL-C. |
5047 | fast_breakcheck(); |
5048 | if (got_int) { |
5049 | #ifdef NFA_REGEXP_DEBUG_LOG |
5050 | fclose(debug); |
5051 | #endif |
5052 | return false; |
5053 | } |
5054 | if (nfa_did_time_out()) { |
5055 | #ifdef NFA_REGEXP_DEBUG_LOG |
5056 | fclose(debug); |
5057 | #endif |
5058 | return false; |
5059 | } |
5060 | |
5061 | nfa_match = false; |
5062 | |
5063 | // Allocate memory for the lists of nodes. |
5064 | size_t size = (nstate + 1) * sizeof(nfa_thread_T); |
5065 | list[0].t = xmalloc(size); |
5066 | list[0].len = nstate + 1; |
5067 | list[1].t = xmalloc(size); |
5068 | list[1].len = nstate + 1; |
5069 | |
5070 | #ifdef REGEXP_DEBUG |
5071 | log_fd = fopen(NFA_REGEXP_RUN_LOG, "a" ); |
5072 | if (log_fd != NULL) { |
5073 | fprintf(log_fd, "**********************************\n" ); |
5074 | nfa_set_code(start->c); |
5075 | fprintf(log_fd, " RUNNING nfa_regmatch() starting with state %d, code %s\n" , |
5076 | abs(start->id), code); |
5077 | fprintf(log_fd, "**********************************\n" ); |
5078 | } else { |
5079 | EMSG(_(e_log_open_failed)); |
5080 | log_fd = stderr; |
5081 | } |
5082 | #endif |
5083 | |
5084 | thislist = &list[0]; |
5085 | thislist->n = 0; |
5086 | thislist->has_pim = FALSE; |
5087 | nextlist = &list[1]; |
5088 | nextlist->n = 0; |
5089 | nextlist->has_pim = FALSE; |
5090 | #ifdef REGEXP_DEBUG |
5091 | fprintf(log_fd, "(---) STARTSTATE first\n" ); |
5092 | #endif |
5093 | thislist->id = nfa_listid + 1; |
5094 | |
5095 | /* Inline optimized code for addstate(thislist, start, m, 0) if we know |
5096 | * it's the first MOPEN. */ |
5097 | if (toplevel) { |
5098 | if (REG_MULTI) { |
5099 | m->norm.list.multi[0].start_lnum = reglnum; |
5100 | m->norm.list.multi[0].start_col = (colnr_T)(reginput - regline); |
5101 | } else |
5102 | m->norm.list.line[0].start = reginput; |
5103 | m->norm.in_use = 1; |
5104 | r = addstate(thislist, start->out, m, NULL, 0); |
5105 | } else { |
5106 | r = addstate(thislist, start, m, NULL, 0); |
5107 | } |
5108 | if (r == NULL) { |
5109 | nfa_match = NFA_TOO_EXPENSIVE; |
5110 | goto theend; |
5111 | } |
5112 | |
5113 | #define ADD_STATE_IF_MATCH(state) \ |
5114 | if (result) { \ |
5115 | add_state = state->out; \ |
5116 | add_off = clen; \ |
5117 | } |
5118 | |
5119 | /* |
5120 | * Run for each character. |
5121 | */ |
5122 | for (;; ) { |
5123 | int curc = utf_ptr2char(reginput); |
5124 | int clen = utfc_ptr2len(reginput); |
5125 | if (curc == NUL) { |
5126 | clen = 0; |
5127 | go_to_nextline = false; |
5128 | } |
5129 | |
5130 | /* swap lists */ |
5131 | thislist = &list[flag]; |
5132 | nextlist = &list[flag ^= 1]; |
5133 | nextlist->n = 0; // clear nextlist |
5134 | nextlist->has_pim = false; |
5135 | nfa_listid++; |
5136 | if (prog->re_engine == AUTOMATIC_ENGINE |
5137 | && (nfa_listid >= NFA_MAX_STATES)) { |
5138 | // Too many states, retry with old engine. |
5139 | nfa_match = NFA_TOO_EXPENSIVE; |
5140 | goto theend; |
5141 | } |
5142 | |
5143 | thislist->id = nfa_listid; |
5144 | nextlist->id = nfa_listid + 1; |
5145 | |
5146 | #ifdef REGEXP_DEBUG |
5147 | fprintf(log_fd, "------------------------------------------\n" ); |
5148 | fprintf(log_fd, ">>> Reginput is \"%s\"\n" , reginput); |
5149 | fprintf(log_fd, |
5150 | ">>> Advanced one character... Current char is %c (code %d) \n" , |
5151 | curc, |
5152 | (int)curc); |
5153 | fprintf(log_fd, ">>> Thislist has %d states available: " , thislist->n); |
5154 | { |
5155 | int i; |
5156 | |
5157 | for (i = 0; i < thislist->n; i++) |
5158 | fprintf(log_fd, "%d " , abs(thislist->t[i].state->id)); |
5159 | } |
5160 | fprintf(log_fd, "\n" ); |
5161 | #endif |
5162 | |
5163 | #ifdef NFA_REGEXP_DEBUG_LOG |
5164 | fprintf(debug, "\n-------------------\n" ); |
5165 | #endif |
5166 | /* |
5167 | * If the state lists are empty we can stop. |
5168 | */ |
5169 | if (thislist->n == 0) |
5170 | break; |
5171 | |
5172 | // compute nextlist |
5173 | for (listidx = 0; listidx < thislist->n; listidx++) { |
5174 | // If the list gets very long there probably is something wrong. |
5175 | // At least allow interrupting with CTRL-C. |
5176 | fast_breakcheck(); |
5177 | if (got_int) { |
5178 | break; |
5179 | } |
5180 | if (nfa_time_limit != NULL && ++nfa_time_count == 20) { |
5181 | nfa_time_count = 0; |
5182 | if (nfa_did_time_out()) { |
5183 | break; |
5184 | } |
5185 | } |
5186 | t = &thislist->t[listidx]; |
5187 | |
5188 | #ifdef NFA_REGEXP_DEBUG_LOG |
5189 | nfa_set_code(t->state->c); |
5190 | fprintf(debug, "%s, " , code); |
5191 | #endif |
5192 | #ifdef REGEXP_DEBUG |
5193 | { |
5194 | int col; |
5195 | |
5196 | if (t->subs.norm.in_use <= 0) { |
5197 | col = -1; |
5198 | } else if (REG_MULTI) { |
5199 | col = t->subs.norm.list.multi[0].start_col; |
5200 | } else { |
5201 | col = (int)(t->subs.norm.list.line[0].start - regline); |
5202 | } |
5203 | nfa_set_code(t->state->c); |
5204 | fprintf(log_fd, "(%d) char %d %s (start col %d)%s... \n" , |
5205 | abs(t->state->id), (int)t->state->c, code, col, |
5206 | pim_info(&t->pim)); |
5207 | } |
5208 | #endif |
5209 | |
5210 | /* |
5211 | * Handle the possible codes of the current state. |
5212 | * The most important is NFA_MATCH. |
5213 | */ |
5214 | add_state = NULL; |
5215 | add_here = false; |
5216 | add_count = 0; |
5217 | switch (t->state->c) { |
5218 | case NFA_MATCH: |
5219 | { |
5220 | // If the match ends before a composing characters and |
5221 | // rex.reg_icombine is not set, that is not really a match. |
5222 | if (enc_utf8 && !rex.reg_icombine && utf_iscomposing(curc)) { |
5223 | break; |
5224 | } |
5225 | nfa_match = true; |
5226 | copy_sub(&submatch->norm, &t->subs.norm); |
5227 | if (nfa_has_zsubexpr) |
5228 | copy_sub(&submatch->synt, &t->subs.synt); |
5229 | #ifdef REGEXP_DEBUG |
5230 | log_subsexpr(&t->subs); |
5231 | #endif |
5232 | /* Found the left-most longest match, do not look at any other |
5233 | * states at this position. When the list of states is going |
5234 | * to be empty quit without advancing, so that "reginput" is |
5235 | * correct. */ |
5236 | if (nextlist->n == 0) |
5237 | clen = 0; |
5238 | goto nextchar; |
5239 | } |
5240 | |
5241 | case NFA_END_INVISIBLE: |
5242 | case NFA_END_INVISIBLE_NEG: |
5243 | case NFA_END_PATTERN: |
5244 | /* |
5245 | * This is only encountered after a NFA_START_INVISIBLE or |
5246 | * NFA_START_INVISIBLE_BEFORE node. |
5247 | * They surround a zero-width group, used with "\@=", "\&", |
5248 | * "\@!", "\@<=" and "\@<!". |
5249 | * If we got here, it means that the current "invisible" group |
5250 | * finished successfully, so return control to the parent |
5251 | * nfa_regmatch(). For a look-behind match only when it ends |
5252 | * in the position in "nfa_endp". |
5253 | * Submatches are stored in *m, and used in the parent call. |
5254 | */ |
5255 | #ifdef REGEXP_DEBUG |
5256 | if (nfa_endp != NULL) { |
5257 | if (REG_MULTI) |
5258 | fprintf( |
5259 | log_fd, |
5260 | "Current lnum: %d, endp lnum: %d; current col: %d, endp col: %d\n" , |
5261 | (int)reglnum, |
5262 | (int)nfa_endp->se_u.pos.lnum, |
5263 | (int)(reginput - regline), |
5264 | nfa_endp->se_u.pos.col); |
5265 | else |
5266 | fprintf(log_fd, "Current col: %d, endp col: %d\n" , |
5267 | (int)(reginput - regline), |
5268 | (int)(nfa_endp->se_u.ptr - reginput)); |
5269 | } |
5270 | #endif |
5271 | /* If "nfa_endp" is set it's only a match if it ends at |
5272 | * "nfa_endp" */ |
5273 | if (nfa_endp != NULL && (REG_MULTI |
5274 | ? (reglnum != nfa_endp->se_u.pos.lnum |
5275 | || (int)(reginput - regline) |
5276 | != nfa_endp->se_u.pos.col) |
5277 | : reginput != nfa_endp->se_u.ptr)) |
5278 | break; |
5279 | |
5280 | /* do not set submatches for \@! */ |
5281 | if (t->state->c != NFA_END_INVISIBLE_NEG) { |
5282 | copy_sub(&m->norm, &t->subs.norm); |
5283 | if (nfa_has_zsubexpr) |
5284 | copy_sub(&m->synt, &t->subs.synt); |
5285 | } |
5286 | #ifdef REGEXP_DEBUG |
5287 | fprintf(log_fd, "Match found:\n" ); |
5288 | log_subsexpr(m); |
5289 | #endif |
5290 | nfa_match = true; |
5291 | // See comment above at "goto nextchar". |
5292 | if (nextlist->n == 0) { |
5293 | clen = 0; |
5294 | } |
5295 | goto nextchar; |
5296 | |
5297 | case NFA_START_INVISIBLE: |
5298 | case NFA_START_INVISIBLE_FIRST: |
5299 | case NFA_START_INVISIBLE_NEG: |
5300 | case NFA_START_INVISIBLE_NEG_FIRST: |
5301 | case NFA_START_INVISIBLE_BEFORE: |
5302 | case NFA_START_INVISIBLE_BEFORE_FIRST: |
5303 | case NFA_START_INVISIBLE_BEFORE_NEG: |
5304 | case NFA_START_INVISIBLE_BEFORE_NEG_FIRST: |
5305 | { |
5306 | #ifdef REGEXP_DEBUG |
5307 | fprintf(log_fd, "Failure chance invisible: %d, what follows: %d\n" , |
5308 | failure_chance(t->state->out, 0), |
5309 | failure_chance(t->state->out1->out, 0)); |
5310 | #endif |
5311 | // Do it directly if there already is a PIM or when |
5312 | // nfa_postprocess() detected it will work better. |
5313 | if (t->pim.result != NFA_PIM_UNUSED |
5314 | || t->state->c == NFA_START_INVISIBLE_FIRST |
5315 | || t->state->c == NFA_START_INVISIBLE_NEG_FIRST |
5316 | || t->state->c == NFA_START_INVISIBLE_BEFORE_FIRST |
5317 | || t->state->c == NFA_START_INVISIBLE_BEFORE_NEG_FIRST) { |
5318 | int in_use = m->norm.in_use; |
5319 | |
5320 | // Copy submatch info for the recursive call, opposite |
5321 | // of what happens on success below. |
5322 | copy_sub_off(&m->norm, &t->subs.norm); |
5323 | if (nfa_has_zsubexpr) |
5324 | copy_sub_off(&m->synt, &t->subs.synt); |
5325 | |
5326 | // First try matching the invisible match, then what |
5327 | // follows. |
5328 | result = recursive_regmatch(t->state, NULL, prog, submatch, m, |
5329 | &listids, &listids_len); |
5330 | if (result == NFA_TOO_EXPENSIVE) { |
5331 | nfa_match = result; |
5332 | goto theend; |
5333 | } |
5334 | |
5335 | // for \@! and \@<! it is a match when the result is |
5336 | // FALSE |
5337 | if (result != (t->state->c == NFA_START_INVISIBLE_NEG |
5338 | || t->state->c == NFA_START_INVISIBLE_NEG_FIRST |
5339 | || t->state->c |
5340 | == NFA_START_INVISIBLE_BEFORE_NEG |
5341 | || t->state->c |
5342 | == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) { |
5343 | // Copy submatch info from the recursive call |
5344 | copy_sub_off(&t->subs.norm, &m->norm); |
5345 | if (nfa_has_zsubexpr) |
5346 | copy_sub_off(&t->subs.synt, &m->synt); |
5347 | // If the pattern has \ze and it matched in the |
5348 | // sub pattern, use it. |
5349 | copy_ze_off(&t->subs.norm, &m->norm); |
5350 | |
5351 | // t->state->out1 is the corresponding |
5352 | // END_INVISIBLE node; Add its out to the current |
5353 | // list (zero-width match). |
5354 | add_here = true; |
5355 | add_state = t->state->out1->out; |
5356 | } |
5357 | m->norm.in_use = in_use; |
5358 | } else { |
5359 | nfa_pim_T pim; |
5360 | |
5361 | // First try matching what follows. Only if a match |
5362 | // is found verify the invisible match matches. Add a |
5363 | // nfa_pim_T to the following states, it contains info |
5364 | // about the invisible match. |
5365 | pim.state = t->state; |
5366 | pim.result = NFA_PIM_TODO; |
5367 | pim.subs.norm.in_use = 0; |
5368 | pim.subs.synt.in_use = 0; |
5369 | if (REG_MULTI) { |
5370 | pim.end.pos.col = (int)(reginput - regline); |
5371 | pim.end.pos.lnum = reglnum; |
5372 | } else |
5373 | pim.end.ptr = reginput; |
5374 | |
5375 | // t->state->out1 is the corresponding END_INVISIBLE |
5376 | // node; Add its out to the current list (zero-width |
5377 | // match). |
5378 | if (addstate_here(thislist, t->state->out1->out, &t->subs, |
5379 | &pim, &listidx) == NULL) { |
5380 | nfa_match = NFA_TOO_EXPENSIVE; |
5381 | goto theend; |
5382 | } |
5383 | } |
5384 | } |
5385 | break; |
5386 | |
5387 | case NFA_START_PATTERN: |
5388 | { |
5389 | nfa_state_T *skip = NULL; |
5390 | #ifdef REGEXP_DEBUG |
5391 | int skip_lid = 0; |
5392 | #endif |
5393 | |
5394 | // There is no point in trying to match the pattern if the |
5395 | // output state is not going to be added to the list. |
5396 | if (state_in_list(nextlist, t->state->out1->out, &t->subs)) { |
5397 | skip = t->state->out1->out; |
5398 | #ifdef REGEXP_DEBUG |
5399 | skip_lid = nextlist->id; |
5400 | #endif |
5401 | } else if (state_in_list(nextlist, |
5402 | t->state->out1->out->out, &t->subs)) { |
5403 | skip = t->state->out1->out->out; |
5404 | #ifdef REGEXP_DEBUG |
5405 | skip_lid = nextlist->id; |
5406 | #endif |
5407 | } else if (state_in_list(thislist, |
5408 | t->state->out1->out->out, &t->subs)) { |
5409 | skip = t->state->out1->out->out; |
5410 | #ifdef REGEXP_DEBUG |
5411 | skip_lid = thislist->id; |
5412 | #endif |
5413 | } |
5414 | if (skip != NULL) { |
5415 | #ifdef REGEXP_DEBUG |
5416 | nfa_set_code(skip->c); |
5417 | fprintf( |
5418 | log_fd, |
5419 | "> Not trying to match pattern, output state %d is already in list %d. char %d: %s\n" , |
5420 | abs(skip->id), skip_lid, skip->c, code); |
5421 | #endif |
5422 | break; |
5423 | } |
5424 | // Copy submatch info to the recursive call, opposite of what |
5425 | // happens afterwards. |
5426 | copy_sub_off(&m->norm, &t->subs.norm); |
5427 | if (nfa_has_zsubexpr) { |
5428 | copy_sub_off(&m->synt, &t->subs.synt); |
5429 | } |
5430 | |
5431 | // First try matching the pattern. |
5432 | result = recursive_regmatch(t->state, NULL, prog, submatch, m, |
5433 | &listids, &listids_len); |
5434 | if (result == NFA_TOO_EXPENSIVE) { |
5435 | nfa_match = result; |
5436 | goto theend; |
5437 | } |
5438 | if (result) { |
5439 | int bytelen; |
5440 | |
5441 | #ifdef REGEXP_DEBUG |
5442 | fprintf(log_fd, "NFA_START_PATTERN matches:\n" ); |
5443 | log_subsexpr(m); |
5444 | #endif |
5445 | // Copy submatch info from the recursive call |
5446 | copy_sub_off(&t->subs.norm, &m->norm); |
5447 | if (nfa_has_zsubexpr) { |
5448 | copy_sub_off(&t->subs.synt, &m->synt); |
5449 | } |
5450 | // Now we need to skip over the matched text and then |
5451 | // continue with what follows. |
5452 | if (REG_MULTI) { |
5453 | // TODO(RE): multi-line match |
5454 | bytelen = m->norm.list.multi[0].end_col |
5455 | - (int)(reginput - regline); |
5456 | } else { |
5457 | bytelen = (int)(m->norm.list.line[0].end - reginput); |
5458 | } |
5459 | |
5460 | #ifdef REGEXP_DEBUG |
5461 | fprintf(log_fd, "NFA_START_PATTERN length: %d\n" , bytelen); |
5462 | #endif |
5463 | if (bytelen == 0) { |
5464 | // empty match, output of corresponding |
5465 | // NFA_END_PATTERN/NFA_SKIP to be used at current |
5466 | // position |
5467 | add_here = true; |
5468 | add_state = t->state->out1->out->out; |
5469 | } else if (bytelen <= clen) { |
5470 | // match current character, output of corresponding |
5471 | // NFA_END_PATTERN to be used at next position. |
5472 | add_state = t->state->out1->out->out; |
5473 | add_off = clen; |
5474 | } else { |
5475 | // skip over the matched characters, set character |
5476 | // count in NFA_SKIP |
5477 | add_state = t->state->out1->out; |
5478 | add_off = bytelen; |
5479 | add_count = bytelen - clen; |
5480 | } |
5481 | } |
5482 | break; |
5483 | } |
5484 | |
5485 | case NFA_BOL: |
5486 | if (reginput == regline) { |
5487 | add_here = true; |
5488 | add_state = t->state->out; |
5489 | } |
5490 | break; |
5491 | |
5492 | case NFA_EOL: |
5493 | if (curc == NUL) { |
5494 | add_here = true; |
5495 | add_state = t->state->out; |
5496 | } |
5497 | break; |
5498 | |
5499 | case NFA_BOW: |
5500 | result = true; |
5501 | |
5502 | if (curc == NUL) { |
5503 | result = false; |
5504 | } else if (has_mbyte) { |
5505 | int this_class; |
5506 | |
5507 | // Get class of current and previous char (if it exists). |
5508 | this_class = mb_get_class_tab(reginput, rex.reg_buf->b_chartab); |
5509 | if (this_class <= 1) { |
5510 | result = false; |
5511 | } else if (reg_prev_class() == this_class) { |
5512 | result = false; |
5513 | } |
5514 | } else if (!vim_iswordc_buf(curc, rex.reg_buf) |
5515 | || (reginput > regline |
5516 | && vim_iswordc_buf(reginput[-1], rex.reg_buf))) { |
5517 | result = false; |
5518 | } |
5519 | if (result) { |
5520 | add_here = true; |
5521 | add_state = t->state->out; |
5522 | } |
5523 | break; |
5524 | |
5525 | case NFA_EOW: |
5526 | result = true; |
5527 | if (reginput == regline) { |
5528 | result = false; |
5529 | } else if (has_mbyte) { |
5530 | int this_class, prev_class; |
5531 | |
5532 | // Get class of current and previous char (if it exists). |
5533 | this_class = mb_get_class_tab(reginput, rex.reg_buf->b_chartab); |
5534 | prev_class = reg_prev_class(); |
5535 | if (this_class == prev_class |
5536 | || prev_class == 0 || prev_class == 1) { |
5537 | result = false; |
5538 | } |
5539 | } else if (!vim_iswordc_buf(reginput[-1], rex.reg_buf) |
5540 | || (reginput[0] != NUL |
5541 | && vim_iswordc_buf(curc, rex.reg_buf))) { |
5542 | result = false; |
5543 | } |
5544 | if (result) { |
5545 | add_here = true; |
5546 | add_state = t->state->out; |
5547 | } |
5548 | break; |
5549 | |
5550 | case NFA_BOF: |
5551 | if (reglnum == 0 && reginput == regline |
5552 | && (!REG_MULTI || rex.reg_firstlnum == 1)) { |
5553 | add_here = true; |
5554 | add_state = t->state->out; |
5555 | } |
5556 | break; |
5557 | |
5558 | case NFA_EOF: |
5559 | if (reglnum == rex.reg_maxline && curc == NUL) { |
5560 | add_here = true; |
5561 | add_state = t->state->out; |
5562 | } |
5563 | break; |
5564 | |
5565 | case NFA_COMPOSING: |
5566 | { |
5567 | int mc = curc; |
5568 | int len = 0; |
5569 | nfa_state_T *end; |
5570 | nfa_state_T *sta; |
5571 | int cchars[MAX_MCO]; |
5572 | int ccount = 0; |
5573 | int j; |
5574 | |
5575 | sta = t->state->out; |
5576 | len = 0; |
5577 | if (utf_iscomposing(sta->c)) { |
5578 | // Only match composing character(s), ignore base |
5579 | // character. Used for ".{composing}" and "{composing}" |
5580 | // (no preceding character). |
5581 | len += mb_char2len(mc); |
5582 | } |
5583 | if (rex.reg_icombine && len == 0) { |
5584 | // If \Z was present, then ignore composing characters. |
5585 | // When ignoring the base character this always matches. |
5586 | if (sta->c != curc) { |
5587 | result = FAIL; |
5588 | } else { |
5589 | result = OK; |
5590 | } |
5591 | while (sta->c != NFA_END_COMPOSING) { |
5592 | sta = sta->out; |
5593 | } |
5594 | } else if (len > 0 || mc == sta->c) { |
5595 | // Check base character matches first, unless ignored. |
5596 | if (len == 0) { |
5597 | len += mb_char2len(mc); |
5598 | sta = sta->out; |
5599 | } |
5600 | |
5601 | // We don't care about the order of composing characters. |
5602 | // Get them into cchars[] first. |
5603 | while (len < clen) { |
5604 | mc = utf_ptr2char(reginput + len); |
5605 | cchars[ccount++] = mc; |
5606 | len += mb_char2len(mc); |
5607 | if (ccount == MAX_MCO) |
5608 | break; |
5609 | } |
5610 | |
5611 | // Check that each composing char in the pattern matches a |
5612 | // composing char in the text. We do not check if all |
5613 | // composing chars are matched. |
5614 | result = OK; |
5615 | while (sta->c != NFA_END_COMPOSING) { |
5616 | for (j = 0; j < ccount; ++j) |
5617 | if (cchars[j] == sta->c) |
5618 | break; |
5619 | if (j == ccount) { |
5620 | result = FAIL; |
5621 | break; |
5622 | } |
5623 | sta = sta->out; |
5624 | } |
5625 | } else |
5626 | result = FAIL; |
5627 | |
5628 | end = t->state->out1; // NFA_END_COMPOSING |
5629 | ADD_STATE_IF_MATCH(end); |
5630 | break; |
5631 | } |
5632 | |
5633 | case NFA_NEWL: |
5634 | if (curc == NUL && !rex.reg_line_lbr && REG_MULTI |
5635 | && reglnum <= rex.reg_maxline) { |
5636 | go_to_nextline = true; |
5637 | // Pass -1 for the offset, which means taking the position |
5638 | // at the start of the next line. |
5639 | add_state = t->state->out; |
5640 | add_off = -1; |
5641 | } else if (curc == '\n' && rex.reg_line_lbr) { |
5642 | // match \n as if it is an ordinary character |
5643 | add_state = t->state->out; |
5644 | add_off = 1; |
5645 | } |
5646 | break; |
5647 | |
5648 | case NFA_START_COLL: |
5649 | case NFA_START_NEG_COLL: |
5650 | { |
5651 | // What follows is a list of characters, until NFA_END_COLL. |
5652 | // One of them must match or none of them must match. |
5653 | nfa_state_T *state; |
5654 | int result_if_matched; |
5655 | int c1, c2; |
5656 | |
5657 | // Never match EOL. If it's part of the collection it is added |
5658 | // as a separate state with an OR. |
5659 | if (curc == NUL) { |
5660 | break; |
5661 | } |
5662 | |
5663 | state = t->state->out; |
5664 | result_if_matched = (t->state->c == NFA_START_COLL); |
5665 | for (;; ) { |
5666 | if (state->c == NFA_END_COLL) { |
5667 | result = !result_if_matched; |
5668 | break; |
5669 | } |
5670 | if (state->c == NFA_RANGE_MIN) { |
5671 | c1 = state->val; |
5672 | state = state->out; // advance to NFA_RANGE_MAX |
5673 | c2 = state->val; |
5674 | #ifdef REGEXP_DEBUG |
5675 | fprintf(log_fd, "NFA_RANGE_MIN curc=%d c1=%d c2=%d\n" , |
5676 | curc, c1, c2); |
5677 | #endif |
5678 | if (curc >= c1 && curc <= c2) { |
5679 | result = result_if_matched; |
5680 | break; |
5681 | } |
5682 | if (rex.reg_ic) { |
5683 | int curc_low = mb_tolower(curc); |
5684 | int done = false; |
5685 | |
5686 | for (; c1 <= c2; c1++) { |
5687 | if (mb_tolower(c1) == curc_low) { |
5688 | result = result_if_matched; |
5689 | done = TRUE; |
5690 | break; |
5691 | } |
5692 | } |
5693 | if (done) { |
5694 | break; |
5695 | } |
5696 | } |
5697 | } else if (state->c < 0 ? check_char_class(state->c, curc) |
5698 | : (curc == state->c |
5699 | || (rex.reg_ic && mb_tolower(curc) |
5700 | == mb_tolower(state->c)))) { |
5701 | result = result_if_matched; |
5702 | break; |
5703 | } |
5704 | state = state->out; |
5705 | } |
5706 | if (result) { |
5707 | // next state is in out of the NFA_END_COLL, out1 of |
5708 | // START points to the END state |
5709 | add_state = t->state->out1->out; |
5710 | add_off = clen; |
5711 | } |
5712 | break; |
5713 | } |
5714 | |
5715 | case NFA_ANY: |
5716 | // Any char except '\0', (end of input) does not match. |
5717 | if (curc > 0) { |
5718 | add_state = t->state->out; |
5719 | add_off = clen; |
5720 | } |
5721 | break; |
5722 | |
5723 | case NFA_ANY_COMPOSING: |
5724 | // On a composing character skip over it. Otherwise do |
5725 | // nothing. Always matches. |
5726 | if (enc_utf8 && utf_iscomposing(curc)) { |
5727 | add_off = clen; |
5728 | } else { |
5729 | add_here = true; |
5730 | add_off = 0; |
5731 | } |
5732 | add_state = t->state->out; |
5733 | break; |
5734 | |
5735 | // Character classes like \a for alpha, \d for digit etc. |
5736 | case NFA_IDENT: // \i |
5737 | result = vim_isIDc(curc); |
5738 | ADD_STATE_IF_MATCH(t->state); |
5739 | break; |
5740 | |
5741 | case NFA_SIDENT: // \I |
5742 | result = !ascii_isdigit(curc) && vim_isIDc(curc); |
5743 | ADD_STATE_IF_MATCH(t->state); |
5744 | break; |
5745 | |
5746 | case NFA_KWORD: // \k |
5747 | result = vim_iswordp_buf(reginput, rex.reg_buf); |
5748 | ADD_STATE_IF_MATCH(t->state); |
5749 | break; |
5750 | |
5751 | case NFA_SKWORD: // \K |
5752 | result = !ascii_isdigit(curc) |
5753 | && vim_iswordp_buf(reginput, rex.reg_buf); |
5754 | ADD_STATE_IF_MATCH(t->state); |
5755 | break; |
5756 | |
5757 | case NFA_FNAME: // \f |
5758 | result = vim_isfilec(curc); |
5759 | ADD_STATE_IF_MATCH(t->state); |
5760 | break; |
5761 | |
5762 | case NFA_SFNAME: // \F |
5763 | result = !ascii_isdigit(curc) && vim_isfilec(curc); |
5764 | ADD_STATE_IF_MATCH(t->state); |
5765 | break; |
5766 | |
5767 | case NFA_PRINT: // \p |
5768 | result = vim_isprintc(PTR2CHAR(reginput)); |
5769 | ADD_STATE_IF_MATCH(t->state); |
5770 | break; |
5771 | |
5772 | case NFA_SPRINT: // \P |
5773 | result = !ascii_isdigit(curc) && vim_isprintc(PTR2CHAR(reginput)); |
5774 | ADD_STATE_IF_MATCH(t->state); |
5775 | break; |
5776 | |
5777 | case NFA_WHITE: // \s |
5778 | result = ascii_iswhite(curc); |
5779 | ADD_STATE_IF_MATCH(t->state); |
5780 | break; |
5781 | |
5782 | case NFA_NWHITE: // \S |
5783 | result = curc != NUL && !ascii_iswhite(curc); |
5784 | ADD_STATE_IF_MATCH(t->state); |
5785 | break; |
5786 | |
5787 | case NFA_DIGIT: // \d |
5788 | result = ri_digit(curc); |
5789 | ADD_STATE_IF_MATCH(t->state); |
5790 | break; |
5791 | |
5792 | case NFA_NDIGIT: // \D |
5793 | result = curc != NUL && !ri_digit(curc); |
5794 | ADD_STATE_IF_MATCH(t->state); |
5795 | break; |
5796 | |
5797 | case NFA_HEX: // \x |
5798 | result = ri_hex(curc); |
5799 | ADD_STATE_IF_MATCH(t->state); |
5800 | break; |
5801 | |
5802 | case NFA_NHEX: // \X |
5803 | result = curc != NUL && !ri_hex(curc); |
5804 | ADD_STATE_IF_MATCH(t->state); |
5805 | break; |
5806 | |
5807 | case NFA_OCTAL: // \o |
5808 | result = ri_octal(curc); |
5809 | ADD_STATE_IF_MATCH(t->state); |
5810 | break; |
5811 | |
5812 | case NFA_NOCTAL: // \O |
5813 | result = curc != NUL && !ri_octal(curc); |
5814 | ADD_STATE_IF_MATCH(t->state); |
5815 | break; |
5816 | |
5817 | case NFA_WORD: // \w |
5818 | result = ri_word(curc); |
5819 | ADD_STATE_IF_MATCH(t->state); |
5820 | break; |
5821 | |
5822 | case NFA_NWORD: // \W |
5823 | result = curc != NUL && !ri_word(curc); |
5824 | ADD_STATE_IF_MATCH(t->state); |
5825 | break; |
5826 | |
5827 | case NFA_HEAD: // \h |
5828 | result = ri_head(curc); |
5829 | ADD_STATE_IF_MATCH(t->state); |
5830 | break; |
5831 | |
5832 | case NFA_NHEAD: // \H |
5833 | result = curc != NUL && !ri_head(curc); |
5834 | ADD_STATE_IF_MATCH(t->state); |
5835 | break; |
5836 | |
5837 | case NFA_ALPHA: // \a |
5838 | result = ri_alpha(curc); |
5839 | ADD_STATE_IF_MATCH(t->state); |
5840 | break; |
5841 | |
5842 | case NFA_NALPHA: // \A |
5843 | result = curc != NUL && !ri_alpha(curc); |
5844 | ADD_STATE_IF_MATCH(t->state); |
5845 | break; |
5846 | |
5847 | case NFA_LOWER: // \l |
5848 | result = ri_lower(curc); |
5849 | ADD_STATE_IF_MATCH(t->state); |
5850 | break; |
5851 | |
5852 | case NFA_NLOWER: // \L |
5853 | result = curc != NUL && !ri_lower(curc); |
5854 | ADD_STATE_IF_MATCH(t->state); |
5855 | break; |
5856 | |
5857 | case NFA_UPPER: // \u |
5858 | result = ri_upper(curc); |
5859 | ADD_STATE_IF_MATCH(t->state); |
5860 | break; |
5861 | |
5862 | case NFA_NUPPER: // \U |
5863 | result = curc != NUL && !ri_upper(curc); |
5864 | ADD_STATE_IF_MATCH(t->state); |
5865 | break; |
5866 | |
5867 | case NFA_LOWER_IC: // [a-z] |
5868 | result = ri_lower(curc) || (rex.reg_ic && ri_upper(curc)); |
5869 | ADD_STATE_IF_MATCH(t->state); |
5870 | break; |
5871 | |
5872 | case NFA_NLOWER_IC: // [^a-z] |
5873 | result = curc != NUL |
5874 | && !(ri_lower(curc) || (rex.reg_ic && ri_upper(curc))); |
5875 | ADD_STATE_IF_MATCH(t->state); |
5876 | break; |
5877 | |
5878 | case NFA_UPPER_IC: // [A-Z] |
5879 | result = ri_upper(curc) || (rex.reg_ic && ri_lower(curc)); |
5880 | ADD_STATE_IF_MATCH(t->state); |
5881 | break; |
5882 | |
5883 | case NFA_NUPPER_IC: // [^A-Z] |
5884 | result = curc != NUL |
5885 | && !(ri_upper(curc) || (rex.reg_ic && ri_lower(curc))); |
5886 | ADD_STATE_IF_MATCH(t->state); |
5887 | break; |
5888 | |
5889 | case NFA_BACKREF1: |
5890 | case NFA_BACKREF2: |
5891 | case NFA_BACKREF3: |
5892 | case NFA_BACKREF4: |
5893 | case NFA_BACKREF5: |
5894 | case NFA_BACKREF6: |
5895 | case NFA_BACKREF7: |
5896 | case NFA_BACKREF8: |
5897 | case NFA_BACKREF9: |
5898 | case NFA_ZREF1: |
5899 | case NFA_ZREF2: |
5900 | case NFA_ZREF3: |
5901 | case NFA_ZREF4: |
5902 | case NFA_ZREF5: |
5903 | case NFA_ZREF6: |
5904 | case NFA_ZREF7: |
5905 | case NFA_ZREF8: |
5906 | case NFA_ZREF9: |
5907 | // \1 .. \9 \z1 .. \z9 |
5908 | { |
5909 | int subidx; |
5910 | int bytelen; |
5911 | |
5912 | if (t->state->c <= NFA_BACKREF9) { |
5913 | subidx = t->state->c - NFA_BACKREF1 + 1; |
5914 | result = match_backref(&t->subs.norm, subidx, &bytelen); |
5915 | } else { |
5916 | subidx = t->state->c - NFA_ZREF1 + 1; |
5917 | result = match_zref(subidx, &bytelen); |
5918 | } |
5919 | |
5920 | if (result) { |
5921 | if (bytelen == 0) { |
5922 | // empty match always works, output of NFA_SKIP to be |
5923 | // used next |
5924 | add_here = true; |
5925 | add_state = t->state->out->out; |
5926 | } else if (bytelen <= clen) { |
5927 | // match current character, jump ahead to out of |
5928 | // NFA_SKIP |
5929 | add_state = t->state->out->out; |
5930 | add_off = clen; |
5931 | } else { |
5932 | // skip over the matched characters, set character |
5933 | // count in NFA_SKIP |
5934 | add_state = t->state->out; |
5935 | add_off = bytelen; |
5936 | add_count = bytelen - clen; |
5937 | } |
5938 | } |
5939 | break; |
5940 | } |
5941 | case NFA_SKIP: |
5942 | // character of previous matching \1 .. \9 or \@> |
5943 | if (t->count - clen <= 0) { |
5944 | // end of match, go to what follows |
5945 | add_state = t->state->out; |
5946 | add_off = clen; |
5947 | } else { |
5948 | // add state again with decremented count |
5949 | add_state = t->state; |
5950 | add_off = 0; |
5951 | add_count = t->count - clen; |
5952 | } |
5953 | break; |
5954 | |
5955 | case NFA_LNUM: |
5956 | case NFA_LNUM_GT: |
5957 | case NFA_LNUM_LT: |
5958 | assert(t->state->val >= 0 |
5959 | && !((rex.reg_firstlnum > 0 |
5960 | && reglnum > LONG_MAX - rex.reg_firstlnum) |
5961 | || (rex.reg_firstlnum < 0 |
5962 | && reglnum < LONG_MIN + rex.reg_firstlnum)) |
5963 | && reglnum + rex.reg_firstlnum >= 0); |
5964 | result = (REG_MULTI |
5965 | && nfa_re_num_cmp((uintmax_t)t->state->val, |
5966 | t->state->c - NFA_LNUM, |
5967 | (uintmax_t)(reglnum + rex.reg_firstlnum))); |
5968 | if (result) { |
5969 | add_here = true; |
5970 | add_state = t->state->out; |
5971 | } |
5972 | break; |
5973 | |
5974 | case NFA_COL: |
5975 | case NFA_COL_GT: |
5976 | case NFA_COL_LT: |
5977 | assert(t->state->val >= 0 |
5978 | && reginput >= regline |
5979 | && (uintmax_t)(reginput - regline) <= UINTMAX_MAX - 1); |
5980 | result = nfa_re_num_cmp((uintmax_t)t->state->val, |
5981 | t->state->c - NFA_COL, |
5982 | (uintmax_t)(reginput - regline + 1)); |
5983 | if (result) { |
5984 | add_here = true; |
5985 | add_state = t->state->out; |
5986 | } |
5987 | break; |
5988 | |
5989 | case NFA_VCOL: |
5990 | case NFA_VCOL_GT: |
5991 | case NFA_VCOL_LT: |
5992 | { |
5993 | int op = t->state->c - NFA_VCOL; |
5994 | colnr_T col = (colnr_T)(reginput - regline); |
5995 | |
5996 | // Bail out quickly when there can't be a match, avoid the overhead of |
5997 | // win_linetabsize() on long lines. |
5998 | if (op != 1 && col > t->state->val * (has_mbyte ? MB_MAXBYTES : 1)) { |
5999 | break; |
6000 | } |
6001 | |
6002 | result = false; |
6003 | win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win; |
6004 | if (op == 1 && col - 1 > t->state->val && col > 100) { |
6005 | long ts = wp->w_buffer->b_p_ts; |
6006 | |
6007 | // Guess that a character won't use more columns than 'tabstop', |
6008 | // with a minimum of 4. |
6009 | if (ts < 4) { |
6010 | ts = 4; |
6011 | } |
6012 | result = col > t->state->val * ts; |
6013 | } |
6014 | if (!result) { |
6015 | uintmax_t lts = win_linetabsize(wp, regline, col); |
6016 | assert(t->state->val >= 0); |
6017 | result = nfa_re_num_cmp((uintmax_t)t->state->val, op, lts + 1); |
6018 | } |
6019 | if (result) { |
6020 | add_here = true; |
6021 | add_state = t->state->out; |
6022 | } |
6023 | } |
6024 | break; |
6025 | |
6026 | case NFA_MARK: |
6027 | case NFA_MARK_GT: |
6028 | case NFA_MARK_LT: |
6029 | { |
6030 | pos_T *pos = getmark_buf(rex.reg_buf, t->state->val, false); |
6031 | |
6032 | // Compare the mark position to the match position. |
6033 | result = (pos != NULL // mark doesn't exist |
6034 | && pos->lnum > 0 // mark isn't set in reg_buf |
6035 | && (pos->lnum == reglnum + rex.reg_firstlnum |
6036 | ? (pos->col == (colnr_T)(reginput - regline) |
6037 | ? t->state->c == NFA_MARK |
6038 | : (pos->col < (colnr_T)(reginput - regline) |
6039 | ? t->state->c == NFA_MARK_GT |
6040 | : t->state->c == NFA_MARK_LT)) |
6041 | : (pos->lnum < reglnum + rex.reg_firstlnum |
6042 | ? t->state->c == NFA_MARK_GT |
6043 | : t->state->c == NFA_MARK_LT))); |
6044 | if (result) { |
6045 | add_here = true; |
6046 | add_state = t->state->out; |
6047 | } |
6048 | break; |
6049 | } |
6050 | |
6051 | case NFA_CURSOR: |
6052 | result = (rex.reg_win != NULL |
6053 | && (reglnum + rex.reg_firstlnum == rex.reg_win->w_cursor.lnum) |
6054 | && ((colnr_T)(reginput - regline) |
6055 | == rex.reg_win->w_cursor.col)); |
6056 | if (result) { |
6057 | add_here = true; |
6058 | add_state = t->state->out; |
6059 | } |
6060 | break; |
6061 | |
6062 | case NFA_VISUAL: |
6063 | result = reg_match_visual(); |
6064 | if (result) { |
6065 | add_here = true; |
6066 | add_state = t->state->out; |
6067 | } |
6068 | break; |
6069 | |
6070 | case NFA_MOPEN1: |
6071 | case NFA_MOPEN2: |
6072 | case NFA_MOPEN3: |
6073 | case NFA_MOPEN4: |
6074 | case NFA_MOPEN5: |
6075 | case NFA_MOPEN6: |
6076 | case NFA_MOPEN7: |
6077 | case NFA_MOPEN8: |
6078 | case NFA_MOPEN9: |
6079 | case NFA_ZOPEN: |
6080 | case NFA_ZOPEN1: |
6081 | case NFA_ZOPEN2: |
6082 | case NFA_ZOPEN3: |
6083 | case NFA_ZOPEN4: |
6084 | case NFA_ZOPEN5: |
6085 | case NFA_ZOPEN6: |
6086 | case NFA_ZOPEN7: |
6087 | case NFA_ZOPEN8: |
6088 | case NFA_ZOPEN9: |
6089 | case NFA_NOPEN: |
6090 | case NFA_ZSTART: |
6091 | // These states are only added to be able to bail out when |
6092 | // they are added again, nothing is to be done. |
6093 | break; |
6094 | |
6095 | default: // regular character |
6096 | { |
6097 | int c = t->state->c; |
6098 | |
6099 | #ifdef REGEXP_DEBUG |
6100 | if (c < 0) { |
6101 | IEMSGN("INTERNAL: Negative state char: %" PRId64, c); |
6102 | } |
6103 | #endif |
6104 | result = (c == curc); |
6105 | |
6106 | if (!result && rex.reg_ic) { |
6107 | result = mb_tolower(c) == mb_tolower(curc); |
6108 | } |
6109 | |
6110 | // If rex.reg_icombine is not set only skip over the character |
6111 | // itself. When it is set skip over composing characters. |
6112 | if (result && enc_utf8 && !rex.reg_icombine) { |
6113 | clen = utf_ptr2len(reginput); |
6114 | } |
6115 | |
6116 | ADD_STATE_IF_MATCH(t->state); |
6117 | break; |
6118 | } |
6119 | } // switch (t->state->c) |
6120 | |
6121 | if (add_state != NULL) { |
6122 | nfa_pim_T *pim; |
6123 | nfa_pim_T pim_copy; |
6124 | |
6125 | if (t->pim.result == NFA_PIM_UNUSED) |
6126 | pim = NULL; |
6127 | else |
6128 | pim = &t->pim; |
6129 | |
6130 | // Handle the postponed invisible match if the match might end |
6131 | // without advancing and before the end of the line. |
6132 | if (pim != NULL && (clen == 0 || match_follows(add_state, 0))) { |
6133 | if (pim->result == NFA_PIM_TODO) { |
6134 | #ifdef REGEXP_DEBUG |
6135 | fprintf(log_fd, "\n" ); |
6136 | fprintf(log_fd, "==================================\n" ); |
6137 | fprintf(log_fd, "Postponed recursive nfa_regmatch()\n" ); |
6138 | fprintf(log_fd, "\n" ); |
6139 | #endif |
6140 | result = recursive_regmatch(pim->state, pim, prog, submatch, m, |
6141 | &listids, &listids_len); |
6142 | pim->result = result ? NFA_PIM_MATCH : NFA_PIM_NOMATCH; |
6143 | // for \@! and \@<! it is a match when the result is |
6144 | // FALSE |
6145 | if (result != (pim->state->c == NFA_START_INVISIBLE_NEG |
6146 | || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST |
6147 | || pim->state->c |
6148 | == NFA_START_INVISIBLE_BEFORE_NEG |
6149 | || pim->state->c |
6150 | == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) { |
6151 | // Copy submatch info from the recursive call |
6152 | copy_sub_off(&pim->subs.norm, &m->norm); |
6153 | if (nfa_has_zsubexpr) |
6154 | copy_sub_off(&pim->subs.synt, &m->synt); |
6155 | } |
6156 | } else { |
6157 | result = (pim->result == NFA_PIM_MATCH); |
6158 | #ifdef REGEXP_DEBUG |
6159 | fprintf(log_fd, "\n" ); |
6160 | fprintf( |
6161 | log_fd, |
6162 | "Using previous recursive nfa_regmatch() result, result == %d\n" , |
6163 | pim->result); |
6164 | fprintf(log_fd, "MATCH = %s\n" , result ? "OK" : "FALSE" ); |
6165 | fprintf(log_fd, "\n" ); |
6166 | #endif |
6167 | } |
6168 | |
6169 | // for \@! and \@<! it is a match when result is FALSE |
6170 | if (result != (pim->state->c == NFA_START_INVISIBLE_NEG |
6171 | || pim->state->c == NFA_START_INVISIBLE_NEG_FIRST |
6172 | || pim->state->c |
6173 | == NFA_START_INVISIBLE_BEFORE_NEG |
6174 | || pim->state->c |
6175 | == NFA_START_INVISIBLE_BEFORE_NEG_FIRST)) { |
6176 | // Copy submatch info from the recursive call |
6177 | copy_sub_off(&t->subs.norm, &pim->subs.norm); |
6178 | if (nfa_has_zsubexpr) |
6179 | copy_sub_off(&t->subs.synt, &pim->subs.synt); |
6180 | } else { |
6181 | // look-behind match failed, don't add the state |
6182 | continue; |
6183 | } |
6184 | |
6185 | // Postponed invisible match was handled, don't add it to |
6186 | // following states. |
6187 | pim = NULL; |
6188 | } |
6189 | |
6190 | // If "pim" points into l->t it will become invalid when |
6191 | // adding the state causes the list to be reallocated. Make a |
6192 | // local copy to avoid that. |
6193 | if (pim == &t->pim) { |
6194 | copy_pim(&pim_copy, pim); |
6195 | pim = &pim_copy; |
6196 | } |
6197 | |
6198 | if (add_here) { |
6199 | r = addstate_here(thislist, add_state, &t->subs, pim, &listidx); |
6200 | } else { |
6201 | r = addstate(nextlist, add_state, &t->subs, pim, add_off); |
6202 | if (add_count > 0) { |
6203 | nextlist->t[nextlist->n - 1].count = add_count; |
6204 | } |
6205 | } |
6206 | if (r == NULL) { |
6207 | nfa_match = NFA_TOO_EXPENSIVE; |
6208 | goto theend; |
6209 | } |
6210 | } |
6211 | } // for (thislist = thislist; thislist->state; thislist++) |
6212 | |
6213 | // Look for the start of a match in the current position by adding the |
6214 | // start state to the list of states. |
6215 | // The first found match is the leftmost one, thus the order of states |
6216 | // matters! |
6217 | // Do not add the start state in recursive calls of nfa_regmatch(), |
6218 | // because recursive calls should only start in the first position. |
6219 | // Unless "nfa_endp" is not NULL, then we match the end position. |
6220 | // Also don't start a match past the first line. |
6221 | if (!nfa_match |
6222 | && ((toplevel |
6223 | && reglnum == 0 |
6224 | && clen != 0 |
6225 | && (rex.reg_maxcol == 0 |
6226 | || (colnr_T)(reginput - regline) < rex.reg_maxcol)) |
6227 | || (nfa_endp != NULL |
6228 | && (REG_MULTI |
6229 | ? (reglnum < nfa_endp->se_u.pos.lnum |
6230 | || (reglnum == nfa_endp->se_u.pos.lnum |
6231 | && (int)(reginput - regline) |
6232 | < nfa_endp->se_u.pos.col)) |
6233 | : reginput < nfa_endp->se_u.ptr)))) { |
6234 | #ifdef REGEXP_DEBUG |
6235 | fprintf(log_fd, "(---) STARTSTATE\n" ); |
6236 | #endif |
6237 | // Inline optimized code for addstate() if we know the state is |
6238 | // the first MOPEN. |
6239 | if (toplevel) { |
6240 | int add = TRUE; |
6241 | int c; |
6242 | |
6243 | if (prog->regstart != NUL && clen != 0) { |
6244 | if (nextlist->n == 0) { |
6245 | colnr_T col = (colnr_T)(reginput - regline) + clen; |
6246 | |
6247 | // Nextlist is empty, we can skip ahead to the |
6248 | // character that must appear at the start. |
6249 | if (skip_to_start(prog->regstart, &col) == FAIL) { |
6250 | break; |
6251 | } |
6252 | #ifdef REGEXP_DEBUG |
6253 | fprintf(log_fd, " Skipping ahead %d bytes to regstart\n" , |
6254 | col - ((colnr_T)(reginput - regline) + clen)); |
6255 | #endif |
6256 | reginput = regline + col - clen; |
6257 | } else { |
6258 | // Checking if the required start character matches is |
6259 | // cheaper than adding a state that won't match. |
6260 | c = PTR2CHAR(reginput + clen); |
6261 | if (c != prog->regstart && (!rex.reg_ic || mb_tolower(c) |
6262 | != mb_tolower(prog->regstart))) { |
6263 | #ifdef REGEXP_DEBUG |
6264 | fprintf(log_fd, |
6265 | " Skipping start state, regstart does not match\n" ); |
6266 | #endif |
6267 | add = FALSE; |
6268 | } |
6269 | } |
6270 | } |
6271 | |
6272 | if (add) { |
6273 | if (REG_MULTI) |
6274 | m->norm.list.multi[0].start_col = |
6275 | (colnr_T)(reginput - regline) + clen; |
6276 | else |
6277 | m->norm.list.line[0].start = reginput + clen; |
6278 | if (addstate(nextlist, start->out, m, NULL, clen) == NULL) { |
6279 | nfa_match = NFA_TOO_EXPENSIVE; |
6280 | goto theend; |
6281 | } |
6282 | } |
6283 | } else { |
6284 | if (addstate(nextlist, start, m, NULL, clen) == NULL) { |
6285 | nfa_match = NFA_TOO_EXPENSIVE; |
6286 | goto theend; |
6287 | } |
6288 | } |
6289 | } |
6290 | |
6291 | #ifdef REGEXP_DEBUG |
6292 | fprintf(log_fd, ">>> Thislist had %d states available: " , thislist->n); |
6293 | { |
6294 | int i; |
6295 | |
6296 | for (i = 0; i < thislist->n; i++) |
6297 | fprintf(log_fd, "%d " , abs(thislist->t[i].state->id)); |
6298 | } |
6299 | fprintf(log_fd, "\n" ); |
6300 | #endif |
6301 | |
6302 | nextchar: |
6303 | // Advance to the next character, or advance to the next line, or |
6304 | // finish. |
6305 | if (clen != 0) { |
6306 | reginput += clen; |
6307 | } else if (go_to_nextline || (nfa_endp != NULL && REG_MULTI |
6308 | && reglnum < nfa_endp->se_u.pos.lnum)) { |
6309 | reg_nextline(); |
6310 | } else { |
6311 | break; |
6312 | } |
6313 | |
6314 | // Allow interrupting with CTRL-C. |
6315 | line_breakcheck(); |
6316 | if (got_int) { |
6317 | break; |
6318 | } |
6319 | // Check for timeout once every twenty times to avoid overhead. |
6320 | if (nfa_time_limit != NULL && ++nfa_time_count == 20) { |
6321 | nfa_time_count = 0; |
6322 | if (nfa_did_time_out()) { |
6323 | break; |
6324 | } |
6325 | } |
6326 | } |
6327 | |
6328 | #ifdef REGEXP_DEBUG |
6329 | if (log_fd != stderr) |
6330 | fclose(log_fd); |
6331 | log_fd = NULL; |
6332 | #endif |
6333 | |
6334 | theend: |
6335 | // Free memory |
6336 | xfree(list[0].t); |
6337 | xfree(list[1].t); |
6338 | xfree(listids); |
6339 | #undef ADD_STATE_IF_MATCH |
6340 | #ifdef NFA_REGEXP_DEBUG_LOG |
6341 | fclose(debug); |
6342 | #endif |
6343 | |
6344 | return nfa_match; |
6345 | } |
6346 | |
6347 | // Try match of "prog" with at regline["col"]. |
6348 | // Returns <= 0 for failure, number of lines contained in the match otherwise. |
6349 | static long nfa_regtry(nfa_regprog_T *prog, |
6350 | colnr_T col, |
6351 | proftime_T *tm, // timeout limit or NULL |
6352 | int *timed_out) // flag set on timeout or NULL |
6353 | { |
6354 | int i; |
6355 | regsubs_T subs, m; |
6356 | nfa_state_T *start = prog->start; |
6357 | #ifdef REGEXP_DEBUG |
6358 | FILE *f; |
6359 | #endif |
6360 | |
6361 | reginput = regline + col; |
6362 | nfa_time_limit = tm; |
6363 | nfa_timed_out = timed_out; |
6364 | nfa_time_count = 0; |
6365 | |
6366 | #ifdef REGEXP_DEBUG |
6367 | f = fopen(NFA_REGEXP_RUN_LOG, "a" ); |
6368 | if (f != NULL) { |
6369 | fprintf(f, |
6370 | "\n\n\t=======================================================\n" ); |
6371 | #ifdef REGEXP_DEBUG |
6372 | fprintf(f, "\tRegexp is \"%s\"\n" , nfa_regengine.expr); |
6373 | #endif |
6374 | fprintf(f, "\tInput text is \"%s\" \n" , reginput); |
6375 | fprintf(f, "\t=======================================================\n\n" ); |
6376 | nfa_print_state(f, start); |
6377 | fprintf(f, "\n\n" ); |
6378 | fclose(f); |
6379 | } else { |
6380 | EMSG("Could not open temporary log file for writing" ); |
6381 | } |
6382 | #endif |
6383 | |
6384 | clear_sub(&subs.norm); |
6385 | clear_sub(&m.norm); |
6386 | clear_sub(&subs.synt); |
6387 | clear_sub(&m.synt); |
6388 | |
6389 | int result = nfa_regmatch(prog, start, &subs, &m); |
6390 | if (!result) { |
6391 | return 0; |
6392 | } else if (result == NFA_TOO_EXPENSIVE) { |
6393 | return result; |
6394 | } |
6395 | |
6396 | cleanup_subexpr(); |
6397 | if (REG_MULTI) { |
6398 | for (i = 0; i < subs.norm.in_use; i++) { |
6399 | rex.reg_startpos[i].lnum = subs.norm.list.multi[i].start_lnum; |
6400 | rex.reg_startpos[i].col = subs.norm.list.multi[i].start_col; |
6401 | |
6402 | rex.reg_endpos[i].lnum = subs.norm.list.multi[i].end_lnum; |
6403 | rex.reg_endpos[i].col = subs.norm.list.multi[i].end_col; |
6404 | } |
6405 | |
6406 | if (rex.reg_startpos[0].lnum < 0) { |
6407 | rex.reg_startpos[0].lnum = 0; |
6408 | rex.reg_startpos[0].col = col; |
6409 | } |
6410 | if (rex.reg_endpos[0].lnum < 0) { |
6411 | // pattern has a \ze but it didn't match, use current end |
6412 | rex.reg_endpos[0].lnum = reglnum; |
6413 | rex.reg_endpos[0].col = (int)(reginput - regline); |
6414 | } else { |
6415 | // Use line number of "\ze". |
6416 | reglnum = rex.reg_endpos[0].lnum; |
6417 | } |
6418 | } else { |
6419 | for (i = 0; i < subs.norm.in_use; i++) { |
6420 | rex.reg_startp[i] = subs.norm.list.line[i].start; |
6421 | rex.reg_endp[i] = subs.norm.list.line[i].end; |
6422 | } |
6423 | |
6424 | if (rex.reg_startp[0] == NULL) { |
6425 | rex.reg_startp[0] = regline + col; |
6426 | } |
6427 | if (rex.reg_endp[0] == NULL) { |
6428 | rex.reg_endp[0] = reginput; |
6429 | } |
6430 | } |
6431 | |
6432 | /* Package any found \z(...\) matches for export. Default is none. */ |
6433 | unref_extmatch(re_extmatch_out); |
6434 | re_extmatch_out = NULL; |
6435 | |
6436 | if (prog->reghasz == REX_SET) { |
6437 | cleanup_zsubexpr(); |
6438 | re_extmatch_out = make_extmatch(); |
6439 | // Loop over \z1, \z2, etc. There is no \z0. |
6440 | for (i = 1; i < subs.synt.in_use; i++) { |
6441 | if (REG_MULTI) { |
6442 | struct multipos *mpos = &subs.synt.list.multi[i]; |
6443 | |
6444 | // Only accept single line matches that are valid. |
6445 | if (mpos->start_lnum >= 0 |
6446 | && mpos->start_lnum == mpos->end_lnum |
6447 | && mpos->end_col >= mpos->start_col) { |
6448 | re_extmatch_out->matches[i] = |
6449 | vim_strnsave(reg_getline(mpos->start_lnum) + mpos->start_col, |
6450 | mpos->end_col - mpos->start_col); |
6451 | } |
6452 | } else { |
6453 | struct linepos *lpos = &subs.synt.list.line[i]; |
6454 | |
6455 | if (lpos->start != NULL && lpos->end != NULL) |
6456 | re_extmatch_out->matches[i] = |
6457 | vim_strnsave(lpos->start, |
6458 | (int)(lpos->end - lpos->start)); |
6459 | } |
6460 | } |
6461 | } |
6462 | |
6463 | return 1 + reglnum; |
6464 | } |
6465 | |
6466 | /// Match a regexp against a string ("line" points to the string) or multiple |
6467 | /// lines ("line" is NULL, use reg_getline()). |
6468 | /// |
6469 | /// @param line String in which to search or NULL |
6470 | /// @param startcol Column to start looking for match |
6471 | /// @param tm Timeout limit or NULL |
6472 | /// @param timed_out Flag set on timeout or NULL |
6473 | /// |
6474 | /// @return <= 0 if there is no match and number of lines contained in the |
6475 | /// match otherwise. |
6476 | static long nfa_regexec_both(char_u *line, colnr_T startcol, |
6477 | proftime_T *tm, int *timed_out) |
6478 | { |
6479 | nfa_regprog_T *prog; |
6480 | long retval = 0L; |
6481 | int i; |
6482 | colnr_T col = startcol; |
6483 | |
6484 | if (REG_MULTI) { |
6485 | prog = (nfa_regprog_T *)rex.reg_mmatch->regprog; |
6486 | line = reg_getline((linenr_T)0); // relative to the cursor |
6487 | rex.reg_startpos = rex.reg_mmatch->startpos; |
6488 | rex.reg_endpos = rex.reg_mmatch->endpos; |
6489 | } else { |
6490 | prog = (nfa_regprog_T *)rex.reg_match->regprog; |
6491 | rex.reg_startp = rex.reg_match->startp; |
6492 | rex.reg_endp = rex.reg_match->endp; |
6493 | } |
6494 | |
6495 | /* Be paranoid... */ |
6496 | if (prog == NULL || line == NULL) { |
6497 | EMSG(_(e_null)); |
6498 | goto theend; |
6499 | } |
6500 | |
6501 | // If pattern contains "\c" or "\C": overrule value of rex.reg_ic |
6502 | if (prog->regflags & RF_ICASE) { |
6503 | rex.reg_ic = true; |
6504 | } else if (prog->regflags & RF_NOICASE) { |
6505 | rex.reg_ic = false; |
6506 | } |
6507 | |
6508 | // If pattern contains "\Z" overrule value of rex.reg_icombine |
6509 | if (prog->regflags & RF_ICOMBINE) { |
6510 | rex.reg_icombine = true; |
6511 | } |
6512 | |
6513 | regline = line; |
6514 | reglnum = 0; /* relative to line */ |
6515 | |
6516 | nfa_has_zend = prog->has_zend; |
6517 | nfa_has_backref = prog->has_backref; |
6518 | nfa_nsubexpr = prog->nsubexp; |
6519 | nfa_listid = 1; |
6520 | nfa_alt_listid = 2; |
6521 | nfa_regengine.expr = prog->pattern; |
6522 | |
6523 | if (prog->reganch && col > 0) |
6524 | return 0L; |
6525 | |
6526 | need_clear_subexpr = TRUE; |
6527 | /* Clear the external match subpointers if necessary. */ |
6528 | if (prog->reghasz == REX_SET) { |
6529 | nfa_has_zsubexpr = TRUE; |
6530 | need_clear_zsubexpr = TRUE; |
6531 | } else |
6532 | nfa_has_zsubexpr = FALSE; |
6533 | |
6534 | if (prog->regstart != NUL) { |
6535 | /* Skip ahead until a character we know the match must start with. |
6536 | * When there is none there is no match. */ |
6537 | if (skip_to_start(prog->regstart, &col) == FAIL) |
6538 | return 0L; |
6539 | |
6540 | // If match_text is set it contains the full text that must match. |
6541 | // Nothing else to try. Doesn't handle combining chars well. |
6542 | if (prog->match_text != NULL && !rex.reg_icombine) { |
6543 | return find_match_text(col, prog->regstart, prog->match_text); |
6544 | } |
6545 | } |
6546 | |
6547 | // If the start column is past the maximum column: no need to try. |
6548 | if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) { |
6549 | goto theend; |
6550 | } |
6551 | |
6552 | nstate = prog->nstate; |
6553 | for (i = 0; i < nstate; ++i) { |
6554 | prog->state[i].id = i; |
6555 | prog->state[i].lastlist[0] = 0; |
6556 | prog->state[i].lastlist[1] = 0; |
6557 | } |
6558 | |
6559 | retval = nfa_regtry(prog, col, tm, timed_out); |
6560 | |
6561 | nfa_regengine.expr = NULL; |
6562 | |
6563 | theend: |
6564 | return retval; |
6565 | } |
6566 | |
6567 | /* |
6568 | * Compile a regular expression into internal code for the NFA matcher. |
6569 | * Returns the program in allocated space. Returns NULL for an error. |
6570 | */ |
6571 | static regprog_T *nfa_regcomp(char_u *expr, int re_flags) |
6572 | { |
6573 | nfa_regprog_T *prog = NULL; |
6574 | int *postfix; |
6575 | |
6576 | if (expr == NULL) |
6577 | return NULL; |
6578 | |
6579 | nfa_regengine.expr = expr; |
6580 | nfa_re_flags = re_flags; |
6581 | |
6582 | init_class_tab(); |
6583 | |
6584 | nfa_regcomp_start(expr, re_flags); |
6585 | |
6586 | // Build postfix form of the regexp. Needed to build the NFA |
6587 | // (and count its size). |
6588 | postfix = re2post(); |
6589 | if (postfix == NULL) { |
6590 | goto fail; // Cascaded (syntax?) error |
6591 | } |
6592 | |
6593 | /* |
6594 | * In order to build the NFA, we parse the input regexp twice: |
6595 | * 1. first pass to count size (so we can allocate space) |
6596 | * 2. second to emit code |
6597 | */ |
6598 | #ifdef REGEXP_DEBUG |
6599 | { |
6600 | FILE *f = fopen(NFA_REGEXP_RUN_LOG, "a" ); |
6601 | |
6602 | if (f != NULL) { |
6603 | fprintf(f, |
6604 | "\n*****************************\n\n\n\n\t" |
6605 | "Compiling regexp \"%s\"... hold on !\n" , |
6606 | expr); |
6607 | fclose(f); |
6608 | } |
6609 | } |
6610 | #endif |
6611 | |
6612 | /* |
6613 | * PASS 1 |
6614 | * Count number of NFA states in "nstate". Do not build the NFA. |
6615 | */ |
6616 | post2nfa(postfix, post_ptr, TRUE); |
6617 | |
6618 | /* allocate the regprog with space for the compiled regexp */ |
6619 | size_t prog_size = sizeof(nfa_regprog_T) + sizeof(nfa_state_T) * (nstate - 1); |
6620 | prog = xmalloc(prog_size); |
6621 | state_ptr = prog->state; |
6622 | |
6623 | /* |
6624 | * PASS 2 |
6625 | * Build the NFA |
6626 | */ |
6627 | prog->start = post2nfa(postfix, post_ptr, FALSE); |
6628 | if (prog->start == NULL) |
6629 | goto fail; |
6630 | |
6631 | prog->regflags = regflags; |
6632 | prog->engine = &nfa_regengine; |
6633 | prog->nstate = nstate; |
6634 | prog->has_zend = nfa_has_zend; |
6635 | prog->has_backref = nfa_has_backref; |
6636 | prog->nsubexp = regnpar; |
6637 | |
6638 | nfa_postprocess(prog); |
6639 | |
6640 | prog->reganch = nfa_get_reganch(prog->start, 0); |
6641 | prog->regstart = nfa_get_regstart(prog->start, 0); |
6642 | prog->match_text = nfa_get_match_text(prog->start); |
6643 | |
6644 | #ifdef REGEXP_DEBUG |
6645 | nfa_postfix_dump(expr, OK); |
6646 | nfa_dump(prog); |
6647 | #endif |
6648 | /* Remember whether this pattern has any \z specials in it. */ |
6649 | prog->reghasz = re_has_z; |
6650 | prog->pattern = vim_strsave(expr); |
6651 | nfa_regengine.expr = NULL; |
6652 | |
6653 | out: |
6654 | xfree(post_start); |
6655 | post_start = post_ptr = post_end = NULL; |
6656 | state_ptr = NULL; |
6657 | return (regprog_T *)prog; |
6658 | |
6659 | fail: |
6660 | XFREE_CLEAR(prog); |
6661 | #ifdef REGEXP_DEBUG |
6662 | nfa_postfix_dump(expr, FAIL); |
6663 | #endif |
6664 | nfa_regengine.expr = NULL; |
6665 | goto out; |
6666 | } |
6667 | |
6668 | /* |
6669 | * Free a compiled regexp program, returned by nfa_regcomp(). |
6670 | */ |
6671 | static void nfa_regfree(regprog_T *prog) |
6672 | { |
6673 | if (prog != NULL) { |
6674 | xfree(((nfa_regprog_T *)prog)->match_text); |
6675 | xfree(((nfa_regprog_T *)prog)->pattern); |
6676 | xfree(prog); |
6677 | } |
6678 | } |
6679 | |
6680 | /* |
6681 | * Match a regexp against a string. |
6682 | * "rmp->regprog" is a compiled regexp as returned by nfa_regcomp(). |
6683 | * Uses curbuf for line count and 'iskeyword'. |
6684 | * If "line_lbr" is true, consider a "\n" in "line" to be a line break. |
6685 | * |
6686 | * Returns <= 0 for failure, number of lines contained in the match otherwise. |
6687 | */ |
6688 | static int |
6689 | nfa_regexec_nl ( |
6690 | regmatch_T *rmp, |
6691 | char_u *line, /* string to match against */ |
6692 | colnr_T col, /* column to start looking for match */ |
6693 | bool line_lbr |
6694 | ) |
6695 | { |
6696 | rex.reg_match = rmp; |
6697 | rex.reg_mmatch = NULL; |
6698 | rex.reg_maxline = 0; |
6699 | rex.reg_line_lbr = line_lbr; |
6700 | rex.reg_buf = curbuf; |
6701 | rex.reg_win = NULL; |
6702 | rex.reg_ic = rmp->rm_ic; |
6703 | rex.reg_icombine = false; |
6704 | rex.reg_maxcol = 0; |
6705 | return nfa_regexec_both(line, col, NULL, NULL); |
6706 | } |
6707 | |
6708 | /// Matches a regexp against multiple lines. |
6709 | /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp(). |
6710 | /// Uses curbuf for line count and 'iskeyword'. |
6711 | /// |
6712 | /// @param win Window in which to search or NULL |
6713 | /// @param buf Buffer in which to search |
6714 | /// @param lnum Number of line to start looking for match |
6715 | /// @param col Column to start looking for match |
6716 | /// @param tm Timeout limit or NULL |
6717 | /// @param timed_out Flag set on timeout or NULL |
6718 | /// |
6719 | /// @return <= 0 if there is no match and number of lines contained in the match |
6720 | /// otherwise. |
6721 | /// |
6722 | /// @note The body is the same as bt_regexec() except for nfa_regexec_both() |
6723 | /// |
6724 | /// @warning |
6725 | /// Match may actually be in another line. e.g.: |
6726 | /// when r.e. is \nc, cursor is at 'a' and the text buffer looks like |
6727 | /// |
6728 | /// @par |
6729 | /// |
6730 | /// +-------------------------+ |
6731 | /// |a | |
6732 | /// |b | |
6733 | /// |c | |
6734 | /// | | |
6735 | /// +-------------------------+ |
6736 | /// |
6737 | /// @par |
6738 | /// then nfa_regexec_multi() returns 3. while the original vim_regexec_multi() |
6739 | /// returns 0 and a second call at line 2 will return 2. |
6740 | /// |
6741 | /// @par |
6742 | /// FIXME if this behavior is not compatible. |
6743 | static long nfa_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf, |
6744 | linenr_T lnum, colnr_T col, |
6745 | proftime_T *tm, int *timed_out) |
6746 | { |
6747 | rex.reg_match = NULL; |
6748 | rex.reg_mmatch = rmp; |
6749 | rex.reg_buf = buf; |
6750 | rex.reg_win = win; |
6751 | rex.reg_firstlnum = lnum; |
6752 | rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum; |
6753 | rex.reg_line_lbr = false; |
6754 | rex.reg_ic = rmp->rmm_ic; |
6755 | rex.reg_icombine = false; |
6756 | rex.reg_maxcol = rmp->rmm_maxcol; |
6757 | |
6758 | return nfa_regexec_both(NULL, col, tm, timed_out); |
6759 | } |
6760 | |