1// Scintilla source code edit control
2/** @file RESearch.cxx
3 ** Regular expression search library.
4 **/
5
6/*
7 * regex - Regular expression pattern matching and replacement
8 *
9 * By: Ozan S. Yigit (oz)
10 * Dept. of Computer Science
11 * York University
12 *
13 * Original code available from http://www.cs.yorku.ca/~oz/
14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
15 * Removed all use of register.
16 * Converted to modern function prototypes.
17 * Put all global/static variables into an object so this code can be
18 * used from multiple threads, etc.
19 * Some extensions by Philippe Lhoste PhiLho(a)GMX.net
20 * '?' extensions by Michael Mullin masmullin@gmail.com
21 *
22 * These routines are the PUBLIC DOMAIN equivalents of regex
23 * routines as found in 4.nBSD UN*X, with minor extensions.
24 *
25 * These routines are derived from various implementations found
26 * in software tools books, and Conroy's grep. They are NOT derived
27 * from licensed/restricted software.
28 * For more interesting/academic/complicated implementations,
29 * see Henry Spencer's regexp routines, or GNU Emacs pattern
30 * matching module.
31 *
32 * Modification history removed.
33 *
34 * Interfaces:
35 * RESearch::Compile: compile a regular expression into a NFA.
36 *
37 * const char *RESearch::Compile(const char *pattern, int length,
38 * bool caseSensitive, bool posix)
39 *
40 * Returns a short error string if they fail.
41 *
42 * RESearch::Execute: execute the NFA to match a pattern.
43 *
44 * int RESearch::Execute(characterIndexer &ci, int lp, int endp)
45 *
46 * re_fail: failure routine for RESearch::Execute. (no longer used)
47 *
48 * void re_fail(char *msg, char op)
49 *
50 * Regular Expressions:
51 *
52 * [1] char matches itself, unless it is a special
53 * character (metachar): . \ [ ] * + ? ^ $
54 * and ( ) if posix option.
55 *
56 * [2] . matches any character.
57 *
58 * [3] \ matches the character following it, except:
59 * - \a, \b, \f, \n, \r, \t, \v match the corresponding C
60 * escape char, respectively BEL, BS, FF, LF, CR, TAB and VT;
61 * Note that \r and \n are never matched because Scintilla
62 * regex searches are made line per line
63 * (stripped of end-of-line chars).
64 * - if not in posix mode, when followed by a
65 * left or right round bracket (see [8]);
66 * - when followed by a digit 1 to 9 (see [9]);
67 * - when followed by a left or right angle bracket
68 * (see [10]);
69 * - when followed by d, D, s, S, w or W (see [11]);
70 * - when followed by x and two hexa digits (see [12].
71 * Backslash is used as an escape character for all
72 * other meta-characters, and itself.
73 *
74 * [4] [set] matches one of the characters in the set.
75 * If the first character in the set is "^",
76 * it matches the characters NOT in the set, i.e.
77 * complements the set. A shorthand S-E (start dash end)
78 * is used to specify a set of characters S up to
79 * E, inclusive. S and E must be characters, otherwise
80 * the dash is taken literally (eg. in expression [\d-a]).
81 * The special characters "]" and "-" have no special
82 * meaning if they appear as the first chars in the set.
83 * To include both, put - first: [-]A-Z]
84 * (or just backslash them).
85 * examples: match:
86 *
87 * [-]|] matches these 3 chars,
88 *
89 * []-|] matches from ] to | chars
90 *
91 * [a-z] any lowercase alpha
92 *
93 * [^-]] any char except - and ]
94 *
95 * [^A-Z] any char except uppercase
96 * alpha
97 *
98 * [a-zA-Z] any alpha
99 *
100 * [5] * any regular expression form [1] to [4]
101 * (except [8], [9] and [10] forms of [3]),
102 * followed by closure char (*)
103 * matches zero or more matches of that form.
104 *
105 * [6] + same as [5], except it matches one or more.
106 *
107 * [5-6] Both [5] and [6] are greedy (they match as much as possible).
108 * Unless they are followed by the 'lazy' quantifier (?)
109 * In which case both [5] and [6] try to match as little as possible
110 *
111 * [7] ? same as [5] except it matches zero or one.
112 *
113 * [8] a regular expression in the form [1] to [13], enclosed
114 * as \(form\) (or (form) with posix flag) matches what
115 * form matches. The enclosure creates a set of tags,
116 * used for [9] and for pattern substitution.
117 * The tagged forms are numbered starting from 1.
118 *
119 * [9] a \ followed by a digit 1 to 9 matches whatever a
120 * previously tagged regular expression ([8]) matched.
121 *
122 * [10] \< a regular expression starting with a \< construct
123 * \> and/or ending with a \> construct, restricts the
124 * pattern matching to the beginning of a word, and/or
125 * the end of a word. A word is defined to be a character
126 * string beginning and/or ending with the characters
127 * A-Z a-z 0-9 and _. Scintilla extends this definition
128 * by user setting. The word must also be preceded and/or
129 * followed by any character outside those mentioned.
130 *
131 * [11] \l a backslash followed by d, D, s, S, w or W,
132 * becomes a character class (both inside and
133 * outside sets []).
134 * d: decimal digits
135 * D: any char except decimal digits
136 * s: whitespace (space, \t \n \r \f \v)
137 * S: any char except whitespace (see above)
138 * w: alphanumeric & underscore (changed by user setting)
139 * W: any char except alphanumeric & underscore (see above)
140 *
141 * [12] \xHH a backslash followed by x and two hexa digits,
142 * becomes the character whose ASCII code is equal
143 * to these digits. If not followed by two digits,
144 * it is 'x' char itself.
145 *
146 * [13] a composite regular expression xy where x and y
147 * are in the form [1] to [12] matches the longest
148 * match of x followed by a match for y.
149 *
150 * [14] ^ a regular expression starting with a ^ character
151 * $ and/or ending with a $ character, restricts the
152 * pattern matching to the beginning of the line,
153 * or the end of line. [anchors] Elsewhere in the
154 * pattern, ^ and $ are treated as ordinary characters.
155 *
156 *
157 * Acknowledgements:
158 *
159 * HCR's Hugh Redelmeier has been most helpful in various
160 * stages of development. He convinced me to include BOW
161 * and EOW constructs, originally invented by Rob Pike at
162 * the University of Toronto.
163 *
164 * References:
165 * Software tools Kernighan & Plauger
166 * Software tools in Pascal Kernighan & Plauger
167 * Grep [rsx-11 C dist] David Conroy
168 * ed - text editor Un*x Programmer's Manual
169 * Advanced editing on Un*x B. W. Kernighan
170 * RegExp routines Henry Spencer
171 *
172 * Notes:
173 *
174 * This implementation uses a bit-set representation for character
175 * classes for speed and compactness. Each character is represented
176 * by one bit in a 256-bit block. Thus, CCL always takes a
177 * constant 32 bytes in the internal nfa, and RESearch::Execute does a single
178 * bit comparison to locate the character in the set.
179 *
180 * Examples:
181 *
182 * pattern: foo*.*
183 * compile: CHR f CHR o CLO CHR o END CLO ANY END END
184 * matches: fo foo fooo foobar fobar foxx ...
185 *
186 * pattern: fo[ob]a[rz]
187 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
188 * matches: fobar fooar fobaz fooaz
189 *
190 * pattern: foo\\+
191 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
192 * matches: foo\ foo\\ foo\\\ ...
193 *
194 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
195 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
196 * matches: foo1foo foo2foo foo3foo
197 *
198 * pattern: \(fo.*\)-\1
199 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
200 * matches: foo-foo fo-fo fob-fob foobar-foobar ...
201 */
202
203#include <cstddef>
204#include <cstdlib>
205
206#include <stdexcept>
207#include <string>
208#include <algorithm>
209#include <iterator>
210
211#include "Position.h"
212#include "CharClassify.h"
213#include "RESearch.h"
214
215using namespace Scintilla::Internal;
216
217#define OKP 1
218#define NOP 0
219
220#define CHR 1
221#define ANY 2
222#define CCL 3
223#define BOL 4
224#define EOL 5
225#define BOT 6
226#define EOT 7
227#define BOW 8
228#define EOW 9
229#define REF 10
230#define CLO 11
231#define CLQ 12 /* 0 to 1 closure */
232#define LCLO 13 /* lazy closure */
233
234#define END 0
235
236/*
237 * The following defines are not meant to be changeable.
238 * They are for readability only.
239 */
240#define BITIND 07
241
242#define badpat(x) (*nfa = END, x)
243
244/*
245 * Character classification table for word boundary operators BOW
246 * and EOW is passed in by the creator of this object (Scintilla
247 * Document). The Document default state is that word chars are:
248 * 0-9, a-z, A-Z and _
249 */
250
251RESearch::RESearch(CharClassify *charClassTable) {
252 failure = 0;
253 charClass = charClassTable;
254 sta = NOP; /* status of lastpat */
255 bol = 0;
256 constexpr unsigned char nul = 0;
257 std::fill(bittab, std::end(bittab), nul);
258 std::fill(tagstk, std::end(tagstk), 0);
259 std::fill(nfa, std::end(nfa), '\0');
260 Clear();
261}
262
263void RESearch::Clear() noexcept {
264 for (int i = 0; i < MAXTAG; i++) {
265 pat[i].clear();
266 bopat[i] = NOTFOUND;
267 eopat[i] = NOTFOUND;
268 }
269}
270
271void RESearch::GrabMatches(const CharacterIndexer &ci) {
272 for (unsigned int i = 0; i < MAXTAG; i++) {
273 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
274 const Sci::Position len = eopat[i] - bopat[i];
275 pat[i].resize(len);
276 for (Sci::Position j = 0; j < len; j++)
277 pat[i][j] = ci.CharAt(bopat[i] + j);
278 }
279 }
280}
281
282void RESearch::ChSet(unsigned char c) noexcept {
283 bittab[c >> 3] |= 1 << (c & BITIND);
284}
285
286void RESearch::ChSetWithCase(unsigned char c, bool caseSensitive) noexcept {
287 ChSet(c);
288 if (!caseSensitive) {
289 if ((c >= 'a') && (c <= 'z')) {
290 ChSet(c - 'a' + 'A');
291 } else if ((c >= 'A') && (c <= 'Z')) {
292 ChSet(c - 'A' + 'a');
293 }
294 }
295}
296
297namespace {
298
299constexpr unsigned char escapeValue(unsigned char ch) noexcept {
300 switch (ch) {
301 case 'a': return '\a';
302 case 'b': return '\b';
303 case 'f': return '\f';
304 case 'n': return '\n';
305 case 'r': return '\r';
306 case 't': return '\t';
307 case 'v': return '\v';
308 }
309 return 0;
310}
311
312constexpr int GetHexaChar(unsigned char hd1, unsigned char hd2) noexcept {
313 int hexValue = 0;
314 if (hd1 >= '0' && hd1 <= '9') {
315 hexValue += 16 * (hd1 - '0');
316 } else if (hd1 >= 'A' && hd1 <= 'F') {
317 hexValue += 16 * (hd1 - 'A' + 10);
318 } else if (hd1 >= 'a' && hd1 <= 'f') {
319 hexValue += 16 * (hd1 - 'a' + 10);
320 } else {
321 return -1;
322 }
323 if (hd2 >= '0' && hd2 <= '9') {
324 hexValue += hd2 - '0';
325 } else if (hd2 >= 'A' && hd2 <= 'F') {
326 hexValue += hd2 - 'A' + 10;
327 } else if (hd2 >= 'a' && hd2 <= 'f') {
328 hexValue += hd2 - 'a' + 10;
329 } else {
330 return -1;
331 }
332 return hexValue;
333}
334
335constexpr int isinset(const char *ap, unsigned char c) noexcept {
336 return ap[c >> 3] & (1 << (c & BITIND));
337}
338
339}
340
341/**
342 * Called when the parser finds a backslash not followed
343 * by a valid expression (like \( in non-Posix mode).
344 * @param pattern : pointer on the char after the backslash.
345 * @param incr : (out) number of chars to skip after expression evaluation.
346 * @return the char if it resolves to a simple char,
347 * or -1 for a char class. In this case, bittab is changed.
348 */
349int RESearch::GetBackslashExpression(
350 const char *pattern,
351 int &incr) noexcept {
352 // Since error reporting is primitive and messages are not used anyway,
353 // I choose to interpret unexpected syntax in a logical way instead
354 // of reporting errors. Otherwise, we can stick on, eg., PCRE behaviour.
355 incr = 0; // Most of the time, will skip the char "naturally".
356 int c;
357 int result = -1;
358 const unsigned char bsc = *pattern;
359 if (!bsc) {
360 // Avoid overrun
361 result = '\\'; // \ at end of pattern, take it literally
362 return result;
363 }
364
365 switch (bsc) {
366 case 'a':
367 case 'b':
368 case 'n':
369 case 'f':
370 case 'r':
371 case 't':
372 case 'v':
373 result = escapeValue(bsc);
374 break;
375 case 'x': {
376 const unsigned char hd1 = *(pattern + 1);
377 const unsigned char hd2 = *(pattern + 2);
378 const int hexValue = GetHexaChar(hd1, hd2);
379 if (hexValue >= 0) {
380 result = hexValue;
381 incr = 2; // Must skip the digits
382 } else {
383 result = 'x'; // \x without 2 digits: see it as 'x'
384 }
385 }
386 break;
387 case 'd':
388 for (c = '0'; c <= '9'; c++) {
389 ChSet(static_cast<unsigned char>(c));
390 }
391 break;
392 case 'D':
393 for (c = 0; c < MAXCHR; c++) {
394 if (c < '0' || c > '9') {
395 ChSet(static_cast<unsigned char>(c));
396 }
397 }
398 break;
399 case 's':
400 ChSet(' ');
401 ChSet('\t');
402 ChSet('\n');
403 ChSet('\r');
404 ChSet('\f');
405 ChSet('\v');
406 break;
407 case 'S':
408 for (c = 0; c < MAXCHR; c++) {
409 if (c != ' ' && !(c >= 0x09 && c <= 0x0D)) {
410 ChSet(static_cast<unsigned char>(c));
411 }
412 }
413 break;
414 case 'w':
415 for (c = 0; c < MAXCHR; c++) {
416 if (iswordc(static_cast<unsigned char>(c))) {
417 ChSet(static_cast<unsigned char>(c));
418 }
419 }
420 break;
421 case 'W':
422 for (c = 0; c < MAXCHR; c++) {
423 if (!iswordc(static_cast<unsigned char>(c))) {
424 ChSet(static_cast<unsigned char>(c));
425 }
426 }
427 break;
428 default:
429 result = bsc;
430 }
431 return result;
432}
433
434const char *RESearch::Compile(const char *pattern, Sci::Position length, bool caseSensitive, bool posix) noexcept {
435 char *mp=nfa; /* nfa pointer */
436 char *lp; /* saved pointer */
437 char *sp=nfa; /* another one */
438 char *mpMax = mp + MAXNFA - BITBLK - 10;
439
440 int tagi = 0; /* tag stack index */
441 int tagc = 1; /* actual tag count */
442
443 int n;
444 char mask; /* xor mask -CCL/NCL */
445 int c1, c2, prevChar;
446
447 if (!pattern || !length) {
448 if (sta)
449 return nullptr;
450 else
451 return badpat("No previous regular expression");
452 }
453 sta = NOP;
454
455 const char *p=pattern; /* pattern pointer */
456 for (int i=0; i<length; i++, p++) {
457 if (mp > mpMax)
458 return badpat("Pattern too long");
459 lp = mp;
460 switch (*p) {
461
462 case '.': /* match any char */
463 *mp++ = ANY;
464 break;
465
466 case '^': /* match beginning */
467 if (p == pattern) {
468 *mp++ = BOL;
469 } else {
470 *mp++ = CHR;
471 *mp++ = *p;
472 }
473 break;
474
475 case '$': /* match endofline */
476 if (!*(p+1)) {
477 *mp++ = EOL;
478 } else {
479 *mp++ = CHR;
480 *mp++ = *p;
481 }
482 break;
483
484 case '[': /* match char class */
485 *mp++ = CCL;
486 prevChar = 0;
487
488 i++;
489 if (*++p == '^') {
490 mask = '\377';
491 i++;
492 p++;
493 } else {
494 mask = 0;
495 }
496
497 if (*p == '-') { /* real dash */
498 i++;
499 prevChar = *p;
500 ChSet(*p++);
501 }
502 if (*p == ']') { /* real brace */
503 i++;
504 prevChar = *p;
505 ChSet(*p++);
506 }
507 while (*p && *p != ']') {
508 if (*p == '-') {
509 if (prevChar < 0) {
510 // Previous def. was a char class like \d, take dash literally
511 prevChar = *p;
512 ChSet(*p);
513 } else if (*(p+1)) {
514 if (*(p+1) != ']') {
515 c1 = prevChar + 1;
516 i++;
517 c2 = static_cast<unsigned char>(*++p);
518 if (c2 == '\\') {
519 if (!*(p+1)) { // End of RE
520 return badpat("Missing ]");
521 } else {
522 i++;
523 p++;
524 int incr;
525 c2 = GetBackslashExpression(p, incr);
526 i += incr;
527 p += incr;
528 if (c2 >= 0) {
529 // Convention: \c (c is any char) is case sensitive, whatever the option
530 ChSet(static_cast<unsigned char>(c2));
531 prevChar = c2;
532 } else {
533 // bittab is already changed
534 prevChar = -1;
535 }
536 }
537 }
538 if (prevChar < 0) {
539 // Char after dash is char class like \d, take dash literally
540 prevChar = '-';
541 ChSet('-');
542 } else {
543 // Put all chars between c1 and c2 included in the char set
544 while (c1 <= c2) {
545 ChSetWithCase(static_cast<unsigned char>(c1++), caseSensitive);
546 }
547 }
548 } else {
549 // Dash before the ], take it literally
550 prevChar = *p;
551 ChSet(*p);
552 }
553 } else {
554 return badpat("Missing ]");
555 }
556 } else if (*p == '\\' && *(p+1)) {
557 i++;
558 p++;
559 int incr;
560 const int c = GetBackslashExpression(p, incr);
561 i += incr;
562 p += incr;
563 if (c >= 0) {
564 // Convention: \c (c is any char) is case sensitive, whatever the option
565 ChSet(static_cast<unsigned char>(c));
566 prevChar = c;
567 } else {
568 // bittab is already changed
569 prevChar = -1;
570 }
571 } else {
572 prevChar = static_cast<unsigned char>(*p);
573 ChSetWithCase(*p, caseSensitive);
574 }
575 i++;
576 p++;
577 }
578 if (!*p)
579 return badpat("Missing ]");
580
581 for (n = 0; n < BITBLK; bittab[n++] = 0)
582 *mp++ = static_cast<char>(mask ^ bittab[n]);
583
584 break;
585
586 case '*': /* match 0 or more... */
587 case '+': /* match 1 or more... */
588 case '?':
589 if (p == pattern)
590 return badpat("Empty closure");
591 lp = sp; /* previous opcode */
592 if (*lp == CLO || *lp == LCLO) /* equivalence... */
593 break;
594 switch (*lp) {
595
596 case BOL:
597 case BOT:
598 case EOT:
599 case BOW:
600 case EOW:
601 case REF:
602 return badpat("Illegal closure");
603 default:
604 break;
605 }
606
607 if (*p == '+')
608 for (sp = mp; lp < sp; lp++)
609 *mp++ = *lp;
610
611 *mp++ = END;
612 *mp++ = END;
613 sp = mp;
614
615 while (--mp > lp)
616 *mp = mp[-1];
617 if (*p == '?') *mp = CLQ;
618 else if (*(p+1) == '?') *mp = LCLO;
619 else *mp = CLO;
620
621 mp = sp;
622 break;
623
624 case '\\': /* tags, backrefs... */
625 i++;
626 switch (*++p) {
627 case '<':
628 *mp++ = BOW;
629 break;
630 case '>':
631 if (*sp == BOW)
632 return badpat("Null pattern inside \\<\\>");
633 *mp++ = EOW;
634 break;
635 case '1':
636 case '2':
637 case '3':
638 case '4':
639 case '5':
640 case '6':
641 case '7':
642 case '8':
643 case '9':
644 n = *p-'0';
645 if (tagi > 0 && tagstk[tagi] == n)
646 return badpat("Cyclical reference");
647 if (tagc > n) {
648 *mp++ = REF;
649 *mp++ = static_cast<char>(n);
650 } else {
651 return badpat("Undetermined reference");
652 }
653 break;
654 default:
655 if (!posix && *p == '(') {
656 if (tagc < MAXTAG) {
657 tagstk[++tagi] = tagc;
658 *mp++ = BOT;
659 *mp++ = static_cast<char>(tagc++);
660 } else {
661 return badpat("Too many \\(\\) pairs");
662 }
663 } else if (!posix && *p == ')') {
664 if (*sp == BOT)
665 return badpat("Null pattern inside \\(\\)");
666 if (tagi > 0) {
667 *mp++ = EOT;
668 *mp++ = static_cast<char>(tagstk[tagi--]);
669 } else {
670 return badpat("Unmatched \\)");
671 }
672 } else {
673 int incr;
674 int c = GetBackslashExpression(p, incr);
675 i += incr;
676 p += incr;
677 if (c >= 0) {
678 *mp++ = CHR;
679 *mp++ = static_cast<unsigned char>(c);
680 } else {
681 *mp++ = CCL;
682 mask = 0;
683 for (n = 0; n < BITBLK; bittab[n++] = 0)
684 *mp++ = static_cast<char>(mask ^ bittab[n]);
685 }
686 }
687 }
688 break;
689
690 default : /* an ordinary char */
691 if (posix && *p == '(') {
692 if (tagc < MAXTAG) {
693 tagstk[++tagi] = tagc;
694 *mp++ = BOT;
695 *mp++ = static_cast<char>(tagc++);
696 } else {
697 return badpat("Too many () pairs");
698 }
699 } else if (posix && *p == ')') {
700 if (*sp == BOT)
701 return badpat("Null pattern inside ()");
702 if (tagi > 0) {
703 *mp++ = EOT;
704 *mp++ = static_cast<char>(tagstk[tagi--]);
705 } else {
706 return badpat("Unmatched )");
707 }
708 } else {
709 unsigned char c = *p;
710 if (!c) // End of RE
711 c = '\\'; // We take it as raw backslash
712 if (caseSensitive || !iswordc(c)) {
713 *mp++ = CHR;
714 *mp++ = c;
715 } else {
716 *mp++ = CCL;
717 mask = 0;
718 ChSetWithCase(c, false);
719 for (n = 0; n < BITBLK; bittab[n++] = 0)
720 *mp++ = static_cast<char>(mask ^ bittab[n]);
721 }
722 }
723 break;
724 }
725 sp = lp;
726 }
727 if (tagi > 0)
728 return badpat((posix ? "Unmatched (" : "Unmatched \\("));
729 *mp = END;
730 sta = OKP;
731 return nullptr;
732}
733
734/*
735 * RESearch::Execute:
736 * execute nfa to find a match.
737 *
738 * special cases: (nfa[0])
739 * BOL
740 * Match only once, starting from the
741 * beginning.
742 * CHR
743 * First locate the character without
744 * calling PMatch, and if found, call
745 * PMatch for the remaining string.
746 * END
747 * RESearch::Compile failed, poor luser did not
748 * check for it. Fail fast.
749 *
750 * If a match is found, bopat[0] and eopat[0] are set
751 * to the beginning and the end of the matched fragment,
752 * respectively.
753 *
754 */
755int RESearch::Execute(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp) {
756 unsigned char c;
757 Sci::Position ep = NOTFOUND;
758 char *ap = nfa;
759
760 bol = lp;
761 failure = 0;
762
763 Clear();
764
765 switch (*ap) {
766
767 case BOL: /* anchored: match from BOL only */
768 ep = PMatch(ci, lp, endp, ap);
769 break;
770 case EOL: /* just searching for end of line normal path doesn't work */
771 if (*(ap+1) == END) {
772 lp = endp;
773 ep = lp;
774 break;
775 } else {
776 return 0;
777 }
778 case CHR: /* ordinary char: locate it fast */
779 c = *(ap+1);
780 while ((lp < endp) && (static_cast<unsigned char>(ci.CharAt(lp)) != c))
781 lp++;
782 if (lp >= endp) /* if EOS, fail, else fall through. */
783 return 0;
784 [[fallthrough]];
785 default: /* regular matching all the way. */
786 while (lp < endp) {
787 ep = PMatch(ci, lp, endp, ap);
788 if (ep != NOTFOUND)
789 break;
790 lp++;
791 }
792 break;
793 case END: /* munged automaton. fail always */
794 return 0;
795 }
796 if (ep == NOTFOUND)
797 return 0;
798
799 bopat[0] = lp;
800 eopat[0] = ep;
801 return 1;
802}
803
804/*
805 * PMatch: internal routine for the hard part
806 *
807 * This code is partly snarfed from an early grep written by
808 * David Conroy. The backref and tag stuff, and various other
809 * innovations are by oz.
810 *
811 * special case optimizations: (nfa[n], nfa[n+1])
812 * CLO ANY
813 * We KNOW .* will match everything up to the
814 * end of line. Thus, directly go to the end of
815 * line, without recursive PMatch calls. As in
816 * the other closure cases, the remaining pattern
817 * must be matched by moving backwards on the
818 * string recursively, to find a match for xy
819 * (x is ".*" and y is the remaining pattern)
820 * where the match satisfies the LONGEST match for
821 * x followed by a match for y.
822 * CLO CHR
823 * We can again scan the string forward for the
824 * single char and at the point of failure, we
825 * execute the remaining nfa recursively, same as
826 * above.
827 *
828 * At the end of a successful match, bopat[n] and eopat[n]
829 * are set to the beginning and end of subpatterns matched
830 * by tagged expressions (n = 1 to 9).
831 */
832
833//extern void re_fail(char *,char);
834
835/*
836 * skip values for CLO XXX to skip past the closure
837 */
838
839#define ANYSKIP 2 /* [CLO] ANY END */
840#define CHRSKIP 3 /* [CLO] CHR chr END */
841#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */
842
843Sci::Position RESearch::PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp, char *ap) {
844 int op, c, n;
845 Sci::Position e; /* extra pointer for CLO */
846 Sci::Position bp; /* beginning of subpat... */
847 Sci::Position ep; /* ending of subpat... */
848 Sci::Position are; /* to save the line ptr. */
849 Sci::Position llp; /* lazy lp for LCLO */
850
851 while ((op = *ap++) != END)
852 switch (op) {
853
854 case CHR:
855 if (ci.CharAt(lp++) != *ap++)
856 return NOTFOUND;
857 break;
858 case ANY:
859 if (lp++ >= endp)
860 return NOTFOUND;
861 break;
862 case CCL:
863 if (lp >= endp)
864 return NOTFOUND;
865 if (!isinset(ap, ci.CharAt(lp++)))
866 return NOTFOUND;
867 ap += BITBLK;
868 break;
869 case BOL:
870 if (lp != bol)
871 return NOTFOUND;
872 break;
873 case EOL:
874 if (lp < endp)
875 return NOTFOUND;
876 break;
877 case BOT:
878 bopat[static_cast<int>(*ap++)] = lp;
879 break;
880 case EOT:
881 eopat[static_cast<int>(*ap++)] = lp;
882 break;
883 case BOW:
884 if ((lp!=bol && iswordc(ci.CharAt(lp-1))) || !iswordc(ci.CharAt(lp)))
885 return NOTFOUND;
886 break;
887 case EOW:
888 if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
889 return NOTFOUND;
890 break;
891 case REF:
892 n = *ap++;
893 bp = bopat[n];
894 ep = eopat[n];
895 while (bp < ep)
896 if (ci.CharAt(bp++) != ci.CharAt(lp++))
897 return NOTFOUND;
898 break;
899 case LCLO:
900 case CLQ:
901 case CLO:
902 are = lp;
903 switch (*ap) {
904
905 case ANY:
906 if (op == CLO || op == LCLO)
907 while (lp < endp)
908 lp++;
909 else if (lp < endp)
910 lp++;
911
912 n = ANYSKIP;
913 break;
914 case CHR:
915 c = *(ap+1);
916 if (op == CLO || op == LCLO)
917 while ((lp < endp) && (c == ci.CharAt(lp)))
918 lp++;
919 else if ((lp < endp) && (c == ci.CharAt(lp)))
920 lp++;
921 n = CHRSKIP;
922 break;
923 case CCL:
924 while ((lp < endp) && isinset(ap+1, ci.CharAt(lp)))
925 lp++;
926 n = CCLSKIP;
927 break;
928 default:
929 failure = true;
930 //re_fail("closure: bad nfa.", *ap);
931 return NOTFOUND;
932 }
933 ap += n;
934
935 llp = lp;
936 e = NOTFOUND;
937 while (llp >= are) {
938 Sci::Position q;
939 if ((q = PMatch(ci, llp, endp, ap)) != NOTFOUND) {
940 e = q;
941 lp = llp;
942 if (op != LCLO) return e;
943 }
944 if (*ap == END) return e;
945 --llp;
946 }
947 if (*ap == EOT)
948 PMatch(ci, lp, endp, ap);
949 return e;
950 default:
951 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
952 return NOTFOUND;
953 }
954 return lp;
955}
956
957
958