RESearch.cxx source code [DeepinIDE/3rdparty/unioncode-scintilla515/scintilla/src/RESearch.cxx]

1	// Scintilla source code edit control
2	/* @file RESearch.cxx*
3	** Regular expression search library.
4	**/
5
6	/*
7	* regex - Regular expression pattern matching and replacement
8	*
9	* By: Ozan S. Yigit (oz)
10	* Dept. of Computer Science
11	* York University
12	*
13	* Original code available from http://www.cs.yorku.ca/~oz/
14	* Translation to C++ by Neil Hodgson neilh@scintilla.org
15	* Removed all use of register.
16	* Converted to modern function prototypes.
17	* Put all global/static variables into an object so this code can be
18	* used from multiple threads, etc.
19	* Some extensions by Philippe Lhoste PhiLho(a)GMX.net
20	* '?' extensions by Michael Mullin masmullin@gmail.com
21	*
22	* These routines are the PUBLIC DOMAIN equivalents of regex
23	* routines as found in 4.nBSD UN*X, with minor extensions.
24	*
25	* These routines are derived from various implementations found
26	* in software tools books, and Conroy's grep. They are NOT derived
27	* from licensed/restricted software.
28	* For more interesting/academic/complicated implementations,
29	* see Henry Spencer's regexp routines, or GNU Emacs pattern
30	* matching module.
31	*
32	* Modification history removed.
33	*
34	* Interfaces:
35	* RESearch::Compile: compile a regular expression into a NFA.
36	*
37	* const char RESearch::Compile(const char pattern, int length,
38	* bool caseSensitive, bool posix)
39	*
40	* Returns a short error string if they fail.
41	*
42	* RESearch::Execute: execute the NFA to match a pattern.
43	*
44	* int RESearch::Execute(characterIndexer &ci, int lp, int endp)
45	*
46	* re_fail: failure routine for RESearch::Execute. (no longer used)
47	*
48	* void re_fail(char *msg, char op)
49	*
50	* Regular Expressions:
51	*
52	* [1] char matches itself, unless it is a special
53	* character (metachar): . \ [ ] * + ? ^ $
54	* and ( ) if posix option.
55	*
56	* [2] . matches any character.
57	*
58	* [3] \ matches the character following it, except:
59	* - \a, \b, \f, \n, \r, \t, \v match the corresponding C
60	* escape char, respectively BEL, BS, FF, LF, CR, TAB and VT;
61	* Note that \r and \n are never matched because Scintilla
62	* regex searches are made line per line
63	* (stripped of end-of-line chars).
64	* - if not in posix mode, when followed by a
65	* left or right round bracket (see [8]);
66	* - when followed by a digit 1 to 9 (see [9]);
67	* - when followed by a left or right angle bracket
68	* (see [10]);
69	* - when followed by d, D, s, S, w or W (see [11]);
70	* - when followed by x and two hexa digits (see [12].
71	* Backslash is used as an escape character for all
72	* other meta-characters, and itself.
73	*
74	* [4] [set] matches one of the characters in the set.
75	* If the first character in the set is "^",
76	* it matches the characters NOT in the set, i.e.
77	* complements the set. A shorthand S-E (start dash end)
78	* is used to specify a set of characters S up to
79	* E, inclusive. S and E must be characters, otherwise
80	* the dash is taken literally (eg. in expression [\d-a]).
81	* The special characters "]" and "-" have no special
82	* meaning if they appear as the first chars in the set.
83	* To include both, put - first: [-]A-Z]
84	* (or just backslash them).
85	* examples: match:
86	*
87	* [-]\|] matches these 3 chars,
88	*
89	* []-\|] matches from ] to \| chars
90	*
91	* [a-z] any lowercase alpha
92	*
93	* [^-]] any char except - and ]
94	*
95	* [^A-Z] any char except uppercase
96	* alpha
97	*
98	* [a-zA-Z] any alpha
99	*
100	* [5] * any regular expression form [1] to [4]
101	* (except [8], [9] and [10] forms of [3]),
102	* followed by closure char (*)
103	* matches zero or more matches of that form.
104	*
105	* [6] + same as [5], except it matches one or more.
106	*
107	* [5-6] Both [5] and [6] are greedy (they match as much as possible).
108	* Unless they are followed by the 'lazy' quantifier (?)
109	* In which case both [5] and [6] try to match as little as possible
110	*
111	* [7] ? same as [5] except it matches zero or one.
112	*
113	* [8] a regular expression in the form [1] to [13], enclosed
114	* as $form$ (or (form) with posix flag) matches what
115	* form matches. The enclosure creates a set of tags,
116	* used for [9] and for pattern substitution.
117	* The tagged forms are numbered starting from 1.
118	*
119	* [9] a \ followed by a digit 1 to 9 matches whatever a
120	* previously tagged regular expression ([8]) matched.
121	*
122	* [10] \< a regular expression starting with a \< construct
123	* \> and/or ending with a \> construct, restricts the
124	* pattern matching to the beginning of a word, and/or
125	* the end of a word. A word is defined to be a character
126	* string beginning and/or ending with the characters
127	* A-Z a-z 0-9 and _. Scintilla extends this definition
128	* by user setting. The word must also be preceded and/or
129	* followed by any character outside those mentioned.
130	*
131	* [11] \l a backslash followed by d, D, s, S, w or W,
132	* becomes a character class (both inside and
133	* outside sets []).
134	* d: decimal digits
135	* D: any char except decimal digits
136	* s: whitespace (space, \t \n \r \f \v)
137	* S: any char except whitespace (see above)
138	* w: alphanumeric & underscore (changed by user setting)
139	* W: any char except alphanumeric & underscore (see above)
140	*
141	* [12] \xHH a backslash followed by x and two hexa digits,
142	* becomes the character whose ASCII code is equal
143	* to these digits. If not followed by two digits,
144	* it is 'x' char itself.
145	*
146	* [13] a composite regular expression xy where x and y
147	* are in the form [1] to [12] matches the longest
148	* match of x followed by a match for y.
149	*
150	* [14] ^ a regular expression starting with a ^ character
151	* $ and/or ending with a $ character, restricts the
152	* pattern matching to the beginning of the line,
153	* or the end of line. [anchors] Elsewhere in the
154	* pattern, ^ and $ are treated as ordinary characters.
155	*
156	*
157	* Acknowledgements:
158	*
159	* HCR's Hugh Redelmeier has been most helpful in various
160	* stages of development. He convinced me to include BOW
161	* and EOW constructs, originally invented by Rob Pike at
162	* the University of Toronto.
163	*
164	* References:
165	* Software tools Kernighan & Plauger
166	* Software tools in Pascal Kernighan & Plauger
167	* Grep [rsx-11 C dist] David Conroy
168	* ed - text editor Un*x Programmer's Manual
169	* Advanced editing on Un*x B. W. Kernighan
170	* RegExp routines Henry Spencer
171	*
172	* Notes:
173	*
174	* This implementation uses a bit-set representation for character
175	* classes for speed and compactness. Each character is represented
176	* by one bit in a 256-bit block. Thus, CCL always takes a
177	* constant 32 bytes in the internal nfa, and RESearch::Execute does a single
178	* bit comparison to locate the character in the set.
179	*
180	* Examples:
181	*
182	* pattern: foo.
183	* compile: CHR f CHR o CLO CHR o END CLO ANY END END
184	* matches: fo foo fooo foobar fobar foxx ...
185	*
186	* pattern: fo[ob]a[rz]
187	* compile: CHR f CHR o CCL bitset CHR a CCL bitset END
188	* matches: fobar fooar fobaz fooaz
189	*
190	* pattern: foo\\+
191	* compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
192	* matches: foo\ foo\\ foo\\\ ...
193	*
194	* pattern: $foo$[1-3]\1 (same as foo[1-3]foo)
195	* compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
196	* matches: foo1foo foo2foo foo3foo
197	*
198	* pattern: $fo.*$-\1
199	* compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
200	* matches: foo-foo fo-fo fob-fob foobar-foobar ...
201	*/
202
203	#include <cstddef>
204	#include <cstdlib>
205
206	#include <stdexcept>
207	#include <string>
208	#include <algorithm>
209	#include <iterator>
210
211	#include "Position.h"
212	#include "CharClassify.h"
213	#include "RESearch.h"
214
215	using namespace Scintilla::Internal;
216
217	#define OKP 1
218	#define NOP 0
219
220	#define CHR 1
221	#define ANY 2
222	#define CCL 3
223	#define BOL 4
224	#define EOL 5
225	#define BOT 6
226	#define EOT 7
227	#define BOW 8
228	#define EOW 9
229	#define REF 10
230	#define CLO 11
231	#define CLQ 12 /* 0 to 1 closure */
232	#define LCLO 13 /* lazy closure */
233
234	#define END 0
235
236	/*
237	* The following defines are not meant to be changeable.
238	* They are for readability only.
239	*/
240	#define BITIND 07
241
242	#define badpat(x) (*nfa = END, x)
243
244	/*
245	* Character classification table for word boundary operators BOW
246	* and EOW is passed in by the creator of this object (Scintilla
247	* Document). The Document default state is that word chars are:
248	* 0-9, a-z, A-Z and _
249	*/
250
251	RESearch::RESearch(CharClassify *charClassTable) {
252	failure = `0`;
253	charClass = charClassTable;
254	sta = NOP; / status of lastpat /
255	bol = `0`;
256	constexpr unsigned char nul = `0`;
257	std::fill(bittab, std::end(bittab), nul);
258	std::fill(tagstk, std::end(tagstk), `0`);
259	std::fill(nfa, std::end(nfa), `'\0'`);
260	Clear();
261	}
262
263	void RESearch::Clear() noexcept {
264	for (int i = `0`; i < MAXTAG; i++) {
265	pat[i].clear();
266	bopat[i] = NOTFOUND;
267	eopat[i] = NOTFOUND;
268	}
269	}
270
271	void RESearch::GrabMatches(const CharacterIndexer &ci) {
272	for (unsigned int i = `0`; i < MAXTAG; i++) {
273	if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
274	const Sci::Position len = eopat[i] - bopat[i];
275	pat[i].resize(len);
276	for (Sci::Position j = `0`; j < len; j++)
277	pat[i][j] = ci.CharAt(bopat[i] + j);
278	}
279	}
280	}
281
282	void RESearch::ChSet(unsigned char c) noexcept {
283	bittab[c >> `3`] \|= `1` << (c & BITIND);
284	}
285
286	void RESearch::ChSetWithCase(unsigned char c, bool caseSensitive) noexcept {
287	ChSet(c);
288	if (!caseSensitive) {
289	if ((c >= `'a'`) && (c <= `'z'`)) {
290	ChSet(c - `'a'` + `'A'`);
291	} else if ((c >= `'A'`) && (c <= `'Z'`)) {
292	ChSet(c - `'A'` + `'a'`);
293	}
294	}
295	}
296
297	namespace {
298
299	constexpr unsigned char escapeValue(unsigned char ch) noexcept {
300	switch (ch) {
301	case `'a'`: return `'\a'`;
302	case `'b'`: return `'\b'`;
303	case `'f'`: return `'\f'`;
304	case `'n'`: return `'\n'`;
305	case `'r'`: return `'\r'`;
306	case `'t'`: return `'\t'`;
307	case `'v'`: return `'\v'`;
308	}
309	return `0`;
310	}
311
312	constexpr int GetHexaChar(unsigned char hd1, unsigned char hd2) noexcept {
313	int hexValue = `0`;
314	if (hd1 >= `'0'` && hd1 <= `'9'`) {
315	hexValue += `16` * (hd1 - `'0'`);
316	} else if (hd1 >= `'A'` && hd1 <= `'F'`) {
317	hexValue += `16` * (hd1 - `'A'` + `10`);
318	} else if (hd1 >= `'a'` && hd1 <= `'f'`) {
319	hexValue += `16` * (hd1 - `'a'` + `10`);
320	} else {
321	return -`1`;
322	}
323	if (hd2 >= `'0'` && hd2 <= `'9'`) {
324	hexValue += hd2 - `'0'`;
325	} else if (hd2 >= `'A'` && hd2 <= `'F'`) {
326	hexValue += hd2 - `'A'` + `10`;
327	} else if (hd2 >= `'a'` && hd2 <= `'f'`) {
328	hexValue += hd2 - `'a'` + `10`;
329	} else {
330	return -`1`;
331	}
332	return hexValue;
333	}
334
335	constexpr int isinset(const char ap, unsigned* char c) noexcept {
336	return ap[c >> `3`] & (`1` << (c & BITIND));
337	}
338
339	}
340
341	/**
342	* Called when the parser finds a backslash not followed
343	* by a valid expression (like \( in non-Posix mode).
344	* @param pattern : pointer on the char after the backslash.
345	* @param incr : (out) number of chars to skip after expression evaluation.
346	* @return the char if it resolves to a simple char,
347	* or -1 for a char class. In this case, bittab is changed.
348	*/
349	int RESearch::GetBackslashExpression(
350	const char *pattern,
351	int &incr) noexcept {
352	// Since error reporting is primitive and messages are not used anyway,
353	// I choose to interpret unexpected syntax in a logical way instead
354	// of reporting errors. Otherwise, we can stick on, eg., PCRE behaviour.
355	incr = `0`; // Most of the time, will skip the char "naturally".
356	int c;
357	int result = -`1`;
358	const unsigned char bsc = *pattern;
359	if (!bsc) {
360	// Avoid overrun
361	result = `'\\'`; // \ at end of pattern, take it literally
362	return result;
363	}
364
365	switch (bsc) {
366	case `'a'`:
367	case `'b'`:
368	case `'n'`:
369	case `'f'`:
370	case `'r'`:
371	case `'t'`:
372	case `'v'`:
373	result = escapeValue(bsc);
374	break;
375	case `'x'`: {
376	const unsigned char hd1 = *(pattern + `1`);
377	const unsigned char hd2 = *(pattern + `2`);
378	const int hexValue = GetHexaChar(hd1, hd2);
379	if (hexValue >= `0`) {
380	result = hexValue;
381	incr = `2`; // Must skip the digits
382	} else {
383	result = `'x'`; // \x without 2 digits: see it as 'x'
384	}
385	}
386	break;
387	case `'d'`:
388	for (c = `'0'`; c <= `'9'`; c++) {
389	ChSet(static_cast<unsigned char>(c));
390	}
391	break;
392	case `'D'`:
393	for (c = `0`; c < MAXCHR; c++) {
394	if (c < `'0'` \|\| c > `'9'`) {
395	ChSet(static_cast<unsigned char>(c));
396	}
397	}
398	break;
399	case `'s'`:
400	ChSet(`' '`);
401	ChSet(`'\t'`);
402	ChSet(`'\n'`);
403	ChSet(`'\r'`);
404	ChSet(`'\f'`);
405	ChSet(`'\v'`);
406	break;
407	case `'S'`:
408	for (c = `0`; c < MAXCHR; c++) {
409	if (c != `' '` && !(c >= `0x09` && c <= `0x0D`)) {
410	ChSet(static_cast<unsigned char>(c));
411	}
412	}
413	break;
414	case `'w'`:
415	for (c = `0`; c < MAXCHR; c++) {
416	if (iswordc(static_cast<unsigned char>(c))) {
417	ChSet(static_cast<unsigned char>(c));
418	}
419	}
420	break;
421	case `'W'`:
422	for (c = `0`; c < MAXCHR; c++) {
423	if (!iswordc(static_cast<unsigned char>(c))) {
424	ChSet(static_cast<unsigned char>(c));
425	}
426	}
427	break;
428	default:
429	result = bsc;
430	}
431	return result;
432	}
433
434	const char RESearch::Compile(const* char pattern, Sci::Position length, bool* caseSensitive, bool posix) noexcept {
435	char mp=nfa; /* nfa pointer /
436	char lp; /* saved pointer /
437	char sp=nfa; /* another one /
438	char *mpMax = mp + MAXNFA - BITBLK - `10`;
439
440	int tagi = `0`; / tag stack index /
441	int tagc = `1`; / actual tag count /
442
443	int n;
444	char mask; / xor mask -CCL/NCL /
445	int c1, c2, prevChar;
446
447	if (!pattern \|\| !length) {
448	if (sta)
449	return nullptr;
450	else
451	return badpat("No previous regular expression");
452	}
453	sta = NOP;
454
455	const char p=pattern; /* pattern pointer /
456	for (int i=`0`; i<length; i++, p++) {
457	if (mp > mpMax)
458	return badpat("Pattern too long");
459	lp = mp;
460	switch (*p) {
461
462	case `'.'`: / match any char /
463	*mp++ = ANY;
464	break;
465
466	case `'^'`: / match beginning /
467	if (p == pattern) {
468	*mp++ = BOL;
469	} else {
470	*mp++ = CHR;
471	mp++ = p;
472	}
473	break;
474
475	case `'$'`: / match endofline /
476	if (!*(p+`1`)) {
477	*mp++ = EOL;
478	} else {
479	*mp++ = CHR;
480	mp++ = p;
481	}
482	break;
483
484	case `'['`: / match char class /
485	*mp++ = CCL;
486	prevChar = `0`;
487
488	i++;
489	if (*++p == `'^'`) {
490	mask = `'\377'`;
491	i++;
492	p++;
493	} else {
494	mask = `0`;
495	}
496
497	if (p == `'-'`) { /* real dash /
498	i++;
499	prevChar = *p;
500	ChSet(*p++);
501	}
502	if (p == `']'`) { /* real brace /
503	i++;
504	prevChar = *p;
505	ChSet(*p++);
506	}
507	while (p && p != `']'`) {
508	if (*p == `'-'`) {
509	if (prevChar < `0`) {
510	// Previous def. was a char class like \d, take dash literally
511	prevChar = *p;
512	ChSet(*p);
513	} else if (*(p+`1`)) {
514	if (*(p+`1`) != `']'`) {
515	c1 = prevChar + `1`;
516	i++;
517	c2 = static_cast<unsigned char>(*++p);
518	if (c2 == `'\\'`) {
519	if (!(p+`1`)) { // End of RE*
520	return badpat("Missing ]");
521	} else {
522	i++;
523	p++;
524	int incr;
525	c2 = GetBackslashExpression(p, incr);
526	i += incr;
527	p += incr;
528	if (c2 >= `0`) {
529	// Convention: \c (c is any char) is case sensitive, whatever the option
530	ChSet(static_cast<unsigned char>(c2));
531	prevChar = c2;
532	} else {
533	// bittab is already changed
534	prevChar = -`1`;
535	}
536	}
537	}
538	if (prevChar < `0`) {
539	// Char after dash is char class like \d, take dash literally
540	prevChar = `'-'`;
541	ChSet(`'-'`);
542	} else {
543	// Put all chars between c1 and c2 included in the char set
544	while (c1 <= c2) {
545	ChSetWithCase(static_cast<unsigned char>(c1++), caseSensitive);
546	}
547	}
548	} else {
549	// Dash before the ], take it literally
550	prevChar = *p;
551	ChSet(*p);
552	}
553	} else {
554	return badpat("Missing ]");
555	}
556	} else if (p == `'\\'` && (p+`1`)) {
557	i++;
558	p++;
559	int incr;
560	const int c = GetBackslashExpression(p, incr);
561	i += incr;
562	p += incr;
563	if (c >= `0`) {
564	// Convention: \c (c is any char) is case sensitive, whatever the option
565	ChSet(static_cast<unsigned char>(c));
566	prevChar = c;
567	} else {
568	// bittab is already changed
569	prevChar = -`1`;
570	}
571	} else {
572	prevChar = static_cast<unsigned char>(*p);
573	ChSetWithCase(*p, caseSensitive);
574	}
575	i++;
576	p++;
577	}
578	if (!*p)
579	return badpat("Missing ]");
580
581	for (n = `0`; n < BITBLK; bittab[n++] = `0`)
582	mp++ = static_cast<char*>(mask ^ bittab[n]);
583
584	break;
585
586	case `''`: /* match 0 or more... /
587	case `'+'`: / match 1 or more... /
588	case `'?'`:
589	if (p == pattern)
590	return badpat("Empty closure");
591	lp = sp; / previous opcode /
592	if (lp == CLO \|\| lp == LCLO) / equivalence... /
593	break;
594	switch (*lp) {
595
596	case BOL:
597	case BOT:
598	case EOT:
599	case BOW:
600	case EOW:
601	case REF:
602	return badpat("Illegal closure");
603	default:
604	break;
605	}
606
607	if (*p == `'+'`)
608	for (sp = mp; lp < sp; lp++)
609	mp++ = lp;
610
611	*mp++ = END;
612	*mp++ = END;
613	sp = mp;
614
615	while (--mp > lp)
616	*mp = mp[-`1`];
617	if (p == `'?'`) mp = CLQ;
618	else if ((p+`1`) == `'?'`) mp = LCLO;
619	else *mp = CLO;
620
621	mp = sp;
622	break;
623
624	case `'\\'`: / tags, backrefs... /
625	i++;
626	switch (*++p) {
627	case `'<'`:
628	*mp++ = BOW;
629	break;
630	case `'>'`:
631	if (*sp == BOW)
632	return badpat("Null pattern inside \\<\\>");
633	*mp++ = EOW;
634	break;
635	case `'1'`:
636	case `'2'`:
637	case `'3'`:
638	case `'4'`:
639	case `'5'`:
640	case `'6'`:
641	case `'7'`:
642	case `'8'`:
643	case `'9'`:
644	n = *p-`'0'`;
645	if (tagi > `0` && tagstk[tagi] == n)
646	return badpat("Cyclical reference");
647	if (tagc > n) {
648	*mp++ = REF;
649	mp++ = static_cast<char*>(n);
650	} else {
651	return badpat("Undetermined reference");
652	}
653	break;
654	default:
655	if (!posix && *p == `'('`) {
656	if (tagc < MAXTAG) {
657	tagstk[++tagi] = tagc;
658	*mp++ = BOT;
659	mp++ = static_cast<char*>(tagc++);
660	} else {
661	return badpat("Too many \$\$ pairs");
662	}
663	} else if (!posix && *p == `')'`) {
664	if (*sp == BOT)
665	return badpat("Null pattern inside \$\$");
666	if (tagi > `0`) {
667	*mp++ = EOT;
668	mp++ = static_cast<char*>(tagstk[tagi--]);
669	} else {
670	return badpat("Unmatched \\)");
671	}
672	} else {
673	int incr;
674	int c = GetBackslashExpression(p, incr);
675	i += incr;
676	p += incr;
677	if (c >= `0`) {
678	*mp++ = CHR;
679	mp++ = static_cast<unsigned* char>(c);
680	} else {
681	*mp++ = CCL;
682	mask = `0`;
683	for (n = `0`; n < BITBLK; bittab[n++] = `0`)
684	mp++ = static_cast<char*>(mask ^ bittab[n]);
685	}
686	}
687	}
688	break;
689
690	default : / an ordinary char /
691	if (posix && *p == `'('`) {
692	if (tagc < MAXTAG) {
693	tagstk[++tagi] = tagc;
694	*mp++ = BOT;
695	mp++ = static_cast<char*>(tagc++);
696	} else {
697	return badpat("Too many () pairs");
698	}
699	} else if (posix && *p == `')'`) {
700	if (*sp == BOT)
701	return badpat("Null pattern inside ()");
702	if (tagi > `0`) {
703	*mp++ = EOT;
704	mp++ = static_cast<char*>(tagstk[tagi--]);
705	} else {
706	return badpat("Unmatched )");
707	}
708	} else {
709	unsigned char c = *p;
710	if (!c) // End of RE
711	c = `'\\'`; // We take it as raw backslash
712	if (caseSensitive \|\| !iswordc(c)) {
713	*mp++ = CHR;
714	*mp++ = c;
715	} else {
716	*mp++ = CCL;
717	mask = `0`;
718	ChSetWithCase(c, false);
719	for (n = `0`; n < BITBLK; bittab[n++] = `0`)
720	mp++ = static_cast<char*>(mask ^ bittab[n]);
721	}
722	}
723	break;
724	}
725	sp = lp;
726	}
727	if (tagi > `0`)
728	return badpat((posix ? "Unmatched (" : "Unmatched \\("));
729	*mp = END;
730	sta = OKP;
731	return nullptr;
732	}
733
734	/*
735	* RESearch::Execute:
736	* execute nfa to find a match.
737	*
738	* special cases: (nfa[0])
739	* BOL
740	* Match only once, starting from the
741	* beginning.
742	* CHR
743	* First locate the character without
744	* calling PMatch, and if found, call
745	* PMatch for the remaining string.
746	* END
747	* RESearch::Compile failed, poor luser did not
748	* check for it. Fail fast.
749	*
750	* If a match is found, bopat[0] and eopat[0] are set
751	* to the beginning and the end of the matched fragment,
752	* respectively.
753	*
754	*/
755	int RESearch::Execute(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp) {
756	unsigned char c;
757	Sci::Position ep = NOTFOUND;
758	char *ap = nfa;
759
760	bol = lp;
761	failure = `0`;
762
763	Clear();
764
765	switch (*ap) {
766
767	case BOL: / anchored: match from BOL only /
768	ep = PMatch(ci, lp, endp, ap);
769	break;
770	case EOL: / just searching for end of line normal path doesn't work /
771	if (*(ap+`1`) == END) {
772	lp = endp;
773	ep = lp;
774	break;
775	} else {
776	return `0`;
777	}
778	case CHR: / ordinary char: locate it fast /
779	c = *(ap+`1`);
780	while ((lp < endp) && (static_cast<unsigned char>(ci.CharAt(lp)) != c))
781	lp++;
782	if (lp >= endp) / if EOS, fail, else fall through. /
783	return `0`;
784	[[fallthrough]];
785	default: / regular matching all the way. /
786	while (lp < endp) {
787	ep = PMatch(ci, lp, endp, ap);
788	if (ep != NOTFOUND)
789	break;
790	lp++;
791	}
792	break;
793	case END: / munged automaton. fail always /
794	return `0`;
795	}
796	if (ep == NOTFOUND)
797	return `0`;
798
799	bopat[`0`] = lp;
800	eopat[`0`] = ep;
801	return `1`;
802	}
803
804	/*
805	* PMatch: internal routine for the hard part
806	*
807	* This code is partly snarfed from an early grep written by
808	* David Conroy. The backref and tag stuff, and various other
809	* innovations are by oz.
810	*
811	* special case optimizations: (nfa[n], nfa[n+1])
812	* CLO ANY
813	* We KNOW .* will match everything up to the
814	* end of line. Thus, directly go to the end of
815	* line, without recursive PMatch calls. As in
816	* the other closure cases, the remaining pattern
817	* must be matched by moving backwards on the
818	* string recursively, to find a match for xy
819	* (x is ".*" and y is the remaining pattern)
820	* where the match satisfies the LONGEST match for
821	* x followed by a match for y.
822	* CLO CHR
823	* We can again scan the string forward for the
824	* single char and at the point of failure, we
825	* execute the remaining nfa recursively, same as
826	* above.
827	*
828	* At the end of a successful match, bopat[n] and eopat[n]
829	* are set to the beginning and end of subpatterns matched
830	* by tagged expressions (n = 1 to 9).
831	*/
832
833	//extern void re_fail(char ,char);*
834
835	/*
836	* skip values for CLO XXX to skip past the closure
837	*/
838
839	#define ANYSKIP 2 /* [CLO] ANY END */
840	#define CHRSKIP 3 /* [CLO] CHR chr END */
841	#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */
842
843	Sci::Position RESearch::PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp, char *ap) {
844	int op, c, n;
845	Sci::Position e; / extra pointer for CLO /
846	Sci::Position bp; / beginning of subpat... /
847	Sci::Position ep; / ending of subpat... /
848	Sci::Position are; / to save the line ptr. /
849	Sci::Position llp; / lazy lp for LCLO /
850
851	while ((op = *ap++) != END)
852	switch (op) {
853
854	case CHR:
855	if (ci.CharAt(lp++) != *ap++)
856	return NOTFOUND;
857	break;
858	case ANY:
859	if (lp++ >= endp)
860	return NOTFOUND;
861	break;
862	case CCL:
863	if (lp >= endp)
864	return NOTFOUND;
865	if (!isinset(ap, ci.CharAt(lp++)))
866	return NOTFOUND;
867	ap += BITBLK;
868	break;
869	case BOL:
870	if (lp != bol)
871	return NOTFOUND;
872	break;
873	case EOL:
874	if (lp < endp)
875	return NOTFOUND;
876	break;
877	case BOT:
878	bopat[static_cast<int>(*ap++)] = lp;
879	break;
880	case EOT:
881	eopat[static_cast<int>(*ap++)] = lp;
882	break;
883	case BOW:
884	if ((lp!=bol && iswordc(ci.CharAt(lp-`1`))) \|\| !iswordc(ci.CharAt(lp)))
885	return NOTFOUND;
886	break;
887	case EOW:
888	if (lp==bol \|\| !iswordc(ci.CharAt(lp-`1`)) \|\| iswordc(ci.CharAt(lp)))
889	return NOTFOUND;
890	break;
891	case REF:
892	n = *ap++;
893	bp = bopat[n];
894	ep = eopat[n];
895	while (bp < ep)
896	if (ci.CharAt(bp++) != ci.CharAt(lp++))
897	return NOTFOUND;
898	break;
899	case LCLO:
900	case CLQ:
901	case CLO:
902	are = lp;
903	switch (*ap) {
904
905	case ANY:
906	if (op == CLO \|\| op == LCLO)
907	while (lp < endp)
908	lp++;
909	else if (lp < endp)
910	lp++;
911
912	n = ANYSKIP;
913	break;
914	case CHR:
915	c = *(ap+`1`);
916	if (op == CLO \|\| op == LCLO)
917	while ((lp < endp) && (c == ci.CharAt(lp)))
918	lp++;
919	else if ((lp < endp) && (c == ci.CharAt(lp)))
920	lp++;
921	n = CHRSKIP;
922	break;
923	case CCL:
924	while ((lp < endp) && isinset(ap+`1`, ci.CharAt(lp)))
925	lp++;
926	n = CCLSKIP;
927	break;
928	default:
929	failure = true;
930	//re_fail("closure: bad nfa.", ap);*
931	return NOTFOUND;
932	}
933	ap += n;
934
935	llp = lp;
936	e = NOTFOUND;
937	while (llp >= are) {
938	Sci::Position q;
939	if ((q = PMatch(ci, llp, endp, ap)) != NOTFOUND) {
940	e = q;
941	lp = llp;
942	if (op != LCLO) return e;
943	}
944	if (ap == END) return* e;
945	--llp;
946	}
947	if (*ap == EOT)
948	PMatch(ci, lp, endp, ap);
949	return e;
950	default:
951	//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
952	return NOTFOUND;
953	}
954	return lp;
955	}
956
957
958

Browse the source code of DeepinIDE/3rdparty/unioncode-scintilla515/scintilla/src/RESearch.cxx