match.c source code [SmallBASIC/lib/match.c]

1	// This file is part of SmallBASIC
2	//
3	// The regular expressions routines is based on match.c by J. Kercheval:
4	//
5	// This program is distributed under the terms of the GPL v2.0 or later
6	// Download the GNU Public License (GPL) from www.gnu.org
7	//
8	// Copyright(C) 2000 Nicholas Christopoulos
9
10	/*
11	Author: J. Kercheval
12	Created: Sat, 01/05/1991 22:21:49
13
14	J. Kercheval Wed, 02/20/1991 22:29:01 Released to Public Domain
15	J. Kercheval Fri, 02/22/1991 15:29:01 fix '\' bugs (two :( of them)
16	J. Kercheval Sun, 03/10/1991 19:31:29 add error return to RegMatche()
17	J. Kercheval Sun, 03/10/1991 20:11:11 add IsValidRegPattern code
18	J. Kercheval Sun, 03/10/1991 20:37:11 beef up main()
19	J. Kercheval Tue, 03/12/1991 22:25:10 Released as V1.1 to Public Domain
20
21	The file match.c coexists in the same directory with the string class.
22	*/
23
24	/**
25	* In the pattern string:
26	* `*' RegMatches any sequence of characters (zero or more)
27	* `?' RegMatches any character
28	* [SET] RegMatches any character in the specified set,
29	* [!SET] or [^SET] RegMatches any character not in the specified set.
30	*
31	* A set is composed of characters or ranges; a range looks like
32	* character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the
33	* minimal set of characters allowed in the [..] pattern construct.
34	* Other characters are allowed (ie. 8 bit characters) if your system
35	* will support them.
36	*
37	*
38	* To suppress the special syntactic significance of any of `[]*?!^-\',
39	* and RegMatch the character exactly, precede it with a `\'.
40	*/
41
42	#include "lib/match.h"
43	#include "common/smbas.h"
44	#include "common/sberr.h"
45
46	#ifdef USE_PCRE
47	#include <pcre.h>
48	#define OVECCOUNT 30 /* should be a multiple of 3 */
49	#endif
50
51	int reg_match_after_star(const char p, char* *t);
52	int reg_match_jk(const char p, char* *t);
53
54	int reg_match_jk(const char p, char* *t) {
55	char range_start, range_end; / start and end in range /
56	int invert; / is this [..] or [!..] /
57	int member_match; / have I matched the [..] construct? /
58	int loop; / should I terminate? /
59
60	for (; *p; p++, t++) {
61	/*
62	* if this is the end of the text then this is the end of the reg_match
63	*/
64	if (*t == `'\0'`)
65	return (p == `''` && *++p == `'\0'`) ? reg_match_valid : reg_match_abort;
66
67	/*
68	* determine and react to pattern type
69	*/
70	switch (*p) {
71	case `'?'`: / single any character RegMatch /
72	break;
73	case `''`: /* multiple any character RegMatch /
74	return reg_match_after_star(p, t);
75	case `'['`: /* [..] construct, single member/exclusion *
76	* character RegMatch */
77	{
78	/*
79	* move to beginning of range
80	*/
81	p++;
82
83	/*
84	* check if this is a member reg_match or exclusion reg_match
85	*/
86	invert = `0`; // false
87	if (p == `'!'` \|\| p == `'^'`) {
88	invert = -`1`; // true
89	p++;
90	}
91
92	/*
93	* if closing bracket here or at range start then we have a malformed
94	* pattern
95	*/
96	if (*p == `']'`)
97	return reg_match_bad_pattern;
98
99	member_match = `0`; // false
100	loop = -`1`; // true
101
102	while (loop) { / if end of construct then loop is done /
103	if (*p == `']'`) {
104	loop = `0`; // false
105	continue;
106	}
107
108	/*
109	* RegMatching a '!', '^', '-', '\' or a ']'
110	*/
111	if (*p == `'\\'`)
112	range_start = range_end = *++p;
113	else
114	range_start = range_end = *p;
115
116	/*
117	* if end of pattern then bad pattern (Missing ']')
118	*/
119	if (*p == `'\0'`)
120	return reg_match_bad_pattern;
121
122	/*
123	* check for range bar
124	*/
125	if (*++p == `'-'`) {
126	/*
127	* get the range end
128	*/
129	range_end = *++p;
130
131	/*
132	* if end of pattern or construct then bad pattern
133	*/
134	if (range_end == `'\0'` \|\| range_end == `']'`)
135	return reg_match_bad_pattern;
136
137	/*
138	* special character range end
139	*/
140	if (range_end == `'\\'`) {
141	range_end = *++p;
142
143	/*
144	* if end of text then we have a bad pattern
145	*/
146	if (!range_end)
147	return reg_match_bad_pattern;
148	}
149
150	/*
151	* move just beyond this range
152	*/
153	p++;
154	}
155
156	/*
157	* if the text character is in range then RegMatch found. make sure
158	* the range letters have the proper relationship to one another
159	* before comparison
160	*/
161
162	if (range_start < range_end) {
163	if (t >= range_start && t <= range_end) {
164	member_match = -`1`; // true
165	loop = `0`; // false
166	}
167	} else {
168	if (t >= range_end && t <= range_start) {
169	member_match = -`1`; // true
170	loop = `0`; // false
171	}
172	}
173	} // while ?
174
175	/*
176	* if there was a match in an exclusion set then no match
177	*/
178	/*
179	* if there was no match in a member set then no match
180	*/
181
182	if ((invert && member_match) \|\| !(invert \|\| member_match))
183	return reg_match_range_failure;
184
185	/*
186	* if this is not an exclusion then skip the rest of the [...]
187	* construct that already RegMatched.
188	*/
189
190	if (member_match) {
191	while (*p != `']'`) {
192	/*
193	* bad pattern (Missing ']')
194	*/
195	if (*p == `'\0'`)
196	return reg_match_bad_pattern;
197
198	/*
199	* skip exact RegMatch
200	*/
201	if (*p == `'\\'`) {
202	p++;
203
204	/*
205	* if end of text then we have a bad pattern
206	*/
207	if (*p == `'\0'`)
208	return reg_match_bad_pattern;
209	}
210
211	/*
212	* move to next pattern char
213	*/
214	p++;
215	} // while
216	}
217	break;
218	}
219	case `'\\'`: /* next character is quoted and must match *
220	* exactly */
221	/*
222	* move pattern pointer to quoted char and fall through
223	*/
224	p++;
225
226	/*
227	* if end of text then we have a bad pattern
228	*/
229	if (*p == `'\0'`)
230	return reg_match_bad_pattern;
231
232	/*
233	* must match this character exactly
234	*/
235	default:
236	if (p != t)
237	return reg_match_literal_failure;
238	} // switch!
239	} // first for
240
241	/*
242	* if end of text not reached then the pattern fails
243	*/
244	if (*t)
245	return reg_match_premature_end;
246	return reg_match_valid;
247	}
248
249	/*
250	*/
251	#ifdef USE_PCRE
252	int reg_match_pcre(const char p, char* *t)
253	{
254	pcre *re;
255	const char *error;
256	int errofs;
257
258	re =
259	pcre_compile(p, (opt_usepcre == `2`) ? PCRE_CASELESS : `0`, &error, &errofs, NULL);
260	if (!re) {
261	rt_raise("REGULAR EXPRESSION SYNTAX ERROR (offset %d) -> %s", error, errofs);
262	return reg_match_bad_pattern;
263	}
264	else {
265	int rc;
266	int ovector[OVECCOUNT];
267
268	rc = pcre_exec(re, NULL, t, strlen(t), `0`, `0`, ovector, OVECCOUNT);
269	if (rc >= `0`)
270	return reg_match_valid;
271	}
272
273	return reg_match_literal_failure;
274	}
275	#endif
276
277	/*
278	*/
279	int reg_match(const char p, char* *t) {
280	#ifdef USE_PCRE
281	if (opt_usepcre)
282	return reg_match_pcre(p, t);
283	#endif
284	return reg_match_jk(p, t);
285	}
286
287	/----------------------------------------------------------------------------*
288	*
289	* recursively call RegMatche() with final segment of PATTERN and of TEXT.
290	*
291	----------------------------------------------------------------------------/*
292	int reg_match_after_star(const char p, char* *t) {
293	int RegMatch = `1`; // unused code
294	int nextp;
295
296	/*
297	* pass over existing ? and * in pattern
298	*/
299	while (p == `'?'` \|\| p == `'*'`) {
300	/*
301	* take one char for each ? and +
302	*/
303	if (*p == `'?'`) {
304	/*
305	* if end of text then no RegMatch
306	*/
307	if (!*t++)
308	return reg_match_abort;
309	}
310
311	/*
312	* move to next char in pattern
313	*/
314	p++;
315	}
316
317	/*
318	* if end of pattern we have RegMatched regardless of text left
319	*/
320	if (!*p)
321	return reg_match_valid;
322
323	/*
324	* get the next character to RegMatch which must be a literal or '['
325	*/
326	nextp = *p;
327	if (nextp == `'\\'`) {
328	nextp = p[`1`];
329
330	/*
331	* if end of text then we have a bad pattern
332	*/
333	if (!nextp)
334	return reg_match_bad_pattern;
335	}
336
337	/*
338	* Continue until we run out of text or definite result seen
339	*/
340	do {
341	/*
342	* a precondition for RegMatching is that the next character in the pattern
343	* RegMatch the next character in the text or that the next pattern char is
344	* the beginning of a range. Increment text pointer as we go here
345	*/
346
347	if (nextp == *t \|\| nextp == `'['`)
348	RegMatch = reg_match(p, t);
349
350	/*
351	* if the end of text is reached then no RegMatch
352	*/
353
354	if (!*t++)
355	RegMatch = reg_match_abort;
356
357	} while (RegMatch != reg_match_valid && RegMatch != reg_match_abort && RegMatch != reg_match_bad_pattern);
358
359	/*
360	* return result
361	*/
362	return RegMatch;
363	}
364

Browse the source code of SmallBASIC/lib/match.c