1// This file is part of SmallBASIC
2//
3// The regular expressions routines is based on match.c by J. Kercheval:
4//
5// This program is distributed under the terms of the GPL v2.0 or later
6// Download the GNU Public License (GPL) from www.gnu.org
7//
8// Copyright(C) 2000 Nicholas Christopoulos
9
10/*
11 Author: J. Kercheval
12 Created: Sat, 01/05/1991 22:21:49
13
14 J. Kercheval Wed, 02/20/1991 22:29:01 Released to Public Domain
15 J. Kercheval Fri, 02/22/1991 15:29:01 fix '\' bugs (two :( of them)
16 J. Kercheval Sun, 03/10/1991 19:31:29 add error return to RegMatche()
17 J. Kercheval Sun, 03/10/1991 20:11:11 add IsValidRegPattern code
18 J. Kercheval Sun, 03/10/1991 20:37:11 beef up main()
19 J. Kercheval Tue, 03/12/1991 22:25:10 Released as V1.1 to Public Domain
20
21 The file match.c coexists in the same directory with the string class.
22 */
23
24/**
25 * In the pattern string:
26 * `*' RegMatches any sequence of characters (zero or more)
27 * `?' RegMatches any character
28 * [SET] RegMatches any character in the specified set,
29 * [!SET] or [^SET] RegMatches any character not in the specified set.
30 *
31 * A set is composed of characters or ranges; a range looks like
32 * character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the
33 * minimal set of characters allowed in the [..] pattern construct.
34 * Other characters are allowed (ie. 8 bit characters) if your system
35 * will support them.
36 *
37 *
38 * To suppress the special syntactic significance of any of `[]*?!^-\',
39 * and RegMatch the character exactly, precede it with a `\'.
40 */
41
42#include "lib/match.h"
43#include "common/smbas.h"
44#include "common/sberr.h"
45
46#ifdef USE_PCRE
47#include <pcre.h>
48#define OVECCOUNT 30 /* should be a multiple of 3 */
49#endif
50
51int reg_match_after_star(const char *p, char *t);
52int reg_match_jk(const char *p, char *t);
53
54int reg_match_jk(const char *p, char *t) {
55 char range_start, range_end; /* start and end in range */
56 int invert; /* is this [..] or [!..] */
57 int member_match; /* have I matched the [..] construct? */
58 int loop; /* should I terminate? */
59
60 for (; *p; p++, t++) {
61 /*
62 * if this is the end of the text then this is the end of the reg_match
63 */
64 if (*t == '\0')
65 return (*p == '*' && *++p == '\0') ? reg_match_valid : reg_match_abort;
66
67 /*
68 * determine and react to pattern type
69 */
70 switch (*p) {
71 case '?': /* single any character RegMatch */
72 break;
73 case '*': /* multiple any character RegMatch */
74 return reg_match_after_star(p, t);
75 case '[': /* [..] construct, single member/exclusion *
76 * character RegMatch */
77 {
78 /*
79 * move to beginning of range
80 */
81 p++;
82
83 /*
84 * check if this is a member reg_match or exclusion reg_match
85 */
86 invert = 0; // false
87 if (*p == '!' || *p == '^') {
88 invert = -1; // true
89 p++;
90 }
91
92 /*
93 * if closing bracket here or at range start then we have a malformed
94 * pattern
95 */
96 if (*p == ']')
97 return reg_match_bad_pattern;
98
99 member_match = 0; // false
100 loop = -1; // true
101
102 while (loop) { /* if end of construct then loop is done */
103 if (*p == ']') {
104 loop = 0; // false
105 continue;
106 }
107
108 /*
109 * RegMatching a '!', '^', '-', '\' or a ']'
110 */
111 if (*p == '\\')
112 range_start = range_end = *++p;
113 else
114 range_start = range_end = *p;
115
116 /*
117 * if end of pattern then bad pattern (Missing ']')
118 */
119 if (*p == '\0')
120 return reg_match_bad_pattern;
121
122 /*
123 * check for range bar
124 */
125 if (*++p == '-') {
126 /*
127 * get the range end
128 */
129 range_end = *++p;
130
131 /*
132 * if end of pattern or construct then bad pattern
133 */
134 if (range_end == '\0' || range_end == ']')
135 return reg_match_bad_pattern;
136
137 /*
138 * special character range end
139 */
140 if (range_end == '\\') {
141 range_end = *++p;
142
143 /*
144 * if end of text then we have a bad pattern
145 */
146 if (!range_end)
147 return reg_match_bad_pattern;
148 }
149
150 /*
151 * move just beyond this range
152 */
153 p++;
154 }
155
156 /*
157 * if the text character is in range then RegMatch found. make sure
158 * the range letters have the proper relationship to one another
159 * before comparison
160 */
161
162 if (range_start < range_end) {
163 if (*t >= range_start && *t <= range_end) {
164 member_match = -1; // true
165 loop = 0; // false
166 }
167 } else {
168 if (*t >= range_end && *t <= range_start) {
169 member_match = -1; // true
170 loop = 0; // false
171 }
172 }
173 } // while ?
174
175 /*
176 * if there was a match in an exclusion set then no match
177 */
178 /*
179 * if there was no match in a member set then no match
180 */
181
182 if ((invert && member_match) || !(invert || member_match))
183 return reg_match_range_failure;
184
185 /*
186 * if this is not an exclusion then skip the rest of the [...]
187 * construct that already RegMatched.
188 */
189
190 if (member_match) {
191 while (*p != ']') {
192 /*
193 * bad pattern (Missing ']')
194 */
195 if (*p == '\0')
196 return reg_match_bad_pattern;
197
198 /*
199 * skip exact RegMatch
200 */
201 if (*p == '\\') {
202 p++;
203
204 /*
205 * if end of text then we have a bad pattern
206 */
207 if (*p == '\0')
208 return reg_match_bad_pattern;
209 }
210
211 /*
212 * move to next pattern char
213 */
214 p++;
215 } // while
216 }
217 break;
218 }
219 case '\\': /* next character is quoted and must match *
220 * exactly */
221 /*
222 * move pattern pointer to quoted char and fall through
223 */
224 p++;
225
226 /*
227 * if end of text then we have a bad pattern
228 */
229 if (*p == '\0')
230 return reg_match_bad_pattern;
231
232 /*
233 * must match this character exactly
234 */
235 default:
236 if (*p != *t)
237 return reg_match_literal_failure;
238 } // switch!
239 } // first for
240
241 /*
242 * if end of text not reached then the pattern fails
243 */
244 if (*t)
245 return reg_match_premature_end;
246 return reg_match_valid;
247}
248
249/*
250 */
251#ifdef USE_PCRE
252int reg_match_pcre(const char *p, char *t)
253{
254 pcre *re;
255 const char *error;
256 int errofs;
257
258 re =
259 pcre_compile(p, (opt_usepcre == 2) ? PCRE_CASELESS : 0, &error, &errofs, NULL);
260 if (!re) {
261 rt_raise("REGULAR EXPRESSION SYNTAX ERROR (offset %d) -> %s", error, errofs);
262 return reg_match_bad_pattern;
263 }
264 else {
265 int rc;
266 int ovector[OVECCOUNT];
267
268 rc = pcre_exec(re, NULL, t, strlen(t), 0, 0, ovector, OVECCOUNT);
269 if (rc >= 0)
270 return reg_match_valid;
271 }
272
273 return reg_match_literal_failure;
274}
275#endif
276
277/*
278 */
279int reg_match(const char *p, char *t) {
280#ifdef USE_PCRE
281 if (opt_usepcre)
282 return reg_match_pcre(p, t);
283#endif
284 return reg_match_jk(p, t);
285}
286
287/*----------------------------------------------------------------------------
288 *
289 * recursively call RegMatche() with final segment of PATTERN and of TEXT.
290 *
291 ----------------------------------------------------------------------------*/
292int reg_match_after_star(const char *p, char *t) {
293 int RegMatch = 1; // unused code
294 int nextp;
295
296 /*
297 * pass over existing ? and * in pattern
298 */
299 while (*p == '?' || *p == '*') {
300 /*
301 * take one char for each ? and +
302 */
303 if (*p == '?') {
304 /*
305 * if end of text then no RegMatch
306 */
307 if (!*t++)
308 return reg_match_abort;
309 }
310
311 /*
312 * move to next char in pattern
313 */
314 p++;
315 }
316
317 /*
318 * if end of pattern we have RegMatched regardless of text left
319 */
320 if (!*p)
321 return reg_match_valid;
322
323 /*
324 * get the next character to RegMatch which must be a literal or '['
325 */
326 nextp = *p;
327 if (nextp == '\\') {
328 nextp = p[1];
329
330 /*
331 * if end of text then we have a bad pattern
332 */
333 if (!nextp)
334 return reg_match_bad_pattern;
335 }
336
337 /*
338 * Continue until we run out of text or definite result seen
339 */
340 do {
341 /*
342 * a precondition for RegMatching is that the next character in the pattern
343 * RegMatch the next character in the text or that the next pattern char is
344 * the beginning of a range. Increment text pointer as we go here
345 */
346
347 if (nextp == *t || nextp == '[')
348 RegMatch = reg_match(p, t);
349
350 /*
351 * if the end of text is reached then no RegMatch
352 */
353
354 if (!*t++)
355 RegMatch = reg_match_abort;
356
357 } while (RegMatch != reg_match_valid && RegMatch != reg_match_abort && RegMatch != reg_match_bad_pattern);
358
359 /*
360 * return result
361 */
362 return RegMatch;
363}
364