1 | // This file is part of SmallBASIC |
2 | // |
3 | // The regular expressions routines is based on match.c by J. Kercheval: |
4 | // |
5 | // This program is distributed under the terms of the GPL v2.0 or later |
6 | // Download the GNU Public License (GPL) from www.gnu.org |
7 | // |
8 | // Copyright(C) 2000 Nicholas Christopoulos |
9 | |
10 | /* |
11 | Author: J. Kercheval |
12 | Created: Sat, 01/05/1991 22:21:49 |
13 | |
14 | J. Kercheval Wed, 02/20/1991 22:29:01 Released to Public Domain |
15 | J. Kercheval Fri, 02/22/1991 15:29:01 fix '\' bugs (two :( of them) |
16 | J. Kercheval Sun, 03/10/1991 19:31:29 add error return to RegMatche() |
17 | J. Kercheval Sun, 03/10/1991 20:11:11 add IsValidRegPattern code |
18 | J. Kercheval Sun, 03/10/1991 20:37:11 beef up main() |
19 | J. Kercheval Tue, 03/12/1991 22:25:10 Released as V1.1 to Public Domain |
20 | |
21 | The file match.c coexists in the same directory with the string class. |
22 | */ |
23 | |
24 | /** |
25 | * In the pattern string: |
26 | * `*' RegMatches any sequence of characters (zero or more) |
27 | * `?' RegMatches any character |
28 | * [SET] RegMatches any character in the specified set, |
29 | * [!SET] or [^SET] RegMatches any character not in the specified set. |
30 | * |
31 | * A set is composed of characters or ranges; a range looks like |
32 | * character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the |
33 | * minimal set of characters allowed in the [..] pattern construct. |
34 | * Other characters are allowed (ie. 8 bit characters) if your system |
35 | * will support them. |
36 | * |
37 | * |
38 | * To suppress the special syntactic significance of any of `[]*?!^-\', |
39 | * and RegMatch the character exactly, precede it with a `\'. |
40 | */ |
41 | |
42 | #include "lib/match.h" |
43 | #include "common/smbas.h" |
44 | #include "common/sberr.h" |
45 | |
46 | #ifdef USE_PCRE |
47 | #include <pcre.h> |
48 | #define OVECCOUNT 30 /* should be a multiple of 3 */ |
49 | #endif |
50 | |
51 | int reg_match_after_star(const char *p, char *t); |
52 | int reg_match_jk(const char *p, char *t); |
53 | |
54 | int reg_match_jk(const char *p, char *t) { |
55 | char range_start, range_end; /* start and end in range */ |
56 | int invert; /* is this [..] or [!..] */ |
57 | int member_match; /* have I matched the [..] construct? */ |
58 | int loop; /* should I terminate? */ |
59 | |
60 | for (; *p; p++, t++) { |
61 | /* |
62 | * if this is the end of the text then this is the end of the reg_match |
63 | */ |
64 | if (*t == '\0') |
65 | return (*p == '*' && *++p == '\0') ? reg_match_valid : reg_match_abort; |
66 | |
67 | /* |
68 | * determine and react to pattern type |
69 | */ |
70 | switch (*p) { |
71 | case '?': /* single any character RegMatch */ |
72 | break; |
73 | case '*': /* multiple any character RegMatch */ |
74 | return reg_match_after_star(p, t); |
75 | case '[': /* [..] construct, single member/exclusion * |
76 | * character RegMatch */ |
77 | { |
78 | /* |
79 | * move to beginning of range |
80 | */ |
81 | p++; |
82 | |
83 | /* |
84 | * check if this is a member reg_match or exclusion reg_match |
85 | */ |
86 | invert = 0; // false |
87 | if (*p == '!' || *p == '^') { |
88 | invert = -1; // true |
89 | p++; |
90 | } |
91 | |
92 | /* |
93 | * if closing bracket here or at range start then we have a malformed |
94 | * pattern |
95 | */ |
96 | if (*p == ']') |
97 | return reg_match_bad_pattern; |
98 | |
99 | member_match = 0; // false |
100 | loop = -1; // true |
101 | |
102 | while (loop) { /* if end of construct then loop is done */ |
103 | if (*p == ']') { |
104 | loop = 0; // false |
105 | continue; |
106 | } |
107 | |
108 | /* |
109 | * RegMatching a '!', '^', '-', '\' or a ']' |
110 | */ |
111 | if (*p == '\\') |
112 | range_start = range_end = *++p; |
113 | else |
114 | range_start = range_end = *p; |
115 | |
116 | /* |
117 | * if end of pattern then bad pattern (Missing ']') |
118 | */ |
119 | if (*p == '\0') |
120 | return reg_match_bad_pattern; |
121 | |
122 | /* |
123 | * check for range bar |
124 | */ |
125 | if (*++p == '-') { |
126 | /* |
127 | * get the range end |
128 | */ |
129 | range_end = *++p; |
130 | |
131 | /* |
132 | * if end of pattern or construct then bad pattern |
133 | */ |
134 | if (range_end == '\0' || range_end == ']') |
135 | return reg_match_bad_pattern; |
136 | |
137 | /* |
138 | * special character range end |
139 | */ |
140 | if (range_end == '\\') { |
141 | range_end = *++p; |
142 | |
143 | /* |
144 | * if end of text then we have a bad pattern |
145 | */ |
146 | if (!range_end) |
147 | return reg_match_bad_pattern; |
148 | } |
149 | |
150 | /* |
151 | * move just beyond this range |
152 | */ |
153 | p++; |
154 | } |
155 | |
156 | /* |
157 | * if the text character is in range then RegMatch found. make sure |
158 | * the range letters have the proper relationship to one another |
159 | * before comparison |
160 | */ |
161 | |
162 | if (range_start < range_end) { |
163 | if (*t >= range_start && *t <= range_end) { |
164 | member_match = -1; // true |
165 | loop = 0; // false |
166 | } |
167 | } else { |
168 | if (*t >= range_end && *t <= range_start) { |
169 | member_match = -1; // true |
170 | loop = 0; // false |
171 | } |
172 | } |
173 | } // while ? |
174 | |
175 | /* |
176 | * if there was a match in an exclusion set then no match |
177 | */ |
178 | /* |
179 | * if there was no match in a member set then no match |
180 | */ |
181 | |
182 | if ((invert && member_match) || !(invert || member_match)) |
183 | return reg_match_range_failure; |
184 | |
185 | /* |
186 | * if this is not an exclusion then skip the rest of the [...] |
187 | * construct that already RegMatched. |
188 | */ |
189 | |
190 | if (member_match) { |
191 | while (*p != ']') { |
192 | /* |
193 | * bad pattern (Missing ']') |
194 | */ |
195 | if (*p == '\0') |
196 | return reg_match_bad_pattern; |
197 | |
198 | /* |
199 | * skip exact RegMatch |
200 | */ |
201 | if (*p == '\\') { |
202 | p++; |
203 | |
204 | /* |
205 | * if end of text then we have a bad pattern |
206 | */ |
207 | if (*p == '\0') |
208 | return reg_match_bad_pattern; |
209 | } |
210 | |
211 | /* |
212 | * move to next pattern char |
213 | */ |
214 | p++; |
215 | } // while |
216 | } |
217 | break; |
218 | } |
219 | case '\\': /* next character is quoted and must match * |
220 | * exactly */ |
221 | /* |
222 | * move pattern pointer to quoted char and fall through |
223 | */ |
224 | p++; |
225 | |
226 | /* |
227 | * if end of text then we have a bad pattern |
228 | */ |
229 | if (*p == '\0') |
230 | return reg_match_bad_pattern; |
231 | |
232 | /* |
233 | * must match this character exactly |
234 | */ |
235 | default: |
236 | if (*p != *t) |
237 | return reg_match_literal_failure; |
238 | } // switch! |
239 | } // first for |
240 | |
241 | /* |
242 | * if end of text not reached then the pattern fails |
243 | */ |
244 | if (*t) |
245 | return reg_match_premature_end; |
246 | return reg_match_valid; |
247 | } |
248 | |
249 | /* |
250 | */ |
251 | #ifdef USE_PCRE |
252 | int reg_match_pcre(const char *p, char *t) |
253 | { |
254 | pcre *re; |
255 | const char *error; |
256 | int errofs; |
257 | |
258 | re = |
259 | pcre_compile(p, (opt_usepcre == 2) ? PCRE_CASELESS : 0, &error, &errofs, NULL); |
260 | if (!re) { |
261 | rt_raise("REGULAR EXPRESSION SYNTAX ERROR (offset %d) -> %s" , error, errofs); |
262 | return reg_match_bad_pattern; |
263 | } |
264 | else { |
265 | int rc; |
266 | int ovector[OVECCOUNT]; |
267 | |
268 | rc = pcre_exec(re, NULL, t, strlen(t), 0, 0, ovector, OVECCOUNT); |
269 | if (rc >= 0) |
270 | return reg_match_valid; |
271 | } |
272 | |
273 | return reg_match_literal_failure; |
274 | } |
275 | #endif |
276 | |
277 | /* |
278 | */ |
279 | int reg_match(const char *p, char *t) { |
280 | #ifdef USE_PCRE |
281 | if (opt_usepcre) |
282 | return reg_match_pcre(p, t); |
283 | #endif |
284 | return reg_match_jk(p, t); |
285 | } |
286 | |
287 | /*---------------------------------------------------------------------------- |
288 | * |
289 | * recursively call RegMatche() with final segment of PATTERN and of TEXT. |
290 | * |
291 | ----------------------------------------------------------------------------*/ |
292 | int reg_match_after_star(const char *p, char *t) { |
293 | int RegMatch = 1; // unused code |
294 | int nextp; |
295 | |
296 | /* |
297 | * pass over existing ? and * in pattern |
298 | */ |
299 | while (*p == '?' || *p == '*') { |
300 | /* |
301 | * take one char for each ? and + |
302 | */ |
303 | if (*p == '?') { |
304 | /* |
305 | * if end of text then no RegMatch |
306 | */ |
307 | if (!*t++) |
308 | return reg_match_abort; |
309 | } |
310 | |
311 | /* |
312 | * move to next char in pattern |
313 | */ |
314 | p++; |
315 | } |
316 | |
317 | /* |
318 | * if end of pattern we have RegMatched regardless of text left |
319 | */ |
320 | if (!*p) |
321 | return reg_match_valid; |
322 | |
323 | /* |
324 | * get the next character to RegMatch which must be a literal or '[' |
325 | */ |
326 | nextp = *p; |
327 | if (nextp == '\\') { |
328 | nextp = p[1]; |
329 | |
330 | /* |
331 | * if end of text then we have a bad pattern |
332 | */ |
333 | if (!nextp) |
334 | return reg_match_bad_pattern; |
335 | } |
336 | |
337 | /* |
338 | * Continue until we run out of text or definite result seen |
339 | */ |
340 | do { |
341 | /* |
342 | * a precondition for RegMatching is that the next character in the pattern |
343 | * RegMatch the next character in the text or that the next pattern char is |
344 | * the beginning of a range. Increment text pointer as we go here |
345 | */ |
346 | |
347 | if (nextp == *t || nextp == '[') |
348 | RegMatch = reg_match(p, t); |
349 | |
350 | /* |
351 | * if the end of text is reached then no RegMatch |
352 | */ |
353 | |
354 | if (!*t++) |
355 | RegMatch = reg_match_abort; |
356 | |
357 | } while (RegMatch != reg_match_valid && RegMatch != reg_match_abort && RegMatch != reg_match_bad_pattern); |
358 | |
359 | /* |
360 | * return result |
361 | */ |
362 | return RegMatch; |
363 | } |
364 | |