| 1 | // This file is part of SmallBASIC |
| 2 | // |
| 3 | // The regular expressions routines is based on match.c by J. Kercheval: |
| 4 | // |
| 5 | // This program is distributed under the terms of the GPL v2.0 or later |
| 6 | // Download the GNU Public License (GPL) from www.gnu.org |
| 7 | // |
| 8 | // Copyright(C) 2000 Nicholas Christopoulos |
| 9 | |
| 10 | /* |
| 11 | Author: J. Kercheval |
| 12 | Created: Sat, 01/05/1991 22:21:49 |
| 13 | |
| 14 | J. Kercheval Wed, 02/20/1991 22:29:01 Released to Public Domain |
| 15 | J. Kercheval Fri, 02/22/1991 15:29:01 fix '\' bugs (two :( of them) |
| 16 | J. Kercheval Sun, 03/10/1991 19:31:29 add error return to RegMatche() |
| 17 | J. Kercheval Sun, 03/10/1991 20:11:11 add IsValidRegPattern code |
| 18 | J. Kercheval Sun, 03/10/1991 20:37:11 beef up main() |
| 19 | J. Kercheval Tue, 03/12/1991 22:25:10 Released as V1.1 to Public Domain |
| 20 | |
| 21 | The file match.c coexists in the same directory with the string class. |
| 22 | */ |
| 23 | |
| 24 | /** |
| 25 | * In the pattern string: |
| 26 | * `*' RegMatches any sequence of characters (zero or more) |
| 27 | * `?' RegMatches any character |
| 28 | * [SET] RegMatches any character in the specified set, |
| 29 | * [!SET] or [^SET] RegMatches any character not in the specified set. |
| 30 | * |
| 31 | * A set is composed of characters or ranges; a range looks like |
| 32 | * character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the |
| 33 | * minimal set of characters allowed in the [..] pattern construct. |
| 34 | * Other characters are allowed (ie. 8 bit characters) if your system |
| 35 | * will support them. |
| 36 | * |
| 37 | * |
| 38 | * To suppress the special syntactic significance of any of `[]*?!^-\', |
| 39 | * and RegMatch the character exactly, precede it with a `\'. |
| 40 | */ |
| 41 | |
| 42 | #include "lib/match.h" |
| 43 | #include "common/smbas.h" |
| 44 | #include "common/sberr.h" |
| 45 | |
| 46 | #ifdef USE_PCRE |
| 47 | #include <pcre.h> |
| 48 | #define OVECCOUNT 30 /* should be a multiple of 3 */ |
| 49 | #endif |
| 50 | |
| 51 | int reg_match_after_star(const char *p, char *t); |
| 52 | int reg_match_jk(const char *p, char *t); |
| 53 | |
| 54 | int reg_match_jk(const char *p, char *t) { |
| 55 | char range_start, range_end; /* start and end in range */ |
| 56 | int invert; /* is this [..] or [!..] */ |
| 57 | int member_match; /* have I matched the [..] construct? */ |
| 58 | int loop; /* should I terminate? */ |
| 59 | |
| 60 | for (; *p; p++, t++) { |
| 61 | /* |
| 62 | * if this is the end of the text then this is the end of the reg_match |
| 63 | */ |
| 64 | if (*t == '\0') |
| 65 | return (*p == '*' && *++p == '\0') ? reg_match_valid : reg_match_abort; |
| 66 | |
| 67 | /* |
| 68 | * determine and react to pattern type |
| 69 | */ |
| 70 | switch (*p) { |
| 71 | case '?': /* single any character RegMatch */ |
| 72 | break; |
| 73 | case '*': /* multiple any character RegMatch */ |
| 74 | return reg_match_after_star(p, t); |
| 75 | case '[': /* [..] construct, single member/exclusion * |
| 76 | * character RegMatch */ |
| 77 | { |
| 78 | /* |
| 79 | * move to beginning of range |
| 80 | */ |
| 81 | p++; |
| 82 | |
| 83 | /* |
| 84 | * check if this is a member reg_match or exclusion reg_match |
| 85 | */ |
| 86 | invert = 0; // false |
| 87 | if (*p == '!' || *p == '^') { |
| 88 | invert = -1; // true |
| 89 | p++; |
| 90 | } |
| 91 | |
| 92 | /* |
| 93 | * if closing bracket here or at range start then we have a malformed |
| 94 | * pattern |
| 95 | */ |
| 96 | if (*p == ']') |
| 97 | return reg_match_bad_pattern; |
| 98 | |
| 99 | member_match = 0; // false |
| 100 | loop = -1; // true |
| 101 | |
| 102 | while (loop) { /* if end of construct then loop is done */ |
| 103 | if (*p == ']') { |
| 104 | loop = 0; // false |
| 105 | continue; |
| 106 | } |
| 107 | |
| 108 | /* |
| 109 | * RegMatching a '!', '^', '-', '\' or a ']' |
| 110 | */ |
| 111 | if (*p == '\\') |
| 112 | range_start = range_end = *++p; |
| 113 | else |
| 114 | range_start = range_end = *p; |
| 115 | |
| 116 | /* |
| 117 | * if end of pattern then bad pattern (Missing ']') |
| 118 | */ |
| 119 | if (*p == '\0') |
| 120 | return reg_match_bad_pattern; |
| 121 | |
| 122 | /* |
| 123 | * check for range bar |
| 124 | */ |
| 125 | if (*++p == '-') { |
| 126 | /* |
| 127 | * get the range end |
| 128 | */ |
| 129 | range_end = *++p; |
| 130 | |
| 131 | /* |
| 132 | * if end of pattern or construct then bad pattern |
| 133 | */ |
| 134 | if (range_end == '\0' || range_end == ']') |
| 135 | return reg_match_bad_pattern; |
| 136 | |
| 137 | /* |
| 138 | * special character range end |
| 139 | */ |
| 140 | if (range_end == '\\') { |
| 141 | range_end = *++p; |
| 142 | |
| 143 | /* |
| 144 | * if end of text then we have a bad pattern |
| 145 | */ |
| 146 | if (!range_end) |
| 147 | return reg_match_bad_pattern; |
| 148 | } |
| 149 | |
| 150 | /* |
| 151 | * move just beyond this range |
| 152 | */ |
| 153 | p++; |
| 154 | } |
| 155 | |
| 156 | /* |
| 157 | * if the text character is in range then RegMatch found. make sure |
| 158 | * the range letters have the proper relationship to one another |
| 159 | * before comparison |
| 160 | */ |
| 161 | |
| 162 | if (range_start < range_end) { |
| 163 | if (*t >= range_start && *t <= range_end) { |
| 164 | member_match = -1; // true |
| 165 | loop = 0; // false |
| 166 | } |
| 167 | } else { |
| 168 | if (*t >= range_end && *t <= range_start) { |
| 169 | member_match = -1; // true |
| 170 | loop = 0; // false |
| 171 | } |
| 172 | } |
| 173 | } // while ? |
| 174 | |
| 175 | /* |
| 176 | * if there was a match in an exclusion set then no match |
| 177 | */ |
| 178 | /* |
| 179 | * if there was no match in a member set then no match |
| 180 | */ |
| 181 | |
| 182 | if ((invert && member_match) || !(invert || member_match)) |
| 183 | return reg_match_range_failure; |
| 184 | |
| 185 | /* |
| 186 | * if this is not an exclusion then skip the rest of the [...] |
| 187 | * construct that already RegMatched. |
| 188 | */ |
| 189 | |
| 190 | if (member_match) { |
| 191 | while (*p != ']') { |
| 192 | /* |
| 193 | * bad pattern (Missing ']') |
| 194 | */ |
| 195 | if (*p == '\0') |
| 196 | return reg_match_bad_pattern; |
| 197 | |
| 198 | /* |
| 199 | * skip exact RegMatch |
| 200 | */ |
| 201 | if (*p == '\\') { |
| 202 | p++; |
| 203 | |
| 204 | /* |
| 205 | * if end of text then we have a bad pattern |
| 206 | */ |
| 207 | if (*p == '\0') |
| 208 | return reg_match_bad_pattern; |
| 209 | } |
| 210 | |
| 211 | /* |
| 212 | * move to next pattern char |
| 213 | */ |
| 214 | p++; |
| 215 | } // while |
| 216 | } |
| 217 | break; |
| 218 | } |
| 219 | case '\\': /* next character is quoted and must match * |
| 220 | * exactly */ |
| 221 | /* |
| 222 | * move pattern pointer to quoted char and fall through |
| 223 | */ |
| 224 | p++; |
| 225 | |
| 226 | /* |
| 227 | * if end of text then we have a bad pattern |
| 228 | */ |
| 229 | if (*p == '\0') |
| 230 | return reg_match_bad_pattern; |
| 231 | |
| 232 | /* |
| 233 | * must match this character exactly |
| 234 | */ |
| 235 | default: |
| 236 | if (*p != *t) |
| 237 | return reg_match_literal_failure; |
| 238 | } // switch! |
| 239 | } // first for |
| 240 | |
| 241 | /* |
| 242 | * if end of text not reached then the pattern fails |
| 243 | */ |
| 244 | if (*t) |
| 245 | return reg_match_premature_end; |
| 246 | return reg_match_valid; |
| 247 | } |
| 248 | |
| 249 | /* |
| 250 | */ |
| 251 | #ifdef USE_PCRE |
| 252 | int reg_match_pcre(const char *p, char *t) |
| 253 | { |
| 254 | pcre *re; |
| 255 | const char *error; |
| 256 | int errofs; |
| 257 | |
| 258 | re = |
| 259 | pcre_compile(p, (opt_usepcre == 2) ? PCRE_CASELESS : 0, &error, &errofs, NULL); |
| 260 | if (!re) { |
| 261 | rt_raise("REGULAR EXPRESSION SYNTAX ERROR (offset %d) -> %s" , error, errofs); |
| 262 | return reg_match_bad_pattern; |
| 263 | } |
| 264 | else { |
| 265 | int rc; |
| 266 | int ovector[OVECCOUNT]; |
| 267 | |
| 268 | rc = pcre_exec(re, NULL, t, strlen(t), 0, 0, ovector, OVECCOUNT); |
| 269 | if (rc >= 0) |
| 270 | return reg_match_valid; |
| 271 | } |
| 272 | |
| 273 | return reg_match_literal_failure; |
| 274 | } |
| 275 | #endif |
| 276 | |
| 277 | /* |
| 278 | */ |
| 279 | int reg_match(const char *p, char *t) { |
| 280 | #ifdef USE_PCRE |
| 281 | if (opt_usepcre) |
| 282 | return reg_match_pcre(p, t); |
| 283 | #endif |
| 284 | return reg_match_jk(p, t); |
| 285 | } |
| 286 | |
| 287 | /*---------------------------------------------------------------------------- |
| 288 | * |
| 289 | * recursively call RegMatche() with final segment of PATTERN and of TEXT. |
| 290 | * |
| 291 | ----------------------------------------------------------------------------*/ |
| 292 | int reg_match_after_star(const char *p, char *t) { |
| 293 | int RegMatch = 1; // unused code |
| 294 | int nextp; |
| 295 | |
| 296 | /* |
| 297 | * pass over existing ? and * in pattern |
| 298 | */ |
| 299 | while (*p == '?' || *p == '*') { |
| 300 | /* |
| 301 | * take one char for each ? and + |
| 302 | */ |
| 303 | if (*p == '?') { |
| 304 | /* |
| 305 | * if end of text then no RegMatch |
| 306 | */ |
| 307 | if (!*t++) |
| 308 | return reg_match_abort; |
| 309 | } |
| 310 | |
| 311 | /* |
| 312 | * move to next char in pattern |
| 313 | */ |
| 314 | p++; |
| 315 | } |
| 316 | |
| 317 | /* |
| 318 | * if end of pattern we have RegMatched regardless of text left |
| 319 | */ |
| 320 | if (!*p) |
| 321 | return reg_match_valid; |
| 322 | |
| 323 | /* |
| 324 | * get the next character to RegMatch which must be a literal or '[' |
| 325 | */ |
| 326 | nextp = *p; |
| 327 | if (nextp == '\\') { |
| 328 | nextp = p[1]; |
| 329 | |
| 330 | /* |
| 331 | * if end of text then we have a bad pattern |
| 332 | */ |
| 333 | if (!nextp) |
| 334 | return reg_match_bad_pattern; |
| 335 | } |
| 336 | |
| 337 | /* |
| 338 | * Continue until we run out of text or definite result seen |
| 339 | */ |
| 340 | do { |
| 341 | /* |
| 342 | * a precondition for RegMatching is that the next character in the pattern |
| 343 | * RegMatch the next character in the text or that the next pattern char is |
| 344 | * the beginning of a range. Increment text pointer as we go here |
| 345 | */ |
| 346 | |
| 347 | if (nextp == *t || nextp == '[') |
| 348 | RegMatch = reg_match(p, t); |
| 349 | |
| 350 | /* |
| 351 | * if the end of text is reached then no RegMatch |
| 352 | */ |
| 353 | |
| 354 | if (!*t++) |
| 355 | RegMatch = reg_match_abort; |
| 356 | |
| 357 | } while (RegMatch != reg_match_valid && RegMatch != reg_match_abort && RegMatch != reg_match_bad_pattern); |
| 358 | |
| 359 | /* |
| 360 | * return result |
| 361 | */ |
| 362 | return RegMatch; |
| 363 | } |
| 364 | |