| 1 | // This file is part of SmallBASIC | 
|---|
| 2 | // | 
|---|
| 3 | // The regular expressions routines is based on match.c by J. Kercheval: | 
|---|
| 4 | // | 
|---|
| 5 | // This program is distributed under the terms of the GPL v2.0 or later | 
|---|
| 6 | // Download the GNU Public License (GPL) from www.gnu.org | 
|---|
| 7 | // | 
|---|
| 8 | // Copyright(C) 2000 Nicholas Christopoulos | 
|---|
| 9 |  | 
|---|
| 10 | /* | 
|---|
| 11 | Author: J. Kercheval | 
|---|
| 12 | Created: Sat, 01/05/1991  22:21:49 | 
|---|
| 13 |  | 
|---|
| 14 | J. Kercheval  Wed, 02/20/1991  22:29:01  Released to Public Domain | 
|---|
| 15 | J. Kercheval  Fri, 02/22/1991  15:29:01  fix '\' bugs (two :( of them) | 
|---|
| 16 | J. Kercheval  Sun, 03/10/1991  19:31:29  add error return to RegMatche() | 
|---|
| 17 | J. Kercheval  Sun, 03/10/1991  20:11:11  add IsValidRegPattern code | 
|---|
| 18 | J. Kercheval  Sun, 03/10/1991  20:37:11  beef up main() | 
|---|
| 19 | J. Kercheval  Tue, 03/12/1991  22:25:10  Released as V1.1 to Public Domain | 
|---|
| 20 |  | 
|---|
| 21 | The file match.c coexists in the same directory with the string class. | 
|---|
| 22 | */ | 
|---|
| 23 |  | 
|---|
| 24 | /** | 
|---|
| 25 | *  In the pattern string: | 
|---|
| 26 | *       `*' RegMatches any sequence of characters (zero or more) | 
|---|
| 27 | *       `?' RegMatches any character | 
|---|
| 28 | *       [SET] RegMatches any character in the specified set, | 
|---|
| 29 | *       [!SET] or [^SET] RegMatches any character not in the specified set. | 
|---|
| 30 | * | 
|---|
| 31 | *  A set is composed of characters or ranges; a range looks like | 
|---|
| 32 | *  character hyphen character (as in 0-9 or A-Z).  [0-9a-zA-Z_] is the | 
|---|
| 33 | *  minimal set of characters allowed in the [..] pattern construct. | 
|---|
| 34 | *  Other characters are allowed (ie. 8 bit characters) if your system | 
|---|
| 35 | *  will support them. | 
|---|
| 36 | * | 
|---|
| 37 | * | 
|---|
| 38 | *  To suppress the special syntactic significance of any of `[]*?!^-\', | 
|---|
| 39 | *  and RegMatch the character exactly, precede it with a `\'. | 
|---|
| 40 | */ | 
|---|
| 41 |  | 
|---|
| 42 | #include "lib/match.h" | 
|---|
| 43 | #include "common/smbas.h" | 
|---|
| 44 | #include "common/sberr.h" | 
|---|
| 45 |  | 
|---|
| 46 | #ifdef USE_PCRE | 
|---|
| 47 | #include <pcre.h> | 
|---|
| 48 | #define OVECCOUNT 30            /* should be a multiple of 3 */ | 
|---|
| 49 | #endif | 
|---|
| 50 |  | 
|---|
| 51 | int reg_match_after_star(const char *p, char *t); | 
|---|
| 52 | int reg_match_jk(const char *p, char *t); | 
|---|
| 53 |  | 
|---|
| 54 | int reg_match_jk(const char *p, char *t) { | 
|---|
| 55 | char range_start, range_end; /* start and end in range */ | 
|---|
| 56 | int invert; /* is this [..] or [!..] */ | 
|---|
| 57 | int member_match; /* have I matched the [..] construct? */ | 
|---|
| 58 | int loop; /* should I terminate? */ | 
|---|
| 59 |  | 
|---|
| 60 | for (; *p; p++, t++) { | 
|---|
| 61 | /* | 
|---|
| 62 | * if this is the end of the text then this is the end of the reg_match | 
|---|
| 63 | */ | 
|---|
| 64 | if (*t == '\0') | 
|---|
| 65 | return (*p == '*' && *++p == '\0') ? reg_match_valid : reg_match_abort; | 
|---|
| 66 |  | 
|---|
| 67 | /* | 
|---|
| 68 | * determine and react to pattern type | 
|---|
| 69 | */ | 
|---|
| 70 | switch (*p) { | 
|---|
| 71 | case '?': /* single any character RegMatch */ | 
|---|
| 72 | break; | 
|---|
| 73 | case '*': /* multiple any character RegMatch */ | 
|---|
| 74 | return reg_match_after_star(p, t); | 
|---|
| 75 | case '[': /* [..] construct, single member/exclusion * | 
|---|
| 76 | * character RegMatch */ | 
|---|
| 77 | { | 
|---|
| 78 | /* | 
|---|
| 79 | * move to beginning of range | 
|---|
| 80 | */ | 
|---|
| 81 | p++; | 
|---|
| 82 |  | 
|---|
| 83 | /* | 
|---|
| 84 | * check if this is a member reg_match or exclusion reg_match | 
|---|
| 85 | */ | 
|---|
| 86 | invert = 0;             // false | 
|---|
| 87 | if (*p == '!' || *p == '^') { | 
|---|
| 88 | invert = -1;          // true | 
|---|
| 89 | p++; | 
|---|
| 90 | } | 
|---|
| 91 |  | 
|---|
| 92 | /* | 
|---|
| 93 | * if closing bracket here or at range start then we have a malformed | 
|---|
| 94 | * pattern | 
|---|
| 95 | */ | 
|---|
| 96 | if (*p == ']') | 
|---|
| 97 | return reg_match_bad_pattern; | 
|---|
| 98 |  | 
|---|
| 99 | member_match = 0;       // false | 
|---|
| 100 | loop = -1;              // true | 
|---|
| 101 |  | 
|---|
| 102 | while (loop) { /* if end of construct then loop is done */ | 
|---|
| 103 | if (*p == ']') { | 
|---|
| 104 | loop = 0;           // false | 
|---|
| 105 | continue; | 
|---|
| 106 | } | 
|---|
| 107 |  | 
|---|
| 108 | /* | 
|---|
| 109 | * RegMatching a '!', '^', '-', '\' or a ']' | 
|---|
| 110 | */ | 
|---|
| 111 | if (*p == '\\') | 
|---|
| 112 | range_start = range_end = *++p; | 
|---|
| 113 | else | 
|---|
| 114 | range_start = range_end = *p; | 
|---|
| 115 |  | 
|---|
| 116 | /* | 
|---|
| 117 | * if end of pattern then bad pattern (Missing ']') | 
|---|
| 118 | */ | 
|---|
| 119 | if (*p == '\0') | 
|---|
| 120 | return reg_match_bad_pattern; | 
|---|
| 121 |  | 
|---|
| 122 | /* | 
|---|
| 123 | * check for range bar | 
|---|
| 124 | */ | 
|---|
| 125 | if (*++p == '-') { | 
|---|
| 126 | /* | 
|---|
| 127 | * get the range end | 
|---|
| 128 | */ | 
|---|
| 129 | range_end = *++p; | 
|---|
| 130 |  | 
|---|
| 131 | /* | 
|---|
| 132 | * if end of pattern or construct then bad pattern | 
|---|
| 133 | */ | 
|---|
| 134 | if (range_end == '\0' || range_end == ']') | 
|---|
| 135 | return reg_match_bad_pattern; | 
|---|
| 136 |  | 
|---|
| 137 | /* | 
|---|
| 138 | * special character range end | 
|---|
| 139 | */ | 
|---|
| 140 | if (range_end == '\\') { | 
|---|
| 141 | range_end = *++p; | 
|---|
| 142 |  | 
|---|
| 143 | /* | 
|---|
| 144 | * if end of text then we have a bad pattern | 
|---|
| 145 | */ | 
|---|
| 146 | if (!range_end) | 
|---|
| 147 | return reg_match_bad_pattern; | 
|---|
| 148 | } | 
|---|
| 149 |  | 
|---|
| 150 | /* | 
|---|
| 151 | * move just beyond this range | 
|---|
| 152 | */ | 
|---|
| 153 | p++; | 
|---|
| 154 | } | 
|---|
| 155 |  | 
|---|
| 156 | /* | 
|---|
| 157 | * if the text character is in range then RegMatch found. make sure | 
|---|
| 158 | * the range letters have the proper relationship to one another | 
|---|
| 159 | * before comparison | 
|---|
| 160 | */ | 
|---|
| 161 |  | 
|---|
| 162 | if (range_start < range_end) { | 
|---|
| 163 | if (*t >= range_start && *t <= range_end) { | 
|---|
| 164 | member_match = -1;  // true | 
|---|
| 165 | loop = 0;         // false | 
|---|
| 166 | } | 
|---|
| 167 | } else { | 
|---|
| 168 | if (*t >= range_end && *t <= range_start) { | 
|---|
| 169 | member_match = -1;  // true | 
|---|
| 170 | loop = 0;         // false | 
|---|
| 171 | } | 
|---|
| 172 | } | 
|---|
| 173 | }                       // while ? | 
|---|
| 174 |  | 
|---|
| 175 | /* | 
|---|
| 176 | * if there was a match in an exclusion set then no match | 
|---|
| 177 | */ | 
|---|
| 178 | /* | 
|---|
| 179 | * if there was no match in a member set then no match | 
|---|
| 180 | */ | 
|---|
| 181 |  | 
|---|
| 182 | if ((invert && member_match) || !(invert || member_match)) | 
|---|
| 183 | return reg_match_range_failure; | 
|---|
| 184 |  | 
|---|
| 185 | /* | 
|---|
| 186 | * if this is not an exclusion then skip the rest of the [...] | 
|---|
| 187 | * construct that already RegMatched. | 
|---|
| 188 | */ | 
|---|
| 189 |  | 
|---|
| 190 | if (member_match) { | 
|---|
| 191 | while (*p != ']') { | 
|---|
| 192 | /* | 
|---|
| 193 | * bad pattern (Missing ']') | 
|---|
| 194 | */ | 
|---|
| 195 | if (*p == '\0') | 
|---|
| 196 | return reg_match_bad_pattern; | 
|---|
| 197 |  | 
|---|
| 198 | /* | 
|---|
| 199 | * skip exact RegMatch | 
|---|
| 200 | */ | 
|---|
| 201 | if (*p == '\\') { | 
|---|
| 202 | p++; | 
|---|
| 203 |  | 
|---|
| 204 | /* | 
|---|
| 205 | * if end of text then we have a bad pattern | 
|---|
| 206 | */ | 
|---|
| 207 | if (*p == '\0') | 
|---|
| 208 | return reg_match_bad_pattern; | 
|---|
| 209 | } | 
|---|
| 210 |  | 
|---|
| 211 | /* | 
|---|
| 212 | * move to next pattern char | 
|---|
| 213 | */ | 
|---|
| 214 | p++; | 
|---|
| 215 | }                     // while | 
|---|
| 216 | } | 
|---|
| 217 | break; | 
|---|
| 218 | } | 
|---|
| 219 | case '\\': /* next character is quoted and must match * | 
|---|
| 220 | * exactly */ | 
|---|
| 221 | /* | 
|---|
| 222 | * move pattern pointer to quoted char and fall through | 
|---|
| 223 | */ | 
|---|
| 224 | p++; | 
|---|
| 225 |  | 
|---|
| 226 | /* | 
|---|
| 227 | * if end of text then we have a bad pattern | 
|---|
| 228 | */ | 
|---|
| 229 | if (*p == '\0') | 
|---|
| 230 | return reg_match_bad_pattern; | 
|---|
| 231 |  | 
|---|
| 232 | /* | 
|---|
| 233 | * must match this character exactly | 
|---|
| 234 | */ | 
|---|
| 235 | default: | 
|---|
| 236 | if (*p != *t) | 
|---|
| 237 | return reg_match_literal_failure; | 
|---|
| 238 | }                           // switch! | 
|---|
| 239 | }                             // first for | 
|---|
| 240 |  | 
|---|
| 241 | /* | 
|---|
| 242 | * if end of text not reached then the pattern fails | 
|---|
| 243 | */ | 
|---|
| 244 | if (*t) | 
|---|
| 245 | return reg_match_premature_end; | 
|---|
| 246 | return reg_match_valid; | 
|---|
| 247 | } | 
|---|
| 248 |  | 
|---|
| 249 | /* | 
|---|
| 250 | */ | 
|---|
| 251 | #ifdef USE_PCRE | 
|---|
| 252 | int reg_match_pcre(const char *p, char *t) | 
|---|
| 253 | { | 
|---|
| 254 | pcre *re; | 
|---|
| 255 | const char *error; | 
|---|
| 256 | int errofs; | 
|---|
| 257 |  | 
|---|
| 258 | re = | 
|---|
| 259 | pcre_compile(p, (opt_usepcre == 2) ? PCRE_CASELESS : 0, &error, &errofs, NULL); | 
|---|
| 260 | if (!re) { | 
|---|
| 261 | rt_raise( "REGULAR EXPRESSION SYNTAX ERROR (offset %d) -> %s", error, errofs); | 
|---|
| 262 | return reg_match_bad_pattern; | 
|---|
| 263 | } | 
|---|
| 264 | else { | 
|---|
| 265 | int rc; | 
|---|
| 266 | int ovector[OVECCOUNT]; | 
|---|
| 267 |  | 
|---|
| 268 | rc = pcre_exec(re, NULL, t, strlen(t), 0, 0, ovector, OVECCOUNT); | 
|---|
| 269 | if (rc >= 0) | 
|---|
| 270 | return reg_match_valid; | 
|---|
| 271 | } | 
|---|
| 272 |  | 
|---|
| 273 | return reg_match_literal_failure; | 
|---|
| 274 | } | 
|---|
| 275 | #endif | 
|---|
| 276 |  | 
|---|
| 277 | /* | 
|---|
| 278 | */ | 
|---|
| 279 | int reg_match(const char *p, char *t) { | 
|---|
| 280 | #ifdef USE_PCRE | 
|---|
| 281 | if (opt_usepcre) | 
|---|
| 282 | return reg_match_pcre(p, t); | 
|---|
| 283 | #endif | 
|---|
| 284 | return reg_match_jk(p, t); | 
|---|
| 285 | } | 
|---|
| 286 |  | 
|---|
| 287 | /*---------------------------------------------------------------------------- | 
|---|
| 288 | * | 
|---|
| 289 | * recursively call RegMatche() with final segment of PATTERN and of TEXT. | 
|---|
| 290 | * | 
|---|
| 291 | ----------------------------------------------------------------------------*/ | 
|---|
| 292 | int reg_match_after_star(const char *p, char *t) { | 
|---|
| 293 | int RegMatch = 1;             // unused code | 
|---|
| 294 | int nextp; | 
|---|
| 295 |  | 
|---|
| 296 | /* | 
|---|
| 297 | * pass over existing ? and * in pattern | 
|---|
| 298 | */ | 
|---|
| 299 | while (*p == '?' || *p == '*') { | 
|---|
| 300 | /* | 
|---|
| 301 | * take one char for each ? and + | 
|---|
| 302 | */ | 
|---|
| 303 | if (*p == '?') { | 
|---|
| 304 | /* | 
|---|
| 305 | * if end of text then no RegMatch | 
|---|
| 306 | */ | 
|---|
| 307 | if (!*t++) | 
|---|
| 308 | return reg_match_abort; | 
|---|
| 309 | } | 
|---|
| 310 |  | 
|---|
| 311 | /* | 
|---|
| 312 | * move to next char in pattern | 
|---|
| 313 | */ | 
|---|
| 314 | p++; | 
|---|
| 315 | } | 
|---|
| 316 |  | 
|---|
| 317 | /* | 
|---|
| 318 | * if end of pattern we have RegMatched regardless of text left | 
|---|
| 319 | */ | 
|---|
| 320 | if (!*p) | 
|---|
| 321 | return reg_match_valid; | 
|---|
| 322 |  | 
|---|
| 323 | /* | 
|---|
| 324 | * get the next character to RegMatch which must be a literal or '[' | 
|---|
| 325 | */ | 
|---|
| 326 | nextp = *p; | 
|---|
| 327 | if (nextp == '\\') { | 
|---|
| 328 | nextp = p[1]; | 
|---|
| 329 |  | 
|---|
| 330 | /* | 
|---|
| 331 | * if end of text then we have a bad pattern | 
|---|
| 332 | */ | 
|---|
| 333 | if (!nextp) | 
|---|
| 334 | return reg_match_bad_pattern; | 
|---|
| 335 | } | 
|---|
| 336 |  | 
|---|
| 337 | /* | 
|---|
| 338 | * Continue until we run out of text or definite result seen | 
|---|
| 339 | */ | 
|---|
| 340 | do { | 
|---|
| 341 | /* | 
|---|
| 342 | * a precondition for RegMatching is that the next character in the pattern | 
|---|
| 343 | * RegMatch the next character in the text or that the next pattern char is | 
|---|
| 344 | * the beginning of a range.  Increment text pointer as we go here | 
|---|
| 345 | */ | 
|---|
| 346 |  | 
|---|
| 347 | if (nextp == *t || nextp == '[') | 
|---|
| 348 | RegMatch = reg_match(p, t); | 
|---|
| 349 |  | 
|---|
| 350 | /* | 
|---|
| 351 | * if the end of text is reached then no RegMatch | 
|---|
| 352 | */ | 
|---|
| 353 |  | 
|---|
| 354 | if (!*t++) | 
|---|
| 355 | RegMatch = reg_match_abort; | 
|---|
| 356 |  | 
|---|
| 357 | } while (RegMatch != reg_match_valid && RegMatch != reg_match_abort && RegMatch != reg_match_bad_pattern); | 
|---|
| 358 |  | 
|---|
| 359 | /* | 
|---|
| 360 | * return result | 
|---|
| 361 | */ | 
|---|
| 362 | return RegMatch; | 
|---|
| 363 | } | 
|---|
| 364 |  | 
|---|