| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * N. Nes |
| 11 | * PCRE library interface |
| 12 | * The PCRE library is a set of functions that implement regular |
| 13 | * expression pattern matching using the same syntax and semantics as Perl, |
| 14 | * with just a few differences. The current implementation of PCRE |
| 15 | * (release 4.x) corresponds approximately with Perl 5.8, including support |
| 16 | * for UTF-8 encoded strings. However, this support has to be |
| 17 | * explicitly enabled; it is not the default. |
| 18 | * |
| 19 | * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre |
| 20 | */ |
| 21 | #include "monetdb_config.h" |
| 22 | #include <string.h> |
| 23 | |
| 24 | #include "mal.h" |
| 25 | #include "mal_exception.h" |
| 26 | |
| 27 | #include <wchar.h> |
| 28 | #include <wctype.h> |
| 29 | |
| 30 | #ifdef HAVE_LIBPCRE |
| 31 | #include <pcre.h> |
| 32 | #ifndef PCRE_STUDY_JIT_COMPILE |
| 33 | /* old library version on e.g. EPEL 6 */ |
| 34 | #define pcre_free_study(x) pcre_free(x) |
| 35 | #define PCRE_STUDY_JIT_COMPILE 0 |
| 36 | #endif |
| 37 | #define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */ |
| 38 | |
| 39 | #else |
| 40 | |
| 41 | #include <regex.h> |
| 42 | |
| 43 | typedef regex_t pcre; |
| 44 | #endif |
| 45 | |
| 46 | mal_export str pcre_init(void *ret); |
| 47 | |
| 48 | mal_export str PCREquote(str *r, const str *v); |
| 49 | mal_export str PCREmatch(bit *ret, const str *val, const str *pat); |
| 50 | mal_export str PCREimatch(bit *ret, const str *val, const str *pat); |
| 51 | mal_export str PCREindex(int *ret, const pcre *pat, const str *val); |
| 52 | mal_export str PCREpatindex(int *ret, const str *pat, const str *val); |
| 53 | mal_export str PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags); |
| 54 | mal_export str PCREreplace_bat_wrap(bat *res, const bat *or, const str *pat, const str *repl, const str *flags); |
| 55 | mal_export str PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags); |
| 56 | mal_export str PCREreplacefirst_bat_wrap(bat *res, const bat *or, const str *pat, const str *repl, const str *flags); |
| 57 | mal_export str PCREsql2pcre(str *ret, const str *pat, const str *esc); |
| 58 | |
| 59 | mal_export str PCRElike3(bit *ret, const str *s, const str *pat, const str *esc); |
| 60 | mal_export str PCRElike2(bit *ret, const str *s, const str *pat); |
| 61 | mal_export str PCREnotlike3(bit *ret, const str *s, const str *pat, const str *esc); |
| 62 | mal_export str PCREnotlike2(bit *ret, const str *s, const str *pat); |
| 63 | mal_export str BATPCRElike(bat *ret, const bat *b, const str *pat, const str *esc); |
| 64 | mal_export str BATPCRElike2(bat *ret, const bat *b, const str *pat); |
| 65 | mal_export str BATPCREnotlike(bat *ret, const bat *b, const str *pat, const str *esc); |
| 66 | mal_export str BATPCREnotlike2(bat *ret, const bat *b, const str *pat); |
| 67 | mal_export str PCREilike3(bit *ret, const str *s, const str *pat, const str *esc); |
| 68 | mal_export str PCREilike2(bit *ret, const str *s, const str *pat); |
| 69 | mal_export str PCREnotilike3(bit *ret, const str *s, const str *pat, const str *esc); |
| 70 | mal_export str PCREnotilike2(bit *ret, const str *s, const str *pat); |
| 71 | mal_export str BATPCREilike(bat *ret, const bat *b, const str *pat, const str *esc); |
| 72 | mal_export str BATPCREilike2(bat *ret, const bat *b, const str *pat); |
| 73 | mal_export str BATPCREnotilike(bat *ret, const bat *b, const str *pat, const str *esc); |
| 74 | mal_export str BATPCREnotilike2(bat *ret, const bat *b, const str *pat); |
| 75 | |
| 76 | mal_export str PCRElikeselect2(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *caseignore, const bit *anti); |
| 77 | mal_export str PCRElikeselect1(bat *ret, const bat *bid, const bat *cid, const str *pat, const str *esc, const bit *anti); |
| 78 | mal_export str PCRElikeselect3(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *anti); |
| 79 | mal_export str PCRElikeselect4(bat *ret, const bat *bid, const bat *cid, const str *pat, const bit *anti); |
| 80 | mal_export str PCRElikeselect5(bat *ret, const bat *bid, const bat *sid, const str *pat, const bit *anti); |
| 81 | |
| 82 | mal_export str LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); |
| 83 | mal_export str LIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); |
| 84 | mal_export str ILIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); |
| 85 | mal_export str ILIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); |
| 86 | |
| 87 | /* current implementation assumes simple %keyword% [keyw%]* */ |
| 88 | typedef struct RE { |
| 89 | char *k; |
| 90 | uint32_t *w; |
| 91 | bool search; |
| 92 | size_t len; |
| 93 | struct RE *n; |
| 94 | } RE; |
| 95 | |
| 96 | /* We cannot use strcasecmp and strncasecmp since they work byte for |
| 97 | * byte and don't deal with multibyte encodings (such as UTF-8). |
| 98 | * |
| 99 | * We implement our own conversion from UTF-8 encoding to Unicode code |
| 100 | * points which we store in uint32_t. The reason for this is, |
| 101 | * functions like mbsrtowcs are locale-dependent (so we need a UTF-8 |
| 102 | * locale to use them), and on Windows, wchar_t is only 2 bytes and |
| 103 | * therefore cannot hold all Unicode code points. We do use functions |
| 104 | * such as towlower to convert a Unicode code point to its lower-case |
| 105 | * equivalent, but again on Windows, if the code point doesn't fit in |
| 106 | * 2 bytes, we skip this conversion and compare the unconverted code |
| 107 | * points. |
| 108 | * |
| 109 | * Note, towlower is also locale-dependent, but we don't need a UTF-8 |
| 110 | * locale in order to use it. */ |
| 111 | |
| 112 | /* helper function to convert a UTF-8 multibyte character to a wide |
| 113 | * character */ |
| 114 | static size_t |
| 115 | utfc8touc(uint32_t *restrict dest, const char *restrict src) |
| 116 | { |
| 117 | if ((src[0] & 0x80) == 0) { |
| 118 | *dest = src[0]; |
| 119 | return src[0] != 0; |
| 120 | } else if ((src[0] & 0xE0) == 0xC0 |
| 121 | && (src[1] & 0xC0) == 0x80 |
| 122 | && (src[0] & 0x1E) != 0) { |
| 123 | *dest = (src[0] & 0x1F) << 6 |
| 124 | | (src[1] & 0x3F); |
| 125 | return 2; |
| 126 | } else if ((src[0] & 0xF0) == 0xE0 |
| 127 | && (src[1] & 0xC0) == 0x80 |
| 128 | && (src[2] & 0xC0) == 0x80 |
| 129 | && ((src[0] & 0x0F) != 0 |
| 130 | || (src[1] & 0x20) != 0)) { |
| 131 | *dest = (src[0] & 0x0F) << 12 |
| 132 | | (src[1] & 0x3F) << 6 |
| 133 | | (src[2] & 0x3F); |
| 134 | return 3; |
| 135 | } else if ((src[0] & 0xF8) == 0xF0 |
| 136 | && (src[1] & 0xC0) == 0x80 |
| 137 | && (src[2] & 0xC0) == 0x80 |
| 138 | && (src[3] & 0xC0) == 0x80) { |
| 139 | uint32_t c = (src[0] & 0x07) << 18 |
| 140 | | (src[1] & 0x3F) << 12 |
| 141 | | (src[2] & 0x3F) << 6 |
| 142 | | (src[3] & 0x3F); |
| 143 | if (c < 0x10000 |
| 144 | || c > 0x10FFFF |
| 145 | || (c & 0x1FF800) == 0x00D800) |
| 146 | return (size_t) -1; |
| 147 | *dest = c; |
| 148 | return 4; |
| 149 | } |
| 150 | return (size_t) -1; |
| 151 | } |
| 152 | |
| 153 | /* helper function to convert a UTF-8 string to a wide character |
| 154 | * string, the wide character string is allocated */ |
| 155 | static uint32_t * |
| 156 | utf8stoucs(const char *src) |
| 157 | { |
| 158 | uint32_t *dest; |
| 159 | size_t i = 0; |
| 160 | size_t j = 0; |
| 161 | |
| 162 | /* count how many uint32_t's we need, while also checking for |
| 163 | * correctness of the input */ |
| 164 | while (src[j]) { |
| 165 | i++; |
| 166 | if ((src[j+0] & 0x80) == 0) { |
| 167 | j += 1; |
| 168 | } else if ((src[j+0] & 0xE0) == 0xC0 |
| 169 | && (src[j+1] & 0xC0) == 0x80 |
| 170 | && (src[j+0] & 0x1E) != 0) { |
| 171 | j += 2; |
| 172 | } else if ((src[j+0] & 0xF0) == 0xE0 |
| 173 | && (src[j+1] & 0xC0) == 0x80 |
| 174 | && (src[j+2] & 0xC0) == 0x80 |
| 175 | && ((src[j+0] & 0x0F) != 0 |
| 176 | || (src[j+1] & 0x20) != 0)) { |
| 177 | j += 3; |
| 178 | } else if ((src[j+0] & 0xF8) == 0xF0 |
| 179 | && (src[j+1] & 0xC0) == 0x80 |
| 180 | && (src[j+2] & 0xC0) == 0x80 |
| 181 | && (src[j+3] & 0xC0) == 0x80) { |
| 182 | uint32_t c = (src[j+0] & 0x07) << 18 |
| 183 | | (src[j+1] & 0x3F) << 12 |
| 184 | | (src[j+2] & 0x3F) << 6 |
| 185 | | (src[j+3] & 0x3F); |
| 186 | if (c < 0x10000 |
| 187 | || c > 0x10FFFF |
| 188 | || (c & 0x1FF800) == 0x00D800) |
| 189 | return NULL; |
| 190 | j += 4; |
| 191 | } else { |
| 192 | return NULL; |
| 193 | } |
| 194 | } |
| 195 | dest = GDKmalloc((i + 1) * sizeof(uint32_t)); |
| 196 | if (dest == NULL) |
| 197 | return NULL; |
| 198 | /* go through the source string again, this time we can skip |
| 199 | * the correctness tests */ |
| 200 | i = j = 0; |
| 201 | while (src[j]) { |
| 202 | if ((src[j+0] & 0x80) == 0) { |
| 203 | dest[i++] = src[j+0]; |
| 204 | j += 1; |
| 205 | } else if ((src[j+0] & 0xE0) == 0xC0) { |
| 206 | dest[i++] = (src[j+0] & 0x1F) << 6 |
| 207 | | (src[j+1] & 0x3F); |
| 208 | j += 2; |
| 209 | } else if ((src[j+0] & 0xF0) == 0xE0) { |
| 210 | dest[i++] = (src[j+0] & 0x0F) << 12 |
| 211 | | (src[j+1] & 0x3F) << 6 |
| 212 | | (src[j+2] & 0x3F); |
| 213 | j += 3; |
| 214 | } else if ((src[j+0] & 0xF8) == 0xF0) { |
| 215 | dest[i++] = (src[j+0] & 0x07) << 18 |
| 216 | | (src[j+1] & 0x3F) << 12 |
| 217 | | (src[j+2] & 0x3F) << 6 |
| 218 | | (src[j+3] & 0x3F); |
| 219 | j += 4; |
| 220 | } |
| 221 | } |
| 222 | dest[i] = 0; |
| 223 | return dest; |
| 224 | } |
| 225 | |
| 226 | static size_t |
| 227 | myucslen(const uint32_t *ucs) |
| 228 | { |
| 229 | size_t i = 0; |
| 230 | |
| 231 | while (ucs[i]) |
| 232 | i++; |
| 233 | return i; |
| 234 | } |
| 235 | |
| 236 | static int |
| 237 | mywstrncasecmp(const char *restrict s1, const uint32_t *restrict s2, size_t n2) |
| 238 | { |
| 239 | uint32_t c1; |
| 240 | |
| 241 | while (n2 > 0) { |
| 242 | size_t nn1 = utfc8touc(&c1, s1); |
| 243 | if (nn1 == 0 || nn1 == (size_t) -1) |
| 244 | return -(*s2 != 0); |
| 245 | if (*s2 == 0) |
| 246 | return 1; |
| 247 | if (nn1 == (size_t) -1 || nn1 == (size_t) -2) |
| 248 | return 0; /* actually an error that shouldn't happen */ |
| 249 | #if SIZEOF_WCHAR_T == 2 |
| 250 | if (c1 > 0xFFFF || *s2 > 0xFFFF) { |
| 251 | if (c1 != *s2) |
| 252 | return c1 - *s2; |
| 253 | } else |
| 254 | #endif |
| 255 | if (towlower((wint_t) c1) != towlower((wint_t) *s2)) |
| 256 | return towlower((wint_t) c1) - towlower((wint_t) *s2); |
| 257 | s1 += nn1; |
| 258 | n2--; |
| 259 | s2++; |
| 260 | } |
| 261 | return 0; |
| 262 | } |
| 263 | |
| 264 | static int |
| 265 | mystrcasecmp(const char *s1, const char *s2) |
| 266 | { |
| 267 | uint32_t c1, c2; |
| 268 | |
| 269 | for (;;) { |
| 270 | size_t nn1 = utfc8touc(&c1, s1); |
| 271 | size_t nn2 = utfc8touc(&c2, s2); |
| 272 | if (nn1 == 0 || nn1 == (size_t) -1) |
| 273 | return -(nn2 != 0 && nn2 != (size_t) -1); |
| 274 | if (nn2 == 0 || nn2 == (size_t) -1) |
| 275 | return 1; |
| 276 | if (nn1 == (size_t) -1 || nn1 == (size_t) -2 || |
| 277 | nn2 == (size_t) -1 || nn2 == (size_t) -2) |
| 278 | return 0; /* actually an error that shouldn't happen */ |
| 279 | #if SIZEOF_WCHAR_T == 2 |
| 280 | if (c1 > 0xFFFF || c2 > 0xFFFF) { |
| 281 | if (c1 != c2) |
| 282 | return c1 - c2; |
| 283 | } else |
| 284 | #endif |
| 285 | if (towlower((wint_t) c1) != towlower((wint_t) c2)) |
| 286 | return towlower((wint_t) c1) - towlower((wint_t) c2); |
| 287 | s1 += nn1; |
| 288 | s2 += nn2; |
| 289 | } |
| 290 | } |
| 291 | |
| 292 | static int |
| 293 | mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2) |
| 294 | { |
| 295 | uint32_t c1; |
| 296 | |
| 297 | for (;;) { |
| 298 | size_t nn1 = utfc8touc(&c1, s1); |
| 299 | if (nn1 == 0 || nn1 == (size_t) -1) |
| 300 | return -(*s2 != 0); |
| 301 | if (*s2 == 0) |
| 302 | return 1; |
| 303 | if (nn1 == (size_t) -1 || nn1 == (size_t) -2) |
| 304 | return 0; /* actually an error that shouldn't happen */ |
| 305 | #if SIZEOF_WCHAR_T == 2 |
| 306 | if (c1 > 0xFFFF || *s2 > 0xFFFF) { |
| 307 | if (c1 != *s2) |
| 308 | return c1 - *s2; |
| 309 | } else |
| 310 | #endif |
| 311 | if (towlower((wint_t) c1) != towlower((wint_t) *s2)) |
| 312 | return towlower((wint_t) c1) - towlower((wint_t) *s2); |
| 313 | s1 += nn1; |
| 314 | s2++; |
| 315 | } |
| 316 | } |
| 317 | |
| 318 | static const char * |
| 319 | mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle) |
| 320 | { |
| 321 | size_t nlen = myucslen(wneedle); |
| 322 | |
| 323 | if (nlen == 0) |
| 324 | return haystack; |
| 325 | |
| 326 | size_t hlen = strlen(haystack); |
| 327 | |
| 328 | while (*haystack) { |
| 329 | size_t i; |
| 330 | size_t h; |
| 331 | size_t step = 0; |
| 332 | for (i = h = 0; i < nlen; i++) { |
| 333 | uint32_t c; |
| 334 | size_t j = utfc8touc(&c, haystack + h); |
| 335 | if (j == 0 || j == (size_t) -1) |
| 336 | return NULL; |
| 337 | if (i == 0) { |
| 338 | step = j; |
| 339 | } |
| 340 | #if SIZEOF_WCHAR_T == 2 |
| 341 | if (c > 0xFFFF || wneedle[i] > 0xFFFF) { |
| 342 | if (c != wneedle[i]) |
| 343 | break; |
| 344 | } else |
| 345 | #endif |
| 346 | if (towlower((wint_t) c) != towlower((wint_t) wneedle[i])) |
| 347 | break; |
| 348 | h += j; |
| 349 | } |
| 350 | if (i == nlen) |
| 351 | return haystack; |
| 352 | haystack += step; |
| 353 | hlen -= step; |
| 354 | } |
| 355 | return NULL; |
| 356 | } |
| 357 | |
| 358 | /* returns true if the pattern does not contain unescaped `_' (single |
| 359 | * character match) and ends with unescaped `%' (any sequence |
| 360 | * match) */ |
| 361 | static bool |
| 362 | re_simple(const char *pat, unsigned char esc) |
| 363 | { |
| 364 | bool escaped = false; |
| 365 | bool percatend = false; |
| 366 | |
| 367 | if (pat == 0) |
| 368 | return 0; |
| 369 | if (*pat == '%') { |
| 370 | percatend = true; |
| 371 | pat++; |
| 372 | } |
| 373 | while (*pat) { |
| 374 | percatend = false; |
| 375 | if (escaped) { |
| 376 | escaped = false; |
| 377 | } else if ((unsigned char) *pat == esc) { |
| 378 | escaped = true; |
| 379 | } else if (*pat == '_') { |
| 380 | return 0; |
| 381 | } else if (*pat == '%') { |
| 382 | percatend = true; |
| 383 | } |
| 384 | pat++; |
| 385 | } |
| 386 | return percatend; |
| 387 | } |
| 388 | |
| 389 | static bool |
| 390 | is_strcmpable(const char *pat, const char *esc) |
| 391 | { |
| 392 | if (pat[strcspn(pat, "%_" )]) |
| 393 | return false; |
| 394 | return strlen(esc) == 0 || strcmp(esc, str_nil) == 0 || strstr(pat, esc) == NULL; |
| 395 | } |
| 396 | |
| 397 | static bool |
| 398 | re_match_ignore(const char *s, RE *pattern) |
| 399 | { |
| 400 | RE *r; |
| 401 | |
| 402 | for (r = pattern; r; r = r->n) { |
| 403 | if (*r->w == 0 && (r->search || *s == 0)) |
| 404 | return true; |
| 405 | if (!*s || |
| 406 | (r->search ? (s = mywstrcasestr(s, r->w)) == NULL : mywstrncasecmp(s, r->w, r->len) != 0)) |
| 407 | return false; |
| 408 | s += r->len; |
| 409 | } |
| 410 | return true; |
| 411 | } |
| 412 | |
| 413 | static bool |
| 414 | re_match_no_ignore(const char *s, RE *pattern) |
| 415 | { |
| 416 | RE *r; |
| 417 | |
| 418 | for (r = pattern; r; r = r->n) { |
| 419 | if (*r->k == 0 && (r->search || *s == 0)) |
| 420 | return true; |
| 421 | if (!*s || |
| 422 | (r->search ? (s = strstr(s, r->k)) == NULL : strncmp(s, r->k, r->len) != 0)) |
| 423 | return false; |
| 424 | s += r->len; |
| 425 | } |
| 426 | return true; |
| 427 | } |
| 428 | |
| 429 | static void |
| 430 | re_destroy(RE *p) |
| 431 | { |
| 432 | if (p) { |
| 433 | GDKfree(p->k); |
| 434 | GDKfree(p->w); |
| 435 | do { |
| 436 | RE *n = p->n; |
| 437 | |
| 438 | GDKfree(p); |
| 439 | p = n; |
| 440 | } while (p); |
| 441 | } |
| 442 | } |
| 443 | |
| 444 | /* Create a linked list of RE structures. Depending on the caseignore |
| 445 | * flag, the w (if true) or the k (if false) field is used. These |
| 446 | * fields in the first structure are allocated, whereas in all |
| 447 | * subsequent structures the fields point into the allocated buffer of |
| 448 | * the first. */ |
| 449 | static RE * |
| 450 | re_create(const char *pat, bool caseignore, uint32_t esc) |
| 451 | { |
| 452 | RE *r = (RE*)GDKmalloc(sizeof(RE)), *n = r; |
| 453 | bool escaped = false; |
| 454 | |
| 455 | if (r == NULL) |
| 456 | return NULL; |
| 457 | *r = (struct RE) {.search = false}; |
| 458 | |
| 459 | while (esc != '%' && *pat == '%') { |
| 460 | pat++; /* skip % */ |
| 461 | r->search = true; |
| 462 | } |
| 463 | if (caseignore) { |
| 464 | uint32_t *wp; |
| 465 | uint32_t *wq; |
| 466 | wp = utf8stoucs(pat); |
| 467 | if (wp == NULL) { |
| 468 | GDKfree(r); |
| 469 | return NULL; |
| 470 | } |
| 471 | r->w = wp; |
| 472 | wq = wp; |
| 473 | while (*wp) { |
| 474 | if (escaped) { |
| 475 | *wq++ = *wp; |
| 476 | escaped = false; |
| 477 | } else if (*wp == esc) { |
| 478 | escaped = true; |
| 479 | } else if (*wp == '%') { |
| 480 | n->len = (size_t) (wq - r->w); |
| 481 | while (wp[1] == '%') |
| 482 | wp++; |
| 483 | if (wp[1]) { |
| 484 | n = n->n = GDKmalloc(sizeof(RE)); |
| 485 | if (n == NULL) |
| 486 | goto bailout; |
| 487 | *n = (struct RE) {.search = true, .w = wp + 1}; |
| 488 | } |
| 489 | *wq++ = 0; |
| 490 | } else { |
| 491 | *wq++ = *wp; |
| 492 | } |
| 493 | wp++; |
| 494 | } |
| 495 | } else { |
| 496 | char *p, *q; |
| 497 | if ((p = GDKstrdup(pat)) == NULL) { |
| 498 | GDKfree(r); |
| 499 | return NULL; |
| 500 | } |
| 501 | r->k = p; |
| 502 | q = p; |
| 503 | while (*p) { |
| 504 | if (escaped) { |
| 505 | *q++ = *p; |
| 506 | escaped = false; |
| 507 | } else if ((unsigned char) *p == esc) { |
| 508 | escaped = true; |
| 509 | } else if (*p == '%') { |
| 510 | n->len = (size_t) (q - r->k); |
| 511 | while (p[1] == '%') |
| 512 | p++; |
| 513 | if (p[1]) { |
| 514 | n = n->n = GDKmalloc(sizeof(RE)); |
| 515 | if (n == NULL) |
| 516 | goto bailout; |
| 517 | *n = (struct RE) {.search = true, .k = p + 1}; |
| 518 | } |
| 519 | *q++ = 0; |
| 520 | } else { |
| 521 | *q++ = *p; |
| 522 | } |
| 523 | p++; |
| 524 | } |
| 525 | } |
| 526 | return r; |
| 527 | bailout: |
| 528 | re_destroy(r); |
| 529 | return NULL; |
| 530 | } |
| 531 | |
| 532 | #ifdef HAVE_LIBPCRE |
| 533 | static str |
| 534 | pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive) |
| 535 | { |
| 536 | pcre *r; |
| 537 | const char *err_p = NULL; |
| 538 | int errpos = 0; |
| 539 | int options = PCRE_UTF8 | PCRE_MULTILINE; |
| 540 | if (insensitive) |
| 541 | options |= PCRE_CASELESS; |
| 542 | |
| 543 | if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) { |
| 544 | throw(MAL, "pcre.compile" , OPERATION_FAILED |
| 545 | " with\n'%s'\nat %d in\n'%s'.\n" , |
| 546 | err_p, errpos, pattern); |
| 547 | } |
| 548 | *res = r; |
| 549 | return MAL_SUCCEED; |
| 550 | } |
| 551 | #endif |
| 552 | |
| 553 | /* these two defines are copies from gdk_select.c */ |
| 554 | |
| 555 | /* scan select loop with candidates */ |
| 556 | #define candscanloop(TEST) \ |
| 557 | do { \ |
| 558 | ALGODEBUG fprintf(stderr, \ |
| 559 | "#BATselect(b=%s#"BUNFMT",s=%s,anti=%d): " \ |
| 560 | "scanselect %s\n", BATgetId(b), BATcount(b), \ |
| 561 | s ? BATgetId(s) : "NULL", anti, #TEST); \ |
| 562 | for (p = 0; p < ci.ncand; p++) { \ |
| 563 | o = canditer_next(&ci); \ |
| 564 | r = (BUN) (o - off); \ |
| 565 | v = BUNtvar(bi, r); \ |
| 566 | if (TEST) \ |
| 567 | bunfastappTYPE(oid, bn, &o); \ |
| 568 | } \ |
| 569 | } while (0) |
| 570 | |
| 571 | /* scan select loop without candidates */ |
| 572 | #define scanloop(TEST) \ |
| 573 | do { \ |
| 574 | ALGODEBUG fprintf(stderr, \ |
| 575 | "#BATselect(b=%s#"BUNFMT",s=%s,anti=%d): " \ |
| 576 | "scanselect %s\n", BATgetId(b), BATcount(b), \ |
| 577 | s ? BATgetId(s) : "NULL", anti, #TEST); \ |
| 578 | while (p < q) { \ |
| 579 | v = BUNtvar(bi, p-off); \ |
| 580 | if (TEST) { \ |
| 581 | o = (oid) p; \ |
| 582 | bunfastappTYPE(oid, bn, &o); \ |
| 583 | } \ |
| 584 | p++; \ |
| 585 | } \ |
| 586 | } while (0) |
| 587 | |
| 588 | static str |
| 589 | pcre_likeselect(BAT **bnp, BAT *b, BAT *s, const char *pat, bool caseignore, bool anti) |
| 590 | { |
| 591 | #ifdef HAVE_LIBPCRE |
| 592 | int options = PCRE_UTF8 | PCRE_MULTILINE | PCRE_DOTALL; |
| 593 | pcre *re; |
| 594 | pcre_extra *pe; |
| 595 | const char *error; |
| 596 | int errpos; |
| 597 | int ovector[9]; |
| 598 | #else |
| 599 | int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED; |
| 600 | regex_t re; |
| 601 | int errcode; |
| 602 | #endif |
| 603 | BATiter bi = bat_iterator(b); |
| 604 | BAT *bn; |
| 605 | BUN p, q; |
| 606 | oid o, off; |
| 607 | const char *v; |
| 608 | struct canditer ci; |
| 609 | |
| 610 | canditer_init(&ci, b, s); |
| 611 | |
| 612 | assert(ATOMstorage(b->ttype) == TYPE_str); |
| 613 | |
| 614 | if (caseignore) { |
| 615 | #ifdef HAVE_LIBPCRE |
| 616 | options |= PCRE_CASELESS; |
| 617 | #else |
| 618 | options |= REG_ICASE; |
| 619 | #endif |
| 620 | } |
| 621 | #ifdef HAVE_LIBPCRE |
| 622 | if ((re = pcre_compile(pat, options, &error, &errpos, NULL)) == NULL) |
| 623 | throw(MAL, "pcre.likeselect" , |
| 624 | OPERATION_FAILED ": compilation of pattern \"%s\" failed\n" , pat); |
| 625 | pe = pcre_study(re, (s ? BATcount(s) : BATcount(b)) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &error); |
| 626 | if (error != NULL) { |
| 627 | pcre_free(re); |
| 628 | throw(MAL, "pcre.likeselect" , |
| 629 | OPERATION_FAILED ": studying pattern \"%s\" failed\n" , pat); |
| 630 | } |
| 631 | #else |
| 632 | if ((errcode = regcomp(&re, pat, options)) != 0) { |
| 633 | throw(MAL, "pcre.likeselect" , |
| 634 | OPERATION_FAILED ": compilation of pattern \"%s\" failed\n" , pat); |
| 635 | } |
| 636 | #endif |
| 637 | bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT); |
| 638 | if (bn == NULL) { |
| 639 | #ifdef HAVE_LIBPCRE |
| 640 | pcre_free_study(pe); |
| 641 | pcre_free(re); |
| 642 | #else |
| 643 | regfree(&re); |
| 644 | #endif |
| 645 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 646 | } |
| 647 | off = b->hseqbase; |
| 648 | |
| 649 | if (s && !BATtdense(s)) { |
| 650 | BUN r; |
| 651 | |
| 652 | #ifdef HAVE_LIBPCRE |
| 653 | #define BODY (pcre_exec(re, pe, v, (int) strlen(v), 0, 0, ovector, 9) >= 0) |
| 654 | #else |
| 655 | #define BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH) |
| 656 | #endif |
| 657 | if (anti) |
| 658 | candscanloop(v && *v != '\200' && !BODY); |
| 659 | else |
| 660 | candscanloop(v && *v != '\200' && BODY); |
| 661 | } else { |
| 662 | if (s) { |
| 663 | assert(BATtdense(s)); |
| 664 | p = (BUN) s->tseqbase; |
| 665 | q = p + BATcount(s); |
| 666 | if ((oid) p < b->hseqbase) |
| 667 | p = b->hseqbase; |
| 668 | if ((oid) q > b->hseqbase + BATcount(b)) |
| 669 | q = b->hseqbase + BATcount(b); |
| 670 | } else { |
| 671 | p = off; |
| 672 | q = BUNlast(b) + off; |
| 673 | } |
| 674 | if (anti) |
| 675 | scanloop(v && *v != '\200' && !BODY); |
| 676 | else |
| 677 | scanloop(v && *v != '\200' && BODY); |
| 678 | } |
| 679 | #ifdef HAVE_LIBPCRE |
| 680 | pcre_free_study(pe); |
| 681 | pcre_free(re); |
| 682 | #else |
| 683 | regfree(&re); |
| 684 | #endif |
| 685 | BATsetcount(bn, BATcount(bn)); /* set some properties */ |
| 686 | bn->theap.dirty |= BATcount(bn) > 0; |
| 687 | bn->tsorted = true; |
| 688 | bn->trevsorted = bn->batCount <= 1; |
| 689 | bn->tkey = true; |
| 690 | bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? * (oid *) Tloc(bn, 0) : oid_nil; |
| 691 | *bnp = bn; |
| 692 | return MAL_SUCCEED; |
| 693 | |
| 694 | bunins_failed: |
| 695 | BBPreclaim(bn); |
| 696 | #ifdef HAVE_LIBPCRE |
| 697 | pcre_free_study(pe); |
| 698 | pcre_free(re); |
| 699 | #else |
| 700 | regfree(&re); |
| 701 | #endif |
| 702 | *bnp = NULL; |
| 703 | throw(MAL, "pcre.likeselect" , OPERATION_FAILED); |
| 704 | } |
| 705 | |
| 706 | static str |
| 707 | re_likeselect(BAT **bnp, BAT *b, BAT *s, const char *pat, bool caseignore, bool anti, bool use_strcmp, uint32_t esc) |
| 708 | { |
| 709 | BATiter bi = bat_iterator(b); |
| 710 | BAT *bn; |
| 711 | BUN p, q; |
| 712 | oid o, off; |
| 713 | const char *v; |
| 714 | RE *re = NULL; |
| 715 | |
| 716 | assert(ATOMstorage(b->ttype) == TYPE_str); |
| 717 | |
| 718 | bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); |
| 719 | if (bn == NULL) |
| 720 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 721 | off = b->hseqbase; |
| 722 | |
| 723 | if (!use_strcmp) { |
| 724 | re = re_create(pat, caseignore, esc); |
| 725 | if (!re) |
| 726 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 727 | } |
| 728 | if (s && !BATtdense(s)) { |
| 729 | struct canditer ci; |
| 730 | BUN r; |
| 731 | |
| 732 | canditer_init(&ci, b, s); |
| 733 | |
| 734 | if (use_strcmp) { |
| 735 | if (caseignore) { |
| 736 | uint32_t *wpat; |
| 737 | wpat = utf8stoucs(pat); |
| 738 | if (wpat == NULL) |
| 739 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 740 | if (anti) |
| 741 | candscanloop(v && *v != '\200' && |
| 742 | mywstrcasecmp(v, wpat) != 0); |
| 743 | else |
| 744 | candscanloop(v && *v != '\200' && |
| 745 | mywstrcasecmp(v, wpat) == 0); |
| 746 | GDKfree(wpat); |
| 747 | } else { |
| 748 | if (anti) |
| 749 | candscanloop(v && *v != '\200' && |
| 750 | strcmp(v, pat) != 0); |
| 751 | else |
| 752 | candscanloop(v && *v != '\200' && |
| 753 | strcmp(v, pat) == 0); |
| 754 | } |
| 755 | } else { |
| 756 | if (caseignore) { |
| 757 | if (anti) |
| 758 | candscanloop(v && *v != '\200' && |
| 759 | re_match_ignore(v, re) == 0); |
| 760 | else |
| 761 | candscanloop(v && *v != '\200' && |
| 762 | re_match_ignore(v, re)); |
| 763 | } else { |
| 764 | if (anti) |
| 765 | candscanloop(v && *v != '\200' && |
| 766 | re_match_no_ignore(v, re) == 0); |
| 767 | else |
| 768 | candscanloop(v && *v != '\200' && |
| 769 | re_match_no_ignore(v, re)); |
| 770 | } |
| 771 | } |
| 772 | } else { |
| 773 | if (s) { |
| 774 | assert(BATtdense(s)); |
| 775 | p = (BUN) s->tseqbase; |
| 776 | q = p + BATcount(s); |
| 777 | if ((oid) p < b->hseqbase) |
| 778 | p = b->hseqbase; |
| 779 | if ((oid) q > b->hseqbase + BATcount(b)) |
| 780 | q = b->hseqbase + BATcount(b); |
| 781 | } else { |
| 782 | p = off; |
| 783 | q = BUNlast(b) + off; |
| 784 | } |
| 785 | if (use_strcmp) { |
| 786 | if (caseignore) { |
| 787 | uint32_t *wpat; |
| 788 | wpat = utf8stoucs(pat); |
| 789 | if (wpat == NULL) |
| 790 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 791 | if (anti) |
| 792 | scanloop(v && *v != '\200' && |
| 793 | mywstrcasecmp(v, wpat) != 0); |
| 794 | else |
| 795 | scanloop(v && *v != '\200' && |
| 796 | mywstrcasecmp(v, wpat) == 0); |
| 797 | GDKfree(wpat); |
| 798 | } else { |
| 799 | if (anti) |
| 800 | scanloop(v && *v != '\200' && |
| 801 | strcmp(v, pat) != 0); |
| 802 | else |
| 803 | scanloop(v && *v != '\200' && |
| 804 | strcmp(v, pat) == 0); |
| 805 | } |
| 806 | } else { |
| 807 | if (caseignore) { |
| 808 | if (anti) |
| 809 | scanloop(v && *v != '\200' && |
| 810 | re_match_ignore(v, re) == 0); |
| 811 | else |
| 812 | scanloop(v && *v != '\200' && |
| 813 | re_match_ignore(v, re)); |
| 814 | } else { |
| 815 | if (anti) |
| 816 | scanloop(v && *v != '\200' && |
| 817 | re_match_no_ignore(v, re) == 0); |
| 818 | else |
| 819 | scanloop(v && *v != '\200' && |
| 820 | re_match_no_ignore(v, re)); |
| 821 | } |
| 822 | } |
| 823 | } |
| 824 | BATsetcount(bn, BATcount(bn)); /* set some properties */ |
| 825 | bn->tsorted = true; |
| 826 | bn->trevsorted = bn->batCount <= 1; |
| 827 | bn->tkey = true; |
| 828 | bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? * (oid *) Tloc(bn, 0) : oid_nil; |
| 829 | *bnp = bn; |
| 830 | re_destroy(re); |
| 831 | return MAL_SUCCEED; |
| 832 | |
| 833 | bunins_failed: |
| 834 | re_destroy(re); |
| 835 | BBPreclaim(bn); |
| 836 | *bnp = NULL; |
| 837 | throw(MAL, "pcre.likeselect" , OPERATION_FAILED); |
| 838 | } |
| 839 | |
| 840 | /* maximum number of back references and quoted \ or $ in replacement string */ |
| 841 | #define MAX_NR_REFS 20 |
| 842 | |
| 843 | struct backref { |
| 844 | int idx; |
| 845 | int start; |
| 846 | int end; |
| 847 | }; |
| 848 | |
| 849 | #ifdef HAVE_LIBPCRE |
| 850 | /* fill in parameter backrefs (length maxrefs) with information about |
| 851 | * back references in the replacement string; a back reference is a |
| 852 | * dollar or backslash followed by a number */ |
| 853 | static int |
| 854 | parse_replacement(const char *replacement, int len_replacement, |
| 855 | struct backref *backrefs, int maxrefs) |
| 856 | { |
| 857 | int nbackrefs = 0; |
| 858 | |
| 859 | for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) { |
| 860 | if (replacement[i] == '$' || replacement[i] == '\\') { |
| 861 | char *endptr; |
| 862 | backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10); |
| 863 | if (endptr > replacement + i + 1) { |
| 864 | int k = (int) (endptr - (replacement + i + 1)); |
| 865 | backrefs[nbackrefs].start = i; |
| 866 | backrefs[nbackrefs].end = i + k + 1; |
| 867 | nbackrefs++; |
| 868 | } else if (replacement[i] == replacement[i + 1]) { |
| 869 | /* doubled $ or \, we must copy just one to the output */ |
| 870 | backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */ |
| 871 | backrefs[nbackrefs].start = i; |
| 872 | backrefs[nbackrefs].end = i + 1; |
| 873 | i++; /* don't look at second $ or \ again */ |
| 874 | nbackrefs++; |
| 875 | } |
| 876 | /* else: $ or \ followed by something we don't recognize, |
| 877 | * so just leave it */ |
| 878 | } |
| 879 | } |
| 880 | return nbackrefs; |
| 881 | } |
| 882 | |
| 883 | static char * |
| 884 | single_replace(pcre *pcre_code, pcre_extra *, |
| 885 | const char *origin_str, int len_origin_str, |
| 886 | int exec_options, int *ovector, int ovecsize, |
| 887 | const char *replacement, int len_replacement, |
| 888 | struct backref *backrefs, int nbackrefs, |
| 889 | bool global, char *result, int *max_result) |
| 890 | { |
| 891 | int offset = 0; |
| 892 | int len_result = 0; |
| 893 | int addlen; |
| 894 | char *tmp; |
| 895 | |
| 896 | do { |
| 897 | int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset, |
| 898 | exec_options, ovector, ovecsize); |
| 899 | if (j <= 0) |
| 900 | break; |
| 901 | addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0); |
| 902 | if (len_result + addlen >= *max_result) { |
| 903 | tmp = GDKrealloc(result, len_result + addlen + 1); |
| 904 | if (tmp == NULL) { |
| 905 | GDKfree(result); |
| 906 | return NULL; |
| 907 | } |
| 908 | result = tmp; |
| 909 | *max_result = len_result + addlen + 1; |
| 910 | } |
| 911 | if (ovector[0] > offset) { |
| 912 | strncpy(result + len_result, origin_str + offset, |
| 913 | ovector[0] - offset); |
| 914 | len_result += ovector[0] - offset; |
| 915 | } |
| 916 | if (nbackrefs == 0) { |
| 917 | strncpy(result + len_result, replacement, len_replacement); |
| 918 | len_result += len_replacement; |
| 919 | } else { |
| 920 | int prevend = 0; |
| 921 | for (int i = 0; i < nbackrefs; i++) { |
| 922 | int off, len; |
| 923 | if (backrefs[i].idx >= ovecsize / 3) { |
| 924 | /* out of bounds, replace with empty string */ |
| 925 | off = 0; |
| 926 | len = 0; |
| 927 | } else { |
| 928 | off = ovector[backrefs[i].idx * 2]; |
| 929 | len = ovector[backrefs[i].idx * 2 + 1] - off; |
| 930 | } |
| 931 | addlen = backrefs[i].start - prevend + len; |
| 932 | if (len_result + addlen >= *max_result) { |
| 933 | tmp = GDKrealloc(result, len_result + addlen + 1); |
| 934 | if (tmp == NULL) { |
| 935 | GDKfree(result); |
| 936 | return NULL; |
| 937 | } |
| 938 | result = tmp; |
| 939 | *max_result = len_result + addlen + 1; |
| 940 | } |
| 941 | if (backrefs[i].start > prevend) { |
| 942 | strncpy(result + len_result, replacement + prevend, |
| 943 | backrefs[i].start - prevend); |
| 944 | len_result += backrefs[i].start - prevend; |
| 945 | } |
| 946 | if (len > 0) { |
| 947 | strncpy(result + len_result, origin_str + off, len); |
| 948 | len_result += len; |
| 949 | } |
| 950 | prevend = backrefs[i].end; |
| 951 | } |
| 952 | /* copy rest of replacement string (after last backref) */ |
| 953 | addlen = len_replacement - prevend; |
| 954 | if (addlen > 0) { |
| 955 | if (len_result + addlen >= *max_result) { |
| 956 | tmp = GDKrealloc(result, len_result + addlen + 1); |
| 957 | if (tmp == NULL) { |
| 958 | GDKfree(result); |
| 959 | return NULL; |
| 960 | } |
| 961 | result = tmp; |
| 962 | *max_result = len_result + addlen + 1; |
| 963 | } |
| 964 | strncpy(result + len_result, replacement + prevend, addlen); |
| 965 | len_result += addlen; |
| 966 | } |
| 967 | } |
| 968 | offset = ovector[1]; |
| 969 | } while (offset < len_origin_str && global); |
| 970 | if (offset < len_origin_str) { |
| 971 | addlen = len_origin_str - offset; |
| 972 | if (len_result + addlen >= *max_result) { |
| 973 | tmp = GDKrealloc(result, len_result + addlen + 1); |
| 974 | if (tmp == NULL) { |
| 975 | GDKfree(result); |
| 976 | return NULL; |
| 977 | } |
| 978 | result = tmp; |
| 979 | *max_result = len_result + addlen + 1; |
| 980 | } |
| 981 | strncpy(result + len_result, origin_str + offset, addlen); |
| 982 | len_result += addlen; |
| 983 | } |
| 984 | /* null terminate string */ |
| 985 | result[len_result] = '\0'; |
| 986 | return result; |
| 987 | } |
| 988 | #endif |
| 989 | |
| 990 | static str |
| 991 | pcre_replace(str *res, const char *origin_str, const char *pattern, |
| 992 | const char *replacement, const char *flags, bool global) |
| 993 | { |
| 994 | #ifdef HAVE_LIBPCRE |
| 995 | const char *err_p = NULL; |
| 996 | pcre *pcre_code = NULL; |
| 997 | pcre_extra *; |
| 998 | char *tmpres; |
| 999 | int max_result; |
| 1000 | int i, errpos = 0; |
| 1001 | int compile_options = PCRE_UTF8, exec_options = PCRE_NOTEMPTY; |
| 1002 | int *ovector, ovecsize; |
| 1003 | int len_origin_str = (int) strlen(origin_str); |
| 1004 | int len_replacement = (int) strlen(replacement); |
| 1005 | struct backref backrefs[MAX_NR_REFS]; |
| 1006 | int nbackrefs = 0; |
| 1007 | |
| 1008 | while (*flags) { |
| 1009 | switch (*flags) { |
| 1010 | case 'e': |
| 1011 | exec_options &= ~PCRE_NOTEMPTY; |
| 1012 | break; |
| 1013 | case 'i': |
| 1014 | compile_options |= PCRE_CASELESS; |
| 1015 | break; |
| 1016 | case 'm': |
| 1017 | compile_options |= PCRE_MULTILINE; |
| 1018 | break; |
| 1019 | case 's': |
| 1020 | compile_options |= PCRE_DOTALL; |
| 1021 | break; |
| 1022 | case 'x': |
| 1023 | compile_options |= PCRE_EXTENDED; |
| 1024 | break; |
| 1025 | default: |
| 1026 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
| 1027 | ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n" , |
| 1028 | *flags); |
| 1029 | } |
| 1030 | flags++; |
| 1031 | } |
| 1032 | |
| 1033 | if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) { |
| 1034 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
| 1035 | OPERATION_FAILED ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n" , |
| 1036 | pattern, errpos, err_p); |
| 1037 | } |
| 1038 | |
| 1039 | /* Since the compiled pattern is going to be used several times, it is |
| 1040 | * worth spending more time analyzing it in order to speed up the time |
| 1041 | * taken for matching. |
| 1042 | */ |
| 1043 | extra = pcre_study(pcre_code, 0, &err_p); |
| 1044 | if (err_p != NULL) { |
| 1045 | pcre_free(pcre_code); |
| 1046 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
| 1047 | OPERATION_FAILED ": pcre study of pattern (%s) failed with '%s'.\n" , |
| 1048 | pattern, err_p); |
| 1049 | } |
| 1050 | pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i); |
| 1051 | ovecsize = (i + 1) * 3; |
| 1052 | if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) { |
| 1053 | pcre_free_study(extra); |
| 1054 | pcre_free(pcre_code); |
| 1055 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
| 1056 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1057 | } |
| 1058 | |
| 1059 | /* identify back references in the replacement string */ |
| 1060 | nbackrefs = parse_replacement(replacement, len_replacement, |
| 1061 | backrefs, MAX_NR_REFS); |
| 1062 | |
| 1063 | max_result = len_origin_str + 1; |
| 1064 | tmpres = GDKmalloc(max_result); |
| 1065 | if (tmpres == NULL) { |
| 1066 | GDKfree(ovector); |
| 1067 | pcre_free_study(extra); |
| 1068 | pcre_free(pcre_code); |
| 1069 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
| 1070 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1071 | } |
| 1072 | |
| 1073 | tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str, |
| 1074 | exec_options, ovector, ovecsize, replacement, |
| 1075 | len_replacement, backrefs, nbackrefs, global, |
| 1076 | tmpres, &max_result); |
| 1077 | GDKfree(ovector); |
| 1078 | pcre_free_study(extra); |
| 1079 | pcre_free(pcre_code); |
| 1080 | if (tmpres == NULL) |
| 1081 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
| 1082 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1083 | |
| 1084 | *res = tmpres; |
| 1085 | return MAL_SUCCEED; |
| 1086 | #else |
| 1087 | (void) res; |
| 1088 | (void) origin_str; |
| 1089 | (void) pattern; |
| 1090 | (void) replacement; |
| 1091 | (void) flags; |
| 1092 | (void) global; |
| 1093 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
| 1094 | "Database was compiled without PCRE support." ); |
| 1095 | #endif |
| 1096 | } |
| 1097 | |
| 1098 | static str |
| 1099 | pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern, |
| 1100 | const char *replacement, const char *flags, bool global) |
| 1101 | { |
| 1102 | #ifdef HAVE_LIBPCRE |
| 1103 | BATiter origin_strsi = bat_iterator(origin_strs); |
| 1104 | const char *err_p = NULL; |
| 1105 | char *tmpres; |
| 1106 | int i, errpos = 0; |
| 1107 | int compile_options = PCRE_UTF8, exec_options = PCRE_NOTEMPTY; |
| 1108 | pcre *pcre_code = NULL; |
| 1109 | pcre_extra *; |
| 1110 | BAT *tmpbat; |
| 1111 | BUN p, q; |
| 1112 | int *ovector, ovecsize; |
| 1113 | int len_replacement = (int) strlen(replacement); |
| 1114 | struct backref backrefs[MAX_NR_REFS]; |
| 1115 | int nbackrefs = 0; |
| 1116 | const char *origin_str; |
| 1117 | int max_dest_size = 0; |
| 1118 | |
| 1119 | while (*flags) { |
| 1120 | switch (*flags) { |
| 1121 | case 'e': |
| 1122 | exec_options &= ~PCRE_NOTEMPTY; |
| 1123 | break; |
| 1124 | case 'i': |
| 1125 | compile_options |= PCRE_CASELESS; |
| 1126 | break; |
| 1127 | case 'm': |
| 1128 | compile_options |= PCRE_MULTILINE; |
| 1129 | break; |
| 1130 | case 's': |
| 1131 | compile_options |= PCRE_DOTALL; |
| 1132 | break; |
| 1133 | case 'x': |
| 1134 | compile_options |= PCRE_EXTENDED; |
| 1135 | break; |
| 1136 | default: |
| 1137 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
| 1138 | ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n" , |
| 1139 | *flags); |
| 1140 | } |
| 1141 | flags++; |
| 1142 | } |
| 1143 | |
| 1144 | if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) { |
| 1145 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
| 1146 | OPERATION_FAILED |
| 1147 | ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n" , |
| 1148 | pattern, errpos, err_p); |
| 1149 | } |
| 1150 | |
| 1151 | /* Since the compiled pattern is going to be used several times, |
| 1152 | * it is worth spending more time analyzing it in order to speed |
| 1153 | * up the time taken for matching. |
| 1154 | */ |
| 1155 | extra = pcre_study(pcre_code, BATcount(origin_strs) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p); |
| 1156 | if (err_p != NULL) { |
| 1157 | pcre_free(pcre_code); |
| 1158 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
| 1159 | OPERATION_FAILED); |
| 1160 | } |
| 1161 | pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i); |
| 1162 | ovecsize = (i + 1) * 3; |
| 1163 | if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) { |
| 1164 | pcre_free_study(extra); |
| 1165 | pcre_free(pcre_code); |
| 1166 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
| 1167 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1168 | } |
| 1169 | |
| 1170 | /* identify back references in the replacement string */ |
| 1171 | nbackrefs = parse_replacement(replacement, len_replacement, |
| 1172 | backrefs, MAX_NR_REFS); |
| 1173 | |
| 1174 | tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs), TRANSIENT); |
| 1175 | |
| 1176 | /* the buffer for all destination strings is allocated only once, |
| 1177 | * and extended when needed */ |
| 1178 | max_dest_size = len_replacement + 1; |
| 1179 | tmpres = GDKmalloc(max_dest_size); |
| 1180 | if (tmpbat == NULL || tmpres == NULL) { |
| 1181 | pcre_free_study(extra); |
| 1182 | pcre_free(pcre_code); |
| 1183 | GDKfree(ovector); |
| 1184 | BBPreclaim(tmpbat); |
| 1185 | GDKfree(tmpres); |
| 1186 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
| 1187 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1188 | } |
| 1189 | BATloop(origin_strs, p, q) { |
| 1190 | origin_str = BUNtvar(origin_strsi, p); |
| 1191 | tmpres = single_replace(pcre_code, extra, origin_str, |
| 1192 | (int) strlen(origin_str), exec_options, |
| 1193 | ovector, ovecsize, replacement, |
| 1194 | len_replacement, backrefs, nbackrefs, global, |
| 1195 | tmpres, &max_dest_size); |
| 1196 | if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) { |
| 1197 | pcre_free_study(extra); |
| 1198 | pcre_free(pcre_code); |
| 1199 | GDKfree(ovector); |
| 1200 | GDKfree(tmpres); |
| 1201 | BBPreclaim(tmpbat); |
| 1202 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
| 1203 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1204 | } |
| 1205 | } |
| 1206 | pcre_free_study(extra); |
| 1207 | pcre_free(pcre_code); |
| 1208 | GDKfree(ovector); |
| 1209 | GDKfree(tmpres); |
| 1210 | *res = tmpbat; |
| 1211 | return MAL_SUCCEED; |
| 1212 | #else |
| 1213 | (void) res; |
| 1214 | (void) origin_strs; |
| 1215 | (void) pattern; |
| 1216 | (void) replacement; |
| 1217 | (void) flags; |
| 1218 | (void) global; |
| 1219 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
| 1220 | "Database was compiled without PCRE support." ); |
| 1221 | #endif |
| 1222 | } |
| 1223 | |
| 1224 | str |
| 1225 | pcre_init(void *ret) |
| 1226 | { |
| 1227 | (void) ret; |
| 1228 | return NULL; |
| 1229 | } |
| 1230 | |
| 1231 | static str |
| 1232 | pcre_match_with_flags(bit *ret, const char *val, const char *pat, const char *flags) |
| 1233 | { |
| 1234 | int pos; |
| 1235 | #ifdef HAVE_LIBPCRE |
| 1236 | const char *err_p = NULL; |
| 1237 | int errpos = 0; |
| 1238 | int options = PCRE_UTF8; |
| 1239 | pcre *re; |
| 1240 | #else |
| 1241 | int options = REG_NOSUB; |
| 1242 | regex_t re; |
| 1243 | int errcode; |
| 1244 | int retval; |
| 1245 | #endif |
| 1246 | |
| 1247 | while (*flags) { |
| 1248 | switch (*flags) { |
| 1249 | case 'i': |
| 1250 | #ifdef HAVE_LIBPCRE |
| 1251 | options |= PCRE_CASELESS; |
| 1252 | #else |
| 1253 | options |= REG_ICASE; |
| 1254 | #endif |
| 1255 | break; |
| 1256 | case 'm': |
| 1257 | #ifdef HAVE_LIBPCRE |
| 1258 | options |= PCRE_MULTILINE; |
| 1259 | #else |
| 1260 | options |= REG_NEWLINE; |
| 1261 | #endif |
| 1262 | break; |
| 1263 | #ifdef HAVE_LIBPCRE |
| 1264 | case 's': |
| 1265 | options |= PCRE_DOTALL; |
| 1266 | break; |
| 1267 | #endif |
| 1268 | case 'x': |
| 1269 | #ifdef HAVE_LIBPCRE |
| 1270 | options |= PCRE_EXTENDED; |
| 1271 | #else |
| 1272 | options |= REG_EXTENDED; |
| 1273 | #endif |
| 1274 | break; |
| 1275 | default: |
| 1276 | throw(MAL, "pcre.match" , ILLEGAL_ARGUMENT |
| 1277 | ": unsupported flag character '%c'\n" , *flags); |
| 1278 | } |
| 1279 | flags++; |
| 1280 | } |
| 1281 | if (strcmp(val, str_nil) == 0) { |
| 1282 | *ret = FALSE; |
| 1283 | return MAL_SUCCEED; |
| 1284 | } |
| 1285 | |
| 1286 | #ifdef HAVE_LIBPCRE |
| 1287 | if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL) |
| 1288 | #else |
| 1289 | if ((errcode = regcomp(&re, pat, options)) != 0) |
| 1290 | #endif |
| 1291 | { |
| 1292 | throw(MAL, "pcre.match" , OPERATION_FAILED |
| 1293 | ": compilation of regular expression (%s) failed " |
| 1294 | #ifdef HAVE_LIBPCRE |
| 1295 | "at %d with '%s'" , pat, errpos, err_p |
| 1296 | #else |
| 1297 | , pat |
| 1298 | #endif |
| 1299 | ); |
| 1300 | } |
| 1301 | #ifdef HAVE_LIBPCRE |
| 1302 | pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0); |
| 1303 | pcre_free(re); |
| 1304 | #else |
| 1305 | retval = regexec(&re, val, (size_t) 0, NULL, 0); |
| 1306 | pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0); |
| 1307 | regfree(&re); |
| 1308 | #endif |
| 1309 | if (pos >= 0) |
| 1310 | *ret = TRUE; |
| 1311 | else if (pos == -1) |
| 1312 | *ret = FALSE; |
| 1313 | else |
| 1314 | throw(MAL, "pcre.match" , OPERATION_FAILED |
| 1315 | ": matching of regular expression (%s) failed with %d" , |
| 1316 | pat, pos); |
| 1317 | return MAL_SUCCEED; |
| 1318 | } |
| 1319 | |
| 1320 | #ifdef HAVE_LIBPCRE |
| 1321 | /* special characters in PCRE that need to be escaped */ |
| 1322 | static const char *pcre_specials = ".+?*()[]{}|^$\\" ; |
| 1323 | #else |
| 1324 | /* special characters in POSIX basic regular expressions that need to |
| 1325 | * be escaped */ |
| 1326 | static const char *pcre_specials = ".*[]^$\\" ; |
| 1327 | #endif |
| 1328 | |
| 1329 | /* change SQL LIKE pattern into PCRE pattern */ |
| 1330 | static str |
| 1331 | sql2pcre(str *r, const char *pat, const char *esc_str) |
| 1332 | { |
| 1333 | int escaped = 0; |
| 1334 | int hasWildcard = 0; |
| 1335 | char *ppat; |
| 1336 | int esc = esc_str[0] == '\200' ? 0 : esc_str[0]; /* should change to utf8_convert() */ |
| 1337 | int specials; |
| 1338 | int c; |
| 1339 | |
| 1340 | if (strlen(esc_str) > 1) |
| 1341 | throw(MAL, "pcre.sql2pcre" , SQLSTATE(22019) ILLEGAL_ARGUMENT ": ESCAPE string must have length 1" ); |
| 1342 | if (pat == NULL ) |
| 1343 | throw(MAL, "pcre.sql2pcre" , OPERATION_FAILED); |
| 1344 | ppat = GDKmalloc(strlen(pat)*3+3 /* 3 = "^'the translated regexp'$0" */); |
| 1345 | if (ppat == NULL) |
| 1346 | throw(MAL, "pcre.sql2pcre" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1347 | |
| 1348 | *r = ppat; |
| 1349 | /* The escape character can be a char which is special in a PCRE |
| 1350 | * expression. If the user used the "+" char as escape and has "++" |
| 1351 | * in their pattern, then replacing this with "+" is not correct and |
| 1352 | * should be "\+" instead. */ |
| 1353 | specials = (esc && strchr(pcre_specials, esc) != NULL); |
| 1354 | |
| 1355 | *ppat++ = '^'; |
| 1356 | while ((c = *pat++) != 0) { |
| 1357 | if (c == esc) { |
| 1358 | if (escaped) { |
| 1359 | if (specials) { /* change ++ into \+ */ |
| 1360 | *ppat++ = esc; |
| 1361 | } else { /* do not escape simple escape symbols */ |
| 1362 | ppat[-1] = esc; /* overwrite backslash */ |
| 1363 | } |
| 1364 | escaped = 0; |
| 1365 | } else { |
| 1366 | *ppat++ = '\\'; |
| 1367 | escaped = 1; |
| 1368 | } |
| 1369 | hasWildcard = 1; |
| 1370 | } else if (strchr(pcre_specials, c) != NULL) { |
| 1371 | /* escape PCRE special chars, avoid double backslash if the |
| 1372 | * user uses an invalid escape sequence */ |
| 1373 | if (!escaped) |
| 1374 | *ppat++ = '\\'; |
| 1375 | *ppat++ = c; |
| 1376 | hasWildcard = 1; |
| 1377 | escaped = 0; |
| 1378 | } else if (c == '%' && !escaped) { |
| 1379 | *ppat++ = '.'; |
| 1380 | *ppat++ = '*'; |
| 1381 | *ppat++ = '?'; |
| 1382 | hasWildcard = 1; |
| 1383 | /* collapse multiple %, but only if it isn't the escape */ |
| 1384 | if (esc != '%') |
| 1385 | while (*pat == '%') |
| 1386 | pat++; |
| 1387 | } else if (c == '_' && !escaped) { |
| 1388 | *ppat++ = '.'; |
| 1389 | hasWildcard = 1; |
| 1390 | } else { |
| 1391 | if (escaped) { |
| 1392 | ppat[-1] = c; /* overwrite backslash of invalid escape */ |
| 1393 | } else { |
| 1394 | *ppat++ = c; |
| 1395 | } |
| 1396 | escaped = 0; |
| 1397 | } |
| 1398 | } |
| 1399 | /* no wildcard or escape character at end of string */ |
| 1400 | if (!hasWildcard || escaped) { |
| 1401 | GDKfree(*r); |
| 1402 | *r = NULL; |
| 1403 | if (escaped) |
| 1404 | throw(MAL, "pcre.sql2pcre" , OPERATION_FAILED); |
| 1405 | *r = GDKstrdup(str_nil); |
| 1406 | if (*r == NULL) |
| 1407 | throw(MAL, "pcre.sql2pcre" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1408 | } else { |
| 1409 | *ppat++ = '$'; |
| 1410 | *ppat = 0; |
| 1411 | } |
| 1412 | return MAL_SUCCEED; |
| 1413 | } |
| 1414 | |
| 1415 | #ifdef HAVE_LIBPCRE |
| 1416 | /* change SQL PATINDEX pattern into PCRE pattern */ |
| 1417 | static str |
| 1418 | pat2pcre(str *r, const char *pat) |
| 1419 | { |
| 1420 | size_t len = strlen(pat); |
| 1421 | char *ppat = GDKmalloc(len*2+3 /* 3 = "^'the translated regexp'$0" */); |
| 1422 | int start = 0; |
| 1423 | |
| 1424 | if (ppat == NULL) |
| 1425 | throw(MAL, "pcre.sql2pcre" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1426 | *r = ppat; |
| 1427 | while (*pat) { |
| 1428 | int c = *pat++; |
| 1429 | |
| 1430 | if (strchr(pcre_specials, c) != NULL) { |
| 1431 | *ppat++ = '\\'; |
| 1432 | *ppat++ = c; |
| 1433 | } else if (c == '%') { |
| 1434 | if (start && *pat) { |
| 1435 | *ppat++ = '.'; |
| 1436 | *ppat++ = '*'; |
| 1437 | } |
| 1438 | start++; |
| 1439 | } else if (c == '_') { |
| 1440 | *ppat++ = '.'; |
| 1441 | } else { |
| 1442 | *ppat++ = c; |
| 1443 | } |
| 1444 | } |
| 1445 | *ppat = 0; |
| 1446 | return MAL_SUCCEED; |
| 1447 | } |
| 1448 | #endif |
| 1449 | |
| 1450 | /* |
| 1451 | * @+ Wrapping |
| 1452 | */ |
| 1453 | #include "mal.h" |
| 1454 | str |
| 1455 | PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags) |
| 1456 | { |
| 1457 | return pcre_replace(res, *or, *pat, *repl, *flags, true); |
| 1458 | } |
| 1459 | |
| 1460 | str |
| 1461 | PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags) |
| 1462 | { |
| 1463 | BAT *b, *bn = NULL; |
| 1464 | str msg; |
| 1465 | if ((b = BATdescriptor(*bid)) == NULL) |
| 1466 | throw(MAL, "batpcre.replace" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
| 1467 | |
| 1468 | msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true); |
| 1469 | if (msg == MAL_SUCCEED) { |
| 1470 | *res = bn->batCacheid; |
| 1471 | BBPkeepref(*res); |
| 1472 | } |
| 1473 | BBPunfix(b->batCacheid); |
| 1474 | return msg; |
| 1475 | } |
| 1476 | |
| 1477 | str |
| 1478 | PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags) |
| 1479 | { |
| 1480 | return pcre_replace(res, *or, *pat, *repl, *flags, false); |
| 1481 | } |
| 1482 | |
| 1483 | str |
| 1484 | PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags) |
| 1485 | { |
| 1486 | BAT *b,*bn = NULL; |
| 1487 | str msg; |
| 1488 | if ((b = BATdescriptor(*bid)) == NULL) |
| 1489 | throw(MAL, "batpcre.replace_first" , RUNTIME_OBJECT_MISSING); |
| 1490 | |
| 1491 | msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false); |
| 1492 | if (msg == MAL_SUCCEED) { |
| 1493 | *res = bn->batCacheid; |
| 1494 | BBPkeepref(*res); |
| 1495 | } |
| 1496 | BBPunfix(b->batCacheid); |
| 1497 | return msg; |
| 1498 | } |
| 1499 | |
| 1500 | str |
| 1501 | PCREmatch(bit *ret, const str *val, const str *pat) |
| 1502 | { |
| 1503 | return pcre_match_with_flags(ret, *val, *pat, |
| 1504 | #ifdef HAVE_LIBPCRE |
| 1505 | "s" |
| 1506 | #else |
| 1507 | "x" |
| 1508 | #endif |
| 1509 | ); |
| 1510 | } |
| 1511 | |
| 1512 | str |
| 1513 | PCREimatch(bit *ret, const str *val, const str *pat) |
| 1514 | { |
| 1515 | return pcre_match_with_flags(ret, *val, *pat, "i" |
| 1516 | #ifndef HAVE_LIBPCRE |
| 1517 | "x" |
| 1518 | #endif |
| 1519 | ); |
| 1520 | } |
| 1521 | |
| 1522 | str |
| 1523 | PCREindex(int *res, const pcre *pattern, const str *s) |
| 1524 | { |
| 1525 | #ifdef HAVE_LIBPCRE |
| 1526 | int v[3]; |
| 1527 | |
| 1528 | v[0] = v[1] = *res = 0; |
| 1529 | if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0, 0, v, 3) >= 0) { |
| 1530 | *res = v[1]; |
| 1531 | } |
| 1532 | return MAL_SUCCEED; |
| 1533 | #else |
| 1534 | (void) res; |
| 1535 | (void) pattern; |
| 1536 | (void) s; |
| 1537 | throw(MAL, "pcre.index" , "Database was compiled without PCRE support." ); |
| 1538 | #endif |
| 1539 | } |
| 1540 | |
| 1541 | |
| 1542 | str |
| 1543 | PCREpatindex(int *ret, const str *pat, const str *val) |
| 1544 | { |
| 1545 | #ifdef HAVE_LIBPCRE |
| 1546 | pcre *re = NULL; |
| 1547 | char *ppat = NULL, *msg; |
| 1548 | |
| 1549 | if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED) |
| 1550 | return msg; |
| 1551 | if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) { |
| 1552 | GDKfree(ppat); |
| 1553 | return msg; |
| 1554 | } |
| 1555 | GDKfree(ppat); |
| 1556 | msg = PCREindex(ret, re, val); |
| 1557 | pcre_free(re); |
| 1558 | return msg; |
| 1559 | #else |
| 1560 | (void) ret; |
| 1561 | (void) pat; |
| 1562 | (void) val; |
| 1563 | throw(MAL, "pcre.patindex" , "Database was compiled without PCRE support." ); |
| 1564 | #endif |
| 1565 | } |
| 1566 | |
| 1567 | str |
| 1568 | PCREquote(str *ret, const str *val) |
| 1569 | { |
| 1570 | char *p; |
| 1571 | const char *s = *val; |
| 1572 | |
| 1573 | *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */ |
| 1574 | if (p == NULL) |
| 1575 | throw(MAL, "pcre.quote" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1576 | /* quote all non-alphanumeric ASCII characters (i.e. leave |
| 1577 | non-ASCII and alphanumeric alone) */ |
| 1578 | while (*s) { |
| 1579 | if (!((*s & 0x80) != 0 || |
| 1580 | ('a' <= *s && *s <= 'z') || |
| 1581 | ('A' <= *s && *s <= 'Z') || |
| 1582 | isdigit((unsigned char) *s))) |
| 1583 | *p++ = '\\'; |
| 1584 | *p++ = *s++; |
| 1585 | } |
| 1586 | *p = 0; |
| 1587 | return MAL_SUCCEED; |
| 1588 | } |
| 1589 | |
| 1590 | |
| 1591 | str |
| 1592 | PCREsql2pcre(str *ret, const str *pat, const str *esc) |
| 1593 | { |
| 1594 | return sql2pcre(ret, *pat, *esc); |
| 1595 | } |
| 1596 | |
| 1597 | static str |
| 1598 | PCRElike4(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens) |
| 1599 | { |
| 1600 | char *ppat = NULL; |
| 1601 | str r = sql2pcre(&ppat, *pat, *esc); |
| 1602 | |
| 1603 | if (!r) { |
| 1604 | assert(ppat); |
| 1605 | if (strcmp(ppat, str_nil) == 0) { |
| 1606 | *ret = FALSE; |
| 1607 | if (*isens) { |
| 1608 | if (mystrcasecmp(*s, *pat) == 0) |
| 1609 | *ret = TRUE; |
| 1610 | } else { |
| 1611 | if (strcmp(*s, *pat) == 0) |
| 1612 | *ret = TRUE; |
| 1613 | } |
| 1614 | } else { |
| 1615 | if (*isens) { |
| 1616 | r = PCREimatch(ret, s, &ppat); |
| 1617 | } else { |
| 1618 | r = PCREmatch(ret, s, &ppat); |
| 1619 | } |
| 1620 | } |
| 1621 | } |
| 1622 | if (ppat) |
| 1623 | GDKfree(ppat); |
| 1624 | return r; |
| 1625 | } |
| 1626 | |
| 1627 | str |
| 1628 | PCRElike3(bit *ret, const str *s, const str *pat, const str *esc) |
| 1629 | { |
| 1630 | bit no = FALSE; |
| 1631 | |
| 1632 | return PCRElike4(ret, s, pat, esc, &no); |
| 1633 | } |
| 1634 | |
| 1635 | str |
| 1636 | PCRElike2(bit *ret, const str *s, const str *pat) |
| 1637 | { |
| 1638 | char *esc = "" ; |
| 1639 | |
| 1640 | return PCRElike3(ret, s, pat, &esc); |
| 1641 | } |
| 1642 | |
| 1643 | str |
| 1644 | PCREnotlike3(bit *ret, const str *s, const str *pat, const str *esc) |
| 1645 | { |
| 1646 | str tmp; |
| 1647 | bit r; |
| 1648 | |
| 1649 | rethrow("str.not_like" , tmp, PCRElike3(&r, s, pat, esc)); |
| 1650 | *ret = !r; |
| 1651 | return MAL_SUCCEED; |
| 1652 | } |
| 1653 | |
| 1654 | str |
| 1655 | PCREnotlike2(bit *ret, const str *s, const str *pat) |
| 1656 | { |
| 1657 | str tmp; |
| 1658 | bit r; |
| 1659 | |
| 1660 | rethrow("str.not_like" , tmp, PCRElike2(&r, s, pat)); |
| 1661 | *ret = !r; |
| 1662 | return MAL_SUCCEED; |
| 1663 | } |
| 1664 | |
| 1665 | str |
| 1666 | PCREilike3(bit *ret, const str *s, const str *pat, const str *esc) |
| 1667 | { |
| 1668 | bit yes = TRUE; |
| 1669 | |
| 1670 | return PCRElike4(ret, s, pat, esc, &yes); |
| 1671 | } |
| 1672 | |
| 1673 | str |
| 1674 | PCREilike2(bit *ret, const str *s, const str *pat) |
| 1675 | { |
| 1676 | char *esc = "\\" ; |
| 1677 | |
| 1678 | return PCREilike3(ret, s, pat, &esc); |
| 1679 | } |
| 1680 | |
| 1681 | str |
| 1682 | PCREnotilike3(bit *ret, const str *s, const str *pat, const str *esc) |
| 1683 | { |
| 1684 | str tmp; |
| 1685 | bit r; |
| 1686 | |
| 1687 | rethrow("str.not_ilike" , tmp, PCREilike3(&r, s, pat, esc)); |
| 1688 | *ret = !r; |
| 1689 | return MAL_SUCCEED; |
| 1690 | } |
| 1691 | |
| 1692 | str |
| 1693 | PCREnotilike2(bit *ret, const str *s, const str *pat) |
| 1694 | { |
| 1695 | str tmp; |
| 1696 | bit r; |
| 1697 | |
| 1698 | rethrow("str.not_ilike" , tmp, PCREilike2(&r, s, pat)); |
| 1699 | *ret = !r; |
| 1700 | return MAL_SUCCEED; |
| 1701 | } |
| 1702 | |
| 1703 | static str |
| 1704 | BATPCRElike3(bat *ret, const bat *bid, const str *pat, const str *esc, const bit *isens, const bit *not) |
| 1705 | { |
| 1706 | char *ppat = NULL; |
| 1707 | str res = sql2pcre(&ppat, *pat, *esc); |
| 1708 | |
| 1709 | if (res == MAL_SUCCEED) { |
| 1710 | BAT *strs = BATdescriptor(*bid); |
| 1711 | BATiter strsi; |
| 1712 | BAT *r; |
| 1713 | bit *br; |
| 1714 | BUN p, q, i = 0; |
| 1715 | |
| 1716 | if (strs == NULL) { |
| 1717 | GDKfree(ppat); |
| 1718 | throw(MAL, "batstr.like" , OPERATION_FAILED); |
| 1719 | } |
| 1720 | |
| 1721 | r = COLnew(strs->hseqbase, TYPE_bit, BATcount(strs), TRANSIENT); |
| 1722 | if (r==NULL) { |
| 1723 | GDKfree(ppat); |
| 1724 | BBPunfix(strs->batCacheid); |
| 1725 | throw(MAL, "pcre.like3" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1726 | } |
| 1727 | br = (bit*)Tloc(r, 0); |
| 1728 | strsi = bat_iterator(strs); |
| 1729 | |
| 1730 | if (strcmp(ppat, str_nil) == 0) { |
| 1731 | BATloop(strs, p, q) { |
| 1732 | const char *s = (str)BUNtvar(strsi, p); |
| 1733 | |
| 1734 | if (strcmp(s, *pat) == 0) |
| 1735 | br[i] = TRUE; |
| 1736 | else |
| 1737 | br[i] = FALSE; |
| 1738 | if (*not) |
| 1739 | br[i] = !br[i]; |
| 1740 | i++; |
| 1741 | } |
| 1742 | } else { |
| 1743 | int pos; |
| 1744 | #ifdef HAVE_LIBPCRE |
| 1745 | const char *err_p = NULL; |
| 1746 | int errpos = 0; |
| 1747 | int options = PCRE_UTF8 | PCRE_DOTALL; |
| 1748 | pcre *re; |
| 1749 | #else |
| 1750 | regex_t re; |
| 1751 | int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED; |
| 1752 | int errcode; |
| 1753 | #endif |
| 1754 | |
| 1755 | if (*isens) { |
| 1756 | #ifdef HAVE_LIBPCRE |
| 1757 | options |= PCRE_CASELESS; |
| 1758 | #else |
| 1759 | options |= REG_ICASE; |
| 1760 | #endif |
| 1761 | } |
| 1762 | if ( |
| 1763 | #ifdef HAVE_LIBPCRE |
| 1764 | (re = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL |
| 1765 | #else |
| 1766 | (errcode = regcomp(&re, ppat, options)) != 0 |
| 1767 | #endif |
| 1768 | ) { |
| 1769 | BBPunfix(strs->batCacheid); |
| 1770 | BBPunfix(r->batCacheid); |
| 1771 | res = createException(MAL, "pcre.match" , OPERATION_FAILED |
| 1772 | ": compilation of regular expression (%s) failed" |
| 1773 | #ifdef HAVE_LIBPCRE |
| 1774 | " at %d with '%s'" , ppat, errpos, err_p |
| 1775 | #else |
| 1776 | , ppat |
| 1777 | #endif |
| 1778 | ); |
| 1779 | GDKfree(ppat); |
| 1780 | return res; |
| 1781 | } |
| 1782 | |
| 1783 | BATloop(strs, p, q) { |
| 1784 | const char *s = (str)BUNtvar(strsi, p); |
| 1785 | |
| 1786 | if (*s == '\200') { |
| 1787 | br[i] = bit_nil; |
| 1788 | r->tnonil = false; |
| 1789 | r->tnil = true; |
| 1790 | } else { |
| 1791 | #ifdef HAVE_LIBPCRE |
| 1792 | pos = pcre_exec(re, NULL, s, (int) strlen(s), 0, 0, NULL, 0); |
| 1793 | #else |
| 1794 | int retval = regexec(&re, s, (size_t) 0, NULL, 0); |
| 1795 | pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0); |
| 1796 | #endif |
| 1797 | if (pos >= 0) |
| 1798 | br[i] = *not? FALSE:TRUE; |
| 1799 | else if (pos == -1) |
| 1800 | br[i] = *not? TRUE: FALSE; |
| 1801 | else { |
| 1802 | BBPunfix(strs->batCacheid); |
| 1803 | BBPunfix(r->batCacheid); |
| 1804 | res = createException(MAL, "pcre.match" , OPERATION_FAILED |
| 1805 | ": matching of regular expression (%s) failed with %d" , ppat, pos); |
| 1806 | GDKfree(ppat); |
| 1807 | return res; |
| 1808 | } |
| 1809 | } |
| 1810 | i++; |
| 1811 | } |
| 1812 | #ifdef HAVE_LIBPCRE |
| 1813 | pcre_free(re); |
| 1814 | #else |
| 1815 | regfree(&re); |
| 1816 | #endif |
| 1817 | } |
| 1818 | BATsetcount(r, i); |
| 1819 | r->tsorted = false; |
| 1820 | r->trevsorted = false; |
| 1821 | BATkey(r, false); |
| 1822 | |
| 1823 | BBPkeepref(*ret = r->batCacheid); |
| 1824 | BBPunfix(strs->batCacheid); |
| 1825 | GDKfree(ppat); |
| 1826 | } |
| 1827 | return res; |
| 1828 | } |
| 1829 | |
| 1830 | str |
| 1831 | BATPCRElike(bat *ret, const bat *bid, const str *pat, const str *esc) |
| 1832 | { |
| 1833 | bit no = FALSE; |
| 1834 | |
| 1835 | return BATPCRElike3(ret, bid, pat, esc, &no, &no); |
| 1836 | } |
| 1837 | |
| 1838 | str |
| 1839 | BATPCRElike2(bat *ret, const bat *bid, const str *pat) |
| 1840 | { |
| 1841 | char *esc = "\\" ; |
| 1842 | |
| 1843 | return BATPCRElike(ret, bid, pat, &esc); |
| 1844 | } |
| 1845 | |
| 1846 | str |
| 1847 | BATPCREnotlike(bat *ret, const bat *bid, const str *pat, const str *esc) |
| 1848 | { |
| 1849 | bit no = FALSE; |
| 1850 | bit yes = TRUE; |
| 1851 | |
| 1852 | return BATPCRElike3(ret, bid, pat, esc, &no, &yes); |
| 1853 | } |
| 1854 | |
| 1855 | str |
| 1856 | BATPCREnotlike2(bat *ret, const bat *bid, const str *pat) |
| 1857 | { |
| 1858 | char *esc = "\\" ; |
| 1859 | |
| 1860 | return BATPCREnotlike(ret, bid, pat, &esc); |
| 1861 | } |
| 1862 | |
| 1863 | str |
| 1864 | BATPCREilike(bat *ret, const bat *bid, const str *pat, const str *esc) |
| 1865 | { |
| 1866 | bit yes = TRUE; |
| 1867 | bit no = FALSE; |
| 1868 | |
| 1869 | return BATPCRElike3(ret, bid, pat, esc, &yes, &no); |
| 1870 | } |
| 1871 | |
| 1872 | str |
| 1873 | BATPCREilike2(bat *ret, const bat *bid, const str *pat) |
| 1874 | { |
| 1875 | char *esc = "\\" ; |
| 1876 | |
| 1877 | return BATPCREilike(ret, bid, pat, &esc); |
| 1878 | } |
| 1879 | |
| 1880 | str |
| 1881 | BATPCREnotilike(bat *ret, const bat *bid, const str *pat, const str *esc) |
| 1882 | { |
| 1883 | bit yes = TRUE; |
| 1884 | |
| 1885 | return BATPCRElike3(ret, bid, pat, esc, &yes, &yes); |
| 1886 | } |
| 1887 | |
| 1888 | str |
| 1889 | BATPCREnotilike2(bat *ret, const bat *bid, const str *pat) |
| 1890 | { |
| 1891 | char *esc = "\\" ; |
| 1892 | |
| 1893 | return BATPCREnotilike(ret, bid, pat, &esc); |
| 1894 | } |
| 1895 | |
| 1896 | str |
| 1897 | PCRElikeselect2(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *caseignore, const bit *anti) |
| 1898 | { |
| 1899 | BAT *b, *s = NULL, *bn = NULL; |
| 1900 | str res; |
| 1901 | char *ppat = NULL; |
| 1902 | bool use_re = false; |
| 1903 | bool use_strcmp = false; |
| 1904 | |
| 1905 | if ((b = BATdescriptor(*bid)) == NULL) { |
| 1906 | throw(MAL, "algebra.likeselect" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
| 1907 | } |
| 1908 | if (sid && !is_bat_nil(*sid) && *sid && (s = BATdescriptor(*sid)) == NULL) { |
| 1909 | BBPunfix(b->batCacheid); |
| 1910 | throw(MAL, "algebra.likeselect" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
| 1911 | } |
| 1912 | |
| 1913 | /* no escape, try if a simple list of keywords works */ |
| 1914 | if (is_strcmpable(*pat, *esc)) { |
| 1915 | use_re = true; |
| 1916 | use_strcmp = true; |
| 1917 | } else if (re_simple(*pat, **esc == '\200' ? 0 : (unsigned char) **esc)) { |
| 1918 | use_re = true; |
| 1919 | } else { |
| 1920 | res = sql2pcre(&ppat, *pat, *esc); |
| 1921 | if (res != MAL_SUCCEED) { |
| 1922 | BBPunfix(b->batCacheid); |
| 1923 | if (s) |
| 1924 | BBPunfix(s->batCacheid); |
| 1925 | return res; |
| 1926 | } |
| 1927 | if (strcmp(ppat, str_nil) == 0) { |
| 1928 | GDKfree(ppat); |
| 1929 | ppat = NULL; |
| 1930 | if (*caseignore) { |
| 1931 | ppat = GDKmalloc(strlen(*pat) + 3); |
| 1932 | if (ppat == NULL) { |
| 1933 | BBPunfix(b->batCacheid); |
| 1934 | if (s) |
| 1935 | BBPunfix(s->batCacheid); |
| 1936 | throw(MAL, "algebra.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1937 | } |
| 1938 | ppat[0] = '^'; |
| 1939 | strcpy(ppat + 1, *pat); |
| 1940 | strcat(ppat, "$" ); |
| 1941 | } |
| 1942 | } |
| 1943 | } |
| 1944 | |
| 1945 | if (use_re) { |
| 1946 | res = re_likeselect(&bn, b, s, *pat, (bool) *caseignore, (bool) *anti, use_strcmp, **esc == '\200' ? 0 : (unsigned char) **esc); |
| 1947 | } else if (ppat == NULL) { |
| 1948 | /* no pattern and no special characters: can use normal select */ |
| 1949 | bn = BATselect(b, s, *pat, NULL, true, true, *anti); |
| 1950 | if (bn == NULL) |
| 1951 | res = createException(MAL, "algebra.likeselect" , GDK_EXCEPTION); |
| 1952 | else |
| 1953 | res = MAL_SUCCEED; |
| 1954 | } else { |
| 1955 | res = pcre_likeselect(&bn, b, s, ppat, (bool) *caseignore, (bool) *anti); |
| 1956 | } |
| 1957 | BBPunfix(b->batCacheid); |
| 1958 | if (s) |
| 1959 | BBPunfix(s->batCacheid); |
| 1960 | GDKfree(ppat); |
| 1961 | if (res != MAL_SUCCEED) |
| 1962 | return res; |
| 1963 | assert(bn); |
| 1964 | *ret = bn->batCacheid; |
| 1965 | BBPkeepref(bn->batCacheid); |
| 1966 | return MAL_SUCCEED; |
| 1967 | } |
| 1968 | |
| 1969 | str |
| 1970 | PCRElikeselect1(bat *ret, const bat *bid, const bat *cid, const str *pat, const str *esc, const bit *anti) |
| 1971 | { |
| 1972 | const bit f = TRUE; |
| 1973 | return PCRElikeselect2(ret, bid, cid, pat, esc, &f, anti); |
| 1974 | } |
| 1975 | |
| 1976 | str |
| 1977 | PCRElikeselect3(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *anti) |
| 1978 | { |
| 1979 | const bit f = FALSE; |
| 1980 | return PCRElikeselect2(ret, bid, sid, pat, esc, &f, anti); |
| 1981 | } |
| 1982 | |
| 1983 | str |
| 1984 | PCRElikeselect4(bat *ret, const bat *bid, const bat *cid, const str *pat, const bit *anti) |
| 1985 | { |
| 1986 | const bit f = TRUE; |
| 1987 | const str esc ="" ; |
| 1988 | return PCRElikeselect2(ret, bid, cid, pat, &esc, &f, anti); |
| 1989 | } |
| 1990 | |
| 1991 | str |
| 1992 | PCRElikeselect5(bat *ret, const bat *bid, const bat *sid, const str *pat, const bit *anti) |
| 1993 | { |
| 1994 | const bit f = FALSE; |
| 1995 | const str esc ="" ; |
| 1996 | return PCRElikeselect2(ret, bid, sid, pat, &esc, &f, anti); |
| 1997 | } |
| 1998 | |
| 1999 | #define APPEND(b, o) (((oid *) b->theap.base)[b->batCount++] = (o)) |
| 2000 | #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width)) |
| 2001 | |
| 2002 | static char * |
| 2003 | pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, |
| 2004 | const char *esc, bool caseignore) |
| 2005 | { |
| 2006 | struct canditer lci, rci; |
| 2007 | const char *lvals, *rvals; |
| 2008 | const char *lvars, *rvars; |
| 2009 | int lwidth, rwidth; |
| 2010 | const char *vl, *vr; |
| 2011 | oid lastl = 0; /* last value inserted into r1 */ |
| 2012 | BUN nl; |
| 2013 | BUN newcap; |
| 2014 | oid lo, ro; |
| 2015 | int rskipped = 0; /* whether we skipped values in r */ |
| 2016 | char *msg = MAL_SUCCEED; |
| 2017 | RE *re = NULL; |
| 2018 | char *pcrepat = NULL; |
| 2019 | #ifdef HAVE_LIBPCRE |
| 2020 | pcre *pcrere = NULL; |
| 2021 | pcre_extra *pcreex = NULL; |
| 2022 | const char *err_p = NULL; |
| 2023 | int errpos; |
| 2024 | int pcreopt = PCRE_UTF8 | PCRE_MULTILINE; |
| 2025 | int pcrestopt = (sl ? BATcount(sl) : BATcount(l)) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0; |
| 2026 | #else |
| 2027 | int pcrere = 0; |
| 2028 | regex_t regex; |
| 2029 | int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED; |
| 2030 | int errcode = -1; |
| 2031 | #endif |
| 2032 | |
| 2033 | |
| 2034 | if (caseignore) |
| 2035 | #ifdef HAVE_LIBPCRE |
| 2036 | pcreopt |= PCRE_CASELESS; |
| 2037 | #else |
| 2038 | options |= REG_ICASE; |
| 2039 | #endif |
| 2040 | |
| 2041 | ALGODEBUG fprintf(stderr, "#pcrejoin(l=%s#" BUNFMT "[%s]%s%s," |
| 2042 | "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s," |
| 2043 | "sr=%s#" BUNFMT "%s%s)\n" , |
| 2044 | BATgetId(l), BATcount(l), ATOMname(l->ttype), |
| 2045 | l->tsorted ? "-sorted" : "" , |
| 2046 | l->trevsorted ? "-revsorted" : "" , |
| 2047 | BATgetId(r), BATcount(r), ATOMname(r->ttype), |
| 2048 | r->tsorted ? "-sorted" : "" , |
| 2049 | r->trevsorted ? "-revsorted" : "" , |
| 2050 | sl ? BATgetId(sl) : "NULL" , sl ? BATcount(sl) : 0, |
| 2051 | sl && sl->tsorted ? "-sorted" : "" , |
| 2052 | sl && sl->trevsorted ? "-revsorted" : "" , |
| 2053 | sr ? BATgetId(sr) : "NULL" , sr ? BATcount(sr) : 0, |
| 2054 | sr && sr->tsorted ? "-sorted" : "" , |
| 2055 | sr && sr->trevsorted ? "-revsorted" : "" ); |
| 2056 | |
| 2057 | assert(ATOMtype(l->ttype) == ATOMtype(r->ttype)); |
| 2058 | assert(ATOMtype(l->ttype) == TYPE_str); |
| 2059 | assert(sl == NULL || sl->tsorted); |
| 2060 | assert(sr == NULL || sr->tsorted); |
| 2061 | |
| 2062 | canditer_init(&lci, l, sl); |
| 2063 | canditer_init(&rci, r, sr); |
| 2064 | |
| 2065 | lvals = (const char *) Tloc(l, 0); |
| 2066 | rvals = (const char *) Tloc(r, 0); |
| 2067 | assert(r->tvarsized && r->ttype); |
| 2068 | lvars = l->tvheap->base; |
| 2069 | rvars = r->tvheap->base; |
| 2070 | lwidth = l->twidth; |
| 2071 | rwidth = r->twidth; |
| 2072 | |
| 2073 | r1->tkey = true; |
| 2074 | r1->tsorted = true; |
| 2075 | r1->trevsorted = true; |
| 2076 | r2->tkey = true; |
| 2077 | r2->tsorted = true; |
| 2078 | r2->trevsorted = true; |
| 2079 | |
| 2080 | /* nested loop implementation for PCRE join */ |
| 2081 | for (BUN ri = 0; ri < rci.ncand; ri++) { |
| 2082 | ro = canditer_next(&rci); |
| 2083 | vr = VALUE(r, ro - r->hseqbase); |
| 2084 | if (strcmp(vr, str_nil) == 0) |
| 2085 | continue; |
| 2086 | if (re_simple(vr, esc && *esc != '\200' ? (unsigned char) *esc : 0)) { |
| 2087 | re = re_create(vr, caseignore, esc && *esc != '\200' ? (unsigned char) *esc : 0); |
| 2088 | if (re == NULL) { |
| 2089 | msg = createException(MAL, "pcre.join" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 2090 | goto bailout; |
| 2091 | } |
| 2092 | } else { |
| 2093 | assert(pcrepat == NULL); |
| 2094 | msg = sql2pcre(&pcrepat, vr, esc); |
| 2095 | if (msg != MAL_SUCCEED) |
| 2096 | goto bailout; |
| 2097 | if (strcmp(pcrepat, str_nil) == 0) { |
| 2098 | GDKfree(pcrepat); |
| 2099 | if (caseignore) { |
| 2100 | pcrepat = GDKmalloc(strlen(vr) + 3); |
| 2101 | if (pcrepat == NULL) { |
| 2102 | msg = createException(MAL, "pcre.join" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 2103 | goto bailout; |
| 2104 | } |
| 2105 | sprintf(pcrepat, "^%s$" , vr); |
| 2106 | } else { |
| 2107 | /* a simple strcmp suffices */ |
| 2108 | pcrepat = NULL; |
| 2109 | } |
| 2110 | } |
| 2111 | if (pcrepat) { |
| 2112 | #ifdef HAVE_LIBPCRE |
| 2113 | pcrere = pcre_compile(pcrepat, pcreopt, &err_p, &errpos, NULL); |
| 2114 | if (pcrere == NULL) { |
| 2115 | msg = createException(MAL, "pcre.join" , OPERATION_FAILED |
| 2116 | ": pcre compile of pattern (%s) " |
| 2117 | "failed at %d with '%s'" , |
| 2118 | pcrepat, errpos, err_p); |
| 2119 | goto bailout; |
| 2120 | } |
| 2121 | pcreex = pcre_study(pcrere, pcrestopt, &err_p); |
| 2122 | if (err_p != NULL) { |
| 2123 | msg = createException(MAL, "pcre.join" , OPERATION_FAILED |
| 2124 | ": pcre study of pattern (%s) " |
| 2125 | "failed with '%s'" , pcrepat, err_p); |
| 2126 | goto bailout; |
| 2127 | } |
| 2128 | #else |
| 2129 | if ((errcode = regcomp(®ex, pcrepat, options)) != 0) { |
| 2130 | msg = createException(MAL, "pcre.join" , OPERATION_FAILED |
| 2131 | ": pcre compile of pattern (%s)" , |
| 2132 | pcrepat); |
| 2133 | goto bailout; |
| 2134 | } |
| 2135 | pcrere = 1; |
| 2136 | #endif |
| 2137 | GDKfree(pcrepat); |
| 2138 | pcrepat = NULL; |
| 2139 | } |
| 2140 | } |
| 2141 | nl = 0; |
| 2142 | canditer_reset(&lci); |
| 2143 | for (BUN li = 0; li < lci.ncand; li++) { |
| 2144 | lo = canditer_next(&lci); |
| 2145 | vl = VALUE(l, lo - l->hseqbase); |
| 2146 | if (strcmp(vl, str_nil) == 0) |
| 2147 | continue; |
| 2148 | if (re) { |
| 2149 | if (caseignore) { |
| 2150 | if (!re_match_ignore(vl, re)) |
| 2151 | continue; |
| 2152 | } else { |
| 2153 | if (!re_match_no_ignore(vl, re)) |
| 2154 | continue; |
| 2155 | } |
| 2156 | } else if (pcrere) { |
| 2157 | #ifdef HAVE_LIBPCRE |
| 2158 | if (pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, 0, NULL, 0) < 0) |
| 2159 | continue; |
| 2160 | #else |
| 2161 | int retval = regexec(®ex, vl, (size_t) 0, NULL, 0); |
| 2162 | if (retval == REG_NOMATCH || retval == REG_ENOSYS) |
| 2163 | continue; |
| 2164 | #endif |
| 2165 | } else { |
| 2166 | if (strcmp(vl, vr) != 0) |
| 2167 | continue; |
| 2168 | } |
| 2169 | if (BUNlast(r1) == BATcapacity(r1)) { |
| 2170 | newcap = BATgrows(r1); |
| 2171 | BATsetcount(r1, BATcount(r1)); |
| 2172 | BATsetcount(r2, BATcount(r2)); |
| 2173 | if (BATextend(r1, newcap) != GDK_SUCCEED || |
| 2174 | BATextend(r2, newcap) != GDK_SUCCEED) { |
| 2175 | msg = createException(MAL, "pcre.join" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 2176 | goto bailout; |
| 2177 | } |
| 2178 | assert(BATcapacity(r1) == BATcapacity(r2)); |
| 2179 | } |
| 2180 | if (BATcount(r1) > 0) { |
| 2181 | if (lastl + 1 != lo) |
| 2182 | r1->tseqbase = oid_nil; |
| 2183 | if (nl == 0) { |
| 2184 | r2->trevsorted = false; |
| 2185 | if (lastl > lo) { |
| 2186 | r1->tsorted = false; |
| 2187 | r1->tkey = false; |
| 2188 | } else if (lastl < lo) { |
| 2189 | r1->trevsorted = false; |
| 2190 | } else { |
| 2191 | r1->tkey = false; |
| 2192 | } |
| 2193 | } |
| 2194 | } |
| 2195 | APPEND(r1, lo); |
| 2196 | APPEND(r2, ro); |
| 2197 | lastl = lo; |
| 2198 | nl++; |
| 2199 | } |
| 2200 | if (re) { |
| 2201 | re_destroy(re); |
| 2202 | re = NULL; |
| 2203 | } |
| 2204 | if (pcrere) { |
| 2205 | #ifdef HAVE_LIBPCRE |
| 2206 | pcre_free_study(pcreex); |
| 2207 | pcre_free(pcrere); |
| 2208 | pcrere = NULL; |
| 2209 | pcreex = NULL; |
| 2210 | #else |
| 2211 | regfree(®ex); |
| 2212 | pcrere = 0; |
| 2213 | #endif |
| 2214 | } |
| 2215 | if (nl > 1) { |
| 2216 | r2->tkey = false; |
| 2217 | r2->tseqbase = oid_nil; |
| 2218 | r1->trevsorted = false; |
| 2219 | } else if (nl == 0) { |
| 2220 | rskipped = BATcount(r2) > 0; |
| 2221 | } else if (rskipped) { |
| 2222 | r2->tseqbase = oid_nil; |
| 2223 | } |
| 2224 | } |
| 2225 | assert(BATcount(r1) == BATcount(r2)); |
| 2226 | /* also set other bits of heap to correct value to indicate size */ |
| 2227 | BATsetcount(r1, BATcount(r1)); |
| 2228 | BATsetcount(r2, BATcount(r2)); |
| 2229 | if (BATcount(r1) > 0) { |
| 2230 | if (BATtdense(r1)) |
| 2231 | r1->tseqbase = ((oid *) r1->theap.base)[0]; |
| 2232 | if (BATtdense(r2)) |
| 2233 | r2->tseqbase = ((oid *) r2->theap.base)[0]; |
| 2234 | } else { |
| 2235 | r1->tseqbase = r2->tseqbase = 0; |
| 2236 | } |
| 2237 | ALGODEBUG fprintf(stderr, "#pcrejoin(l=%s,r=%s)=(%s#" BUNFMT"%s%s,%s#" BUNFMT"%s%s\n" , |
| 2238 | BATgetId(l), BATgetId(r), |
| 2239 | BATgetId(r1), BATcount(r1), |
| 2240 | r1->tsorted ? "-sorted" : "" , |
| 2241 | r1->trevsorted ? "-revsorted" : "" , |
| 2242 | BATgetId(r2), BATcount(r2), |
| 2243 | r2->tsorted ? "-sorted" : "" , |
| 2244 | r2->trevsorted ? "-revsorted" : "" ); |
| 2245 | return MAL_SUCCEED; |
| 2246 | |
| 2247 | bailout: |
| 2248 | if (re) |
| 2249 | re_destroy(re); |
| 2250 | if (pcrepat) |
| 2251 | GDKfree(pcrepat); |
| 2252 | #ifdef HAVE_LIBPCRE |
| 2253 | if (pcreex) |
| 2254 | pcre_free_study(pcreex); |
| 2255 | if (pcrere) |
| 2256 | pcre_free(pcrere); |
| 2257 | #else |
| 2258 | if (pcrere) |
| 2259 | regfree(®ex); |
| 2260 | #endif |
| 2261 | |
| 2262 | assert(msg != MAL_SUCCEED); |
| 2263 | return msg; |
| 2264 | } |
| 2265 | |
| 2266 | static str |
| 2267 | PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, |
| 2268 | const char *esc, bool caseignore) |
| 2269 | { |
| 2270 | BAT *left = NULL, *right = NULL, *candleft = NULL, *candright = NULL; |
| 2271 | BAT *result1 = NULL, *result2 = NULL; |
| 2272 | char *msg = MAL_SUCCEED; |
| 2273 | |
| 2274 | if ((left = BATdescriptor(lid)) == NULL) |
| 2275 | goto fail; |
| 2276 | if ((right = BATdescriptor(rid)) == NULL) |
| 2277 | goto fail; |
| 2278 | if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL) |
| 2279 | goto fail; |
| 2280 | if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL) |
| 2281 | goto fail; |
| 2282 | result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT); |
| 2283 | result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT); |
| 2284 | if (result1 == NULL || result2 == NULL) { |
| 2285 | msg = createException(MAL, "pcre.join" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 2286 | goto fail; |
| 2287 | } |
| 2288 | result1->tnil = false; |
| 2289 | result1->tnonil = true; |
| 2290 | result1->tkey = true; |
| 2291 | result1->tsorted = true; |
| 2292 | result1->trevsorted = true; |
| 2293 | result1->tseqbase = 0; |
| 2294 | result2->tnil = false; |
| 2295 | result2->tnonil = true; |
| 2296 | result2->tkey = true; |
| 2297 | result2->tsorted = true; |
| 2298 | result2->trevsorted = true; |
| 2299 | result2->tseqbase = 0; |
| 2300 | msg = pcrejoin(result1, result2, left, right, candleft, candright, |
| 2301 | esc, caseignore); |
| 2302 | if (msg) |
| 2303 | goto fail; |
| 2304 | *r1 = result1->batCacheid; |
| 2305 | *r2 = result2->batCacheid; |
| 2306 | BBPkeepref(*r1); |
| 2307 | BBPkeepref(*r2); |
| 2308 | BBPunfix(left->batCacheid); |
| 2309 | BBPunfix(right->batCacheid); |
| 2310 | if (candleft) |
| 2311 | BBPunfix(candleft->batCacheid); |
| 2312 | if (candright) |
| 2313 | BBPunfix(candright->batCacheid); |
| 2314 | return MAL_SUCCEED; |
| 2315 | |
| 2316 | fail: |
| 2317 | if (left) |
| 2318 | BBPunfix(left->batCacheid); |
| 2319 | if (right) |
| 2320 | BBPunfix(right->batCacheid); |
| 2321 | if (candleft) |
| 2322 | BBPunfix(candleft->batCacheid); |
| 2323 | if (candright) |
| 2324 | BBPunfix(candright->batCacheid); |
| 2325 | if (result1) |
| 2326 | BBPunfix(result1->batCacheid); |
| 2327 | if (result2) |
| 2328 | BBPunfix(result2->batCacheid); |
| 2329 | if (msg) |
| 2330 | return msg; |
| 2331 | throw(MAL, "pcre.join" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
| 2332 | } |
| 2333 | |
| 2334 | str |
| 2335 | LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate) |
| 2336 | { |
| 2337 | (void) nil_matches; |
| 2338 | (void) estimate; |
| 2339 | return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *esc, 0); |
| 2340 | } |
| 2341 | |
| 2342 | str |
| 2343 | LIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate) |
| 2344 | { |
| 2345 | (void) nil_matches; |
| 2346 | (void) estimate; |
| 2347 | return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, "" , 0); |
| 2348 | } |
| 2349 | |
| 2350 | str |
| 2351 | ILIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate) |
| 2352 | { |
| 2353 | (void) nil_matches; |
| 2354 | (void) estimate; |
| 2355 | return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *esc, 1); |
| 2356 | } |
| 2357 | |
| 2358 | str |
| 2359 | ILIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate) |
| 2360 | { |
| 2361 | (void) nil_matches; |
| 2362 | (void) estimate; |
| 2363 | return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, "" , 1); |
| 2364 | } |
| 2365 | |