1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * N. Nes
11 * PCRE library interface
12 * The PCRE library is a set of functions that implement regular
13 * expression pattern matching using the same syntax and semantics as Perl,
14 * with just a few differences. The current implementation of PCRE
15 * (release 4.x) corresponds approximately with Perl 5.8, including support
16 * for UTF-8 encoded strings. However, this support has to be
17 * explicitly enabled; it is not the default.
18 *
19 * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
20 */
21#include "monetdb_config.h"
22#include <string.h>
23
24#include "mal.h"
25#include "mal_exception.h"
26
27#include <wchar.h>
28#include <wctype.h>
29
30#ifdef HAVE_LIBPCRE
31#include <pcre.h>
32#ifndef PCRE_STUDY_JIT_COMPILE
33/* old library version on e.g. EPEL 6 */
34#define pcre_free_study(x) pcre_free(x)
35#define PCRE_STUDY_JIT_COMPILE 0
36#endif
37#define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */
38
39#else
40
41#include <regex.h>
42
43typedef regex_t pcre;
44#endif
45
46mal_export str pcre_init(void *ret);
47
48mal_export str PCREquote(str *r, const str *v);
49mal_export str PCREmatch(bit *ret, const str *val, const str *pat);
50mal_export str PCREimatch(bit *ret, const str *val, const str *pat);
51mal_export str PCREindex(int *ret, const pcre *pat, const str *val);
52mal_export str PCREpatindex(int *ret, const str *pat, const str *val);
53mal_export str PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags);
54mal_export str PCREreplace_bat_wrap(bat *res, const bat *or, const str *pat, const str *repl, const str *flags);
55mal_export str PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags);
56mal_export str PCREreplacefirst_bat_wrap(bat *res, const bat *or, const str *pat, const str *repl, const str *flags);
57mal_export str PCREsql2pcre(str *ret, const str *pat, const str *esc);
58
59mal_export str PCRElike3(bit *ret, const str *s, const str *pat, const str *esc);
60mal_export str PCRElike2(bit *ret, const str *s, const str *pat);
61mal_export str PCREnotlike3(bit *ret, const str *s, const str *pat, const str *esc);
62mal_export str PCREnotlike2(bit *ret, const str *s, const str *pat);
63mal_export str BATPCRElike(bat *ret, const bat *b, const str *pat, const str *esc);
64mal_export str BATPCRElike2(bat *ret, const bat *b, const str *pat);
65mal_export str BATPCREnotlike(bat *ret, const bat *b, const str *pat, const str *esc);
66mal_export str BATPCREnotlike2(bat *ret, const bat *b, const str *pat);
67mal_export str PCREilike3(bit *ret, const str *s, const str *pat, const str *esc);
68mal_export str PCREilike2(bit *ret, const str *s, const str *pat);
69mal_export str PCREnotilike3(bit *ret, const str *s, const str *pat, const str *esc);
70mal_export str PCREnotilike2(bit *ret, const str *s, const str *pat);
71mal_export str BATPCREilike(bat *ret, const bat *b, const str *pat, const str *esc);
72mal_export str BATPCREilike2(bat *ret, const bat *b, const str *pat);
73mal_export str BATPCREnotilike(bat *ret, const bat *b, const str *pat, const str *esc);
74mal_export str BATPCREnotilike2(bat *ret, const bat *b, const str *pat);
75
76mal_export str PCRElikeselect2(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *caseignore, const bit *anti);
77mal_export str PCRElikeselect1(bat *ret, const bat *bid, const bat *cid, const str *pat, const str *esc, const bit *anti);
78mal_export str PCRElikeselect3(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *anti);
79mal_export str PCRElikeselect4(bat *ret, const bat *bid, const bat *cid, const str *pat, const bit *anti);
80mal_export str PCRElikeselect5(bat *ret, const bat *bid, const bat *sid, const str *pat, const bit *anti);
81
82mal_export str LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate);
83mal_export str LIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate);
84mal_export str ILIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate);
85mal_export str ILIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate);
86
87/* current implementation assumes simple %keyword% [keyw%]* */
88typedef struct RE {
89 char *k;
90 uint32_t *w;
91 bool search;
92 size_t len;
93 struct RE *n;
94} RE;
95
96/* We cannot use strcasecmp and strncasecmp since they work byte for
97 * byte and don't deal with multibyte encodings (such as UTF-8).
98 *
99 * We implement our own conversion from UTF-8 encoding to Unicode code
100 * points which we store in uint32_t. The reason for this is,
101 * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
102 * locale to use them), and on Windows, wchar_t is only 2 bytes and
103 * therefore cannot hold all Unicode code points. We do use functions
104 * such as towlower to convert a Unicode code point to its lower-case
105 * equivalent, but again on Windows, if the code point doesn't fit in
106 * 2 bytes, we skip this conversion and compare the unconverted code
107 * points.
108 *
109 * Note, towlower is also locale-dependent, but we don't need a UTF-8
110 * locale in order to use it. */
111
112/* helper function to convert a UTF-8 multibyte character to a wide
113 * character */
114static size_t
115utfc8touc(uint32_t *restrict dest, const char *restrict src)
116{
117 if ((src[0] & 0x80) == 0) {
118 *dest = src[0];
119 return src[0] != 0;
120 } else if ((src[0] & 0xE0) == 0xC0
121 && (src[1] & 0xC0) == 0x80
122 && (src[0] & 0x1E) != 0) {
123 *dest = (src[0] & 0x1F) << 6
124 | (src[1] & 0x3F);
125 return 2;
126 } else if ((src[0] & 0xF0) == 0xE0
127 && (src[1] & 0xC0) == 0x80
128 && (src[2] & 0xC0) == 0x80
129 && ((src[0] & 0x0F) != 0
130 || (src[1] & 0x20) != 0)) {
131 *dest = (src[0] & 0x0F) << 12
132 | (src[1] & 0x3F) << 6
133 | (src[2] & 0x3F);
134 return 3;
135 } else if ((src[0] & 0xF8) == 0xF0
136 && (src[1] & 0xC0) == 0x80
137 && (src[2] & 0xC0) == 0x80
138 && (src[3] & 0xC0) == 0x80) {
139 uint32_t c = (src[0] & 0x07) << 18
140 | (src[1] & 0x3F) << 12
141 | (src[2] & 0x3F) << 6
142 | (src[3] & 0x3F);
143 if (c < 0x10000
144 || c > 0x10FFFF
145 || (c & 0x1FF800) == 0x00D800)
146 return (size_t) -1;
147 *dest = c;
148 return 4;
149 }
150 return (size_t) -1;
151}
152
153/* helper function to convert a UTF-8 string to a wide character
154 * string, the wide character string is allocated */
155static uint32_t *
156utf8stoucs(const char *src)
157{
158 uint32_t *dest;
159 size_t i = 0;
160 size_t j = 0;
161
162 /* count how many uint32_t's we need, while also checking for
163 * correctness of the input */
164 while (src[j]) {
165 i++;
166 if ((src[j+0] & 0x80) == 0) {
167 j += 1;
168 } else if ((src[j+0] & 0xE0) == 0xC0
169 && (src[j+1] & 0xC0) == 0x80
170 && (src[j+0] & 0x1E) != 0) {
171 j += 2;
172 } else if ((src[j+0] & 0xF0) == 0xE0
173 && (src[j+1] & 0xC0) == 0x80
174 && (src[j+2] & 0xC0) == 0x80
175 && ((src[j+0] & 0x0F) != 0
176 || (src[j+1] & 0x20) != 0)) {
177 j += 3;
178 } else if ((src[j+0] & 0xF8) == 0xF0
179 && (src[j+1] & 0xC0) == 0x80
180 && (src[j+2] & 0xC0) == 0x80
181 && (src[j+3] & 0xC0) == 0x80) {
182 uint32_t c = (src[j+0] & 0x07) << 18
183 | (src[j+1] & 0x3F) << 12
184 | (src[j+2] & 0x3F) << 6
185 | (src[j+3] & 0x3F);
186 if (c < 0x10000
187 || c > 0x10FFFF
188 || (c & 0x1FF800) == 0x00D800)
189 return NULL;
190 j += 4;
191 } else {
192 return NULL;
193 }
194 }
195 dest = GDKmalloc((i + 1) * sizeof(uint32_t));
196 if (dest == NULL)
197 return NULL;
198 /* go through the source string again, this time we can skip
199 * the correctness tests */
200 i = j = 0;
201 while (src[j]) {
202 if ((src[j+0] & 0x80) == 0) {
203 dest[i++] = src[j+0];
204 j += 1;
205 } else if ((src[j+0] & 0xE0) == 0xC0) {
206 dest[i++] = (src[j+0] & 0x1F) << 6
207 | (src[j+1] & 0x3F);
208 j += 2;
209 } else if ((src[j+0] & 0xF0) == 0xE0) {
210 dest[i++] = (src[j+0] & 0x0F) << 12
211 | (src[j+1] & 0x3F) << 6
212 | (src[j+2] & 0x3F);
213 j += 3;
214 } else if ((src[j+0] & 0xF8) == 0xF0) {
215 dest[i++] = (src[j+0] & 0x07) << 18
216 | (src[j+1] & 0x3F) << 12
217 | (src[j+2] & 0x3F) << 6
218 | (src[j+3] & 0x3F);
219 j += 4;
220 }
221 }
222 dest[i] = 0;
223 return dest;
224}
225
226static size_t
227myucslen(const uint32_t *ucs)
228{
229 size_t i = 0;
230
231 while (ucs[i])
232 i++;
233 return i;
234}
235
236static int
237mywstrncasecmp(const char *restrict s1, const uint32_t *restrict s2, size_t n2)
238{
239 uint32_t c1;
240
241 while (n2 > 0) {
242 size_t nn1 = utfc8touc(&c1, s1);
243 if (nn1 == 0 || nn1 == (size_t) -1)
244 return -(*s2 != 0);
245 if (*s2 == 0)
246 return 1;
247 if (nn1 == (size_t) -1 || nn1 == (size_t) -2)
248 return 0; /* actually an error that shouldn't happen */
249#if SIZEOF_WCHAR_T == 2
250 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
251 if (c1 != *s2)
252 return c1 - *s2;
253 } else
254#endif
255 if (towlower((wint_t) c1) != towlower((wint_t) *s2))
256 return towlower((wint_t) c1) - towlower((wint_t) *s2);
257 s1 += nn1;
258 n2--;
259 s2++;
260 }
261 return 0;
262}
263
264static int
265mystrcasecmp(const char *s1, const char *s2)
266{
267 uint32_t c1, c2;
268
269 for (;;) {
270 size_t nn1 = utfc8touc(&c1, s1);
271 size_t nn2 = utfc8touc(&c2, s2);
272 if (nn1 == 0 || nn1 == (size_t) -1)
273 return -(nn2 != 0 && nn2 != (size_t) -1);
274 if (nn2 == 0 || nn2 == (size_t) -1)
275 return 1;
276 if (nn1 == (size_t) -1 || nn1 == (size_t) -2 ||
277 nn2 == (size_t) -1 || nn2 == (size_t) -2)
278 return 0; /* actually an error that shouldn't happen */
279#if SIZEOF_WCHAR_T == 2
280 if (c1 > 0xFFFF || c2 > 0xFFFF) {
281 if (c1 != c2)
282 return c1 - c2;
283 } else
284#endif
285 if (towlower((wint_t) c1) != towlower((wint_t) c2))
286 return towlower((wint_t) c1) - towlower((wint_t) c2);
287 s1 += nn1;
288 s2 += nn2;
289 }
290}
291
292static int
293mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
294{
295 uint32_t c1;
296
297 for (;;) {
298 size_t nn1 = utfc8touc(&c1, s1);
299 if (nn1 == 0 || nn1 == (size_t) -1)
300 return -(*s2 != 0);
301 if (*s2 == 0)
302 return 1;
303 if (nn1 == (size_t) -1 || nn1 == (size_t) -2)
304 return 0; /* actually an error that shouldn't happen */
305#if SIZEOF_WCHAR_T == 2
306 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
307 if (c1 != *s2)
308 return c1 - *s2;
309 } else
310#endif
311 if (towlower((wint_t) c1) != towlower((wint_t) *s2))
312 return towlower((wint_t) c1) - towlower((wint_t) *s2);
313 s1 += nn1;
314 s2++;
315 }
316}
317
318static const char *
319mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle)
320{
321 size_t nlen = myucslen(wneedle);
322
323 if (nlen == 0)
324 return haystack;
325
326 size_t hlen = strlen(haystack);
327
328 while (*haystack) {
329 size_t i;
330 size_t h;
331 size_t step = 0;
332 for (i = h = 0; i < nlen; i++) {
333 uint32_t c;
334 size_t j = utfc8touc(&c, haystack + h);
335 if (j == 0 || j == (size_t) -1)
336 return NULL;
337 if (i == 0) {
338 step = j;
339 }
340#if SIZEOF_WCHAR_T == 2
341 if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
342 if (c != wneedle[i])
343 break;
344 } else
345#endif
346 if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
347 break;
348 h += j;
349 }
350 if (i == nlen)
351 return haystack;
352 haystack += step;
353 hlen -= step;
354 }
355 return NULL;
356}
357
358/* returns true if the pattern does not contain unescaped `_' (single
359 * character match) and ends with unescaped `%' (any sequence
360 * match) */
361static bool
362re_simple(const char *pat, unsigned char esc)
363{
364 bool escaped = false;
365 bool percatend = false;
366
367 if (pat == 0)
368 return 0;
369 if (*pat == '%') {
370 percatend = true;
371 pat++;
372 }
373 while (*pat) {
374 percatend = false;
375 if (escaped) {
376 escaped = false;
377 } else if ((unsigned char) *pat == esc) {
378 escaped = true;
379 } else if (*pat == '_') {
380 return 0;
381 } else if (*pat == '%') {
382 percatend = true;
383 }
384 pat++;
385 }
386 return percatend;
387}
388
389static bool
390is_strcmpable(const char *pat, const char *esc)
391{
392 if (pat[strcspn(pat, "%_")])
393 return false;
394 return strlen(esc) == 0 || strcmp(esc, str_nil) == 0 || strstr(pat, esc) == NULL;
395}
396
397static bool
398re_match_ignore(const char *s, RE *pattern)
399{
400 RE *r;
401
402 for (r = pattern; r; r = r->n) {
403 if (*r->w == 0 && (r->search || *s == 0))
404 return true;
405 if (!*s ||
406 (r->search ? (s = mywstrcasestr(s, r->w)) == NULL : mywstrncasecmp(s, r->w, r->len) != 0))
407 return false;
408 s += r->len;
409 }
410 return true;
411}
412
413static bool
414re_match_no_ignore(const char *s, RE *pattern)
415{
416 RE *r;
417
418 for (r = pattern; r; r = r->n) {
419 if (*r->k == 0 && (r->search || *s == 0))
420 return true;
421 if (!*s ||
422 (r->search ? (s = strstr(s, r->k)) == NULL : strncmp(s, r->k, r->len) != 0))
423 return false;
424 s += r->len;
425 }
426 return true;
427}
428
429static void
430re_destroy(RE *p)
431{
432 if (p) {
433 GDKfree(p->k);
434 GDKfree(p->w);
435 do {
436 RE *n = p->n;
437
438 GDKfree(p);
439 p = n;
440 } while (p);
441 }
442}
443
444/* Create a linked list of RE structures. Depending on the caseignore
445 * flag, the w (if true) or the k (if false) field is used. These
446 * fields in the first structure are allocated, whereas in all
447 * subsequent structures the fields point into the allocated buffer of
448 * the first. */
449static RE *
450re_create(const char *pat, bool caseignore, uint32_t esc)
451{
452 RE *r = (RE*)GDKmalloc(sizeof(RE)), *n = r;
453 bool escaped = false;
454
455 if (r == NULL)
456 return NULL;
457 *r = (struct RE) {.search = false};
458
459 while (esc != '%' && *pat == '%') {
460 pat++; /* skip % */
461 r->search = true;
462 }
463 if (caseignore) {
464 uint32_t *wp;
465 uint32_t *wq;
466 wp = utf8stoucs(pat);
467 if (wp == NULL) {
468 GDKfree(r);
469 return NULL;
470 }
471 r->w = wp;
472 wq = wp;
473 while (*wp) {
474 if (escaped) {
475 *wq++ = *wp;
476 escaped = false;
477 } else if (*wp == esc) {
478 escaped = true;
479 } else if (*wp == '%') {
480 n->len = (size_t) (wq - r->w);
481 while (wp[1] == '%')
482 wp++;
483 if (wp[1]) {
484 n = n->n = GDKmalloc(sizeof(RE));
485 if (n == NULL)
486 goto bailout;
487 *n = (struct RE) {.search = true, .w = wp + 1};
488 }
489 *wq++ = 0;
490 } else {
491 *wq++ = *wp;
492 }
493 wp++;
494 }
495 } else {
496 char *p, *q;
497 if ((p = GDKstrdup(pat)) == NULL) {
498 GDKfree(r);
499 return NULL;
500 }
501 r->k = p;
502 q = p;
503 while (*p) {
504 if (escaped) {
505 *q++ = *p;
506 escaped = false;
507 } else if ((unsigned char) *p == esc) {
508 escaped = true;
509 } else if (*p == '%') {
510 n->len = (size_t) (q - r->k);
511 while (p[1] == '%')
512 p++;
513 if (p[1]) {
514 n = n->n = GDKmalloc(sizeof(RE));
515 if (n == NULL)
516 goto bailout;
517 *n = (struct RE) {.search = true, .k = p + 1};
518 }
519 *q++ = 0;
520 } else {
521 *q++ = *p;
522 }
523 p++;
524 }
525 }
526 return r;
527 bailout:
528 re_destroy(r);
529 return NULL;
530}
531
532#ifdef HAVE_LIBPCRE
533static str
534pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
535{
536 pcre *r;
537 const char *err_p = NULL;
538 int errpos = 0;
539 int options = PCRE_UTF8 | PCRE_MULTILINE;
540 if (insensitive)
541 options |= PCRE_CASELESS;
542
543 if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
544 throw(MAL, "pcre.compile", OPERATION_FAILED
545 " with\n'%s'\nat %d in\n'%s'.\n",
546 err_p, errpos, pattern);
547 }
548 *res = r;
549 return MAL_SUCCEED;
550}
551#endif
552
553/* these two defines are copies from gdk_select.c */
554
555/* scan select loop with candidates */
556#define candscanloop(TEST) \
557 do { \
558 ALGODEBUG fprintf(stderr, \
559 "#BATselect(b=%s#"BUNFMT",s=%s,anti=%d): " \
560 "scanselect %s\n", BATgetId(b), BATcount(b), \
561 s ? BATgetId(s) : "NULL", anti, #TEST); \
562 for (p = 0; p < ci.ncand; p++) { \
563 o = canditer_next(&ci); \
564 r = (BUN) (o - off); \
565 v = BUNtvar(bi, r); \
566 if (TEST) \
567 bunfastappTYPE(oid, bn, &o); \
568 } \
569 } while (0)
570
571/* scan select loop without candidates */
572#define scanloop(TEST) \
573 do { \
574 ALGODEBUG fprintf(stderr, \
575 "#BATselect(b=%s#"BUNFMT",s=%s,anti=%d): " \
576 "scanselect %s\n", BATgetId(b), BATcount(b), \
577 s ? BATgetId(s) : "NULL", anti, #TEST); \
578 while (p < q) { \
579 v = BUNtvar(bi, p-off); \
580 if (TEST) { \
581 o = (oid) p; \
582 bunfastappTYPE(oid, bn, &o); \
583 } \
584 p++; \
585 } \
586 } while (0)
587
588static str
589pcre_likeselect(BAT **bnp, BAT *b, BAT *s, const char *pat, bool caseignore, bool anti)
590{
591#ifdef HAVE_LIBPCRE
592 int options = PCRE_UTF8 | PCRE_MULTILINE | PCRE_DOTALL;
593 pcre *re;
594 pcre_extra *pe;
595 const char *error;
596 int errpos;
597 int ovector[9];
598#else
599 int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
600 regex_t re;
601 int errcode;
602#endif
603 BATiter bi = bat_iterator(b);
604 BAT *bn;
605 BUN p, q;
606 oid o, off;
607 const char *v;
608 struct canditer ci;
609
610 canditer_init(&ci, b, s);
611
612 assert(ATOMstorage(b->ttype) == TYPE_str);
613
614 if (caseignore) {
615#ifdef HAVE_LIBPCRE
616 options |= PCRE_CASELESS;
617#else
618 options |= REG_ICASE;
619#endif
620 }
621#ifdef HAVE_LIBPCRE
622 if ((re = pcre_compile(pat, options, &error, &errpos, NULL)) == NULL)
623 throw(MAL, "pcre.likeselect",
624 OPERATION_FAILED ": compilation of pattern \"%s\" failed\n", pat);
625 pe = pcre_study(re, (s ? BATcount(s) : BATcount(b)) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &error);
626 if (error != NULL) {
627 pcre_free(re);
628 throw(MAL, "pcre.likeselect",
629 OPERATION_FAILED ": studying pattern \"%s\" failed\n", pat);
630 }
631#else
632 if ((errcode = regcomp(&re, pat, options)) != 0) {
633 throw(MAL, "pcre.likeselect",
634 OPERATION_FAILED ": compilation of pattern \"%s\" failed\n", pat);
635 }
636#endif
637 bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT);
638 if (bn == NULL) {
639#ifdef HAVE_LIBPCRE
640 pcre_free_study(pe);
641 pcre_free(re);
642#else
643 regfree(&re);
644#endif
645 throw(MAL, "pcre.likeselect", SQLSTATE(HY001) MAL_MALLOC_FAIL);
646 }
647 off = b->hseqbase;
648
649 if (s && !BATtdense(s)) {
650 BUN r;
651
652#ifdef HAVE_LIBPCRE
653#define BODY (pcre_exec(re, pe, v, (int) strlen(v), 0, 0, ovector, 9) >= 0)
654#else
655#define BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
656#endif
657 if (anti)
658 candscanloop(v && *v != '\200' && !BODY);
659 else
660 candscanloop(v && *v != '\200' && BODY);
661 } else {
662 if (s) {
663 assert(BATtdense(s));
664 p = (BUN) s->tseqbase;
665 q = p + BATcount(s);
666 if ((oid) p < b->hseqbase)
667 p = b->hseqbase;
668 if ((oid) q > b->hseqbase + BATcount(b))
669 q = b->hseqbase + BATcount(b);
670 } else {
671 p = off;
672 q = BUNlast(b) + off;
673 }
674 if (anti)
675 scanloop(v && *v != '\200' && !BODY);
676 else
677 scanloop(v && *v != '\200' && BODY);
678 }
679#ifdef HAVE_LIBPCRE
680 pcre_free_study(pe);
681 pcre_free(re);
682#else
683 regfree(&re);
684#endif
685 BATsetcount(bn, BATcount(bn)); /* set some properties */
686 bn->theap.dirty |= BATcount(bn) > 0;
687 bn->tsorted = true;
688 bn->trevsorted = bn->batCount <= 1;
689 bn->tkey = true;
690 bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? * (oid *) Tloc(bn, 0) : oid_nil;
691 *bnp = bn;
692 return MAL_SUCCEED;
693
694 bunins_failed:
695 BBPreclaim(bn);
696#ifdef HAVE_LIBPCRE
697 pcre_free_study(pe);
698 pcre_free(re);
699#else
700 regfree(&re);
701#endif
702 *bnp = NULL;
703 throw(MAL, "pcre.likeselect", OPERATION_FAILED);
704}
705
706static str
707re_likeselect(BAT **bnp, BAT *b, BAT *s, const char *pat, bool caseignore, bool anti, bool use_strcmp, uint32_t esc)
708{
709 BATiter bi = bat_iterator(b);
710 BAT *bn;
711 BUN p, q;
712 oid o, off;
713 const char *v;
714 RE *re = NULL;
715
716 assert(ATOMstorage(b->ttype) == TYPE_str);
717
718 bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT);
719 if (bn == NULL)
720 throw(MAL, "pcre.likeselect", SQLSTATE(HY001) MAL_MALLOC_FAIL);
721 off = b->hseqbase;
722
723 if (!use_strcmp) {
724 re = re_create(pat, caseignore, esc);
725 if (!re)
726 throw(MAL, "pcre.likeselect", SQLSTATE(HY001) MAL_MALLOC_FAIL);
727 }
728 if (s && !BATtdense(s)) {
729 struct canditer ci;
730 BUN r;
731
732 canditer_init(&ci, b, s);
733
734 if (use_strcmp) {
735 if (caseignore) {
736 uint32_t *wpat;
737 wpat = utf8stoucs(pat);
738 if (wpat == NULL)
739 throw(MAL, "pcre.likeselect", SQLSTATE(HY001) MAL_MALLOC_FAIL);
740 if (anti)
741 candscanloop(v && *v != '\200' &&
742 mywstrcasecmp(v, wpat) != 0);
743 else
744 candscanloop(v && *v != '\200' &&
745 mywstrcasecmp(v, wpat) == 0);
746 GDKfree(wpat);
747 } else {
748 if (anti)
749 candscanloop(v && *v != '\200' &&
750 strcmp(v, pat) != 0);
751 else
752 candscanloop(v && *v != '\200' &&
753 strcmp(v, pat) == 0);
754 }
755 } else {
756 if (caseignore) {
757 if (anti)
758 candscanloop(v && *v != '\200' &&
759 re_match_ignore(v, re) == 0);
760 else
761 candscanloop(v && *v != '\200' &&
762 re_match_ignore(v, re));
763 } else {
764 if (anti)
765 candscanloop(v && *v != '\200' &&
766 re_match_no_ignore(v, re) == 0);
767 else
768 candscanloop(v && *v != '\200' &&
769 re_match_no_ignore(v, re));
770 }
771 }
772 } else {
773 if (s) {
774 assert(BATtdense(s));
775 p = (BUN) s->tseqbase;
776 q = p + BATcount(s);
777 if ((oid) p < b->hseqbase)
778 p = b->hseqbase;
779 if ((oid) q > b->hseqbase + BATcount(b))
780 q = b->hseqbase + BATcount(b);
781 } else {
782 p = off;
783 q = BUNlast(b) + off;
784 }
785 if (use_strcmp) {
786 if (caseignore) {
787 uint32_t *wpat;
788 wpat = utf8stoucs(pat);
789 if (wpat == NULL)
790 throw(MAL, "pcre.likeselect", SQLSTATE(HY001) MAL_MALLOC_FAIL);
791 if (anti)
792 scanloop(v && *v != '\200' &&
793 mywstrcasecmp(v, wpat) != 0);
794 else
795 scanloop(v && *v != '\200' &&
796 mywstrcasecmp(v, wpat) == 0);
797 GDKfree(wpat);
798 } else {
799 if (anti)
800 scanloop(v && *v != '\200' &&
801 strcmp(v, pat) != 0);
802 else
803 scanloop(v && *v != '\200' &&
804 strcmp(v, pat) == 0);
805 }
806 } else {
807 if (caseignore) {
808 if (anti)
809 scanloop(v && *v != '\200' &&
810 re_match_ignore(v, re) == 0);
811 else
812 scanloop(v && *v != '\200' &&
813 re_match_ignore(v, re));
814 } else {
815 if (anti)
816 scanloop(v && *v != '\200' &&
817 re_match_no_ignore(v, re) == 0);
818 else
819 scanloop(v && *v != '\200' &&
820 re_match_no_ignore(v, re));
821 }
822 }
823 }
824 BATsetcount(bn, BATcount(bn)); /* set some properties */
825 bn->tsorted = true;
826 bn->trevsorted = bn->batCount <= 1;
827 bn->tkey = true;
828 bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? * (oid *) Tloc(bn, 0) : oid_nil;
829 *bnp = bn;
830 re_destroy(re);
831 return MAL_SUCCEED;
832
833 bunins_failed:
834 re_destroy(re);
835 BBPreclaim(bn);
836 *bnp = NULL;
837 throw(MAL, "pcre.likeselect", OPERATION_FAILED);
838}
839
840/* maximum number of back references and quoted \ or $ in replacement string */
841#define MAX_NR_REFS 20
842
843struct backref {
844 int idx;
845 int start;
846 int end;
847};
848
849#ifdef HAVE_LIBPCRE
850/* fill in parameter backrefs (length maxrefs) with information about
851 * back references in the replacement string; a back reference is a
852 * dollar or backslash followed by a number */
853static int
854parse_replacement(const char *replacement, int len_replacement,
855 struct backref *backrefs, int maxrefs)
856{
857 int nbackrefs = 0;
858
859 for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
860 if (replacement[i] == '$' || replacement[i] == '\\') {
861 char *endptr;
862 backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
863 if (endptr > replacement + i + 1) {
864 int k = (int) (endptr - (replacement + i + 1));
865 backrefs[nbackrefs].start = i;
866 backrefs[nbackrefs].end = i + k + 1;
867 nbackrefs++;
868 } else if (replacement[i] == replacement[i + 1]) {
869 /* doubled $ or \, we must copy just one to the output */
870 backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
871 backrefs[nbackrefs].start = i;
872 backrefs[nbackrefs].end = i + 1;
873 i++; /* don't look at second $ or \ again */
874 nbackrefs++;
875 }
876 /* else: $ or \ followed by something we don't recognize,
877 * so just leave it */
878 }
879 }
880 return nbackrefs;
881}
882
883static char *
884single_replace(pcre *pcre_code, pcre_extra *extra,
885 const char *origin_str, int len_origin_str,
886 int exec_options, int *ovector, int ovecsize,
887 const char *replacement, int len_replacement,
888 struct backref *backrefs, int nbackrefs,
889 bool global, char *result, int *max_result)
890{
891 int offset = 0;
892 int len_result = 0;
893 int addlen;
894 char *tmp;
895
896 do {
897 int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
898 exec_options, ovector, ovecsize);
899 if (j <= 0)
900 break;
901 addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
902 if (len_result + addlen >= *max_result) {
903 tmp = GDKrealloc(result, len_result + addlen + 1);
904 if (tmp == NULL) {
905 GDKfree(result);
906 return NULL;
907 }
908 result = tmp;
909 *max_result = len_result + addlen + 1;
910 }
911 if (ovector[0] > offset) {
912 strncpy(result + len_result, origin_str + offset,
913 ovector[0] - offset);
914 len_result += ovector[0] - offset;
915 }
916 if (nbackrefs == 0) {
917 strncpy(result + len_result, replacement, len_replacement);
918 len_result += len_replacement;
919 } else {
920 int prevend = 0;
921 for (int i = 0; i < nbackrefs; i++) {
922 int off, len;
923 if (backrefs[i].idx >= ovecsize / 3) {
924 /* out of bounds, replace with empty string */
925 off = 0;
926 len = 0;
927 } else {
928 off = ovector[backrefs[i].idx * 2];
929 len = ovector[backrefs[i].idx * 2 + 1] - off;
930 }
931 addlen = backrefs[i].start - prevend + len;
932 if (len_result + addlen >= *max_result) {
933 tmp = GDKrealloc(result, len_result + addlen + 1);
934 if (tmp == NULL) {
935 GDKfree(result);
936 return NULL;
937 }
938 result = tmp;
939 *max_result = len_result + addlen + 1;
940 }
941 if (backrefs[i].start > prevend) {
942 strncpy(result + len_result, replacement + prevend,
943 backrefs[i].start - prevend);
944 len_result += backrefs[i].start - prevend;
945 }
946 if (len > 0) {
947 strncpy(result + len_result, origin_str + off, len);
948 len_result += len;
949 }
950 prevend = backrefs[i].end;
951 }
952 /* copy rest of replacement string (after last backref) */
953 addlen = len_replacement - prevend;
954 if (addlen > 0) {
955 if (len_result + addlen >= *max_result) {
956 tmp = GDKrealloc(result, len_result + addlen + 1);
957 if (tmp == NULL) {
958 GDKfree(result);
959 return NULL;
960 }
961 result = tmp;
962 *max_result = len_result + addlen + 1;
963 }
964 strncpy(result + len_result, replacement + prevend, addlen);
965 len_result += addlen;
966 }
967 }
968 offset = ovector[1];
969 } while (offset < len_origin_str && global);
970 if (offset < len_origin_str) {
971 addlen = len_origin_str - offset;
972 if (len_result + addlen >= *max_result) {
973 tmp = GDKrealloc(result, len_result + addlen + 1);
974 if (tmp == NULL) {
975 GDKfree(result);
976 return NULL;
977 }
978 result = tmp;
979 *max_result = len_result + addlen + 1;
980 }
981 strncpy(result + len_result, origin_str + offset, addlen);
982 len_result += addlen;
983 }
984 /* null terminate string */
985 result[len_result] = '\0';
986 return result;
987}
988#endif
989
990static str
991pcre_replace(str *res, const char *origin_str, const char *pattern,
992 const char *replacement, const char *flags, bool global)
993{
994#ifdef HAVE_LIBPCRE
995 const char *err_p = NULL;
996 pcre *pcre_code = NULL;
997 pcre_extra *extra;
998 char *tmpres;
999 int max_result;
1000 int i, errpos = 0;
1001 int compile_options = PCRE_UTF8, exec_options = PCRE_NOTEMPTY;
1002 int *ovector, ovecsize;
1003 int len_origin_str = (int) strlen(origin_str);
1004 int len_replacement = (int) strlen(replacement);
1005 struct backref backrefs[MAX_NR_REFS];
1006 int nbackrefs = 0;
1007
1008 while (*flags) {
1009 switch (*flags) {
1010 case 'e':
1011 exec_options &= ~PCRE_NOTEMPTY;
1012 break;
1013 case 'i':
1014 compile_options |= PCRE_CASELESS;
1015 break;
1016 case 'm':
1017 compile_options |= PCRE_MULTILINE;
1018 break;
1019 case 's':
1020 compile_options |= PCRE_DOTALL;
1021 break;
1022 case 'x':
1023 compile_options |= PCRE_EXTENDED;
1024 break;
1025 default:
1026 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
1027 ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
1028 *flags);
1029 }
1030 flags++;
1031 }
1032
1033 if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
1034 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
1035 OPERATION_FAILED ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
1036 pattern, errpos, err_p);
1037 }
1038
1039 /* Since the compiled pattern is going to be used several times, it is
1040 * worth spending more time analyzing it in order to speed up the time
1041 * taken for matching.
1042 */
1043 extra = pcre_study(pcre_code, 0, &err_p);
1044 if (err_p != NULL) {
1045 pcre_free(pcre_code);
1046 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
1047 OPERATION_FAILED ": pcre study of pattern (%s) failed with '%s'.\n",
1048 pattern, err_p);
1049 }
1050 pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
1051 ovecsize = (i + 1) * 3;
1052 if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
1053 pcre_free_study(extra);
1054 pcre_free(pcre_code);
1055 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
1056 SQLSTATE(HY001) MAL_MALLOC_FAIL);
1057 }
1058
1059 /* identify back references in the replacement string */
1060 nbackrefs = parse_replacement(replacement, len_replacement,
1061 backrefs, MAX_NR_REFS);
1062
1063 max_result = len_origin_str + 1;
1064 tmpres = GDKmalloc(max_result);
1065 if (tmpres == NULL) {
1066 GDKfree(ovector);
1067 pcre_free_study(extra);
1068 pcre_free(pcre_code);
1069 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
1070 SQLSTATE(HY001) MAL_MALLOC_FAIL);
1071 }
1072
1073 tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
1074 exec_options, ovector, ovecsize, replacement,
1075 len_replacement, backrefs, nbackrefs, global,
1076 tmpres, &max_result);
1077 GDKfree(ovector);
1078 pcre_free_study(extra);
1079 pcre_free(pcre_code);
1080 if (tmpres == NULL)
1081 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
1082 SQLSTATE(HY001) MAL_MALLOC_FAIL);
1083
1084 *res = tmpres;
1085 return MAL_SUCCEED;
1086#else
1087 (void) res;
1088 (void) origin_str;
1089 (void) pattern;
1090 (void) replacement;
1091 (void) flags;
1092 (void) global;
1093 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
1094 "Database was compiled without PCRE support.");
1095#endif
1096}
1097
1098static str
1099pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
1100 const char *replacement, const char *flags, bool global)
1101{
1102#ifdef HAVE_LIBPCRE
1103 BATiter origin_strsi = bat_iterator(origin_strs);
1104 const char *err_p = NULL;
1105 char *tmpres;
1106 int i, errpos = 0;
1107 int compile_options = PCRE_UTF8, exec_options = PCRE_NOTEMPTY;
1108 pcre *pcre_code = NULL;
1109 pcre_extra *extra;
1110 BAT *tmpbat;
1111 BUN p, q;
1112 int *ovector, ovecsize;
1113 int len_replacement = (int) strlen(replacement);
1114 struct backref backrefs[MAX_NR_REFS];
1115 int nbackrefs = 0;
1116 const char *origin_str;
1117 int max_dest_size = 0;
1118
1119 while (*flags) {
1120 switch (*flags) {
1121 case 'e':
1122 exec_options &= ~PCRE_NOTEMPTY;
1123 break;
1124 case 'i':
1125 compile_options |= PCRE_CASELESS;
1126 break;
1127 case 'm':
1128 compile_options |= PCRE_MULTILINE;
1129 break;
1130 case 's':
1131 compile_options |= PCRE_DOTALL;
1132 break;
1133 case 'x':
1134 compile_options |= PCRE_EXTENDED;
1135 break;
1136 default:
1137 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1138 ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
1139 *flags);
1140 }
1141 flags++;
1142 }
1143
1144 if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
1145 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1146 OPERATION_FAILED
1147 ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
1148 pattern, errpos, err_p);
1149 }
1150
1151 /* Since the compiled pattern is going to be used several times,
1152 * it is worth spending more time analyzing it in order to speed
1153 * up the time taken for matching.
1154 */
1155 extra = pcre_study(pcre_code, BATcount(origin_strs) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
1156 if (err_p != NULL) {
1157 pcre_free(pcre_code);
1158 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1159 OPERATION_FAILED);
1160 }
1161 pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
1162 ovecsize = (i + 1) * 3;
1163 if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
1164 pcre_free_study(extra);
1165 pcre_free(pcre_code);
1166 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1167 SQLSTATE(HY001) MAL_MALLOC_FAIL);
1168 }
1169
1170 /* identify back references in the replacement string */
1171 nbackrefs = parse_replacement(replacement, len_replacement,
1172 backrefs, MAX_NR_REFS);
1173
1174 tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs), TRANSIENT);
1175
1176 /* the buffer for all destination strings is allocated only once,
1177 * and extended when needed */
1178 max_dest_size = len_replacement + 1;
1179 tmpres = GDKmalloc(max_dest_size);
1180 if (tmpbat == NULL || tmpres == NULL) {
1181 pcre_free_study(extra);
1182 pcre_free(pcre_code);
1183 GDKfree(ovector);
1184 BBPreclaim(tmpbat);
1185 GDKfree(tmpres);
1186 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1187 SQLSTATE(HY001) MAL_MALLOC_FAIL);
1188 }
1189 BATloop(origin_strs, p, q) {
1190 origin_str = BUNtvar(origin_strsi, p);
1191 tmpres = single_replace(pcre_code, extra, origin_str,
1192 (int) strlen(origin_str), exec_options,
1193 ovector, ovecsize, replacement,
1194 len_replacement, backrefs, nbackrefs, global,
1195 tmpres, &max_dest_size);
1196 if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
1197 pcre_free_study(extra);
1198 pcre_free(pcre_code);
1199 GDKfree(ovector);
1200 GDKfree(tmpres);
1201 BBPreclaim(tmpbat);
1202 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1203 SQLSTATE(HY001) MAL_MALLOC_FAIL);
1204 }
1205 }
1206 pcre_free_study(extra);
1207 pcre_free(pcre_code);
1208 GDKfree(ovector);
1209 GDKfree(tmpres);
1210 *res = tmpbat;
1211 return MAL_SUCCEED;
1212#else
1213 (void) res;
1214 (void) origin_strs;
1215 (void) pattern;
1216 (void) replacement;
1217 (void) flags;
1218 (void) global;
1219 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1220 "Database was compiled without PCRE support.");
1221#endif
1222}
1223
1224str
1225pcre_init(void *ret)
1226{
1227 (void) ret;
1228 return NULL;
1229}
1230
1231static str
1232pcre_match_with_flags(bit *ret, const char *val, const char *pat, const char *flags)
1233{
1234 int pos;
1235#ifdef HAVE_LIBPCRE
1236 const char *err_p = NULL;
1237 int errpos = 0;
1238 int options = PCRE_UTF8;
1239 pcre *re;
1240#else
1241 int options = REG_NOSUB;
1242 regex_t re;
1243 int errcode;
1244 int retval;
1245#endif
1246
1247 while (*flags) {
1248 switch (*flags) {
1249 case 'i':
1250#ifdef HAVE_LIBPCRE
1251 options |= PCRE_CASELESS;
1252#else
1253 options |= REG_ICASE;
1254#endif
1255 break;
1256 case 'm':
1257#ifdef HAVE_LIBPCRE
1258 options |= PCRE_MULTILINE;
1259#else
1260 options |= REG_NEWLINE;
1261#endif
1262 break;
1263#ifdef HAVE_LIBPCRE
1264 case 's':
1265 options |= PCRE_DOTALL;
1266 break;
1267#endif
1268 case 'x':
1269#ifdef HAVE_LIBPCRE
1270 options |= PCRE_EXTENDED;
1271#else
1272 options |= REG_EXTENDED;
1273#endif
1274 break;
1275 default:
1276 throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
1277 ": unsupported flag character '%c'\n", *flags);
1278 }
1279 flags++;
1280 }
1281 if (strcmp(val, str_nil) == 0) {
1282 *ret = FALSE;
1283 return MAL_SUCCEED;
1284 }
1285
1286#ifdef HAVE_LIBPCRE
1287 if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
1288#else
1289 if ((errcode = regcomp(&re, pat, options)) != 0)
1290#endif
1291 {
1292 throw(MAL, "pcre.match", OPERATION_FAILED
1293 ": compilation of regular expression (%s) failed "
1294#ifdef HAVE_LIBPCRE
1295 "at %d with '%s'", pat, errpos, err_p
1296#else
1297 , pat
1298#endif
1299 );
1300 }
1301#ifdef HAVE_LIBPCRE
1302 pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0);
1303 pcre_free(re);
1304#else
1305 retval = regexec(&re, val, (size_t) 0, NULL, 0);
1306 pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
1307 regfree(&re);
1308#endif
1309 if (pos >= 0)
1310 *ret = TRUE;
1311 else if (pos == -1)
1312 *ret = FALSE;
1313 else
1314 throw(MAL, "pcre.match", OPERATION_FAILED
1315 ": matching of regular expression (%s) failed with %d",
1316 pat, pos);
1317 return MAL_SUCCEED;
1318}
1319
1320#ifdef HAVE_LIBPCRE
1321/* special characters in PCRE that need to be escaped */
1322static const char *pcre_specials = ".+?*()[]{}|^$\\";
1323#else
1324/* special characters in POSIX basic regular expressions that need to
1325 * be escaped */
1326static const char *pcre_specials = ".*[]^$\\";
1327#endif
1328
1329/* change SQL LIKE pattern into PCRE pattern */
1330static str
1331sql2pcre(str *r, const char *pat, const char *esc_str)
1332{
1333 int escaped = 0;
1334 int hasWildcard = 0;
1335 char *ppat;
1336 int esc = esc_str[0] == '\200' ? 0 : esc_str[0]; /* should change to utf8_convert() */
1337 int specials;
1338 int c;
1339
1340 if (strlen(esc_str) > 1)
1341 throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": ESCAPE string must have length 1");
1342 if (pat == NULL )
1343 throw(MAL, "pcre.sql2pcre", OPERATION_FAILED);
1344 ppat = GDKmalloc(strlen(pat)*3+3 /* 3 = "^'the translated regexp'$0" */);
1345 if (ppat == NULL)
1346 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1347
1348 *r = ppat;
1349 /* The escape character can be a char which is special in a PCRE
1350 * expression. If the user used the "+" char as escape and has "++"
1351 * in their pattern, then replacing this with "+" is not correct and
1352 * should be "\+" instead. */
1353 specials = (esc && strchr(pcre_specials, esc) != NULL);
1354
1355 *ppat++ = '^';
1356 while ((c = *pat++) != 0) {
1357 if (c == esc) {
1358 if (escaped) {
1359 if (specials) { /* change ++ into \+ */
1360 *ppat++ = esc;
1361 } else { /* do not escape simple escape symbols */
1362 ppat[-1] = esc; /* overwrite backslash */
1363 }
1364 escaped = 0;
1365 } else {
1366 *ppat++ = '\\';
1367 escaped = 1;
1368 }
1369 hasWildcard = 1;
1370 } else if (strchr(pcre_specials, c) != NULL) {
1371 /* escape PCRE special chars, avoid double backslash if the
1372 * user uses an invalid escape sequence */
1373 if (!escaped)
1374 *ppat++ = '\\';
1375 *ppat++ = c;
1376 hasWildcard = 1;
1377 escaped = 0;
1378 } else if (c == '%' && !escaped) {
1379 *ppat++ = '.';
1380 *ppat++ = '*';
1381 *ppat++ = '?';
1382 hasWildcard = 1;
1383 /* collapse multiple %, but only if it isn't the escape */
1384 if (esc != '%')
1385 while (*pat == '%')
1386 pat++;
1387 } else if (c == '_' && !escaped) {
1388 *ppat++ = '.';
1389 hasWildcard = 1;
1390 } else {
1391 if (escaped) {
1392 ppat[-1] = c; /* overwrite backslash of invalid escape */
1393 } else {
1394 *ppat++ = c;
1395 }
1396 escaped = 0;
1397 }
1398 }
1399 /* no wildcard or escape character at end of string */
1400 if (!hasWildcard || escaped) {
1401 GDKfree(*r);
1402 *r = NULL;
1403 if (escaped)
1404 throw(MAL, "pcre.sql2pcre", OPERATION_FAILED);
1405 *r = GDKstrdup(str_nil);
1406 if (*r == NULL)
1407 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1408 } else {
1409 *ppat++ = '$';
1410 *ppat = 0;
1411 }
1412 return MAL_SUCCEED;
1413}
1414
1415#ifdef HAVE_LIBPCRE
1416/* change SQL PATINDEX pattern into PCRE pattern */
1417static str
1418pat2pcre(str *r, const char *pat)
1419{
1420 size_t len = strlen(pat);
1421 char *ppat = GDKmalloc(len*2+3 /* 3 = "^'the translated regexp'$0" */);
1422 int start = 0;
1423
1424 if (ppat == NULL)
1425 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1426 *r = ppat;
1427 while (*pat) {
1428 int c = *pat++;
1429
1430 if (strchr(pcre_specials, c) != NULL) {
1431 *ppat++ = '\\';
1432 *ppat++ = c;
1433 } else if (c == '%') {
1434 if (start && *pat) {
1435 *ppat++ = '.';
1436 *ppat++ = '*';
1437 }
1438 start++;
1439 } else if (c == '_') {
1440 *ppat++ = '.';
1441 } else {
1442 *ppat++ = c;
1443 }
1444 }
1445 *ppat = 0;
1446 return MAL_SUCCEED;
1447}
1448#endif
1449
1450/*
1451 * @+ Wrapping
1452 */
1453#include "mal.h"
1454str
1455PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags)
1456{
1457 return pcre_replace(res, *or, *pat, *repl, *flags, true);
1458}
1459
1460str
1461PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags)
1462{
1463 BAT *b, *bn = NULL;
1464 str msg;
1465 if ((b = BATdescriptor(*bid)) == NULL)
1466 throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1467
1468 msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
1469 if (msg == MAL_SUCCEED) {
1470 *res = bn->batCacheid;
1471 BBPkeepref(*res);
1472 }
1473 BBPunfix(b->batCacheid);
1474 return msg;
1475}
1476
1477str
1478PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags)
1479{
1480 return pcre_replace(res, *or, *pat, *repl, *flags, false);
1481}
1482
1483str
1484PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags)
1485{
1486 BAT *b,*bn = NULL;
1487 str msg;
1488 if ((b = BATdescriptor(*bid)) == NULL)
1489 throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
1490
1491 msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
1492 if (msg == MAL_SUCCEED) {
1493 *res = bn->batCacheid;
1494 BBPkeepref(*res);
1495 }
1496 BBPunfix(b->batCacheid);
1497 return msg;
1498}
1499
1500str
1501PCREmatch(bit *ret, const str *val, const str *pat)
1502{
1503 return pcre_match_with_flags(ret, *val, *pat,
1504#ifdef HAVE_LIBPCRE
1505 "s"
1506#else
1507 "x"
1508#endif
1509 );
1510}
1511
1512str
1513PCREimatch(bit *ret, const str *val, const str *pat)
1514{
1515 return pcre_match_with_flags(ret, *val, *pat, "i"
1516#ifndef HAVE_LIBPCRE
1517 "x"
1518#endif
1519 );
1520}
1521
1522str
1523PCREindex(int *res, const pcre *pattern, const str *s)
1524{
1525#ifdef HAVE_LIBPCRE
1526 int v[3];
1527
1528 v[0] = v[1] = *res = 0;
1529 if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0, 0, v, 3) >= 0) {
1530 *res = v[1];
1531 }
1532 return MAL_SUCCEED;
1533#else
1534 (void) res;
1535 (void) pattern;
1536 (void) s;
1537 throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
1538#endif
1539}
1540
1541
1542str
1543PCREpatindex(int *ret, const str *pat, const str *val)
1544{
1545#ifdef HAVE_LIBPCRE
1546 pcre *re = NULL;
1547 char *ppat = NULL, *msg;
1548
1549 if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
1550 return msg;
1551 if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
1552 GDKfree(ppat);
1553 return msg;
1554 }
1555 GDKfree(ppat);
1556 msg = PCREindex(ret, re, val);
1557 pcre_free(re);
1558 return msg;
1559#else
1560 (void) ret;
1561 (void) pat;
1562 (void) val;
1563 throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
1564#endif
1565}
1566
1567str
1568PCREquote(str *ret, const str *val)
1569{
1570 char *p;
1571 const char *s = *val;
1572
1573 *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
1574 if (p == NULL)
1575 throw(MAL, "pcre.quote", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1576 /* quote all non-alphanumeric ASCII characters (i.e. leave
1577 non-ASCII and alphanumeric alone) */
1578 while (*s) {
1579 if (!((*s & 0x80) != 0 ||
1580 ('a' <= *s && *s <= 'z') ||
1581 ('A' <= *s && *s <= 'Z') ||
1582 isdigit((unsigned char) *s)))
1583 *p++ = '\\';
1584 *p++ = *s++;
1585 }
1586 *p = 0;
1587 return MAL_SUCCEED;
1588}
1589
1590
1591str
1592PCREsql2pcre(str *ret, const str *pat, const str *esc)
1593{
1594 return sql2pcre(ret, *pat, *esc);
1595}
1596
1597static str
1598PCRElike4(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens)
1599{
1600 char *ppat = NULL;
1601 str r = sql2pcre(&ppat, *pat, *esc);
1602
1603 if (!r) {
1604 assert(ppat);
1605 if (strcmp(ppat, str_nil) == 0) {
1606 *ret = FALSE;
1607 if (*isens) {
1608 if (mystrcasecmp(*s, *pat) == 0)
1609 *ret = TRUE;
1610 } else {
1611 if (strcmp(*s, *pat) == 0)
1612 *ret = TRUE;
1613 }
1614 } else {
1615 if (*isens) {
1616 r = PCREimatch(ret, s, &ppat);
1617 } else {
1618 r = PCREmatch(ret, s, &ppat);
1619 }
1620 }
1621 }
1622 if (ppat)
1623 GDKfree(ppat);
1624 return r;
1625}
1626
1627str
1628PCRElike3(bit *ret, const str *s, const str *pat, const str *esc)
1629{
1630 bit no = FALSE;
1631
1632 return PCRElike4(ret, s, pat, esc, &no);
1633}
1634
1635str
1636PCRElike2(bit *ret, const str *s, const str *pat)
1637{
1638 char *esc = "";
1639
1640 return PCRElike3(ret, s, pat, &esc);
1641}
1642
1643str
1644PCREnotlike3(bit *ret, const str *s, const str *pat, const str *esc)
1645{
1646 str tmp;
1647 bit r;
1648
1649 rethrow("str.not_like", tmp, PCRElike3(&r, s, pat, esc));
1650 *ret = !r;
1651 return MAL_SUCCEED;
1652}
1653
1654str
1655PCREnotlike2(bit *ret, const str *s, const str *pat)
1656{
1657 str tmp;
1658 bit r;
1659
1660 rethrow("str.not_like", tmp, PCRElike2(&r, s, pat));
1661 *ret = !r;
1662 return MAL_SUCCEED;
1663}
1664
1665str
1666PCREilike3(bit *ret, const str *s, const str *pat, const str *esc)
1667{
1668 bit yes = TRUE;
1669
1670 return PCRElike4(ret, s, pat, esc, &yes);
1671}
1672
1673str
1674PCREilike2(bit *ret, const str *s, const str *pat)
1675{
1676 char *esc = "\\";
1677
1678 return PCREilike3(ret, s, pat, &esc);
1679}
1680
1681str
1682PCREnotilike3(bit *ret, const str *s, const str *pat, const str *esc)
1683{
1684 str tmp;
1685 bit r;
1686
1687 rethrow("str.not_ilike", tmp, PCREilike3(&r, s, pat, esc));
1688 *ret = !r;
1689 return MAL_SUCCEED;
1690}
1691
1692str
1693PCREnotilike2(bit *ret, const str *s, const str *pat)
1694{
1695 str tmp;
1696 bit r;
1697
1698 rethrow("str.not_ilike", tmp, PCREilike2(&r, s, pat));
1699 *ret = !r;
1700 return MAL_SUCCEED;
1701}
1702
1703static str
1704BATPCRElike3(bat *ret, const bat *bid, const str *pat, const str *esc, const bit *isens, const bit *not)
1705{
1706 char *ppat = NULL;
1707 str res = sql2pcre(&ppat, *pat, *esc);
1708
1709 if (res == MAL_SUCCEED) {
1710 BAT *strs = BATdescriptor(*bid);
1711 BATiter strsi;
1712 BAT *r;
1713 bit *br;
1714 BUN p, q, i = 0;
1715
1716 if (strs == NULL) {
1717 GDKfree(ppat);
1718 throw(MAL, "batstr.like", OPERATION_FAILED);
1719 }
1720
1721 r = COLnew(strs->hseqbase, TYPE_bit, BATcount(strs), TRANSIENT);
1722 if (r==NULL) {
1723 GDKfree(ppat);
1724 BBPunfix(strs->batCacheid);
1725 throw(MAL, "pcre.like3", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1726 }
1727 br = (bit*)Tloc(r, 0);
1728 strsi = bat_iterator(strs);
1729
1730 if (strcmp(ppat, str_nil) == 0) {
1731 BATloop(strs, p, q) {
1732 const char *s = (str)BUNtvar(strsi, p);
1733
1734 if (strcmp(s, *pat) == 0)
1735 br[i] = TRUE;
1736 else
1737 br[i] = FALSE;
1738 if (*not)
1739 br[i] = !br[i];
1740 i++;
1741 }
1742 } else {
1743 int pos;
1744#ifdef HAVE_LIBPCRE
1745 const char *err_p = NULL;
1746 int errpos = 0;
1747 int options = PCRE_UTF8 | PCRE_DOTALL;
1748 pcre *re;
1749#else
1750 regex_t re;
1751 int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
1752 int errcode;
1753#endif
1754
1755 if (*isens) {
1756#ifdef HAVE_LIBPCRE
1757 options |= PCRE_CASELESS;
1758#else
1759 options |= REG_ICASE;
1760#endif
1761 }
1762 if (
1763#ifdef HAVE_LIBPCRE
1764 (re = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL
1765#else
1766 (errcode = regcomp(&re, ppat, options)) != 0
1767#endif
1768 ) {
1769 BBPunfix(strs->batCacheid);
1770 BBPunfix(r->batCacheid);
1771 res = createException(MAL, "pcre.match", OPERATION_FAILED
1772 ": compilation of regular expression (%s) failed"
1773#ifdef HAVE_LIBPCRE
1774 " at %d with '%s'", ppat, errpos, err_p
1775#else
1776 , ppat
1777#endif
1778 );
1779 GDKfree(ppat);
1780 return res;
1781 }
1782
1783 BATloop(strs, p, q) {
1784 const char *s = (str)BUNtvar(strsi, p);
1785
1786 if (*s == '\200') {
1787 br[i] = bit_nil;
1788 r->tnonil = false;
1789 r->tnil = true;
1790 } else {
1791#ifdef HAVE_LIBPCRE
1792 pos = pcre_exec(re, NULL, s, (int) strlen(s), 0, 0, NULL, 0);
1793#else
1794 int retval = regexec(&re, s, (size_t) 0, NULL, 0);
1795 pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
1796#endif
1797 if (pos >= 0)
1798 br[i] = *not? FALSE:TRUE;
1799 else if (pos == -1)
1800 br[i] = *not? TRUE: FALSE;
1801 else {
1802 BBPunfix(strs->batCacheid);
1803 BBPunfix(r->batCacheid);
1804 res = createException(MAL, "pcre.match", OPERATION_FAILED
1805 ": matching of regular expression (%s) failed with %d", ppat, pos);
1806 GDKfree(ppat);
1807 return res;
1808 }
1809 }
1810 i++;
1811 }
1812#ifdef HAVE_LIBPCRE
1813 pcre_free(re);
1814#else
1815 regfree(&re);
1816#endif
1817 }
1818 BATsetcount(r, i);
1819 r->tsorted = false;
1820 r->trevsorted = false;
1821 BATkey(r, false);
1822
1823 BBPkeepref(*ret = r->batCacheid);
1824 BBPunfix(strs->batCacheid);
1825 GDKfree(ppat);
1826 }
1827 return res;
1828}
1829
1830str
1831BATPCRElike(bat *ret, const bat *bid, const str *pat, const str *esc)
1832{
1833 bit no = FALSE;
1834
1835 return BATPCRElike3(ret, bid, pat, esc, &no, &no);
1836}
1837
1838str
1839BATPCRElike2(bat *ret, const bat *bid, const str *pat)
1840{
1841 char *esc = "\\";
1842
1843 return BATPCRElike(ret, bid, pat, &esc);
1844}
1845
1846str
1847BATPCREnotlike(bat *ret, const bat *bid, const str *pat, const str *esc)
1848{
1849 bit no = FALSE;
1850 bit yes = TRUE;
1851
1852 return BATPCRElike3(ret, bid, pat, esc, &no, &yes);
1853}
1854
1855str
1856BATPCREnotlike2(bat *ret, const bat *bid, const str *pat)
1857{
1858 char *esc = "\\";
1859
1860 return BATPCREnotlike(ret, bid, pat, &esc);
1861}
1862
1863str
1864BATPCREilike(bat *ret, const bat *bid, const str *pat, const str *esc)
1865{
1866 bit yes = TRUE;
1867 bit no = FALSE;
1868
1869 return BATPCRElike3(ret, bid, pat, esc, &yes, &no);
1870}
1871
1872str
1873BATPCREilike2(bat *ret, const bat *bid, const str *pat)
1874{
1875 char *esc = "\\";
1876
1877 return BATPCREilike(ret, bid, pat, &esc);
1878}
1879
1880str
1881BATPCREnotilike(bat *ret, const bat *bid, const str *pat, const str *esc)
1882{
1883 bit yes = TRUE;
1884
1885 return BATPCRElike3(ret, bid, pat, esc, &yes, &yes);
1886}
1887
1888str
1889BATPCREnotilike2(bat *ret, const bat *bid, const str *pat)
1890{
1891 char *esc = "\\";
1892
1893 return BATPCREnotilike(ret, bid, pat, &esc);
1894}
1895
1896str
1897PCRElikeselect2(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *caseignore, const bit *anti)
1898{
1899 BAT *b, *s = NULL, *bn = NULL;
1900 str res;
1901 char *ppat = NULL;
1902 bool use_re = false;
1903 bool use_strcmp = false;
1904
1905 if ((b = BATdescriptor(*bid)) == NULL) {
1906 throw(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1907 }
1908 if (sid && !is_bat_nil(*sid) && *sid && (s = BATdescriptor(*sid)) == NULL) {
1909 BBPunfix(b->batCacheid);
1910 throw(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1911 }
1912
1913 /* no escape, try if a simple list of keywords works */
1914 if (is_strcmpable(*pat, *esc)) {
1915 use_re = true;
1916 use_strcmp = true;
1917 } else if (re_simple(*pat, **esc == '\200' ? 0 : (unsigned char) **esc)) {
1918 use_re = true;
1919 } else {
1920 res = sql2pcre(&ppat, *pat, *esc);
1921 if (res != MAL_SUCCEED) {
1922 BBPunfix(b->batCacheid);
1923 if (s)
1924 BBPunfix(s->batCacheid);
1925 return res;
1926 }
1927 if (strcmp(ppat, str_nil) == 0) {
1928 GDKfree(ppat);
1929 ppat = NULL;
1930 if (*caseignore) {
1931 ppat = GDKmalloc(strlen(*pat) + 3);
1932 if (ppat == NULL) {
1933 BBPunfix(b->batCacheid);
1934 if (s)
1935 BBPunfix(s->batCacheid);
1936 throw(MAL, "algebra.likeselect", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1937 }
1938 ppat[0] = '^';
1939 strcpy(ppat + 1, *pat);
1940 strcat(ppat, "$");
1941 }
1942 }
1943 }
1944
1945 if (use_re) {
1946 res = re_likeselect(&bn, b, s, *pat, (bool) *caseignore, (bool) *anti, use_strcmp, **esc == '\200' ? 0 : (unsigned char) **esc);
1947 } else if (ppat == NULL) {
1948 /* no pattern and no special characters: can use normal select */
1949 bn = BATselect(b, s, *pat, NULL, true, true, *anti);
1950 if (bn == NULL)
1951 res = createException(MAL, "algebra.likeselect", GDK_EXCEPTION);
1952 else
1953 res = MAL_SUCCEED;
1954 } else {
1955 res = pcre_likeselect(&bn, b, s, ppat, (bool) *caseignore, (bool) *anti);
1956 }
1957 BBPunfix(b->batCacheid);
1958 if (s)
1959 BBPunfix(s->batCacheid);
1960 GDKfree(ppat);
1961 if (res != MAL_SUCCEED)
1962 return res;
1963 assert(bn);
1964 *ret = bn->batCacheid;
1965 BBPkeepref(bn->batCacheid);
1966 return MAL_SUCCEED;
1967}
1968
1969str
1970PCRElikeselect1(bat *ret, const bat *bid, const bat *cid, const str *pat, const str *esc, const bit *anti)
1971{
1972 const bit f = TRUE;
1973 return PCRElikeselect2(ret, bid, cid, pat, esc, &f, anti);
1974}
1975
1976str
1977PCRElikeselect3(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *anti)
1978{
1979 const bit f = FALSE;
1980 return PCRElikeselect2(ret, bid, sid, pat, esc, &f, anti);
1981}
1982
1983str
1984PCRElikeselect4(bat *ret, const bat *bid, const bat *cid, const str *pat, const bit *anti)
1985{
1986 const bit f = TRUE;
1987 const str esc ="";
1988 return PCRElikeselect2(ret, bid, cid, pat, &esc, &f, anti);
1989}
1990
1991str
1992PCRElikeselect5(bat *ret, const bat *bid, const bat *sid, const str *pat, const bit *anti)
1993{
1994 const bit f = FALSE;
1995 const str esc ="";
1996 return PCRElikeselect2(ret, bid, sid, pat, &esc, &f, anti);
1997}
1998
1999#define APPEND(b, o) (((oid *) b->theap.base)[b->batCount++] = (o))
2000#define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width))
2001
2002static char *
2003pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
2004 const char *esc, bool caseignore)
2005{
2006 struct canditer lci, rci;
2007 const char *lvals, *rvals;
2008 const char *lvars, *rvars;
2009 int lwidth, rwidth;
2010 const char *vl, *vr;
2011 oid lastl = 0; /* last value inserted into r1 */
2012 BUN nl;
2013 BUN newcap;
2014 oid lo, ro;
2015 int rskipped = 0; /* whether we skipped values in r */
2016 char *msg = MAL_SUCCEED;
2017 RE *re = NULL;
2018 char *pcrepat = NULL;
2019#ifdef HAVE_LIBPCRE
2020 pcre *pcrere = NULL;
2021 pcre_extra *pcreex = NULL;
2022 const char *err_p = NULL;
2023 int errpos;
2024 int pcreopt = PCRE_UTF8 | PCRE_MULTILINE;
2025 int pcrestopt = (sl ? BATcount(sl) : BATcount(l)) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
2026#else
2027 int pcrere = 0;
2028 regex_t regex;
2029 int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
2030 int errcode = -1;
2031#endif
2032
2033
2034 if (caseignore)
2035#ifdef HAVE_LIBPCRE
2036 pcreopt |= PCRE_CASELESS;
2037#else
2038 options |= REG_ICASE;
2039#endif
2040
2041 ALGODEBUG fprintf(stderr, "#pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
2042 "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
2043 "sr=%s#" BUNFMT "%s%s)\n",
2044 BATgetId(l), BATcount(l), ATOMname(l->ttype),
2045 l->tsorted ? "-sorted" : "",
2046 l->trevsorted ? "-revsorted" : "",
2047 BATgetId(r), BATcount(r), ATOMname(r->ttype),
2048 r->tsorted ? "-sorted" : "",
2049 r->trevsorted ? "-revsorted" : "",
2050 sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
2051 sl && sl->tsorted ? "-sorted" : "",
2052 sl && sl->trevsorted ? "-revsorted" : "",
2053 sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
2054 sr && sr->tsorted ? "-sorted" : "",
2055 sr && sr->trevsorted ? "-revsorted" : "");
2056
2057 assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
2058 assert(ATOMtype(l->ttype) == TYPE_str);
2059 assert(sl == NULL || sl->tsorted);
2060 assert(sr == NULL || sr->tsorted);
2061
2062 canditer_init(&lci, l, sl);
2063 canditer_init(&rci, r, sr);
2064
2065 lvals = (const char *) Tloc(l, 0);
2066 rvals = (const char *) Tloc(r, 0);
2067 assert(r->tvarsized && r->ttype);
2068 lvars = l->tvheap->base;
2069 rvars = r->tvheap->base;
2070 lwidth = l->twidth;
2071 rwidth = r->twidth;
2072
2073 r1->tkey = true;
2074 r1->tsorted = true;
2075 r1->trevsorted = true;
2076 r2->tkey = true;
2077 r2->tsorted = true;
2078 r2->trevsorted = true;
2079
2080 /* nested loop implementation for PCRE join */
2081 for (BUN ri = 0; ri < rci.ncand; ri++) {
2082 ro = canditer_next(&rci);
2083 vr = VALUE(r, ro - r->hseqbase);
2084 if (strcmp(vr, str_nil) == 0)
2085 continue;
2086 if (re_simple(vr, esc && *esc != '\200' ? (unsigned char) *esc : 0)) {
2087 re = re_create(vr, caseignore, esc && *esc != '\200' ? (unsigned char) *esc : 0);
2088 if (re == NULL) {
2089 msg = createException(MAL, "pcre.join", SQLSTATE(HY001) MAL_MALLOC_FAIL);
2090 goto bailout;
2091 }
2092 } else {
2093 assert(pcrepat == NULL);
2094 msg = sql2pcre(&pcrepat, vr, esc);
2095 if (msg != MAL_SUCCEED)
2096 goto bailout;
2097 if (strcmp(pcrepat, str_nil) == 0) {
2098 GDKfree(pcrepat);
2099 if (caseignore) {
2100 pcrepat = GDKmalloc(strlen(vr) + 3);
2101 if (pcrepat == NULL) {
2102 msg = createException(MAL, "pcre.join", SQLSTATE(HY001) MAL_MALLOC_FAIL);
2103 goto bailout;
2104 }
2105 sprintf(pcrepat, "^%s$", vr);
2106 } else {
2107 /* a simple strcmp suffices */
2108 pcrepat = NULL;
2109 }
2110 }
2111 if (pcrepat) {
2112#ifdef HAVE_LIBPCRE
2113 pcrere = pcre_compile(pcrepat, pcreopt, &err_p, &errpos, NULL);
2114 if (pcrere == NULL) {
2115 msg = createException(MAL, "pcre.join", OPERATION_FAILED
2116 ": pcre compile of pattern (%s) "
2117 "failed at %d with '%s'",
2118 pcrepat, errpos, err_p);
2119 goto bailout;
2120 }
2121 pcreex = pcre_study(pcrere, pcrestopt, &err_p);
2122 if (err_p != NULL) {
2123 msg = createException(MAL, "pcre.join", OPERATION_FAILED
2124 ": pcre study of pattern (%s) "
2125 "failed with '%s'", pcrepat, err_p);
2126 goto bailout;
2127 }
2128#else
2129 if ((errcode = regcomp(&regex, pcrepat, options)) != 0) {
2130 msg = createException(MAL, "pcre.join", OPERATION_FAILED
2131 ": pcre compile of pattern (%s)",
2132 pcrepat);
2133 goto bailout;
2134 }
2135 pcrere = 1;
2136#endif
2137 GDKfree(pcrepat);
2138 pcrepat = NULL;
2139 }
2140 }
2141 nl = 0;
2142 canditer_reset(&lci);
2143 for (BUN li = 0; li < lci.ncand; li++) {
2144 lo = canditer_next(&lci);
2145 vl = VALUE(l, lo - l->hseqbase);
2146 if (strcmp(vl, str_nil) == 0)
2147 continue;
2148 if (re) {
2149 if (caseignore) {
2150 if (!re_match_ignore(vl, re))
2151 continue;
2152 } else {
2153 if (!re_match_no_ignore(vl, re))
2154 continue;
2155 }
2156 } else if (pcrere) {
2157#ifdef HAVE_LIBPCRE
2158 if (pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, 0, NULL, 0) < 0)
2159 continue;
2160#else
2161 int retval = regexec(&regex, vl, (size_t) 0, NULL, 0);
2162 if (retval == REG_NOMATCH || retval == REG_ENOSYS)
2163 continue;
2164#endif
2165 } else {
2166 if (strcmp(vl, vr) != 0)
2167 continue;
2168 }
2169 if (BUNlast(r1) == BATcapacity(r1)) {
2170 newcap = BATgrows(r1);
2171 BATsetcount(r1, BATcount(r1));
2172 BATsetcount(r2, BATcount(r2));
2173 if (BATextend(r1, newcap) != GDK_SUCCEED ||
2174 BATextend(r2, newcap) != GDK_SUCCEED) {
2175 msg = createException(MAL, "pcre.join", SQLSTATE(HY001) MAL_MALLOC_FAIL);
2176 goto bailout;
2177 }
2178 assert(BATcapacity(r1) == BATcapacity(r2));
2179 }
2180 if (BATcount(r1) > 0) {
2181 if (lastl + 1 != lo)
2182 r1->tseqbase = oid_nil;
2183 if (nl == 0) {
2184 r2->trevsorted = false;
2185 if (lastl > lo) {
2186 r1->tsorted = false;
2187 r1->tkey = false;
2188 } else if (lastl < lo) {
2189 r1->trevsorted = false;
2190 } else {
2191 r1->tkey = false;
2192 }
2193 }
2194 }
2195 APPEND(r1, lo);
2196 APPEND(r2, ro);
2197 lastl = lo;
2198 nl++;
2199 }
2200 if (re) {
2201 re_destroy(re);
2202 re = NULL;
2203 }
2204 if (pcrere) {
2205#ifdef HAVE_LIBPCRE
2206 pcre_free_study(pcreex);
2207 pcre_free(pcrere);
2208 pcrere = NULL;
2209 pcreex = NULL;
2210#else
2211 regfree(&regex);
2212 pcrere = 0;
2213#endif
2214 }
2215 if (nl > 1) {
2216 r2->tkey = false;
2217 r2->tseqbase = oid_nil;
2218 r1->trevsorted = false;
2219 } else if (nl == 0) {
2220 rskipped = BATcount(r2) > 0;
2221 } else if (rskipped) {
2222 r2->tseqbase = oid_nil;
2223 }
2224 }
2225 assert(BATcount(r1) == BATcount(r2));
2226 /* also set other bits of heap to correct value to indicate size */
2227 BATsetcount(r1, BATcount(r1));
2228 BATsetcount(r2, BATcount(r2));
2229 if (BATcount(r1) > 0) {
2230 if (BATtdense(r1))
2231 r1->tseqbase = ((oid *) r1->theap.base)[0];
2232 if (BATtdense(r2))
2233 r2->tseqbase = ((oid *) r2->theap.base)[0];
2234 } else {
2235 r1->tseqbase = r2->tseqbase = 0;
2236 }
2237 ALGODEBUG fprintf(stderr, "#pcrejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
2238 BATgetId(l), BATgetId(r),
2239 BATgetId(r1), BATcount(r1),
2240 r1->tsorted ? "-sorted" : "",
2241 r1->trevsorted ? "-revsorted" : "",
2242 BATgetId(r2), BATcount(r2),
2243 r2->tsorted ? "-sorted" : "",
2244 r2->trevsorted ? "-revsorted" : "");
2245 return MAL_SUCCEED;
2246
2247 bailout:
2248 if (re)
2249 re_destroy(re);
2250 if (pcrepat)
2251 GDKfree(pcrepat);
2252#ifdef HAVE_LIBPCRE
2253 if (pcreex)
2254 pcre_free_study(pcreex);
2255 if (pcrere)
2256 pcre_free(pcrere);
2257#else
2258 if (pcrere)
2259 regfree(&regex);
2260#endif
2261
2262 assert(msg != MAL_SUCCEED);
2263 return msg;
2264}
2265
2266static str
2267PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid,
2268 const char *esc, bool caseignore)
2269{
2270 BAT *left = NULL, *right = NULL, *candleft = NULL, *candright = NULL;
2271 BAT *result1 = NULL, *result2 = NULL;
2272 char *msg = MAL_SUCCEED;
2273
2274 if ((left = BATdescriptor(lid)) == NULL)
2275 goto fail;
2276 if ((right = BATdescriptor(rid)) == NULL)
2277 goto fail;
2278 if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
2279 goto fail;
2280 if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
2281 goto fail;
2282 result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
2283 result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
2284 if (result1 == NULL || result2 == NULL) {
2285 msg = createException(MAL, "pcre.join", SQLSTATE(HY001) MAL_MALLOC_FAIL);
2286 goto fail;
2287 }
2288 result1->tnil = false;
2289 result1->tnonil = true;
2290 result1->tkey = true;
2291 result1->tsorted = true;
2292 result1->trevsorted = true;
2293 result1->tseqbase = 0;
2294 result2->tnil = false;
2295 result2->tnonil = true;
2296 result2->tkey = true;
2297 result2->tsorted = true;
2298 result2->trevsorted = true;
2299 result2->tseqbase = 0;
2300 msg = pcrejoin(result1, result2, left, right, candleft, candright,
2301 esc, caseignore);
2302 if (msg)
2303 goto fail;
2304 *r1 = result1->batCacheid;
2305 *r2 = result2->batCacheid;
2306 BBPkeepref(*r1);
2307 BBPkeepref(*r2);
2308 BBPunfix(left->batCacheid);
2309 BBPunfix(right->batCacheid);
2310 if (candleft)
2311 BBPunfix(candleft->batCacheid);
2312 if (candright)
2313 BBPunfix(candright->batCacheid);
2314 return MAL_SUCCEED;
2315
2316 fail:
2317 if (left)
2318 BBPunfix(left->batCacheid);
2319 if (right)
2320 BBPunfix(right->batCacheid);
2321 if (candleft)
2322 BBPunfix(candleft->batCacheid);
2323 if (candright)
2324 BBPunfix(candright->batCacheid);
2325 if (result1)
2326 BBPunfix(result1->batCacheid);
2327 if (result2)
2328 BBPunfix(result2->batCacheid);
2329 if (msg)
2330 return msg;
2331 throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2332}
2333
2334str
2335LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate)
2336{
2337 (void) nil_matches;
2338 (void) estimate;
2339 return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *esc, 0);
2340}
2341
2342str
2343LIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate)
2344{
2345 (void) nil_matches;
2346 (void) estimate;
2347 return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, "", 0);
2348}
2349
2350str
2351ILIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate)
2352{
2353 (void) nil_matches;
2354 (void) estimate;
2355 return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *esc, 1);
2356}
2357
2358str
2359ILIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate)
2360{
2361 (void) nil_matches;
2362 (void) estimate;
2363 return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, "", 1);
2364}
2365