1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * N. Nes |
11 | * PCRE library interface |
12 | * The PCRE library is a set of functions that implement regular |
13 | * expression pattern matching using the same syntax and semantics as Perl, |
14 | * with just a few differences. The current implementation of PCRE |
15 | * (release 4.x) corresponds approximately with Perl 5.8, including support |
16 | * for UTF-8 encoded strings. However, this support has to be |
17 | * explicitly enabled; it is not the default. |
18 | * |
19 | * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre |
20 | */ |
21 | #include "monetdb_config.h" |
22 | #include <string.h> |
23 | |
24 | #include "mal.h" |
25 | #include "mal_exception.h" |
26 | |
27 | #include <wchar.h> |
28 | #include <wctype.h> |
29 | |
30 | #ifdef HAVE_LIBPCRE |
31 | #include <pcre.h> |
32 | #ifndef PCRE_STUDY_JIT_COMPILE |
33 | /* old library version on e.g. EPEL 6 */ |
34 | #define pcre_free_study(x) pcre_free(x) |
35 | #define PCRE_STUDY_JIT_COMPILE 0 |
36 | #endif |
37 | #define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */ |
38 | |
39 | #else |
40 | |
41 | #include <regex.h> |
42 | |
43 | typedef regex_t pcre; |
44 | #endif |
45 | |
46 | mal_export str pcre_init(void *ret); |
47 | |
48 | mal_export str PCREquote(str *r, const str *v); |
49 | mal_export str PCREmatch(bit *ret, const str *val, const str *pat); |
50 | mal_export str PCREimatch(bit *ret, const str *val, const str *pat); |
51 | mal_export str PCREindex(int *ret, const pcre *pat, const str *val); |
52 | mal_export str PCREpatindex(int *ret, const str *pat, const str *val); |
53 | mal_export str PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags); |
54 | mal_export str PCREreplace_bat_wrap(bat *res, const bat *or, const str *pat, const str *repl, const str *flags); |
55 | mal_export str PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags); |
56 | mal_export str PCREreplacefirst_bat_wrap(bat *res, const bat *or, const str *pat, const str *repl, const str *flags); |
57 | mal_export str PCREsql2pcre(str *ret, const str *pat, const str *esc); |
58 | |
59 | mal_export str PCRElike3(bit *ret, const str *s, const str *pat, const str *esc); |
60 | mal_export str PCRElike2(bit *ret, const str *s, const str *pat); |
61 | mal_export str PCREnotlike3(bit *ret, const str *s, const str *pat, const str *esc); |
62 | mal_export str PCREnotlike2(bit *ret, const str *s, const str *pat); |
63 | mal_export str BATPCRElike(bat *ret, const bat *b, const str *pat, const str *esc); |
64 | mal_export str BATPCRElike2(bat *ret, const bat *b, const str *pat); |
65 | mal_export str BATPCREnotlike(bat *ret, const bat *b, const str *pat, const str *esc); |
66 | mal_export str BATPCREnotlike2(bat *ret, const bat *b, const str *pat); |
67 | mal_export str PCREilike3(bit *ret, const str *s, const str *pat, const str *esc); |
68 | mal_export str PCREilike2(bit *ret, const str *s, const str *pat); |
69 | mal_export str PCREnotilike3(bit *ret, const str *s, const str *pat, const str *esc); |
70 | mal_export str PCREnotilike2(bit *ret, const str *s, const str *pat); |
71 | mal_export str BATPCREilike(bat *ret, const bat *b, const str *pat, const str *esc); |
72 | mal_export str BATPCREilike2(bat *ret, const bat *b, const str *pat); |
73 | mal_export str BATPCREnotilike(bat *ret, const bat *b, const str *pat, const str *esc); |
74 | mal_export str BATPCREnotilike2(bat *ret, const bat *b, const str *pat); |
75 | |
76 | mal_export str PCRElikeselect2(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *caseignore, const bit *anti); |
77 | mal_export str PCRElikeselect1(bat *ret, const bat *bid, const bat *cid, const str *pat, const str *esc, const bit *anti); |
78 | mal_export str PCRElikeselect3(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *anti); |
79 | mal_export str PCRElikeselect4(bat *ret, const bat *bid, const bat *cid, const str *pat, const bit *anti); |
80 | mal_export str PCRElikeselect5(bat *ret, const bat *bid, const bat *sid, const str *pat, const bit *anti); |
81 | |
82 | mal_export str LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); |
83 | mal_export str LIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); |
84 | mal_export str ILIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); |
85 | mal_export str ILIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate); |
86 | |
87 | /* current implementation assumes simple %keyword% [keyw%]* */ |
88 | typedef struct RE { |
89 | char *k; |
90 | uint32_t *w; |
91 | bool search; |
92 | size_t len; |
93 | struct RE *n; |
94 | } RE; |
95 | |
96 | /* We cannot use strcasecmp and strncasecmp since they work byte for |
97 | * byte and don't deal with multibyte encodings (such as UTF-8). |
98 | * |
99 | * We implement our own conversion from UTF-8 encoding to Unicode code |
100 | * points which we store in uint32_t. The reason for this is, |
101 | * functions like mbsrtowcs are locale-dependent (so we need a UTF-8 |
102 | * locale to use them), and on Windows, wchar_t is only 2 bytes and |
103 | * therefore cannot hold all Unicode code points. We do use functions |
104 | * such as towlower to convert a Unicode code point to its lower-case |
105 | * equivalent, but again on Windows, if the code point doesn't fit in |
106 | * 2 bytes, we skip this conversion and compare the unconverted code |
107 | * points. |
108 | * |
109 | * Note, towlower is also locale-dependent, but we don't need a UTF-8 |
110 | * locale in order to use it. */ |
111 | |
112 | /* helper function to convert a UTF-8 multibyte character to a wide |
113 | * character */ |
114 | static size_t |
115 | utfc8touc(uint32_t *restrict dest, const char *restrict src) |
116 | { |
117 | if ((src[0] & 0x80) == 0) { |
118 | *dest = src[0]; |
119 | return src[0] != 0; |
120 | } else if ((src[0] & 0xE0) == 0xC0 |
121 | && (src[1] & 0xC0) == 0x80 |
122 | && (src[0] & 0x1E) != 0) { |
123 | *dest = (src[0] & 0x1F) << 6 |
124 | | (src[1] & 0x3F); |
125 | return 2; |
126 | } else if ((src[0] & 0xF0) == 0xE0 |
127 | && (src[1] & 0xC0) == 0x80 |
128 | && (src[2] & 0xC0) == 0x80 |
129 | && ((src[0] & 0x0F) != 0 |
130 | || (src[1] & 0x20) != 0)) { |
131 | *dest = (src[0] & 0x0F) << 12 |
132 | | (src[1] & 0x3F) << 6 |
133 | | (src[2] & 0x3F); |
134 | return 3; |
135 | } else if ((src[0] & 0xF8) == 0xF0 |
136 | && (src[1] & 0xC0) == 0x80 |
137 | && (src[2] & 0xC0) == 0x80 |
138 | && (src[3] & 0xC0) == 0x80) { |
139 | uint32_t c = (src[0] & 0x07) << 18 |
140 | | (src[1] & 0x3F) << 12 |
141 | | (src[2] & 0x3F) << 6 |
142 | | (src[3] & 0x3F); |
143 | if (c < 0x10000 |
144 | || c > 0x10FFFF |
145 | || (c & 0x1FF800) == 0x00D800) |
146 | return (size_t) -1; |
147 | *dest = c; |
148 | return 4; |
149 | } |
150 | return (size_t) -1; |
151 | } |
152 | |
153 | /* helper function to convert a UTF-8 string to a wide character |
154 | * string, the wide character string is allocated */ |
155 | static uint32_t * |
156 | utf8stoucs(const char *src) |
157 | { |
158 | uint32_t *dest; |
159 | size_t i = 0; |
160 | size_t j = 0; |
161 | |
162 | /* count how many uint32_t's we need, while also checking for |
163 | * correctness of the input */ |
164 | while (src[j]) { |
165 | i++; |
166 | if ((src[j+0] & 0x80) == 0) { |
167 | j += 1; |
168 | } else if ((src[j+0] & 0xE0) == 0xC0 |
169 | && (src[j+1] & 0xC0) == 0x80 |
170 | && (src[j+0] & 0x1E) != 0) { |
171 | j += 2; |
172 | } else if ((src[j+0] & 0xF0) == 0xE0 |
173 | && (src[j+1] & 0xC0) == 0x80 |
174 | && (src[j+2] & 0xC0) == 0x80 |
175 | && ((src[j+0] & 0x0F) != 0 |
176 | || (src[j+1] & 0x20) != 0)) { |
177 | j += 3; |
178 | } else if ((src[j+0] & 0xF8) == 0xF0 |
179 | && (src[j+1] & 0xC0) == 0x80 |
180 | && (src[j+2] & 0xC0) == 0x80 |
181 | && (src[j+3] & 0xC0) == 0x80) { |
182 | uint32_t c = (src[j+0] & 0x07) << 18 |
183 | | (src[j+1] & 0x3F) << 12 |
184 | | (src[j+2] & 0x3F) << 6 |
185 | | (src[j+3] & 0x3F); |
186 | if (c < 0x10000 |
187 | || c > 0x10FFFF |
188 | || (c & 0x1FF800) == 0x00D800) |
189 | return NULL; |
190 | j += 4; |
191 | } else { |
192 | return NULL; |
193 | } |
194 | } |
195 | dest = GDKmalloc((i + 1) * sizeof(uint32_t)); |
196 | if (dest == NULL) |
197 | return NULL; |
198 | /* go through the source string again, this time we can skip |
199 | * the correctness tests */ |
200 | i = j = 0; |
201 | while (src[j]) { |
202 | if ((src[j+0] & 0x80) == 0) { |
203 | dest[i++] = src[j+0]; |
204 | j += 1; |
205 | } else if ((src[j+0] & 0xE0) == 0xC0) { |
206 | dest[i++] = (src[j+0] & 0x1F) << 6 |
207 | | (src[j+1] & 0x3F); |
208 | j += 2; |
209 | } else if ((src[j+0] & 0xF0) == 0xE0) { |
210 | dest[i++] = (src[j+0] & 0x0F) << 12 |
211 | | (src[j+1] & 0x3F) << 6 |
212 | | (src[j+2] & 0x3F); |
213 | j += 3; |
214 | } else if ((src[j+0] & 0xF8) == 0xF0) { |
215 | dest[i++] = (src[j+0] & 0x07) << 18 |
216 | | (src[j+1] & 0x3F) << 12 |
217 | | (src[j+2] & 0x3F) << 6 |
218 | | (src[j+3] & 0x3F); |
219 | j += 4; |
220 | } |
221 | } |
222 | dest[i] = 0; |
223 | return dest; |
224 | } |
225 | |
226 | static size_t |
227 | myucslen(const uint32_t *ucs) |
228 | { |
229 | size_t i = 0; |
230 | |
231 | while (ucs[i]) |
232 | i++; |
233 | return i; |
234 | } |
235 | |
236 | static int |
237 | mywstrncasecmp(const char *restrict s1, const uint32_t *restrict s2, size_t n2) |
238 | { |
239 | uint32_t c1; |
240 | |
241 | while (n2 > 0) { |
242 | size_t nn1 = utfc8touc(&c1, s1); |
243 | if (nn1 == 0 || nn1 == (size_t) -1) |
244 | return -(*s2 != 0); |
245 | if (*s2 == 0) |
246 | return 1; |
247 | if (nn1 == (size_t) -1 || nn1 == (size_t) -2) |
248 | return 0; /* actually an error that shouldn't happen */ |
249 | #if SIZEOF_WCHAR_T == 2 |
250 | if (c1 > 0xFFFF || *s2 > 0xFFFF) { |
251 | if (c1 != *s2) |
252 | return c1 - *s2; |
253 | } else |
254 | #endif |
255 | if (towlower((wint_t) c1) != towlower((wint_t) *s2)) |
256 | return towlower((wint_t) c1) - towlower((wint_t) *s2); |
257 | s1 += nn1; |
258 | n2--; |
259 | s2++; |
260 | } |
261 | return 0; |
262 | } |
263 | |
264 | static int |
265 | mystrcasecmp(const char *s1, const char *s2) |
266 | { |
267 | uint32_t c1, c2; |
268 | |
269 | for (;;) { |
270 | size_t nn1 = utfc8touc(&c1, s1); |
271 | size_t nn2 = utfc8touc(&c2, s2); |
272 | if (nn1 == 0 || nn1 == (size_t) -1) |
273 | return -(nn2 != 0 && nn2 != (size_t) -1); |
274 | if (nn2 == 0 || nn2 == (size_t) -1) |
275 | return 1; |
276 | if (nn1 == (size_t) -1 || nn1 == (size_t) -2 || |
277 | nn2 == (size_t) -1 || nn2 == (size_t) -2) |
278 | return 0; /* actually an error that shouldn't happen */ |
279 | #if SIZEOF_WCHAR_T == 2 |
280 | if (c1 > 0xFFFF || c2 > 0xFFFF) { |
281 | if (c1 != c2) |
282 | return c1 - c2; |
283 | } else |
284 | #endif |
285 | if (towlower((wint_t) c1) != towlower((wint_t) c2)) |
286 | return towlower((wint_t) c1) - towlower((wint_t) c2); |
287 | s1 += nn1; |
288 | s2 += nn2; |
289 | } |
290 | } |
291 | |
292 | static int |
293 | mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2) |
294 | { |
295 | uint32_t c1; |
296 | |
297 | for (;;) { |
298 | size_t nn1 = utfc8touc(&c1, s1); |
299 | if (nn1 == 0 || nn1 == (size_t) -1) |
300 | return -(*s2 != 0); |
301 | if (*s2 == 0) |
302 | return 1; |
303 | if (nn1 == (size_t) -1 || nn1 == (size_t) -2) |
304 | return 0; /* actually an error that shouldn't happen */ |
305 | #if SIZEOF_WCHAR_T == 2 |
306 | if (c1 > 0xFFFF || *s2 > 0xFFFF) { |
307 | if (c1 != *s2) |
308 | return c1 - *s2; |
309 | } else |
310 | #endif |
311 | if (towlower((wint_t) c1) != towlower((wint_t) *s2)) |
312 | return towlower((wint_t) c1) - towlower((wint_t) *s2); |
313 | s1 += nn1; |
314 | s2++; |
315 | } |
316 | } |
317 | |
318 | static const char * |
319 | mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle) |
320 | { |
321 | size_t nlen = myucslen(wneedle); |
322 | |
323 | if (nlen == 0) |
324 | return haystack; |
325 | |
326 | size_t hlen = strlen(haystack); |
327 | |
328 | while (*haystack) { |
329 | size_t i; |
330 | size_t h; |
331 | size_t step = 0; |
332 | for (i = h = 0; i < nlen; i++) { |
333 | uint32_t c; |
334 | size_t j = utfc8touc(&c, haystack + h); |
335 | if (j == 0 || j == (size_t) -1) |
336 | return NULL; |
337 | if (i == 0) { |
338 | step = j; |
339 | } |
340 | #if SIZEOF_WCHAR_T == 2 |
341 | if (c > 0xFFFF || wneedle[i] > 0xFFFF) { |
342 | if (c != wneedle[i]) |
343 | break; |
344 | } else |
345 | #endif |
346 | if (towlower((wint_t) c) != towlower((wint_t) wneedle[i])) |
347 | break; |
348 | h += j; |
349 | } |
350 | if (i == nlen) |
351 | return haystack; |
352 | haystack += step; |
353 | hlen -= step; |
354 | } |
355 | return NULL; |
356 | } |
357 | |
358 | /* returns true if the pattern does not contain unescaped `_' (single |
359 | * character match) and ends with unescaped `%' (any sequence |
360 | * match) */ |
361 | static bool |
362 | re_simple(const char *pat, unsigned char esc) |
363 | { |
364 | bool escaped = false; |
365 | bool percatend = false; |
366 | |
367 | if (pat == 0) |
368 | return 0; |
369 | if (*pat == '%') { |
370 | percatend = true; |
371 | pat++; |
372 | } |
373 | while (*pat) { |
374 | percatend = false; |
375 | if (escaped) { |
376 | escaped = false; |
377 | } else if ((unsigned char) *pat == esc) { |
378 | escaped = true; |
379 | } else if (*pat == '_') { |
380 | return 0; |
381 | } else if (*pat == '%') { |
382 | percatend = true; |
383 | } |
384 | pat++; |
385 | } |
386 | return percatend; |
387 | } |
388 | |
389 | static bool |
390 | is_strcmpable(const char *pat, const char *esc) |
391 | { |
392 | if (pat[strcspn(pat, "%_" )]) |
393 | return false; |
394 | return strlen(esc) == 0 || strcmp(esc, str_nil) == 0 || strstr(pat, esc) == NULL; |
395 | } |
396 | |
397 | static bool |
398 | re_match_ignore(const char *s, RE *pattern) |
399 | { |
400 | RE *r; |
401 | |
402 | for (r = pattern; r; r = r->n) { |
403 | if (*r->w == 0 && (r->search || *s == 0)) |
404 | return true; |
405 | if (!*s || |
406 | (r->search ? (s = mywstrcasestr(s, r->w)) == NULL : mywstrncasecmp(s, r->w, r->len) != 0)) |
407 | return false; |
408 | s += r->len; |
409 | } |
410 | return true; |
411 | } |
412 | |
413 | static bool |
414 | re_match_no_ignore(const char *s, RE *pattern) |
415 | { |
416 | RE *r; |
417 | |
418 | for (r = pattern; r; r = r->n) { |
419 | if (*r->k == 0 && (r->search || *s == 0)) |
420 | return true; |
421 | if (!*s || |
422 | (r->search ? (s = strstr(s, r->k)) == NULL : strncmp(s, r->k, r->len) != 0)) |
423 | return false; |
424 | s += r->len; |
425 | } |
426 | return true; |
427 | } |
428 | |
429 | static void |
430 | re_destroy(RE *p) |
431 | { |
432 | if (p) { |
433 | GDKfree(p->k); |
434 | GDKfree(p->w); |
435 | do { |
436 | RE *n = p->n; |
437 | |
438 | GDKfree(p); |
439 | p = n; |
440 | } while (p); |
441 | } |
442 | } |
443 | |
444 | /* Create a linked list of RE structures. Depending on the caseignore |
445 | * flag, the w (if true) or the k (if false) field is used. These |
446 | * fields in the first structure are allocated, whereas in all |
447 | * subsequent structures the fields point into the allocated buffer of |
448 | * the first. */ |
449 | static RE * |
450 | re_create(const char *pat, bool caseignore, uint32_t esc) |
451 | { |
452 | RE *r = (RE*)GDKmalloc(sizeof(RE)), *n = r; |
453 | bool escaped = false; |
454 | |
455 | if (r == NULL) |
456 | return NULL; |
457 | *r = (struct RE) {.search = false}; |
458 | |
459 | while (esc != '%' && *pat == '%') { |
460 | pat++; /* skip % */ |
461 | r->search = true; |
462 | } |
463 | if (caseignore) { |
464 | uint32_t *wp; |
465 | uint32_t *wq; |
466 | wp = utf8stoucs(pat); |
467 | if (wp == NULL) { |
468 | GDKfree(r); |
469 | return NULL; |
470 | } |
471 | r->w = wp; |
472 | wq = wp; |
473 | while (*wp) { |
474 | if (escaped) { |
475 | *wq++ = *wp; |
476 | escaped = false; |
477 | } else if (*wp == esc) { |
478 | escaped = true; |
479 | } else if (*wp == '%') { |
480 | n->len = (size_t) (wq - r->w); |
481 | while (wp[1] == '%') |
482 | wp++; |
483 | if (wp[1]) { |
484 | n = n->n = GDKmalloc(sizeof(RE)); |
485 | if (n == NULL) |
486 | goto bailout; |
487 | *n = (struct RE) {.search = true, .w = wp + 1}; |
488 | } |
489 | *wq++ = 0; |
490 | } else { |
491 | *wq++ = *wp; |
492 | } |
493 | wp++; |
494 | } |
495 | } else { |
496 | char *p, *q; |
497 | if ((p = GDKstrdup(pat)) == NULL) { |
498 | GDKfree(r); |
499 | return NULL; |
500 | } |
501 | r->k = p; |
502 | q = p; |
503 | while (*p) { |
504 | if (escaped) { |
505 | *q++ = *p; |
506 | escaped = false; |
507 | } else if ((unsigned char) *p == esc) { |
508 | escaped = true; |
509 | } else if (*p == '%') { |
510 | n->len = (size_t) (q - r->k); |
511 | while (p[1] == '%') |
512 | p++; |
513 | if (p[1]) { |
514 | n = n->n = GDKmalloc(sizeof(RE)); |
515 | if (n == NULL) |
516 | goto bailout; |
517 | *n = (struct RE) {.search = true, .k = p + 1}; |
518 | } |
519 | *q++ = 0; |
520 | } else { |
521 | *q++ = *p; |
522 | } |
523 | p++; |
524 | } |
525 | } |
526 | return r; |
527 | bailout: |
528 | re_destroy(r); |
529 | return NULL; |
530 | } |
531 | |
532 | #ifdef HAVE_LIBPCRE |
533 | static str |
534 | pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive) |
535 | { |
536 | pcre *r; |
537 | const char *err_p = NULL; |
538 | int errpos = 0; |
539 | int options = PCRE_UTF8 | PCRE_MULTILINE; |
540 | if (insensitive) |
541 | options |= PCRE_CASELESS; |
542 | |
543 | if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) { |
544 | throw(MAL, "pcre.compile" , OPERATION_FAILED |
545 | " with\n'%s'\nat %d in\n'%s'.\n" , |
546 | err_p, errpos, pattern); |
547 | } |
548 | *res = r; |
549 | return MAL_SUCCEED; |
550 | } |
551 | #endif |
552 | |
553 | /* these two defines are copies from gdk_select.c */ |
554 | |
555 | /* scan select loop with candidates */ |
556 | #define candscanloop(TEST) \ |
557 | do { \ |
558 | ALGODEBUG fprintf(stderr, \ |
559 | "#BATselect(b=%s#"BUNFMT",s=%s,anti=%d): " \ |
560 | "scanselect %s\n", BATgetId(b), BATcount(b), \ |
561 | s ? BATgetId(s) : "NULL", anti, #TEST); \ |
562 | for (p = 0; p < ci.ncand; p++) { \ |
563 | o = canditer_next(&ci); \ |
564 | r = (BUN) (o - off); \ |
565 | v = BUNtvar(bi, r); \ |
566 | if (TEST) \ |
567 | bunfastappTYPE(oid, bn, &o); \ |
568 | } \ |
569 | } while (0) |
570 | |
571 | /* scan select loop without candidates */ |
572 | #define scanloop(TEST) \ |
573 | do { \ |
574 | ALGODEBUG fprintf(stderr, \ |
575 | "#BATselect(b=%s#"BUNFMT",s=%s,anti=%d): " \ |
576 | "scanselect %s\n", BATgetId(b), BATcount(b), \ |
577 | s ? BATgetId(s) : "NULL", anti, #TEST); \ |
578 | while (p < q) { \ |
579 | v = BUNtvar(bi, p-off); \ |
580 | if (TEST) { \ |
581 | o = (oid) p; \ |
582 | bunfastappTYPE(oid, bn, &o); \ |
583 | } \ |
584 | p++; \ |
585 | } \ |
586 | } while (0) |
587 | |
588 | static str |
589 | pcre_likeselect(BAT **bnp, BAT *b, BAT *s, const char *pat, bool caseignore, bool anti) |
590 | { |
591 | #ifdef HAVE_LIBPCRE |
592 | int options = PCRE_UTF8 | PCRE_MULTILINE | PCRE_DOTALL; |
593 | pcre *re; |
594 | pcre_extra *pe; |
595 | const char *error; |
596 | int errpos; |
597 | int ovector[9]; |
598 | #else |
599 | int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED; |
600 | regex_t re; |
601 | int errcode; |
602 | #endif |
603 | BATiter bi = bat_iterator(b); |
604 | BAT *bn; |
605 | BUN p, q; |
606 | oid o, off; |
607 | const char *v; |
608 | struct canditer ci; |
609 | |
610 | canditer_init(&ci, b, s); |
611 | |
612 | assert(ATOMstorage(b->ttype) == TYPE_str); |
613 | |
614 | if (caseignore) { |
615 | #ifdef HAVE_LIBPCRE |
616 | options |= PCRE_CASELESS; |
617 | #else |
618 | options |= REG_ICASE; |
619 | #endif |
620 | } |
621 | #ifdef HAVE_LIBPCRE |
622 | if ((re = pcre_compile(pat, options, &error, &errpos, NULL)) == NULL) |
623 | throw(MAL, "pcre.likeselect" , |
624 | OPERATION_FAILED ": compilation of pattern \"%s\" failed\n" , pat); |
625 | pe = pcre_study(re, (s ? BATcount(s) : BATcount(b)) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &error); |
626 | if (error != NULL) { |
627 | pcre_free(re); |
628 | throw(MAL, "pcre.likeselect" , |
629 | OPERATION_FAILED ": studying pattern \"%s\" failed\n" , pat); |
630 | } |
631 | #else |
632 | if ((errcode = regcomp(&re, pat, options)) != 0) { |
633 | throw(MAL, "pcre.likeselect" , |
634 | OPERATION_FAILED ": compilation of pattern \"%s\" failed\n" , pat); |
635 | } |
636 | #endif |
637 | bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT); |
638 | if (bn == NULL) { |
639 | #ifdef HAVE_LIBPCRE |
640 | pcre_free_study(pe); |
641 | pcre_free(re); |
642 | #else |
643 | regfree(&re); |
644 | #endif |
645 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
646 | } |
647 | off = b->hseqbase; |
648 | |
649 | if (s && !BATtdense(s)) { |
650 | BUN r; |
651 | |
652 | #ifdef HAVE_LIBPCRE |
653 | #define BODY (pcre_exec(re, pe, v, (int) strlen(v), 0, 0, ovector, 9) >= 0) |
654 | #else |
655 | #define BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH) |
656 | #endif |
657 | if (anti) |
658 | candscanloop(v && *v != '\200' && !BODY); |
659 | else |
660 | candscanloop(v && *v != '\200' && BODY); |
661 | } else { |
662 | if (s) { |
663 | assert(BATtdense(s)); |
664 | p = (BUN) s->tseqbase; |
665 | q = p + BATcount(s); |
666 | if ((oid) p < b->hseqbase) |
667 | p = b->hseqbase; |
668 | if ((oid) q > b->hseqbase + BATcount(b)) |
669 | q = b->hseqbase + BATcount(b); |
670 | } else { |
671 | p = off; |
672 | q = BUNlast(b) + off; |
673 | } |
674 | if (anti) |
675 | scanloop(v && *v != '\200' && !BODY); |
676 | else |
677 | scanloop(v && *v != '\200' && BODY); |
678 | } |
679 | #ifdef HAVE_LIBPCRE |
680 | pcre_free_study(pe); |
681 | pcre_free(re); |
682 | #else |
683 | regfree(&re); |
684 | #endif |
685 | BATsetcount(bn, BATcount(bn)); /* set some properties */ |
686 | bn->theap.dirty |= BATcount(bn) > 0; |
687 | bn->tsorted = true; |
688 | bn->trevsorted = bn->batCount <= 1; |
689 | bn->tkey = true; |
690 | bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? * (oid *) Tloc(bn, 0) : oid_nil; |
691 | *bnp = bn; |
692 | return MAL_SUCCEED; |
693 | |
694 | bunins_failed: |
695 | BBPreclaim(bn); |
696 | #ifdef HAVE_LIBPCRE |
697 | pcre_free_study(pe); |
698 | pcre_free(re); |
699 | #else |
700 | regfree(&re); |
701 | #endif |
702 | *bnp = NULL; |
703 | throw(MAL, "pcre.likeselect" , OPERATION_FAILED); |
704 | } |
705 | |
706 | static str |
707 | re_likeselect(BAT **bnp, BAT *b, BAT *s, const char *pat, bool caseignore, bool anti, bool use_strcmp, uint32_t esc) |
708 | { |
709 | BATiter bi = bat_iterator(b); |
710 | BAT *bn; |
711 | BUN p, q; |
712 | oid o, off; |
713 | const char *v; |
714 | RE *re = NULL; |
715 | |
716 | assert(ATOMstorage(b->ttype) == TYPE_str); |
717 | |
718 | bn = COLnew(0, TYPE_oid, s ? BATcount(s) : BATcount(b), TRANSIENT); |
719 | if (bn == NULL) |
720 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
721 | off = b->hseqbase; |
722 | |
723 | if (!use_strcmp) { |
724 | re = re_create(pat, caseignore, esc); |
725 | if (!re) |
726 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
727 | } |
728 | if (s && !BATtdense(s)) { |
729 | struct canditer ci; |
730 | BUN r; |
731 | |
732 | canditer_init(&ci, b, s); |
733 | |
734 | if (use_strcmp) { |
735 | if (caseignore) { |
736 | uint32_t *wpat; |
737 | wpat = utf8stoucs(pat); |
738 | if (wpat == NULL) |
739 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
740 | if (anti) |
741 | candscanloop(v && *v != '\200' && |
742 | mywstrcasecmp(v, wpat) != 0); |
743 | else |
744 | candscanloop(v && *v != '\200' && |
745 | mywstrcasecmp(v, wpat) == 0); |
746 | GDKfree(wpat); |
747 | } else { |
748 | if (anti) |
749 | candscanloop(v && *v != '\200' && |
750 | strcmp(v, pat) != 0); |
751 | else |
752 | candscanloop(v && *v != '\200' && |
753 | strcmp(v, pat) == 0); |
754 | } |
755 | } else { |
756 | if (caseignore) { |
757 | if (anti) |
758 | candscanloop(v && *v != '\200' && |
759 | re_match_ignore(v, re) == 0); |
760 | else |
761 | candscanloop(v && *v != '\200' && |
762 | re_match_ignore(v, re)); |
763 | } else { |
764 | if (anti) |
765 | candscanloop(v && *v != '\200' && |
766 | re_match_no_ignore(v, re) == 0); |
767 | else |
768 | candscanloop(v && *v != '\200' && |
769 | re_match_no_ignore(v, re)); |
770 | } |
771 | } |
772 | } else { |
773 | if (s) { |
774 | assert(BATtdense(s)); |
775 | p = (BUN) s->tseqbase; |
776 | q = p + BATcount(s); |
777 | if ((oid) p < b->hseqbase) |
778 | p = b->hseqbase; |
779 | if ((oid) q > b->hseqbase + BATcount(b)) |
780 | q = b->hseqbase + BATcount(b); |
781 | } else { |
782 | p = off; |
783 | q = BUNlast(b) + off; |
784 | } |
785 | if (use_strcmp) { |
786 | if (caseignore) { |
787 | uint32_t *wpat; |
788 | wpat = utf8stoucs(pat); |
789 | if (wpat == NULL) |
790 | throw(MAL, "pcre.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
791 | if (anti) |
792 | scanloop(v && *v != '\200' && |
793 | mywstrcasecmp(v, wpat) != 0); |
794 | else |
795 | scanloop(v && *v != '\200' && |
796 | mywstrcasecmp(v, wpat) == 0); |
797 | GDKfree(wpat); |
798 | } else { |
799 | if (anti) |
800 | scanloop(v && *v != '\200' && |
801 | strcmp(v, pat) != 0); |
802 | else |
803 | scanloop(v && *v != '\200' && |
804 | strcmp(v, pat) == 0); |
805 | } |
806 | } else { |
807 | if (caseignore) { |
808 | if (anti) |
809 | scanloop(v && *v != '\200' && |
810 | re_match_ignore(v, re) == 0); |
811 | else |
812 | scanloop(v && *v != '\200' && |
813 | re_match_ignore(v, re)); |
814 | } else { |
815 | if (anti) |
816 | scanloop(v && *v != '\200' && |
817 | re_match_no_ignore(v, re) == 0); |
818 | else |
819 | scanloop(v && *v != '\200' && |
820 | re_match_no_ignore(v, re)); |
821 | } |
822 | } |
823 | } |
824 | BATsetcount(bn, BATcount(bn)); /* set some properties */ |
825 | bn->tsorted = true; |
826 | bn->trevsorted = bn->batCount <= 1; |
827 | bn->tkey = true; |
828 | bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? * (oid *) Tloc(bn, 0) : oid_nil; |
829 | *bnp = bn; |
830 | re_destroy(re); |
831 | return MAL_SUCCEED; |
832 | |
833 | bunins_failed: |
834 | re_destroy(re); |
835 | BBPreclaim(bn); |
836 | *bnp = NULL; |
837 | throw(MAL, "pcre.likeselect" , OPERATION_FAILED); |
838 | } |
839 | |
840 | /* maximum number of back references and quoted \ or $ in replacement string */ |
841 | #define MAX_NR_REFS 20 |
842 | |
843 | struct backref { |
844 | int idx; |
845 | int start; |
846 | int end; |
847 | }; |
848 | |
849 | #ifdef HAVE_LIBPCRE |
850 | /* fill in parameter backrefs (length maxrefs) with information about |
851 | * back references in the replacement string; a back reference is a |
852 | * dollar or backslash followed by a number */ |
853 | static int |
854 | parse_replacement(const char *replacement, int len_replacement, |
855 | struct backref *backrefs, int maxrefs) |
856 | { |
857 | int nbackrefs = 0; |
858 | |
859 | for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) { |
860 | if (replacement[i] == '$' || replacement[i] == '\\') { |
861 | char *endptr; |
862 | backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10); |
863 | if (endptr > replacement + i + 1) { |
864 | int k = (int) (endptr - (replacement + i + 1)); |
865 | backrefs[nbackrefs].start = i; |
866 | backrefs[nbackrefs].end = i + k + 1; |
867 | nbackrefs++; |
868 | } else if (replacement[i] == replacement[i + 1]) { |
869 | /* doubled $ or \, we must copy just one to the output */ |
870 | backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */ |
871 | backrefs[nbackrefs].start = i; |
872 | backrefs[nbackrefs].end = i + 1; |
873 | i++; /* don't look at second $ or \ again */ |
874 | nbackrefs++; |
875 | } |
876 | /* else: $ or \ followed by something we don't recognize, |
877 | * so just leave it */ |
878 | } |
879 | } |
880 | return nbackrefs; |
881 | } |
882 | |
883 | static char * |
884 | single_replace(pcre *pcre_code, pcre_extra *, |
885 | const char *origin_str, int len_origin_str, |
886 | int exec_options, int *ovector, int ovecsize, |
887 | const char *replacement, int len_replacement, |
888 | struct backref *backrefs, int nbackrefs, |
889 | bool global, char *result, int *max_result) |
890 | { |
891 | int offset = 0; |
892 | int len_result = 0; |
893 | int addlen; |
894 | char *tmp; |
895 | |
896 | do { |
897 | int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset, |
898 | exec_options, ovector, ovecsize); |
899 | if (j <= 0) |
900 | break; |
901 | addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0); |
902 | if (len_result + addlen >= *max_result) { |
903 | tmp = GDKrealloc(result, len_result + addlen + 1); |
904 | if (tmp == NULL) { |
905 | GDKfree(result); |
906 | return NULL; |
907 | } |
908 | result = tmp; |
909 | *max_result = len_result + addlen + 1; |
910 | } |
911 | if (ovector[0] > offset) { |
912 | strncpy(result + len_result, origin_str + offset, |
913 | ovector[0] - offset); |
914 | len_result += ovector[0] - offset; |
915 | } |
916 | if (nbackrefs == 0) { |
917 | strncpy(result + len_result, replacement, len_replacement); |
918 | len_result += len_replacement; |
919 | } else { |
920 | int prevend = 0; |
921 | for (int i = 0; i < nbackrefs; i++) { |
922 | int off, len; |
923 | if (backrefs[i].idx >= ovecsize / 3) { |
924 | /* out of bounds, replace with empty string */ |
925 | off = 0; |
926 | len = 0; |
927 | } else { |
928 | off = ovector[backrefs[i].idx * 2]; |
929 | len = ovector[backrefs[i].idx * 2 + 1] - off; |
930 | } |
931 | addlen = backrefs[i].start - prevend + len; |
932 | if (len_result + addlen >= *max_result) { |
933 | tmp = GDKrealloc(result, len_result + addlen + 1); |
934 | if (tmp == NULL) { |
935 | GDKfree(result); |
936 | return NULL; |
937 | } |
938 | result = tmp; |
939 | *max_result = len_result + addlen + 1; |
940 | } |
941 | if (backrefs[i].start > prevend) { |
942 | strncpy(result + len_result, replacement + prevend, |
943 | backrefs[i].start - prevend); |
944 | len_result += backrefs[i].start - prevend; |
945 | } |
946 | if (len > 0) { |
947 | strncpy(result + len_result, origin_str + off, len); |
948 | len_result += len; |
949 | } |
950 | prevend = backrefs[i].end; |
951 | } |
952 | /* copy rest of replacement string (after last backref) */ |
953 | addlen = len_replacement - prevend; |
954 | if (addlen > 0) { |
955 | if (len_result + addlen >= *max_result) { |
956 | tmp = GDKrealloc(result, len_result + addlen + 1); |
957 | if (tmp == NULL) { |
958 | GDKfree(result); |
959 | return NULL; |
960 | } |
961 | result = tmp; |
962 | *max_result = len_result + addlen + 1; |
963 | } |
964 | strncpy(result + len_result, replacement + prevend, addlen); |
965 | len_result += addlen; |
966 | } |
967 | } |
968 | offset = ovector[1]; |
969 | } while (offset < len_origin_str && global); |
970 | if (offset < len_origin_str) { |
971 | addlen = len_origin_str - offset; |
972 | if (len_result + addlen >= *max_result) { |
973 | tmp = GDKrealloc(result, len_result + addlen + 1); |
974 | if (tmp == NULL) { |
975 | GDKfree(result); |
976 | return NULL; |
977 | } |
978 | result = tmp; |
979 | *max_result = len_result + addlen + 1; |
980 | } |
981 | strncpy(result + len_result, origin_str + offset, addlen); |
982 | len_result += addlen; |
983 | } |
984 | /* null terminate string */ |
985 | result[len_result] = '\0'; |
986 | return result; |
987 | } |
988 | #endif |
989 | |
990 | static str |
991 | pcre_replace(str *res, const char *origin_str, const char *pattern, |
992 | const char *replacement, const char *flags, bool global) |
993 | { |
994 | #ifdef HAVE_LIBPCRE |
995 | const char *err_p = NULL; |
996 | pcre *pcre_code = NULL; |
997 | pcre_extra *; |
998 | char *tmpres; |
999 | int max_result; |
1000 | int i, errpos = 0; |
1001 | int compile_options = PCRE_UTF8, exec_options = PCRE_NOTEMPTY; |
1002 | int *ovector, ovecsize; |
1003 | int len_origin_str = (int) strlen(origin_str); |
1004 | int len_replacement = (int) strlen(replacement); |
1005 | struct backref backrefs[MAX_NR_REFS]; |
1006 | int nbackrefs = 0; |
1007 | |
1008 | while (*flags) { |
1009 | switch (*flags) { |
1010 | case 'e': |
1011 | exec_options &= ~PCRE_NOTEMPTY; |
1012 | break; |
1013 | case 'i': |
1014 | compile_options |= PCRE_CASELESS; |
1015 | break; |
1016 | case 'm': |
1017 | compile_options |= PCRE_MULTILINE; |
1018 | break; |
1019 | case 's': |
1020 | compile_options |= PCRE_DOTALL; |
1021 | break; |
1022 | case 'x': |
1023 | compile_options |= PCRE_EXTENDED; |
1024 | break; |
1025 | default: |
1026 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
1027 | ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n" , |
1028 | *flags); |
1029 | } |
1030 | flags++; |
1031 | } |
1032 | |
1033 | if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) { |
1034 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
1035 | OPERATION_FAILED ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n" , |
1036 | pattern, errpos, err_p); |
1037 | } |
1038 | |
1039 | /* Since the compiled pattern is going to be used several times, it is |
1040 | * worth spending more time analyzing it in order to speed up the time |
1041 | * taken for matching. |
1042 | */ |
1043 | extra = pcre_study(pcre_code, 0, &err_p); |
1044 | if (err_p != NULL) { |
1045 | pcre_free(pcre_code); |
1046 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
1047 | OPERATION_FAILED ": pcre study of pattern (%s) failed with '%s'.\n" , |
1048 | pattern, err_p); |
1049 | } |
1050 | pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i); |
1051 | ovecsize = (i + 1) * 3; |
1052 | if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) { |
1053 | pcre_free_study(extra); |
1054 | pcre_free(pcre_code); |
1055 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
1056 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1057 | } |
1058 | |
1059 | /* identify back references in the replacement string */ |
1060 | nbackrefs = parse_replacement(replacement, len_replacement, |
1061 | backrefs, MAX_NR_REFS); |
1062 | |
1063 | max_result = len_origin_str + 1; |
1064 | tmpres = GDKmalloc(max_result); |
1065 | if (tmpres == NULL) { |
1066 | GDKfree(ovector); |
1067 | pcre_free_study(extra); |
1068 | pcre_free(pcre_code); |
1069 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
1070 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1071 | } |
1072 | |
1073 | tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str, |
1074 | exec_options, ovector, ovecsize, replacement, |
1075 | len_replacement, backrefs, nbackrefs, global, |
1076 | tmpres, &max_result); |
1077 | GDKfree(ovector); |
1078 | pcre_free_study(extra); |
1079 | pcre_free(pcre_code); |
1080 | if (tmpres == NULL) |
1081 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
1082 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1083 | |
1084 | *res = tmpres; |
1085 | return MAL_SUCCEED; |
1086 | #else |
1087 | (void) res; |
1088 | (void) origin_str; |
1089 | (void) pattern; |
1090 | (void) replacement; |
1091 | (void) flags; |
1092 | (void) global; |
1093 | throw(MAL, global ? "pcre.replace" : "pcre.replace_first" , |
1094 | "Database was compiled without PCRE support." ); |
1095 | #endif |
1096 | } |
1097 | |
1098 | static str |
1099 | pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern, |
1100 | const char *replacement, const char *flags, bool global) |
1101 | { |
1102 | #ifdef HAVE_LIBPCRE |
1103 | BATiter origin_strsi = bat_iterator(origin_strs); |
1104 | const char *err_p = NULL; |
1105 | char *tmpres; |
1106 | int i, errpos = 0; |
1107 | int compile_options = PCRE_UTF8, exec_options = PCRE_NOTEMPTY; |
1108 | pcre *pcre_code = NULL; |
1109 | pcre_extra *; |
1110 | BAT *tmpbat; |
1111 | BUN p, q; |
1112 | int *ovector, ovecsize; |
1113 | int len_replacement = (int) strlen(replacement); |
1114 | struct backref backrefs[MAX_NR_REFS]; |
1115 | int nbackrefs = 0; |
1116 | const char *origin_str; |
1117 | int max_dest_size = 0; |
1118 | |
1119 | while (*flags) { |
1120 | switch (*flags) { |
1121 | case 'e': |
1122 | exec_options &= ~PCRE_NOTEMPTY; |
1123 | break; |
1124 | case 'i': |
1125 | compile_options |= PCRE_CASELESS; |
1126 | break; |
1127 | case 'm': |
1128 | compile_options |= PCRE_MULTILINE; |
1129 | break; |
1130 | case 's': |
1131 | compile_options |= PCRE_DOTALL; |
1132 | break; |
1133 | case 'x': |
1134 | compile_options |= PCRE_EXTENDED; |
1135 | break; |
1136 | default: |
1137 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
1138 | ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n" , |
1139 | *flags); |
1140 | } |
1141 | flags++; |
1142 | } |
1143 | |
1144 | if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) { |
1145 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
1146 | OPERATION_FAILED |
1147 | ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n" , |
1148 | pattern, errpos, err_p); |
1149 | } |
1150 | |
1151 | /* Since the compiled pattern is going to be used several times, |
1152 | * it is worth spending more time analyzing it in order to speed |
1153 | * up the time taken for matching. |
1154 | */ |
1155 | extra = pcre_study(pcre_code, BATcount(origin_strs) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p); |
1156 | if (err_p != NULL) { |
1157 | pcre_free(pcre_code); |
1158 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
1159 | OPERATION_FAILED); |
1160 | } |
1161 | pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i); |
1162 | ovecsize = (i + 1) * 3; |
1163 | if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) { |
1164 | pcre_free_study(extra); |
1165 | pcre_free(pcre_code); |
1166 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
1167 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1168 | } |
1169 | |
1170 | /* identify back references in the replacement string */ |
1171 | nbackrefs = parse_replacement(replacement, len_replacement, |
1172 | backrefs, MAX_NR_REFS); |
1173 | |
1174 | tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs), TRANSIENT); |
1175 | |
1176 | /* the buffer for all destination strings is allocated only once, |
1177 | * and extended when needed */ |
1178 | max_dest_size = len_replacement + 1; |
1179 | tmpres = GDKmalloc(max_dest_size); |
1180 | if (tmpbat == NULL || tmpres == NULL) { |
1181 | pcre_free_study(extra); |
1182 | pcre_free(pcre_code); |
1183 | GDKfree(ovector); |
1184 | BBPreclaim(tmpbat); |
1185 | GDKfree(tmpres); |
1186 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
1187 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1188 | } |
1189 | BATloop(origin_strs, p, q) { |
1190 | origin_str = BUNtvar(origin_strsi, p); |
1191 | tmpres = single_replace(pcre_code, extra, origin_str, |
1192 | (int) strlen(origin_str), exec_options, |
1193 | ovector, ovecsize, replacement, |
1194 | len_replacement, backrefs, nbackrefs, global, |
1195 | tmpres, &max_dest_size); |
1196 | if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) { |
1197 | pcre_free_study(extra); |
1198 | pcre_free(pcre_code); |
1199 | GDKfree(ovector); |
1200 | GDKfree(tmpres); |
1201 | BBPreclaim(tmpbat); |
1202 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
1203 | SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1204 | } |
1205 | } |
1206 | pcre_free_study(extra); |
1207 | pcre_free(pcre_code); |
1208 | GDKfree(ovector); |
1209 | GDKfree(tmpres); |
1210 | *res = tmpbat; |
1211 | return MAL_SUCCEED; |
1212 | #else |
1213 | (void) res; |
1214 | (void) origin_strs; |
1215 | (void) pattern; |
1216 | (void) replacement; |
1217 | (void) flags; |
1218 | (void) global; |
1219 | throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first" , |
1220 | "Database was compiled without PCRE support." ); |
1221 | #endif |
1222 | } |
1223 | |
1224 | str |
1225 | pcre_init(void *ret) |
1226 | { |
1227 | (void) ret; |
1228 | return NULL; |
1229 | } |
1230 | |
1231 | static str |
1232 | pcre_match_with_flags(bit *ret, const char *val, const char *pat, const char *flags) |
1233 | { |
1234 | int pos; |
1235 | #ifdef HAVE_LIBPCRE |
1236 | const char *err_p = NULL; |
1237 | int errpos = 0; |
1238 | int options = PCRE_UTF8; |
1239 | pcre *re; |
1240 | #else |
1241 | int options = REG_NOSUB; |
1242 | regex_t re; |
1243 | int errcode; |
1244 | int retval; |
1245 | #endif |
1246 | |
1247 | while (*flags) { |
1248 | switch (*flags) { |
1249 | case 'i': |
1250 | #ifdef HAVE_LIBPCRE |
1251 | options |= PCRE_CASELESS; |
1252 | #else |
1253 | options |= REG_ICASE; |
1254 | #endif |
1255 | break; |
1256 | case 'm': |
1257 | #ifdef HAVE_LIBPCRE |
1258 | options |= PCRE_MULTILINE; |
1259 | #else |
1260 | options |= REG_NEWLINE; |
1261 | #endif |
1262 | break; |
1263 | #ifdef HAVE_LIBPCRE |
1264 | case 's': |
1265 | options |= PCRE_DOTALL; |
1266 | break; |
1267 | #endif |
1268 | case 'x': |
1269 | #ifdef HAVE_LIBPCRE |
1270 | options |= PCRE_EXTENDED; |
1271 | #else |
1272 | options |= REG_EXTENDED; |
1273 | #endif |
1274 | break; |
1275 | default: |
1276 | throw(MAL, "pcre.match" , ILLEGAL_ARGUMENT |
1277 | ": unsupported flag character '%c'\n" , *flags); |
1278 | } |
1279 | flags++; |
1280 | } |
1281 | if (strcmp(val, str_nil) == 0) { |
1282 | *ret = FALSE; |
1283 | return MAL_SUCCEED; |
1284 | } |
1285 | |
1286 | #ifdef HAVE_LIBPCRE |
1287 | if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL) |
1288 | #else |
1289 | if ((errcode = regcomp(&re, pat, options)) != 0) |
1290 | #endif |
1291 | { |
1292 | throw(MAL, "pcre.match" , OPERATION_FAILED |
1293 | ": compilation of regular expression (%s) failed " |
1294 | #ifdef HAVE_LIBPCRE |
1295 | "at %d with '%s'" , pat, errpos, err_p |
1296 | #else |
1297 | , pat |
1298 | #endif |
1299 | ); |
1300 | } |
1301 | #ifdef HAVE_LIBPCRE |
1302 | pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, 0, NULL, 0); |
1303 | pcre_free(re); |
1304 | #else |
1305 | retval = regexec(&re, val, (size_t) 0, NULL, 0); |
1306 | pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0); |
1307 | regfree(&re); |
1308 | #endif |
1309 | if (pos >= 0) |
1310 | *ret = TRUE; |
1311 | else if (pos == -1) |
1312 | *ret = FALSE; |
1313 | else |
1314 | throw(MAL, "pcre.match" , OPERATION_FAILED |
1315 | ": matching of regular expression (%s) failed with %d" , |
1316 | pat, pos); |
1317 | return MAL_SUCCEED; |
1318 | } |
1319 | |
1320 | #ifdef HAVE_LIBPCRE |
1321 | /* special characters in PCRE that need to be escaped */ |
1322 | static const char *pcre_specials = ".+?*()[]{}|^$\\" ; |
1323 | #else |
1324 | /* special characters in POSIX basic regular expressions that need to |
1325 | * be escaped */ |
1326 | static const char *pcre_specials = ".*[]^$\\" ; |
1327 | #endif |
1328 | |
1329 | /* change SQL LIKE pattern into PCRE pattern */ |
1330 | static str |
1331 | sql2pcre(str *r, const char *pat, const char *esc_str) |
1332 | { |
1333 | int escaped = 0; |
1334 | int hasWildcard = 0; |
1335 | char *ppat; |
1336 | int esc = esc_str[0] == '\200' ? 0 : esc_str[0]; /* should change to utf8_convert() */ |
1337 | int specials; |
1338 | int c; |
1339 | |
1340 | if (strlen(esc_str) > 1) |
1341 | throw(MAL, "pcre.sql2pcre" , SQLSTATE(22019) ILLEGAL_ARGUMENT ": ESCAPE string must have length 1" ); |
1342 | if (pat == NULL ) |
1343 | throw(MAL, "pcre.sql2pcre" , OPERATION_FAILED); |
1344 | ppat = GDKmalloc(strlen(pat)*3+3 /* 3 = "^'the translated regexp'$0" */); |
1345 | if (ppat == NULL) |
1346 | throw(MAL, "pcre.sql2pcre" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1347 | |
1348 | *r = ppat; |
1349 | /* The escape character can be a char which is special in a PCRE |
1350 | * expression. If the user used the "+" char as escape and has "++" |
1351 | * in their pattern, then replacing this with "+" is not correct and |
1352 | * should be "\+" instead. */ |
1353 | specials = (esc && strchr(pcre_specials, esc) != NULL); |
1354 | |
1355 | *ppat++ = '^'; |
1356 | while ((c = *pat++) != 0) { |
1357 | if (c == esc) { |
1358 | if (escaped) { |
1359 | if (specials) { /* change ++ into \+ */ |
1360 | *ppat++ = esc; |
1361 | } else { /* do not escape simple escape symbols */ |
1362 | ppat[-1] = esc; /* overwrite backslash */ |
1363 | } |
1364 | escaped = 0; |
1365 | } else { |
1366 | *ppat++ = '\\'; |
1367 | escaped = 1; |
1368 | } |
1369 | hasWildcard = 1; |
1370 | } else if (strchr(pcre_specials, c) != NULL) { |
1371 | /* escape PCRE special chars, avoid double backslash if the |
1372 | * user uses an invalid escape sequence */ |
1373 | if (!escaped) |
1374 | *ppat++ = '\\'; |
1375 | *ppat++ = c; |
1376 | hasWildcard = 1; |
1377 | escaped = 0; |
1378 | } else if (c == '%' && !escaped) { |
1379 | *ppat++ = '.'; |
1380 | *ppat++ = '*'; |
1381 | *ppat++ = '?'; |
1382 | hasWildcard = 1; |
1383 | /* collapse multiple %, but only if it isn't the escape */ |
1384 | if (esc != '%') |
1385 | while (*pat == '%') |
1386 | pat++; |
1387 | } else if (c == '_' && !escaped) { |
1388 | *ppat++ = '.'; |
1389 | hasWildcard = 1; |
1390 | } else { |
1391 | if (escaped) { |
1392 | ppat[-1] = c; /* overwrite backslash of invalid escape */ |
1393 | } else { |
1394 | *ppat++ = c; |
1395 | } |
1396 | escaped = 0; |
1397 | } |
1398 | } |
1399 | /* no wildcard or escape character at end of string */ |
1400 | if (!hasWildcard || escaped) { |
1401 | GDKfree(*r); |
1402 | *r = NULL; |
1403 | if (escaped) |
1404 | throw(MAL, "pcre.sql2pcre" , OPERATION_FAILED); |
1405 | *r = GDKstrdup(str_nil); |
1406 | if (*r == NULL) |
1407 | throw(MAL, "pcre.sql2pcre" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1408 | } else { |
1409 | *ppat++ = '$'; |
1410 | *ppat = 0; |
1411 | } |
1412 | return MAL_SUCCEED; |
1413 | } |
1414 | |
1415 | #ifdef HAVE_LIBPCRE |
1416 | /* change SQL PATINDEX pattern into PCRE pattern */ |
1417 | static str |
1418 | pat2pcre(str *r, const char *pat) |
1419 | { |
1420 | size_t len = strlen(pat); |
1421 | char *ppat = GDKmalloc(len*2+3 /* 3 = "^'the translated regexp'$0" */); |
1422 | int start = 0; |
1423 | |
1424 | if (ppat == NULL) |
1425 | throw(MAL, "pcre.sql2pcre" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1426 | *r = ppat; |
1427 | while (*pat) { |
1428 | int c = *pat++; |
1429 | |
1430 | if (strchr(pcre_specials, c) != NULL) { |
1431 | *ppat++ = '\\'; |
1432 | *ppat++ = c; |
1433 | } else if (c == '%') { |
1434 | if (start && *pat) { |
1435 | *ppat++ = '.'; |
1436 | *ppat++ = '*'; |
1437 | } |
1438 | start++; |
1439 | } else if (c == '_') { |
1440 | *ppat++ = '.'; |
1441 | } else { |
1442 | *ppat++ = c; |
1443 | } |
1444 | } |
1445 | *ppat = 0; |
1446 | return MAL_SUCCEED; |
1447 | } |
1448 | #endif |
1449 | |
1450 | /* |
1451 | * @+ Wrapping |
1452 | */ |
1453 | #include "mal.h" |
1454 | str |
1455 | PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags) |
1456 | { |
1457 | return pcre_replace(res, *or, *pat, *repl, *flags, true); |
1458 | } |
1459 | |
1460 | str |
1461 | PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags) |
1462 | { |
1463 | BAT *b, *bn = NULL; |
1464 | str msg; |
1465 | if ((b = BATdescriptor(*bid)) == NULL) |
1466 | throw(MAL, "batpcre.replace" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
1467 | |
1468 | msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true); |
1469 | if (msg == MAL_SUCCEED) { |
1470 | *res = bn->batCacheid; |
1471 | BBPkeepref(*res); |
1472 | } |
1473 | BBPunfix(b->batCacheid); |
1474 | return msg; |
1475 | } |
1476 | |
1477 | str |
1478 | PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags) |
1479 | { |
1480 | return pcre_replace(res, *or, *pat, *repl, *flags, false); |
1481 | } |
1482 | |
1483 | str |
1484 | PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags) |
1485 | { |
1486 | BAT *b,*bn = NULL; |
1487 | str msg; |
1488 | if ((b = BATdescriptor(*bid)) == NULL) |
1489 | throw(MAL, "batpcre.replace_first" , RUNTIME_OBJECT_MISSING); |
1490 | |
1491 | msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false); |
1492 | if (msg == MAL_SUCCEED) { |
1493 | *res = bn->batCacheid; |
1494 | BBPkeepref(*res); |
1495 | } |
1496 | BBPunfix(b->batCacheid); |
1497 | return msg; |
1498 | } |
1499 | |
1500 | str |
1501 | PCREmatch(bit *ret, const str *val, const str *pat) |
1502 | { |
1503 | return pcre_match_with_flags(ret, *val, *pat, |
1504 | #ifdef HAVE_LIBPCRE |
1505 | "s" |
1506 | #else |
1507 | "x" |
1508 | #endif |
1509 | ); |
1510 | } |
1511 | |
1512 | str |
1513 | PCREimatch(bit *ret, const str *val, const str *pat) |
1514 | { |
1515 | return pcre_match_with_flags(ret, *val, *pat, "i" |
1516 | #ifndef HAVE_LIBPCRE |
1517 | "x" |
1518 | #endif |
1519 | ); |
1520 | } |
1521 | |
1522 | str |
1523 | PCREindex(int *res, const pcre *pattern, const str *s) |
1524 | { |
1525 | #ifdef HAVE_LIBPCRE |
1526 | int v[3]; |
1527 | |
1528 | v[0] = v[1] = *res = 0; |
1529 | if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0, 0, v, 3) >= 0) { |
1530 | *res = v[1]; |
1531 | } |
1532 | return MAL_SUCCEED; |
1533 | #else |
1534 | (void) res; |
1535 | (void) pattern; |
1536 | (void) s; |
1537 | throw(MAL, "pcre.index" , "Database was compiled without PCRE support." ); |
1538 | #endif |
1539 | } |
1540 | |
1541 | |
1542 | str |
1543 | PCREpatindex(int *ret, const str *pat, const str *val) |
1544 | { |
1545 | #ifdef HAVE_LIBPCRE |
1546 | pcre *re = NULL; |
1547 | char *ppat = NULL, *msg; |
1548 | |
1549 | if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED) |
1550 | return msg; |
1551 | if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) { |
1552 | GDKfree(ppat); |
1553 | return msg; |
1554 | } |
1555 | GDKfree(ppat); |
1556 | msg = PCREindex(ret, re, val); |
1557 | pcre_free(re); |
1558 | return msg; |
1559 | #else |
1560 | (void) ret; |
1561 | (void) pat; |
1562 | (void) val; |
1563 | throw(MAL, "pcre.patindex" , "Database was compiled without PCRE support." ); |
1564 | #endif |
1565 | } |
1566 | |
1567 | str |
1568 | PCREquote(str *ret, const str *val) |
1569 | { |
1570 | char *p; |
1571 | const char *s = *val; |
1572 | |
1573 | *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */ |
1574 | if (p == NULL) |
1575 | throw(MAL, "pcre.quote" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1576 | /* quote all non-alphanumeric ASCII characters (i.e. leave |
1577 | non-ASCII and alphanumeric alone) */ |
1578 | while (*s) { |
1579 | if (!((*s & 0x80) != 0 || |
1580 | ('a' <= *s && *s <= 'z') || |
1581 | ('A' <= *s && *s <= 'Z') || |
1582 | isdigit((unsigned char) *s))) |
1583 | *p++ = '\\'; |
1584 | *p++ = *s++; |
1585 | } |
1586 | *p = 0; |
1587 | return MAL_SUCCEED; |
1588 | } |
1589 | |
1590 | |
1591 | str |
1592 | PCREsql2pcre(str *ret, const str *pat, const str *esc) |
1593 | { |
1594 | return sql2pcre(ret, *pat, *esc); |
1595 | } |
1596 | |
1597 | static str |
1598 | PCRElike4(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens) |
1599 | { |
1600 | char *ppat = NULL; |
1601 | str r = sql2pcre(&ppat, *pat, *esc); |
1602 | |
1603 | if (!r) { |
1604 | assert(ppat); |
1605 | if (strcmp(ppat, str_nil) == 0) { |
1606 | *ret = FALSE; |
1607 | if (*isens) { |
1608 | if (mystrcasecmp(*s, *pat) == 0) |
1609 | *ret = TRUE; |
1610 | } else { |
1611 | if (strcmp(*s, *pat) == 0) |
1612 | *ret = TRUE; |
1613 | } |
1614 | } else { |
1615 | if (*isens) { |
1616 | r = PCREimatch(ret, s, &ppat); |
1617 | } else { |
1618 | r = PCREmatch(ret, s, &ppat); |
1619 | } |
1620 | } |
1621 | } |
1622 | if (ppat) |
1623 | GDKfree(ppat); |
1624 | return r; |
1625 | } |
1626 | |
1627 | str |
1628 | PCRElike3(bit *ret, const str *s, const str *pat, const str *esc) |
1629 | { |
1630 | bit no = FALSE; |
1631 | |
1632 | return PCRElike4(ret, s, pat, esc, &no); |
1633 | } |
1634 | |
1635 | str |
1636 | PCRElike2(bit *ret, const str *s, const str *pat) |
1637 | { |
1638 | char *esc = "" ; |
1639 | |
1640 | return PCRElike3(ret, s, pat, &esc); |
1641 | } |
1642 | |
1643 | str |
1644 | PCREnotlike3(bit *ret, const str *s, const str *pat, const str *esc) |
1645 | { |
1646 | str tmp; |
1647 | bit r; |
1648 | |
1649 | rethrow("str.not_like" , tmp, PCRElike3(&r, s, pat, esc)); |
1650 | *ret = !r; |
1651 | return MAL_SUCCEED; |
1652 | } |
1653 | |
1654 | str |
1655 | PCREnotlike2(bit *ret, const str *s, const str *pat) |
1656 | { |
1657 | str tmp; |
1658 | bit r; |
1659 | |
1660 | rethrow("str.not_like" , tmp, PCRElike2(&r, s, pat)); |
1661 | *ret = !r; |
1662 | return MAL_SUCCEED; |
1663 | } |
1664 | |
1665 | str |
1666 | PCREilike3(bit *ret, const str *s, const str *pat, const str *esc) |
1667 | { |
1668 | bit yes = TRUE; |
1669 | |
1670 | return PCRElike4(ret, s, pat, esc, &yes); |
1671 | } |
1672 | |
1673 | str |
1674 | PCREilike2(bit *ret, const str *s, const str *pat) |
1675 | { |
1676 | char *esc = "\\" ; |
1677 | |
1678 | return PCREilike3(ret, s, pat, &esc); |
1679 | } |
1680 | |
1681 | str |
1682 | PCREnotilike3(bit *ret, const str *s, const str *pat, const str *esc) |
1683 | { |
1684 | str tmp; |
1685 | bit r; |
1686 | |
1687 | rethrow("str.not_ilike" , tmp, PCREilike3(&r, s, pat, esc)); |
1688 | *ret = !r; |
1689 | return MAL_SUCCEED; |
1690 | } |
1691 | |
1692 | str |
1693 | PCREnotilike2(bit *ret, const str *s, const str *pat) |
1694 | { |
1695 | str tmp; |
1696 | bit r; |
1697 | |
1698 | rethrow("str.not_ilike" , tmp, PCREilike2(&r, s, pat)); |
1699 | *ret = !r; |
1700 | return MAL_SUCCEED; |
1701 | } |
1702 | |
1703 | static str |
1704 | BATPCRElike3(bat *ret, const bat *bid, const str *pat, const str *esc, const bit *isens, const bit *not) |
1705 | { |
1706 | char *ppat = NULL; |
1707 | str res = sql2pcre(&ppat, *pat, *esc); |
1708 | |
1709 | if (res == MAL_SUCCEED) { |
1710 | BAT *strs = BATdescriptor(*bid); |
1711 | BATiter strsi; |
1712 | BAT *r; |
1713 | bit *br; |
1714 | BUN p, q, i = 0; |
1715 | |
1716 | if (strs == NULL) { |
1717 | GDKfree(ppat); |
1718 | throw(MAL, "batstr.like" , OPERATION_FAILED); |
1719 | } |
1720 | |
1721 | r = COLnew(strs->hseqbase, TYPE_bit, BATcount(strs), TRANSIENT); |
1722 | if (r==NULL) { |
1723 | GDKfree(ppat); |
1724 | BBPunfix(strs->batCacheid); |
1725 | throw(MAL, "pcre.like3" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1726 | } |
1727 | br = (bit*)Tloc(r, 0); |
1728 | strsi = bat_iterator(strs); |
1729 | |
1730 | if (strcmp(ppat, str_nil) == 0) { |
1731 | BATloop(strs, p, q) { |
1732 | const char *s = (str)BUNtvar(strsi, p); |
1733 | |
1734 | if (strcmp(s, *pat) == 0) |
1735 | br[i] = TRUE; |
1736 | else |
1737 | br[i] = FALSE; |
1738 | if (*not) |
1739 | br[i] = !br[i]; |
1740 | i++; |
1741 | } |
1742 | } else { |
1743 | int pos; |
1744 | #ifdef HAVE_LIBPCRE |
1745 | const char *err_p = NULL; |
1746 | int errpos = 0; |
1747 | int options = PCRE_UTF8 | PCRE_DOTALL; |
1748 | pcre *re; |
1749 | #else |
1750 | regex_t re; |
1751 | int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED; |
1752 | int errcode; |
1753 | #endif |
1754 | |
1755 | if (*isens) { |
1756 | #ifdef HAVE_LIBPCRE |
1757 | options |= PCRE_CASELESS; |
1758 | #else |
1759 | options |= REG_ICASE; |
1760 | #endif |
1761 | } |
1762 | if ( |
1763 | #ifdef HAVE_LIBPCRE |
1764 | (re = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL |
1765 | #else |
1766 | (errcode = regcomp(&re, ppat, options)) != 0 |
1767 | #endif |
1768 | ) { |
1769 | BBPunfix(strs->batCacheid); |
1770 | BBPunfix(r->batCacheid); |
1771 | res = createException(MAL, "pcre.match" , OPERATION_FAILED |
1772 | ": compilation of regular expression (%s) failed" |
1773 | #ifdef HAVE_LIBPCRE |
1774 | " at %d with '%s'" , ppat, errpos, err_p |
1775 | #else |
1776 | , ppat |
1777 | #endif |
1778 | ); |
1779 | GDKfree(ppat); |
1780 | return res; |
1781 | } |
1782 | |
1783 | BATloop(strs, p, q) { |
1784 | const char *s = (str)BUNtvar(strsi, p); |
1785 | |
1786 | if (*s == '\200') { |
1787 | br[i] = bit_nil; |
1788 | r->tnonil = false; |
1789 | r->tnil = true; |
1790 | } else { |
1791 | #ifdef HAVE_LIBPCRE |
1792 | pos = pcre_exec(re, NULL, s, (int) strlen(s), 0, 0, NULL, 0); |
1793 | #else |
1794 | int retval = regexec(&re, s, (size_t) 0, NULL, 0); |
1795 | pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0); |
1796 | #endif |
1797 | if (pos >= 0) |
1798 | br[i] = *not? FALSE:TRUE; |
1799 | else if (pos == -1) |
1800 | br[i] = *not? TRUE: FALSE; |
1801 | else { |
1802 | BBPunfix(strs->batCacheid); |
1803 | BBPunfix(r->batCacheid); |
1804 | res = createException(MAL, "pcre.match" , OPERATION_FAILED |
1805 | ": matching of regular expression (%s) failed with %d" , ppat, pos); |
1806 | GDKfree(ppat); |
1807 | return res; |
1808 | } |
1809 | } |
1810 | i++; |
1811 | } |
1812 | #ifdef HAVE_LIBPCRE |
1813 | pcre_free(re); |
1814 | #else |
1815 | regfree(&re); |
1816 | #endif |
1817 | } |
1818 | BATsetcount(r, i); |
1819 | r->tsorted = false; |
1820 | r->trevsorted = false; |
1821 | BATkey(r, false); |
1822 | |
1823 | BBPkeepref(*ret = r->batCacheid); |
1824 | BBPunfix(strs->batCacheid); |
1825 | GDKfree(ppat); |
1826 | } |
1827 | return res; |
1828 | } |
1829 | |
1830 | str |
1831 | BATPCRElike(bat *ret, const bat *bid, const str *pat, const str *esc) |
1832 | { |
1833 | bit no = FALSE; |
1834 | |
1835 | return BATPCRElike3(ret, bid, pat, esc, &no, &no); |
1836 | } |
1837 | |
1838 | str |
1839 | BATPCRElike2(bat *ret, const bat *bid, const str *pat) |
1840 | { |
1841 | char *esc = "\\" ; |
1842 | |
1843 | return BATPCRElike(ret, bid, pat, &esc); |
1844 | } |
1845 | |
1846 | str |
1847 | BATPCREnotlike(bat *ret, const bat *bid, const str *pat, const str *esc) |
1848 | { |
1849 | bit no = FALSE; |
1850 | bit yes = TRUE; |
1851 | |
1852 | return BATPCRElike3(ret, bid, pat, esc, &no, &yes); |
1853 | } |
1854 | |
1855 | str |
1856 | BATPCREnotlike2(bat *ret, const bat *bid, const str *pat) |
1857 | { |
1858 | char *esc = "\\" ; |
1859 | |
1860 | return BATPCREnotlike(ret, bid, pat, &esc); |
1861 | } |
1862 | |
1863 | str |
1864 | BATPCREilike(bat *ret, const bat *bid, const str *pat, const str *esc) |
1865 | { |
1866 | bit yes = TRUE; |
1867 | bit no = FALSE; |
1868 | |
1869 | return BATPCRElike3(ret, bid, pat, esc, &yes, &no); |
1870 | } |
1871 | |
1872 | str |
1873 | BATPCREilike2(bat *ret, const bat *bid, const str *pat) |
1874 | { |
1875 | char *esc = "\\" ; |
1876 | |
1877 | return BATPCREilike(ret, bid, pat, &esc); |
1878 | } |
1879 | |
1880 | str |
1881 | BATPCREnotilike(bat *ret, const bat *bid, const str *pat, const str *esc) |
1882 | { |
1883 | bit yes = TRUE; |
1884 | |
1885 | return BATPCRElike3(ret, bid, pat, esc, &yes, &yes); |
1886 | } |
1887 | |
1888 | str |
1889 | BATPCREnotilike2(bat *ret, const bat *bid, const str *pat) |
1890 | { |
1891 | char *esc = "\\" ; |
1892 | |
1893 | return BATPCREnotilike(ret, bid, pat, &esc); |
1894 | } |
1895 | |
1896 | str |
1897 | PCRElikeselect2(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *caseignore, const bit *anti) |
1898 | { |
1899 | BAT *b, *s = NULL, *bn = NULL; |
1900 | str res; |
1901 | char *ppat = NULL; |
1902 | bool use_re = false; |
1903 | bool use_strcmp = false; |
1904 | |
1905 | if ((b = BATdescriptor(*bid)) == NULL) { |
1906 | throw(MAL, "algebra.likeselect" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
1907 | } |
1908 | if (sid && !is_bat_nil(*sid) && *sid && (s = BATdescriptor(*sid)) == NULL) { |
1909 | BBPunfix(b->batCacheid); |
1910 | throw(MAL, "algebra.likeselect" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
1911 | } |
1912 | |
1913 | /* no escape, try if a simple list of keywords works */ |
1914 | if (is_strcmpable(*pat, *esc)) { |
1915 | use_re = true; |
1916 | use_strcmp = true; |
1917 | } else if (re_simple(*pat, **esc == '\200' ? 0 : (unsigned char) **esc)) { |
1918 | use_re = true; |
1919 | } else { |
1920 | res = sql2pcre(&ppat, *pat, *esc); |
1921 | if (res != MAL_SUCCEED) { |
1922 | BBPunfix(b->batCacheid); |
1923 | if (s) |
1924 | BBPunfix(s->batCacheid); |
1925 | return res; |
1926 | } |
1927 | if (strcmp(ppat, str_nil) == 0) { |
1928 | GDKfree(ppat); |
1929 | ppat = NULL; |
1930 | if (*caseignore) { |
1931 | ppat = GDKmalloc(strlen(*pat) + 3); |
1932 | if (ppat == NULL) { |
1933 | BBPunfix(b->batCacheid); |
1934 | if (s) |
1935 | BBPunfix(s->batCacheid); |
1936 | throw(MAL, "algebra.likeselect" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1937 | } |
1938 | ppat[0] = '^'; |
1939 | strcpy(ppat + 1, *pat); |
1940 | strcat(ppat, "$" ); |
1941 | } |
1942 | } |
1943 | } |
1944 | |
1945 | if (use_re) { |
1946 | res = re_likeselect(&bn, b, s, *pat, (bool) *caseignore, (bool) *anti, use_strcmp, **esc == '\200' ? 0 : (unsigned char) **esc); |
1947 | } else if (ppat == NULL) { |
1948 | /* no pattern and no special characters: can use normal select */ |
1949 | bn = BATselect(b, s, *pat, NULL, true, true, *anti); |
1950 | if (bn == NULL) |
1951 | res = createException(MAL, "algebra.likeselect" , GDK_EXCEPTION); |
1952 | else |
1953 | res = MAL_SUCCEED; |
1954 | } else { |
1955 | res = pcre_likeselect(&bn, b, s, ppat, (bool) *caseignore, (bool) *anti); |
1956 | } |
1957 | BBPunfix(b->batCacheid); |
1958 | if (s) |
1959 | BBPunfix(s->batCacheid); |
1960 | GDKfree(ppat); |
1961 | if (res != MAL_SUCCEED) |
1962 | return res; |
1963 | assert(bn); |
1964 | *ret = bn->batCacheid; |
1965 | BBPkeepref(bn->batCacheid); |
1966 | return MAL_SUCCEED; |
1967 | } |
1968 | |
1969 | str |
1970 | PCRElikeselect1(bat *ret, const bat *bid, const bat *cid, const str *pat, const str *esc, const bit *anti) |
1971 | { |
1972 | const bit f = TRUE; |
1973 | return PCRElikeselect2(ret, bid, cid, pat, esc, &f, anti); |
1974 | } |
1975 | |
1976 | str |
1977 | PCRElikeselect3(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *anti) |
1978 | { |
1979 | const bit f = FALSE; |
1980 | return PCRElikeselect2(ret, bid, sid, pat, esc, &f, anti); |
1981 | } |
1982 | |
1983 | str |
1984 | PCRElikeselect4(bat *ret, const bat *bid, const bat *cid, const str *pat, const bit *anti) |
1985 | { |
1986 | const bit f = TRUE; |
1987 | const str esc ="" ; |
1988 | return PCRElikeselect2(ret, bid, cid, pat, &esc, &f, anti); |
1989 | } |
1990 | |
1991 | str |
1992 | PCRElikeselect5(bat *ret, const bat *bid, const bat *sid, const str *pat, const bit *anti) |
1993 | { |
1994 | const bit f = FALSE; |
1995 | const str esc ="" ; |
1996 | return PCRElikeselect2(ret, bid, sid, pat, &esc, &f, anti); |
1997 | } |
1998 | |
1999 | #define APPEND(b, o) (((oid *) b->theap.base)[b->batCount++] = (o)) |
2000 | #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##width)) |
2001 | |
2002 | static char * |
2003 | pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, |
2004 | const char *esc, bool caseignore) |
2005 | { |
2006 | struct canditer lci, rci; |
2007 | const char *lvals, *rvals; |
2008 | const char *lvars, *rvars; |
2009 | int lwidth, rwidth; |
2010 | const char *vl, *vr; |
2011 | oid lastl = 0; /* last value inserted into r1 */ |
2012 | BUN nl; |
2013 | BUN newcap; |
2014 | oid lo, ro; |
2015 | int rskipped = 0; /* whether we skipped values in r */ |
2016 | char *msg = MAL_SUCCEED; |
2017 | RE *re = NULL; |
2018 | char *pcrepat = NULL; |
2019 | #ifdef HAVE_LIBPCRE |
2020 | pcre *pcrere = NULL; |
2021 | pcre_extra *pcreex = NULL; |
2022 | const char *err_p = NULL; |
2023 | int errpos; |
2024 | int pcreopt = PCRE_UTF8 | PCRE_MULTILINE; |
2025 | int pcrestopt = (sl ? BATcount(sl) : BATcount(l)) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0; |
2026 | #else |
2027 | int pcrere = 0; |
2028 | regex_t regex; |
2029 | int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED; |
2030 | int errcode = -1; |
2031 | #endif |
2032 | |
2033 | |
2034 | if (caseignore) |
2035 | #ifdef HAVE_LIBPCRE |
2036 | pcreopt |= PCRE_CASELESS; |
2037 | #else |
2038 | options |= REG_ICASE; |
2039 | #endif |
2040 | |
2041 | ALGODEBUG fprintf(stderr, "#pcrejoin(l=%s#" BUNFMT "[%s]%s%s," |
2042 | "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s," |
2043 | "sr=%s#" BUNFMT "%s%s)\n" , |
2044 | BATgetId(l), BATcount(l), ATOMname(l->ttype), |
2045 | l->tsorted ? "-sorted" : "" , |
2046 | l->trevsorted ? "-revsorted" : "" , |
2047 | BATgetId(r), BATcount(r), ATOMname(r->ttype), |
2048 | r->tsorted ? "-sorted" : "" , |
2049 | r->trevsorted ? "-revsorted" : "" , |
2050 | sl ? BATgetId(sl) : "NULL" , sl ? BATcount(sl) : 0, |
2051 | sl && sl->tsorted ? "-sorted" : "" , |
2052 | sl && sl->trevsorted ? "-revsorted" : "" , |
2053 | sr ? BATgetId(sr) : "NULL" , sr ? BATcount(sr) : 0, |
2054 | sr && sr->tsorted ? "-sorted" : "" , |
2055 | sr && sr->trevsorted ? "-revsorted" : "" ); |
2056 | |
2057 | assert(ATOMtype(l->ttype) == ATOMtype(r->ttype)); |
2058 | assert(ATOMtype(l->ttype) == TYPE_str); |
2059 | assert(sl == NULL || sl->tsorted); |
2060 | assert(sr == NULL || sr->tsorted); |
2061 | |
2062 | canditer_init(&lci, l, sl); |
2063 | canditer_init(&rci, r, sr); |
2064 | |
2065 | lvals = (const char *) Tloc(l, 0); |
2066 | rvals = (const char *) Tloc(r, 0); |
2067 | assert(r->tvarsized && r->ttype); |
2068 | lvars = l->tvheap->base; |
2069 | rvars = r->tvheap->base; |
2070 | lwidth = l->twidth; |
2071 | rwidth = r->twidth; |
2072 | |
2073 | r1->tkey = true; |
2074 | r1->tsorted = true; |
2075 | r1->trevsorted = true; |
2076 | r2->tkey = true; |
2077 | r2->tsorted = true; |
2078 | r2->trevsorted = true; |
2079 | |
2080 | /* nested loop implementation for PCRE join */ |
2081 | for (BUN ri = 0; ri < rci.ncand; ri++) { |
2082 | ro = canditer_next(&rci); |
2083 | vr = VALUE(r, ro - r->hseqbase); |
2084 | if (strcmp(vr, str_nil) == 0) |
2085 | continue; |
2086 | if (re_simple(vr, esc && *esc != '\200' ? (unsigned char) *esc : 0)) { |
2087 | re = re_create(vr, caseignore, esc && *esc != '\200' ? (unsigned char) *esc : 0); |
2088 | if (re == NULL) { |
2089 | msg = createException(MAL, "pcre.join" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
2090 | goto bailout; |
2091 | } |
2092 | } else { |
2093 | assert(pcrepat == NULL); |
2094 | msg = sql2pcre(&pcrepat, vr, esc); |
2095 | if (msg != MAL_SUCCEED) |
2096 | goto bailout; |
2097 | if (strcmp(pcrepat, str_nil) == 0) { |
2098 | GDKfree(pcrepat); |
2099 | if (caseignore) { |
2100 | pcrepat = GDKmalloc(strlen(vr) + 3); |
2101 | if (pcrepat == NULL) { |
2102 | msg = createException(MAL, "pcre.join" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
2103 | goto bailout; |
2104 | } |
2105 | sprintf(pcrepat, "^%s$" , vr); |
2106 | } else { |
2107 | /* a simple strcmp suffices */ |
2108 | pcrepat = NULL; |
2109 | } |
2110 | } |
2111 | if (pcrepat) { |
2112 | #ifdef HAVE_LIBPCRE |
2113 | pcrere = pcre_compile(pcrepat, pcreopt, &err_p, &errpos, NULL); |
2114 | if (pcrere == NULL) { |
2115 | msg = createException(MAL, "pcre.join" , OPERATION_FAILED |
2116 | ": pcre compile of pattern (%s) " |
2117 | "failed at %d with '%s'" , |
2118 | pcrepat, errpos, err_p); |
2119 | goto bailout; |
2120 | } |
2121 | pcreex = pcre_study(pcrere, pcrestopt, &err_p); |
2122 | if (err_p != NULL) { |
2123 | msg = createException(MAL, "pcre.join" , OPERATION_FAILED |
2124 | ": pcre study of pattern (%s) " |
2125 | "failed with '%s'" , pcrepat, err_p); |
2126 | goto bailout; |
2127 | } |
2128 | #else |
2129 | if ((errcode = regcomp(®ex, pcrepat, options)) != 0) { |
2130 | msg = createException(MAL, "pcre.join" , OPERATION_FAILED |
2131 | ": pcre compile of pattern (%s)" , |
2132 | pcrepat); |
2133 | goto bailout; |
2134 | } |
2135 | pcrere = 1; |
2136 | #endif |
2137 | GDKfree(pcrepat); |
2138 | pcrepat = NULL; |
2139 | } |
2140 | } |
2141 | nl = 0; |
2142 | canditer_reset(&lci); |
2143 | for (BUN li = 0; li < lci.ncand; li++) { |
2144 | lo = canditer_next(&lci); |
2145 | vl = VALUE(l, lo - l->hseqbase); |
2146 | if (strcmp(vl, str_nil) == 0) |
2147 | continue; |
2148 | if (re) { |
2149 | if (caseignore) { |
2150 | if (!re_match_ignore(vl, re)) |
2151 | continue; |
2152 | } else { |
2153 | if (!re_match_no_ignore(vl, re)) |
2154 | continue; |
2155 | } |
2156 | } else if (pcrere) { |
2157 | #ifdef HAVE_LIBPCRE |
2158 | if (pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, 0, NULL, 0) < 0) |
2159 | continue; |
2160 | #else |
2161 | int retval = regexec(®ex, vl, (size_t) 0, NULL, 0); |
2162 | if (retval == REG_NOMATCH || retval == REG_ENOSYS) |
2163 | continue; |
2164 | #endif |
2165 | } else { |
2166 | if (strcmp(vl, vr) != 0) |
2167 | continue; |
2168 | } |
2169 | if (BUNlast(r1) == BATcapacity(r1)) { |
2170 | newcap = BATgrows(r1); |
2171 | BATsetcount(r1, BATcount(r1)); |
2172 | BATsetcount(r2, BATcount(r2)); |
2173 | if (BATextend(r1, newcap) != GDK_SUCCEED || |
2174 | BATextend(r2, newcap) != GDK_SUCCEED) { |
2175 | msg = createException(MAL, "pcre.join" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
2176 | goto bailout; |
2177 | } |
2178 | assert(BATcapacity(r1) == BATcapacity(r2)); |
2179 | } |
2180 | if (BATcount(r1) > 0) { |
2181 | if (lastl + 1 != lo) |
2182 | r1->tseqbase = oid_nil; |
2183 | if (nl == 0) { |
2184 | r2->trevsorted = false; |
2185 | if (lastl > lo) { |
2186 | r1->tsorted = false; |
2187 | r1->tkey = false; |
2188 | } else if (lastl < lo) { |
2189 | r1->trevsorted = false; |
2190 | } else { |
2191 | r1->tkey = false; |
2192 | } |
2193 | } |
2194 | } |
2195 | APPEND(r1, lo); |
2196 | APPEND(r2, ro); |
2197 | lastl = lo; |
2198 | nl++; |
2199 | } |
2200 | if (re) { |
2201 | re_destroy(re); |
2202 | re = NULL; |
2203 | } |
2204 | if (pcrere) { |
2205 | #ifdef HAVE_LIBPCRE |
2206 | pcre_free_study(pcreex); |
2207 | pcre_free(pcrere); |
2208 | pcrere = NULL; |
2209 | pcreex = NULL; |
2210 | #else |
2211 | regfree(®ex); |
2212 | pcrere = 0; |
2213 | #endif |
2214 | } |
2215 | if (nl > 1) { |
2216 | r2->tkey = false; |
2217 | r2->tseqbase = oid_nil; |
2218 | r1->trevsorted = false; |
2219 | } else if (nl == 0) { |
2220 | rskipped = BATcount(r2) > 0; |
2221 | } else if (rskipped) { |
2222 | r2->tseqbase = oid_nil; |
2223 | } |
2224 | } |
2225 | assert(BATcount(r1) == BATcount(r2)); |
2226 | /* also set other bits of heap to correct value to indicate size */ |
2227 | BATsetcount(r1, BATcount(r1)); |
2228 | BATsetcount(r2, BATcount(r2)); |
2229 | if (BATcount(r1) > 0) { |
2230 | if (BATtdense(r1)) |
2231 | r1->tseqbase = ((oid *) r1->theap.base)[0]; |
2232 | if (BATtdense(r2)) |
2233 | r2->tseqbase = ((oid *) r2->theap.base)[0]; |
2234 | } else { |
2235 | r1->tseqbase = r2->tseqbase = 0; |
2236 | } |
2237 | ALGODEBUG fprintf(stderr, "#pcrejoin(l=%s,r=%s)=(%s#" BUNFMT"%s%s,%s#" BUNFMT"%s%s\n" , |
2238 | BATgetId(l), BATgetId(r), |
2239 | BATgetId(r1), BATcount(r1), |
2240 | r1->tsorted ? "-sorted" : "" , |
2241 | r1->trevsorted ? "-revsorted" : "" , |
2242 | BATgetId(r2), BATcount(r2), |
2243 | r2->tsorted ? "-sorted" : "" , |
2244 | r2->trevsorted ? "-revsorted" : "" ); |
2245 | return MAL_SUCCEED; |
2246 | |
2247 | bailout: |
2248 | if (re) |
2249 | re_destroy(re); |
2250 | if (pcrepat) |
2251 | GDKfree(pcrepat); |
2252 | #ifdef HAVE_LIBPCRE |
2253 | if (pcreex) |
2254 | pcre_free_study(pcreex); |
2255 | if (pcrere) |
2256 | pcre_free(pcrere); |
2257 | #else |
2258 | if (pcrere) |
2259 | regfree(®ex); |
2260 | #endif |
2261 | |
2262 | assert(msg != MAL_SUCCEED); |
2263 | return msg; |
2264 | } |
2265 | |
2266 | static str |
2267 | PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, |
2268 | const char *esc, bool caseignore) |
2269 | { |
2270 | BAT *left = NULL, *right = NULL, *candleft = NULL, *candright = NULL; |
2271 | BAT *result1 = NULL, *result2 = NULL; |
2272 | char *msg = MAL_SUCCEED; |
2273 | |
2274 | if ((left = BATdescriptor(lid)) == NULL) |
2275 | goto fail; |
2276 | if ((right = BATdescriptor(rid)) == NULL) |
2277 | goto fail; |
2278 | if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL) |
2279 | goto fail; |
2280 | if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL) |
2281 | goto fail; |
2282 | result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT); |
2283 | result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT); |
2284 | if (result1 == NULL || result2 == NULL) { |
2285 | msg = createException(MAL, "pcre.join" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
2286 | goto fail; |
2287 | } |
2288 | result1->tnil = false; |
2289 | result1->tnonil = true; |
2290 | result1->tkey = true; |
2291 | result1->tsorted = true; |
2292 | result1->trevsorted = true; |
2293 | result1->tseqbase = 0; |
2294 | result2->tnil = false; |
2295 | result2->tnonil = true; |
2296 | result2->tkey = true; |
2297 | result2->tsorted = true; |
2298 | result2->trevsorted = true; |
2299 | result2->tseqbase = 0; |
2300 | msg = pcrejoin(result1, result2, left, right, candleft, candright, |
2301 | esc, caseignore); |
2302 | if (msg) |
2303 | goto fail; |
2304 | *r1 = result1->batCacheid; |
2305 | *r2 = result2->batCacheid; |
2306 | BBPkeepref(*r1); |
2307 | BBPkeepref(*r2); |
2308 | BBPunfix(left->batCacheid); |
2309 | BBPunfix(right->batCacheid); |
2310 | if (candleft) |
2311 | BBPunfix(candleft->batCacheid); |
2312 | if (candright) |
2313 | BBPunfix(candright->batCacheid); |
2314 | return MAL_SUCCEED; |
2315 | |
2316 | fail: |
2317 | if (left) |
2318 | BBPunfix(left->batCacheid); |
2319 | if (right) |
2320 | BBPunfix(right->batCacheid); |
2321 | if (candleft) |
2322 | BBPunfix(candleft->batCacheid); |
2323 | if (candright) |
2324 | BBPunfix(candright->batCacheid); |
2325 | if (result1) |
2326 | BBPunfix(result1->batCacheid); |
2327 | if (result2) |
2328 | BBPunfix(result2->batCacheid); |
2329 | if (msg) |
2330 | return msg; |
2331 | throw(MAL, "pcre.join" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
2332 | } |
2333 | |
2334 | str |
2335 | LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate) |
2336 | { |
2337 | (void) nil_matches; |
2338 | (void) estimate; |
2339 | return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *esc, 0); |
2340 | } |
2341 | |
2342 | str |
2343 | LIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate) |
2344 | { |
2345 | (void) nil_matches; |
2346 | (void) estimate; |
2347 | return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, "" , 0); |
2348 | } |
2349 | |
2350 | str |
2351 | ILIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const str *esc, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate) |
2352 | { |
2353 | (void) nil_matches; |
2354 | (void) estimate; |
2355 | return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *esc, 1); |
2356 | } |
2357 | |
2358 | str |
2359 | ILIKEjoin1(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate) |
2360 | { |
2361 | (void) nil_matches; |
2362 | (void) estimate; |
2363 | return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, "" , 1); |
2364 | } |
2365 | |