1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9#include "monetdb_config.h"
10#include "gdk.h"
11#include "gdk_private.h"
12#include "gdk_cand.h"
13
14/* String Atom Implementation
15 *
16 * Strings are stored in two parts. The first part is the normal tail
17 * heap which contains a list of offsets. The second part is the
18 * theap which contains the actual strings. The offsets in the tail
19 * heap (a.k.a. offset heap) point into the theap (a.k.a. string
20 * heap). Strings are NULL-terminated and are stored without any
21 * escape sequences. Strings are encoded using the UTF-8 encoding
22 * of Unicode. This means that individual "characters" (really,
23 * Unicode code points) can be between one and four bytes long.
24 *
25 * Because in many typical situations there are lots of duplicated
26 * string values that are being stored in a table, but also in many
27 * (other) typical situations there are very few duplicated string
28 * values stored, a scheme has been introduced to cater to both
29 * situations.
30 *
31 * When the string heap is "small" (defined as less than 64KiB), the
32 * string heap is fully duplicate eliminated. When the string heap
33 * grows beyond this size, the heap is not kept free of duplicate
34 * strings, but there is then a heuristic that tries to limit the
35 * number of duplicates.
36 *
37 * This is done by having a fixed sized hash table at the start of the
38 * string heap, and allocating space for collision lists in the first
39 * 64KiB of the string heap. After the first 64KiB no extra space is
40 * allocated for lists, so hash collisions cannot be resolved.
41 */
42
43/* some of these macros are duplicates from gdk_atoms.c */
44#define num08(x) ((x) >= '0' && (x) <= '7')
45#define base08(x) ((x) - '0')
46#define mult08(x) ((x) << 3)
47
48#define num16(x) isxdigit((unsigned char) (x))
49#define base16(x) (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) >= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0')
50#define mult16(x) ((x) << 4)
51
52#define atommem(size) \
53 do { \
54 if (*dst == NULL || *len < (size)) { \
55 GDKfree(*dst); \
56 *len = (size); \
57 *dst = GDKmalloc(*len); \
58 if (*dst == NULL) { \
59 *len = 0; \
60 return -1; \
61 } \
62 } \
63 } while (0)
64
65const char str_nil[2] = { '\200', 0 };
66
67int
68strNil(const char *s)
69{
70 return GDK_STRNIL(s);
71}
72
73size_t
74strLen(const char *s)
75{
76 return GDK_STRLEN(s);
77}
78
79int
80strCmp(const char *l, const char *r)
81{
82 return GDK_STRCMP(l, r);
83}
84
85int
86strCmpNoNil(const unsigned char *l, const unsigned char *r)
87{
88 while (*l == *r) {
89 if (*l == 0)
90 return 0;
91 l++;
92 r++;
93 }
94 return (*l < *r) ? -1 : 1;
95}
96
97void
98strHeap(Heap *d, size_t cap)
99{
100 size_t size;
101
102 cap = MAX(cap, BATTINY);
103 size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN);
104 if (HEAPalloc(d, size, 1) == GDK_SUCCEED) {
105 d->free = GDK_STRHASHTABLE * sizeof(stridx_t);
106 d->dirty = true;
107 memset(d->base, 0, d->free);
108 d->hashash = false;
109#ifndef NDEBUG
110 /* fill should solve initialization problems within valgrind */
111 memset(d->base + d->free, 0, d->size - d->free);
112#endif
113 }
114}
115
116
117BUN
118strHash(const char *s)
119{
120 BUN res;
121
122 GDK_STRHASH(s, res);
123 return res;
124}
125
126void
127strCleanHash(Heap *h, bool rebuild)
128{
129 stridx_t newhash[GDK_STRHASHTABLE];
130 size_t pad, pos;
131 const size_t extralen = h->hashash ? EXTRALEN : 0;
132 BUN off, strhash;
133 const char *s;
134
135 (void) rebuild;
136 if (!h->cleanhash)
137 return;
138 /* rebuild hash table for double elimination
139 *
140 * If appending strings to the BAT was aborted, if the heap
141 * was memory mapped, the hash in the string heap may well be
142 * incorrect. Therefore we don't trust it when we read in a
143 * string heap and we rebuild the complete table (it is small,
144 * so this won't take any time at all).
145 * Note that we will only do this the first time the heap is
146 * loaded, and only for heaps that existed when the server was
147 * started. */
148 memset(newhash, 0, sizeof(newhash));
149 pos = GDK_STRHASHSIZE;
150 while (pos < h->free && pos < GDK_ELIMLIMIT) {
151 pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
152 if (pad < sizeof(stridx_t))
153 pad += GDK_VARALIGN;
154 pos += pad + extralen;
155 s = h->base + pos;
156 if (h->hashash)
157 strhash = ((const BUN *) s)[-1];
158 else
159 GDK_STRHASH(s, strhash);
160 off = strhash & GDK_STRHASHMASK;
161 newhash[off] = (stridx_t) (pos - extralen - sizeof(stridx_t));
162 pos += GDK_STRLEN(s);
163 }
164 /* only set dirty flag if the hash table actually changed */
165 if (memcmp(newhash, h->base, sizeof(newhash)) != 0) {
166 memcpy(h->base, newhash, sizeof(newhash));
167 if (h->storage == STORE_MMAP) {
168 if (!(GDKdebug & NOSYNCMASK))
169 (void) MT_msync(h->base, GDK_STRHASHSIZE);
170 } else
171 h->dirty = true;
172 }
173#ifndef NDEBUG
174 if (GDK_ELIMDOUBLES(h)) {
175 pos = GDK_STRHASHSIZE;
176 while (pos < h->free) {
177 pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
178 if (pad < sizeof(stridx_t))
179 pad += GDK_VARALIGN;
180 pos += pad + extralen;
181 s = h->base + pos;
182 assert(strLocate(h, s) != 0);
183 pos += GDK_STRLEN(s);
184 }
185 }
186#endif
187 h->cleanhash = false;
188}
189
190/*
191 * The strPut routine. The routine strLocate can be used to identify
192 * the location of a string in the heap if it exists. Otherwise it
193 * returns zero.
194 */
195var_t
196strLocate(Heap *h, const char *v)
197{
198 stridx_t *ref, *next;
199 const size_t extralen = h->hashash ? EXTRALEN : 0;
200
201 /* search hash-table, if double-elimination is still in place */
202 BUN off;
203 GDK_STRHASH(v, off);
204 off &= GDK_STRHASHMASK;
205
206 /* should only use strLocate iff fully double eliminated */
207 assert(GDK_ELIMBASE(h->free) == 0);
208
209 /* search the linked list */
210 for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
211 next = (stridx_t *) (h->base + *ref);
212 if (GDK_STRCMP(v, (str) (next + 1) + extralen) == 0)
213 return (var_t) ((sizeof(stridx_t) + *ref + extralen)); /* found */
214 }
215 return 0;
216}
217
218var_t
219strPut(Heap *h, var_t *dst, const char *v)
220{
221 size_t elimbase = GDK_ELIMBASE(h->free);
222 size_t pad;
223 size_t pos, len = GDK_STRLEN(v);
224 const size_t extralen = h->hashash ? EXTRALEN : 0;
225 stridx_t *bucket;
226 BUN off, strhash;
227
228 GDK_STRHASH(v, off);
229 strhash = off;
230 off &= GDK_STRHASHMASK;
231 bucket = ((stridx_t *) h->base) + off;
232
233 if (*bucket) {
234 /* the hash list is not empty */
235 if (*bucket < GDK_ELIMLIMIT) {
236 /* small string heap (<64KiB) -- fully double
237 * eliminated: search the linked list */
238 const stridx_t *ref = bucket;
239
240 do {
241 pos = *ref + sizeof(stridx_t) + extralen;
242 if (GDK_STRCMP(v, h->base + pos) == 0) {
243 /* found */
244 return *dst = (var_t) pos;
245 }
246 ref = (stridx_t *) (h->base + *ref);
247 } while (*ref);
248 } else {
249 /* large string heap (>=64KiB) -- there is no
250 * linked list, so only look at single
251 * entry */
252 pos = *bucket + extralen;
253 if (GDK_STRCMP(v, h->base + pos) == 0) {
254 /* already in heap: reuse */
255 return *dst = (var_t) pos;
256 }
257 }
258 }
259 /* the string was not found in the heap, we need to enter it */
260
261 if (v[0] != '\200' || v[1] != '\0') {
262 /* check that string is correctly encoded UTF-8; there
263 * was no need to do this earlier: if the string was
264 * found above, it must have gone through here in the
265 * past */
266 int nutf8 = 0;
267 int m = 0;
268 for (size_t i = 0; v[i]; i++) {
269 if (nutf8 > 0) {
270 if ((v[i] & 0xC0) != 0x80 ||
271 (m != 0 && (v[i] & m) == 0)) {
272 badutf8:
273 GDKerror("strPut: incorrectly encoded UTF-8");
274 return 0;
275 }
276 m = 0;
277 nutf8--;
278 } else if ((v[i] & 0xE0) == 0xC0) {
279 nutf8 = 1;
280 if ((v[i] & 0x1E) == 0)
281 goto badutf8;
282 } else if ((v[i] & 0xF0) == 0xE0) {
283 nutf8 = 2;
284 if ((v[i] & 0x0F) == 0)
285 m = 0x20;
286 } else if ((v[i] & 0xF8) == 0xF0) {
287 nutf8 = 3;
288 if ((v[i] & 0x07) == 0)
289 m = 0x30;
290 } else if ((v[i] & 0x80) != 0) {
291 goto badutf8;
292 }
293 }
294 }
295
296 pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1));
297 if (elimbase == 0) { /* i.e. h->free < GDK_ELIMLIMIT */
298 if (pad < sizeof(stridx_t)) {
299 /* make room for hash link */
300 pad += GDK_VARALIGN;
301 }
302 } else if (extralen == 0) { /* i.e., h->hashash == FALSE */
303 /* no VARSHIFT and no string hash value stored => no
304 * padding/alignment needed */
305 pad = 0;
306 } else {
307 /* pad to align on VARALIGN for VARSHIFT and/or string
308 * hash value */
309 pad &= (GDK_VARALIGN - 1);
310 }
311
312 /* check heap for space (limited to a certain maximum after
313 * which nils are inserted) */
314 if (h->free + pad + len + extralen >= h->size) {
315 size_t newsize = MAX(h->size, 4096);
316
317 /* double the heap size until we have enough space */
318 do {
319 if (newsize < 4 * 1024 * 1024)
320 newsize <<= 1;
321 else
322 newsize += 4 * 1024 * 1024;
323 } while (newsize <= h->free + pad + len + extralen);
324
325 assert(newsize);
326
327 if (h->free + pad + len + extralen >= (size_t) VAR_MAX) {
328 GDKerror("strPut: string heaps gets larger than %zuGiB.\n", (size_t) VAR_MAX >> 30);
329 return 0;
330 }
331 HEAPDEBUG fprintf(stderr, "#HEAPextend in strPut %s %zu %zu\n", h->filename, h->size, newsize);
332 if (HEAPextend(h, newsize, true) != GDK_SUCCEED) {
333 return 0;
334 }
335#ifndef NDEBUG
336 /* fill should solve initialization problems within
337 * valgrind */
338 memset(h->base + h->free, 0, h->size - h->free);
339#endif
340
341 /* make bucket point into the new heap */
342 bucket = ((stridx_t *) h->base) + off;
343 }
344
345 /* insert string */
346 pos = h->free + pad + extralen;
347 *dst = (var_t) pos;
348 memcpy(h->base + pos, v, len);
349 if (h->hashash) {
350 ((BUN *) (h->base + pos))[-1] = strhash;
351#if EXTRALEN > SIZEOF_BUN
352 ((BUN *) (h->base + pos))[-2] = (BUN) len;
353#endif
354 }
355 h->free += pad + len + extralen;
356 h->dirty = true;
357
358 /* maintain hash table */
359 pos -= extralen;
360 if (elimbase == 0) { /* small string heap: link the next pointer */
361 /* the stridx_t next pointer directly precedes the
362 * string and optional (depending on hashash) hash
363 * value */
364 pos -= sizeof(stridx_t);
365 *(stridx_t *) (h->base + pos) = *bucket;
366 }
367 *bucket = (stridx_t) pos; /* set bucket to the new string */
368
369 return *dst;
370}
371
372/*
373 * Convert an "" separated string to a GDK string value, checking that
374 * the input is correct UTF-8.
375 */
376
377/*
378 UTF-8 encoding is as follows:
379U-00000000 - U-0000007F: 0xxxxxxx
380U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
381U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
382U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
383U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
384U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
385*/
386/* To be correctly coded UTF-8, the sequence should be the shortest
387 * possible encoding of the value being encoded. This means that for
388 * an encoding of length n+1 (1 <= n <= 5), at least one of the bits
389 * in utf8chkmsk[n] should be non-zero (else the encoding could be
390 * shorter). */
391static int utf8chkmsk[] = {
392 0x0000007f,
393 0x00000780,
394 0x0000f800,
395 0x001f0000,
396 0x03e00000,
397 0x7c000000,
398};
399
400ssize_t
401GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len)
402{
403 unsigned char *p = dst;
404 const unsigned char *cur = src, *end = src + len;
405 bool escaped = false;
406 int mask = 0, n, c, utf8char = 0;
407
408 if (len >= 2 && strcmp((const char *) src, str_nil) == 0) {
409 strcpy((char *) dst, str_nil);
410 return 1;
411 }
412
413 /* copy it in, while performing the correct escapes */
414 /* n is the number of follow-on bytes left in a multi-byte
415 * UTF-8 sequence */
416 for (cur = src, n = 0; cur < end || escaped; cur++) {
417 /* first convert any \ escapes and store value in c */
418 if (escaped) {
419 switch (*cur) {
420 case '0':
421 case '1':
422 case '2':
423 case '3':
424 case '4':
425 case '5':
426 case '6':
427 case '7':
428 /* \ with up to three octal digits */
429 c = base08(*cur);
430 if (num08(cur[1])) {
431 cur++;
432 c = mult08(c) + base08(*cur);
433 if (num08(cur[1])) {
434 if (c > 037) {
435 /* octal
436 * escape
437 * sequence
438 * out or
439 * range */
440 GDKerror("not an octal number\n");
441 return -1;
442 }
443 cur++;
444 c = mult08(c) + base08(*cur);
445 assert(c >= 0 && c <= 0377);
446 }
447 }
448 break;
449 case 'x':
450 /* \x with one or two hexadecimal digits */
451 if (num16(cur[1])) {
452 cur++;
453 c = base16(*cur);
454 if (num16(cur[1])) {
455 cur++;
456 c = mult16(c) + base16(*cur);
457 }
458 } else
459 c = 'x';
460 break;
461 case 'u':
462 case 'U':
463 /* \u with four hexadecimal digits or
464 * \U with eight hexadecimal digits */
465 if (n > 0) {
466 /* not when in the middle of a
467 * UTF-8 sequence */
468 goto notutf8;
469 }
470 c = 0;
471 for (n = *cur == 'U' ? 8 : 4; n > 0; n--) {
472 cur++;
473 if (!num16(*cur)) {
474 GDKerror("not a Unicode code point escape\n");
475 return -1;
476 }
477 c = c << 4 | base16(*cur);
478 }
479 /* n == 0 now */
480 if (c == 0 || c > 0x10FFFF ||
481 (c & 0xFFF800) == 0xD800) {
482 GDKerror("illegal Unicode code point\n");
483 return -1;
484 }
485 if (c < 0x80) {
486 *p++ = (unsigned char) c;
487 } else {
488 if (c < 0x800) {
489 *p++ = 0xC0 | (c >> 6);
490 } else {
491 if (c < 0x10000) {
492 *p++ = 0xE0 | (c >> 12);
493 } else {
494 *p++ = 0xF0 | (c >> 18);
495 *p++ = 0x80 | ((c >> 12) & 0x3F);
496 }
497 *p++ = 0x80 | ((c >> 6) & 0x3F);
498 }
499 *p++ = 0x80 | (c & 0x3F);
500 }
501 escaped = false;
502 continue;
503 case 'a':
504 c = '\a';
505 break;
506 case 'b':
507 c = '\b';
508 break;
509 case 'f':
510 c = '\f';
511 break;
512 case 'n':
513 c = '\n';
514 break;
515 case 'r':
516 c = '\r';
517 break;
518 case 't':
519 c = '\t';
520 break;
521 case '\0':
522 c = '\\';
523 break;
524 case '\'':
525 case '\\':
526 /* \' and \\ can be handled by the
527 * default case */
528 default:
529 /* unrecognized \ escape, just copy
530 * the backslashed character */
531 c = *cur;
532 break;
533 }
534 escaped = false;
535 } else if ((c = *cur) == '\\') {
536 escaped = true;
537 continue;
538#if 0
539 } else if (c == quote && cur[1] == quote) {
540 assert(c != 0);
541 if (n > 0)
542 goto notutf8;
543 *p++ = quote;
544 cur++;
545 continue;
546#endif
547 }
548
549 if (n > 0) {
550 /* we're still expecting follow-up bytes in a
551 * UTF-8 sequence */
552 if ((c & 0xC0) != 0x80) {
553 /* incorrect UTF-8 sequence: byte is
554 * not 10xxxxxx */
555 goto notutf8;
556 }
557 utf8char = (utf8char << 6) | (c & 0x3F);
558 n--;
559 if (n == 0) {
560 /* this was the last byte in the sequence */
561 if ((utf8char & mask) == 0) {
562 /* incorrect UTF-8 sequence:
563 * not shortest possible */
564 goto notutf8;
565 }
566 if (utf8char > 0x10FFFF) {
567 /* incorrect UTF-8 sequence:
568 * value too large */
569 goto notutf8;
570 }
571 if ((utf8char & 0x1FFF800) == 0xD800) {
572 /* incorrect UTF-8 sequence:
573 * low or high surrogate
574 * encoded as UTF-8 */
575 goto notutf8;
576 }
577 }
578 } else if (c >= 0x80) {
579 int m;
580
581 /* start of multi-byte UTF-8 character */
582 for (n = 0, m = 0x40; c & m; n++, m >>= 1)
583 ;
584 /* n now is number of 10xxxxxx bytes that
585 * should follow */
586 if (n == 0 || n >= 4) {
587 /* incorrect UTF-8 sequence */
588 /* n==0: c == 10xxxxxx */
589 /* n>=4: c == 11111xxx */
590 goto notutf8;
591 }
592 mask = utf8chkmsk[n];
593 /* collect the Unicode code point in utf8char */
594 utf8char = c & ~(0xFFC0 >> n); /* remove non-x bits */
595 }
596 *p++ = c;
597 }
598 if (n > 0) {
599 /* incomplete UTF-8 sequence */
600 goto notutf8;
601 }
602 *p++ = 0;
603 return len;
604 notutf8:
605 GDKerror("not a proper UTF-8 sequence\n");
606 return -1;
607}
608
609ssize_t
610strFromStr(const char *restrict src, size_t *restrict len, char **restrict dst, bool external)
611{
612 const char *cur = src, *start = NULL;
613 size_t l = 1;
614 bool escaped = false;
615
616 if (!external) {
617 size_t sz = strLen(src);
618 atommem(sz);
619 return (ssize_t) strcpy_len(*dst, src, sz);
620 }
621
622 if (GDK_STRNIL(src)) {
623 atommem(2);
624 strcpy(*dst, str_nil);
625 return 1;
626 }
627
628 while (GDKisspace(*cur))
629 cur++;
630 if (*cur != '"') {
631 if (strncmp(cur, "nil", 3) == 0) {
632 atommem(2);
633 strcpy(*dst, str_nil);
634 return (ssize_t) (cur - src) + 3;
635 }
636 GDKerror("not a quoted string\n");
637 return -1;
638 }
639
640 /* scout the string to find out its length and whether it was
641 * properly quoted */
642 for (start = ++cur; *cur != '"' || escaped; cur++) {
643 if (*cur == 0) {
644 GDKerror("no closing quotes\n");
645 return -1;
646 } else if (*cur == '\\' && !escaped) {
647 escaped = true;
648 } else {
649 escaped = false;
650 l++;
651 }
652 }
653
654 /* alloc new memory */
655 if (*dst == NULL || *len < l) {
656 GDKfree(*dst);
657 *dst = GDKmalloc(*len = l);
658 if (*dst == NULL) {
659 *len = 0;
660 return -1;
661 }
662 }
663
664 return GDKstrFromStr((unsigned char *) *dst,
665 (const unsigned char *) start,
666 (ssize_t) (cur - start));
667}
668
669/*
670 * Convert a GDK string value to something printable.
671 */
672/* all but control characters (in range 0 to 31) and DEL */
673#ifdef ASCII_CHR
674/* ASCII printable characters */
675#define printable_chr(ch) (' ' <= (ch) && (ch) <= '~')
676#else
677/* everything except ASCII control characters */
678#define printable_chr(ch) ((' ' <= (ch) && (ch) <= '~') || ((ch) & 0x80) != 0)
679#endif
680
681size_t
682escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote)
683{
684 size_t end, sz = 0;
685 size_t sep1len, sep2len;
686
687 sep1len = sep1 ? strlen(sep1) : 0;
688 sep2len = sep2 ? strlen(sep2) : 0;
689 for (end = 0; src[end]; end++)
690 if (src[end] == '\\'
691 || src[end] == quote
692 || (sep1len && strncmp(src + end, sep1, sep1len) == 0)
693 || (sep2len && strncmp(src + end, sep2, sep2len) == 0)) {
694 sz += 2;
695#ifndef ASCII_CHR
696 } else if (src[end] == (char) '\302' &&
697 0200 <= ((int) src[end + 1] & 0377) &&
698 ((int) src[end + 1] & 0377) <= 0237) {
699 /* Unicode control character (code point range
700 * U-00000080 through U-0000009F encoded in
701 * UTF-8 */
702 /* for the first one of the two UTF-8 bytes we
703 * count a width of 7 and for the second one
704 * 1, together that's 8, i.e. the width of two
705 * backslash-escaped octal coded characters */
706 sz += 7;
707#endif
708 } else if (!printable_chr(src[end])) {
709 sz += 4;
710 } else {
711 sz++;
712 }
713 return sz;
714}
715
716size_t
717escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote)
718{
719 size_t cur = 0, l = 0;
720 size_t sep1len, sep2len;
721
722 sep1len = sep1 ? strlen(sep1) : 0;
723 sep2len = sep2 ? strlen(sep2) : 0;
724 for (; src[cur] && l < dstlen; cur++)
725 if (!printable_chr(src[cur])
726#ifndef ASCII_CHR
727 || (src[cur] == '\302'
728 && 0200 <= (src[cur + 1] & 0377)
729 && ((int) src[cur + 1] & 0377) <= 0237)
730 || (cur > 0
731 && src[cur - 1] == '\302'
732 && 0200 <= (src[cur] & 0377)
733 && (src[cur] & 0377) <= 0237)
734#endif
735 ) {
736 dst[l++] = '\\';
737 switch (src[cur]) {
738 case '\t':
739 dst[l++] = 't';
740 break;
741 case '\n':
742 dst[l++] = 'n';
743 break;
744 case '\r':
745 dst[l++] = 'r';
746 break;
747 case '\f':
748 dst[l++] = 'f';
749 break;
750 default:
751 snprintf(dst + l, dstlen - l, "%03o", (unsigned char) src[cur]);
752 l += 3;
753 break;
754 }
755 } else if (src[cur] == '\\'
756 || src[cur] == quote
757 || (sep1len && strncmp(src + cur, sep1, sep1len) == 0)
758 || (sep2len && strncmp(src + cur, sep2, sep2len) == 0)) {
759 dst[l++] = '\\';
760 dst[l++] = src[cur];
761 } else {
762 dst[l++] = src[cur];
763 }
764 assert(l < dstlen);
765 dst[l] = 0;
766 return l;
767}
768
769ssize_t
770strToStr(char **restrict dst, size_t *restrict len, const char *restrict src, bool external)
771{
772 size_t sz;
773
774 if (!external) {
775 sz = strLen(src);
776 atommem(sz);
777 return (ssize_t) strcpy_len(*dst, src, sz);
778 }
779 if (GDK_STRNIL(src)) {
780 atommem(4);
781 strcpy(*dst, "nil");
782 return 3;
783 } else {
784 ssize_t l = 0;
785 size_t sz = escapedStrlen(src, NULL, NULL, '"');
786
787 atommem(sz + 3);
788 l = (ssize_t) escapedStr((*dst) + 1, src, *len - 1, NULL, NULL, '"');
789 l++;
790 (*dst)[0] = (*dst)[l++] = '"';
791 (*dst)[l] = 0;
792 return l;
793 }
794}
795
796str
797strRead(str a, stream *s, size_t cnt)
798{
799 int len;
800
801 (void) cnt;
802 assert(cnt == 1);
803 if (mnstr_readInt(s, &len) != 1)
804 return NULL;
805 if ((a = GDKmalloc(len + 1)) == NULL)
806 return NULL;
807 if (len && mnstr_read(s, a, len, 1) != 1) {
808 GDKfree(a);
809 return NULL;
810 }
811 a[len] = 0;
812 return a;
813}
814
815gdk_return
816strWrite(const char *a, stream *s, size_t cnt)
817{
818 size_t len = strlen(a);
819
820 (void) cnt;
821 assert(cnt == 1);
822 if (mnstr_writeInt(s, (int) len) && mnstr_write(s, a, len, 1) == 1)
823 return GDK_SUCCEED;
824 else
825 return GDK_FAIL;
826}
827
828static gdk_return
829concat_strings(BAT **bnp, ValPtr pt, BAT *b, oid seqb,
830 BUN ngrp, struct canditer *restrict ci, BUN ncand,
831 const oid *restrict gids, oid min, oid max, bool skip_nils,
832 const char *separator, BUN *has_nils)
833{
834 oid gid;
835 BUN i, p, nils = 0;
836 size_t *lengths = NULL, separator_length = strlen(separator), next_length;
837 str *astrings = NULL, s;
838 BATiter bi;
839 BAT *bn = NULL;
840 gdk_return rres = GDK_SUCCEED;
841
842 /* exactly one of bnp and pt must be NULL, the other non-NULL */
843 assert((bnp == NULL) != (pt == NULL));
844 /* if pt not NULL, only a single group allowed */
845 assert(pt == NULL || ngrp == 1);
846 if (bnp) {
847 if ((bn = COLnew(min, TYPE_str, ngrp, TRANSIENT)) == NULL) {
848 rres = GDK_FAIL;
849 goto finish;
850 }
851 *bnp = bn;
852 }
853
854 bi = bat_iterator(b);
855
856 if (ngrp == 1) {
857 size_t offset = 0, single_length = 0;
858 bool empty = true;
859
860 for (i = 0; i < ncand; i++) {
861 p = canditer_next(ci) - seqb;
862 s = BUNtvar(bi, p);
863 if (GDK_STRNIL(s)) {
864 if (!skip_nils) {
865 nils = 1;
866 break;
867 }
868 } else {
869 single_length += strlen(s);
870 if (!empty)
871 single_length += separator_length;
872 empty = false;
873 }
874 }
875 canditer_reset(ci);
876
877 if (nils == 0) {
878 char *single_str;
879
880 if ((single_str = GDKmalloc(single_length + 1)) == NULL) {
881 return GDK_FAIL;
882 }
883 empty = true;
884 for (i = 0; i < ncand; i++) {
885 p = canditer_next(ci) - seqb;
886 s = BUNtvar(bi, p);
887 if (GDK_STRNIL(s))
888 continue;
889 if (!empty) {
890 memcpy(single_str + offset, separator, separator_length);
891 offset += separator_length;
892 }
893 next_length = strlen(s);
894 memcpy(single_str + offset, s, next_length);
895 offset += next_length;
896 empty = false;
897 }
898 single_str[offset] = '\0';
899 if (bn) {
900 if (BUNappend(bn, single_str, false) != GDK_SUCCEED) {
901 GDKfree(single_str);
902 return GDK_FAIL;
903 }
904 } else {
905 pt->len = offset + 1;
906 pt->val.sval = single_str;
907 single_str = NULL; /* don't free */
908 }
909 GDKfree(single_str);
910 } else if (bn) {
911 if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
912 return GDK_FAIL;
913 }
914 } else {
915 if (VALinit(pt, TYPE_str, str_nil) == NULL) {
916 return GDK_FAIL;
917 }
918 }
919 return GDK_SUCCEED;
920 } else {
921 /* first used to calculated the total length of
922 * each group, then the the total offset */
923 lengths = GDKzalloc(ngrp * sizeof(*lengths));
924 astrings = GDKmalloc(ngrp * sizeof(str));
925 if (lengths == NULL || astrings == NULL) {
926 rres = GDK_FAIL;
927 goto finish;
928 }
929 /* at first, set astrings[i] to str_nil, then for each
930 * non-empty group (even if all strings in the group
931 * are empty), set to NULL */
932 for (i = 0; i < ngrp; i++)
933 astrings[i] = (char *) str_nil;
934 for (p = 0; p < ncand; p++) {
935 i = canditer_next(ci) - seqb;
936 if (gids[i] >= min && gids[i] <= max) {
937 gid = gids[i] - min;
938 if (lengths[gid] == (size_t) -1)
939 continue;
940 s = BUNtvar(bi, i);
941 if (!GDK_STRNIL(s)) {
942 lengths[gid] += strlen(s) + separator_length;
943 astrings[gid] = NULL;
944 } else if (!skip_nils) {
945 nils++;
946 lengths[gid] = (size_t) -1;
947 astrings[gid] = (char *) str_nil;
948 }
949 }
950 }
951 for (i = 0; i < ngrp; i++) {
952 if (astrings[i] == NULL) {
953 if ((astrings[i] = GDKmalloc(lengths[i] + 1 - separator_length)) == NULL) {
954 rres = GDK_FAIL;
955 goto finish;
956 }
957 astrings[i][0] = 0;
958 lengths[i] = 0;
959 } else
960 astrings[i] = NULL;
961 }
962 canditer_reset(ci);
963 for (p = 0; p < ncand; p++) {
964 i = canditer_next(ci) - seqb;
965 if (gids[i] >= min && gids[i] <= max) {
966 gid = gids[i] - min;
967 if (astrings[gid]) {
968 s = BUNtvar(bi, i);
969 if (GDK_STRNIL(s))
970 continue;
971 if (astrings[gid][lengths[gid]]) {
972 memcpy(astrings[gid] + lengths[gid], separator, separator_length);
973 lengths[gid] += separator_length;
974 }
975 next_length = strlen(s);
976 memcpy(astrings[gid] + lengths[gid], s, next_length);
977 lengths[gid] += next_length;
978 astrings[gid][lengths[gid]] = 1;
979 }
980 }
981 }
982 for (i = 0; i < ngrp; i++) {
983 if (astrings[i]) {
984 astrings[i][lengths[i]] = '\0';
985 if (BUNappend(bn, astrings[i], false) != GDK_SUCCEED) {
986 rres = GDK_FAIL;
987 goto finish;
988 }
989 } else if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
990 rres = GDK_FAIL;
991 goto finish;
992 }
993 }
994 }
995
996 finish:
997 if (has_nils)
998 *has_nils = nils;
999 GDKfree(lengths);
1000 if (astrings) {
1001 for (i = 0; i < ngrp; i++) {
1002 if (astrings[i] != str_nil)
1003 GDKfree(astrings[i]);
1004 }
1005 GDKfree(astrings);
1006 }
1007 if (rres == GDK_FAIL)
1008 BBPreclaim(bn);
1009
1010 return rres;
1011}
1012
1013gdk_return
1014BATstr_group_concat(ValPtr res, BAT *b, BAT *s, bool skip_nils,
1015 bool abort_on_error, bool nil_if_empty,
1016 const char *separator)
1017{
1018 BUN ncand;
1019 struct canditer ci;
1020
1021 (void) abort_on_error;
1022 assert(separator);
1023 res->vtype = TYPE_str;
1024
1025 ncand = canditer_init(&ci, b, s);
1026
1027 if (ncand == 0 || GDK_STRNIL(separator)) {
1028 if (VALinit(res, TYPE_str, nil_if_empty ? str_nil : "") == NULL)
1029 return GDK_FAIL;
1030 return GDK_SUCCEED;
1031 }
1032
1033 return concat_strings(NULL, res, b, b->hseqbase,
1034 1, &ci, ncand, NULL, 0, 0, skip_nils,
1035 separator, NULL);
1036}
1037
1038BAT *
1039BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, bool skip_nils,
1040 bool abort_on_error, const char *separator)
1041{
1042 BAT *bn = NULL;
1043 oid min, max;
1044 BUN ngrp;
1045 BUN ncand;
1046 struct canditer ci;
1047 const char *err;
1048 BUN nils = 0;
1049 gdk_return res;
1050
1051 assert(separator);
1052 (void) skip_nils;
1053
1054 if ((err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp,
1055 &ci, &ncand)) !=NULL) {
1056 GDKerror("BATgroupstr_group_concat: %s\n", err);
1057 return NULL;
1058 }
1059 if (g == NULL) {
1060 GDKerror("BATgroupstr_group_concat: b and g must be aligned\n");
1061 return NULL;
1062 }
1063
1064 if (ncand == 0 || ngrp == 0 || GDK_STRNIL(separator)) {
1065 /* trivial: no strings to concat, so return bat
1066 * aligned with g with nil in the tail */
1067 return BATconstant(ngrp == 0 ? 0 : min, TYPE_str, str_nil, ngrp, TRANSIENT);
1068 }
1069
1070 if (BATtdense(g) || (g->tkey && g->tnonil)) {
1071 /* trivial: singleton groups, so all results are equal
1072 * to the inputs (but possibly a different type) */
1073 return BATconvert(b, s, TYPE_str, abort_on_error);
1074 }
1075
1076 res = concat_strings(&bn, NULL, b, b->hseqbase,
1077 ngrp, &ci, ncand, (const oid *) Tloc(g, 0),
1078 min, max, skip_nils, separator, &nils);
1079 if (res != GDK_SUCCEED)
1080 return NULL;
1081
1082 return bn;
1083}
1084