gdk_string.c source code [MonetDB/gdk/gdk_string.c]

1	/*
2	* This Source Code Form is subject to the terms of the Mozilla Public
3	* License, v. 2.0. If a copy of the MPL was not distributed with this
4	* file, You can obtain one at http://mozilla.org/MPL/2.0/.
5	*
6	* Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7	*/
8
9	#include "monetdb_config.h"
10	#include "gdk.h"
11	#include "gdk_private.h"
12	#include "gdk_cand.h"
13
14	/ String Atom Implementation*
15	*
16	* Strings are stored in two parts. The first part is the normal tail
17	* heap which contains a list of offsets. The second part is the
18	* theap which contains the actual strings. The offsets in the tail
19	* heap (a.k.a. offset heap) point into the theap (a.k.a. string
20	* heap). Strings are NULL-terminated and are stored without any
21	* escape sequences. Strings are encoded using the UTF-8 encoding
22	* of Unicode. This means that individual "characters" (really,
23	* Unicode code points) can be between one and four bytes long.
24	*
25	* Because in many typical situations there are lots of duplicated
26	* string values that are being stored in a table, but also in many
27	* (other) typical situations there are very few duplicated string
28	* values stored, a scheme has been introduced to cater to both
29	* situations.
30	*
31	* When the string heap is "small" (defined as less than 64KiB), the
32	* string heap is fully duplicate eliminated. When the string heap
33	* grows beyond this size, the heap is not kept free of duplicate
34	* strings, but there is then a heuristic that tries to limit the
35	* number of duplicates.
36	*
37	* This is done by having a fixed sized hash table at the start of the
38	* string heap, and allocating space for collision lists in the first
39	* 64KiB of the string heap. After the first 64KiB no extra space is
40	* allocated for lists, so hash collisions cannot be resolved.
41	*/
42
43	/ some of these macros are duplicates from gdk_atoms.c /
44	#define num08(x) ((x) >= '0' && (x) <= '7')
45	#define base08(x) ((x) - '0')
46	#define mult08(x) ((x) << 3)
47
48	#define num16(x) isxdigit((unsigned char) (x))
49	#define base16(x) (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) >= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0')
50	#define mult16(x) ((x) << 4)
51
52	#define atommem(size) \
53	do { \
54	if (dst == NULL \|\| len < (size)) { \
55	GDKfree(*dst); \
56	*len = (size); \
57	dst = GDKmalloc(len); \
58	if (*dst == NULL) { \
59	*len = 0; \
60	return -1; \
61	} \
62	} \
63	} while (0)
64
65	const char str_nil[`2`] = { `'\200'`, `0` };
66
67	int
68	strNil(const char *s)
69	{
70	return GDK_STRNIL(s);
71	}
72
73	size_t
74	strLen(const char *s)
75	{
76	return GDK_STRLEN(s);
77	}
78
79	int
80	strCmp(const char l, const* char *r)
81	{
82	return GDK_STRCMP(l, r);
83	}
84
85	int
86	strCmpNoNil(const unsigned char l, const* unsigned char *r)
87	{
88	while (l == r) {
89	if (*l == `0`)
90	return `0`;
91	l++;
92	r++;
93	}
94	return (l < r) ? -`1` : `1`;
95	}
96
97	void
98	strHeap(Heap *d, size_t cap)
99	{
100	size_t size;
101
102	cap = MAX(cap, BATTINY);
103	size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN);
104	if (HEAPalloc(d, size, `1`) == GDK_SUCCEED) {
105	d->free = GDK_STRHASHTABLE * sizeof(stridx_t);
106	d->dirty = true;
107	memset(d->base, `0`, d->free);
108	d->hashash = false;
109	#ifndef NDEBUG
110	/ fill should solve initialization problems within valgrind /
111	memset(d->base + d->free, `0`, d->size - d->free);
112	#endif
113	}
114	}
115
116
117	BUN
118	strHash(const char *s)
119	{
120	BUN res;
121
122	GDK_STRHASH(s, res);
123	return res;
124	}
125
126	void
127	strCleanHash(Heap *h, bool rebuild)
128	{
129	stridx_t newhash[GDK_STRHASHTABLE];
130	size_t pad, pos;
131	const size_t extralen = h->hashash ? EXTRALEN : `0`;
132	BUN off, strhash;
133	const char *s;
134
135	(void) rebuild;
136	if (!h->cleanhash)
137	return;
138	/ rebuild hash table for double elimination*
139	*
140	* If appending strings to the BAT was aborted, if the heap
141	* was memory mapped, the hash in the string heap may well be
142	* incorrect. Therefore we don't trust it when we read in a
143	* string heap and we rebuild the complete table (it is small,
144	* so this won't take any time at all).
145	* Note that we will only do this the first time the heap is
146	* loaded, and only for heaps that existed when the server was
147	* started. */
148	memset(newhash, `0`, sizeof(newhash));
149	pos = GDK_STRHASHSIZE;
150	while (pos < h->free && pos < GDK_ELIMLIMIT) {
151	pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - `1`));
152	if (pad < sizeof(stridx_t))
153	pad += GDK_VARALIGN;
154	pos += pad + extralen;
155	s = h->base + pos;
156	if (h->hashash)
157	strhash = ((const BUN *) s)[-`1`];
158	else
159	GDK_STRHASH(s, strhash);
160	off = strhash & GDK_STRHASHMASK;
161	newhash[off] = (stridx_t) (pos - extralen - sizeof(stridx_t));
162	pos += GDK_STRLEN(s);
163	}
164	/ only set dirty flag if the hash table actually changed /
165	if (memcmp(newhash, h->base, sizeof(newhash)) != `0`) {
166	memcpy(h->base, newhash, sizeof(newhash));
167	if (h->storage == STORE_MMAP) {
168	if (!(GDKdebug & NOSYNCMASK))
169	(void) MT_msync(h->base, GDK_STRHASHSIZE);
170	} else
171	h->dirty = true;
172	}
173	#ifndef NDEBUG
174	if (GDK_ELIMDOUBLES(h)) {
175	pos = GDK_STRHASHSIZE;
176	while (pos < h->free) {
177	pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - `1`));
178	if (pad < sizeof(stridx_t))
179	pad += GDK_VARALIGN;
180	pos += pad + extralen;
181	s = h->base + pos;
182	assert(strLocate(h, s) != `0`);
183	pos += GDK_STRLEN(s);
184	}
185	}
186	#endif
187	h->cleanhash = false;
188	}
189
190	/*
191	* The strPut routine. The routine strLocate can be used to identify
192	* the location of a string in the heap if it exists. Otherwise it
193	* returns zero.
194	*/
195	var_t
196	strLocate(Heap h, const* char *v)
197	{
198	stridx_t ref, next;
199	const size_t extralen = h->hashash ? EXTRALEN : `0`;
200
201	/ search hash-table, if double-elimination is still in place /
202	BUN off;
203	GDK_STRHASH(v, off);
204	off &= GDK_STRHASHMASK;
205
206	/ should only use strLocate iff fully double eliminated /
207	assert(GDK_ELIMBASE(h->free) == `0`);
208
209	/ search the linked list /
210	for (ref = ((stridx_t ) h->base) + off; ref; ref = next) {
211	next = (stridx_t ) (h->base + ref);
212	if (GDK_STRCMP(v, (str) (next + `1`) + extralen) == `0`)
213	return (var_t) ((sizeof(stridx_t) + ref + extralen)); /* found /
214	}
215	return `0`;
216	}
217
218	var_t
219	strPut(Heap h, var_t dst, const char *v)
220	{
221	size_t elimbase = GDK_ELIMBASE(h->free);
222	size_t pad;
223	size_t pos, len = GDK_STRLEN(v);
224	const size_t extralen = h->hashash ? EXTRALEN : `0`;
225	stridx_t *bucket;
226	BUN off, strhash;
227
228	GDK_STRHASH(v, off);
229	strhash = off;
230	off &= GDK_STRHASHMASK;
231	bucket = ((stridx_t *) h->base) + off;
232
233	if (*bucket) {
234	/ the hash list is not empty /
235	if (*bucket < GDK_ELIMLIMIT) {
236	/ small string heap (<64KiB) -- fully double*
237	* eliminated: search the linked list */
238	const stridx_t *ref = bucket;
239
240	do {
241	pos = ref + sizeof*(stridx_t) + extralen;
242	if (GDK_STRCMP(v, h->base + pos) == `0`) {
243	/ found /
244	return *dst = (var_t) pos;
245	}
246	ref = (stridx_t ) (h->base + ref);
247	} while (*ref);
248	} else {
249	/ large string heap (>=64KiB) -- there is no*
250	* linked list, so only look at single
251	* entry */
252	pos = *bucket + extralen;
253	if (GDK_STRCMP(v, h->base + pos) == `0`) {
254	/ already in heap: reuse /
255	return *dst = (var_t) pos;
256	}
257	}
258	}
259	/ the string was not found in the heap, we need to enter it /
260
261	if (v[`0`] != `'\200'` \|\| v[`1`] != `'\0'`) {
262	/ check that string is correctly encoded UTF-8; there*
263	* was no need to do this earlier: if the string was
264	* found above, it must have gone through here in the
265	* past */
266	int nutf8 = `0`;
267	int m = `0`;
268	for (size_t i = `0`; v[i]; i++) {
269	if (nutf8 > `0`) {
270	if ((v[i] & `0xC0`) != `0x80` \|\|
271	(m != `0` && (v[i] & m) == `0`)) {
272	badutf8:
273	GDKerror("strPut: incorrectly encoded UTF-8");
274	return `0`;
275	}
276	m = `0`;
277	nutf8--;
278	} else if ((v[i] & `0xE0`) == `0xC0`) {
279	nutf8 = `1`;
280	if ((v[i] & `0x1E`) == `0`)
281	goto badutf8;
282	} else if ((v[i] & `0xF0`) == `0xE0`) {
283	nutf8 = `2`;
284	if ((v[i] & `0x0F`) == `0`)
285	m = `0x20`;
286	} else if ((v[i] & `0xF8`) == `0xF0`) {
287	nutf8 = `3`;
288	if ((v[i] & `0x07`) == `0`)
289	m = `0x30`;
290	} else if ((v[i] & `0x80`) != `0`) {
291	goto badutf8;
292	}
293	}
294	}
295
296	pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - `1`));
297	if (elimbase == `0`) { / i.e. h->free < GDK_ELIMLIMIT /
298	if (pad < sizeof(stridx_t)) {
299	/ make room for hash link /
300	pad += GDK_VARALIGN;
301	}
302	} else if (extralen == `0`) { / i.e., h->hashash == FALSE /
303	/ no VARSHIFT and no string hash value stored => no*
304	* padding/alignment needed */
305	pad = `0`;
306	} else {
307	/ pad to align on VARALIGN for VARSHIFT and/or string*
308	* hash value */
309	pad &= (GDK_VARALIGN - `1`);
310	}
311
312	/ check heap for space (limited to a certain maximum after*
313	* which nils are inserted) */
314	if (h->free + pad + len + extralen >= h->size) {
315	size_t newsize = MAX(h->size, `4096`);
316
317	/ double the heap size until we have enough space /
318	do {
319	if (newsize < `4` * `1024` * `1024`)
320	newsize <<= `1`;
321	else
322	newsize += `4` * `1024` * `1024`;
323	} while (newsize <= h->free + pad + len + extralen);
324
325	assert(newsize);
326
327	if (h->free + pad + len + extralen >= (size_t) VAR_MAX) {
328	GDKerror("strPut: string heaps gets larger than %zuGiB.\n", (size_t) VAR_MAX >> `30`);
329	return `0`;
330	}
331	HEAPDEBUG fprintf(stderr, "#HEAPextend in strPut %s %zu %zu\n", h->filename, h->size, newsize);
332	if (HEAPextend(h, newsize, true) != GDK_SUCCEED) {
333	return `0`;
334	}
335	#ifndef NDEBUG
336	/ fill should solve initialization problems within*
337	* valgrind */
338	memset(h->base + h->free, `0`, h->size - h->free);
339	#endif
340
341	/ make bucket point into the new heap /
342	bucket = ((stridx_t *) h->base) + off;
343	}
344
345	/ insert string /
346	pos = h->free + pad + extralen;
347	*dst = (var_t) pos;
348	memcpy(h->base + pos, v, len);
349	if (h->hashash) {
350	((BUN *) (h->base + pos))[-`1`] = strhash;
351	#if EXTRALEN > SIZEOF_BUN
352	((BUN *) (h->base + pos))[-`2`] = (BUN) len;
353	#endif
354	}
355	h->free += pad + len + extralen;
356	h->dirty = true;
357
358	/ maintain hash table /
359	pos -= extralen;
360	if (elimbase == `0`) { / small string heap: link the next pointer /
361	/ the stridx_t next pointer directly precedes the*
362	* string and optional (depending on hashash) hash
363	* value */
364	pos -= sizeof(stridx_t);
365	(stridx_t ) (h->base + pos) = *bucket;
366	}
367	bucket = (stridx_t) pos; /* set bucket to the new string /
368
369	return *dst;
370	}
371
372	/*
373	* Convert an "" separated string to a GDK string value, checking that
374	* the input is correct UTF-8.
375	*/
376
377	/*
378	UTF-8 encoding is as follows:
379	U-00000000 - U-0000007F: 0xxxxxxx
380	U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
381	U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
382	U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
383	U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
384	U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
385	*/
386	/ To be correctly coded UTF-8, the sequence should be the shortest*
387	* possible encoding of the value being encoded. This means that for
388	* an encoding of length n+1 (1 <= n <= 5), at least one of the bits
389	* in utf8chkmsk[n] should be non-zero (else the encoding could be
390	* shorter). */
391	static int utf8chkmsk[] = {
392	`0x0000007f`,
393	`0x00000780`,
394	`0x0000f800`,
395	`0x001f0000`,
396	`0x03e00000`,
397	`0x7c000000`,
398	};
399
400	ssize_t
401	GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len)
402	{
403	unsigned char *p = dst;
404	const unsigned char cur = src, end = src + len;
405	bool escaped = false;
406	int mask = `0`, n, c, utf8char = `0`;
407
408	if (len >= `2` && strcmp((const char *) src, str_nil) == `0`) {
409	strcpy((char *) dst, str_nil);
410	return `1`;
411	}
412
413	/ copy it in, while performing the correct escapes /
414	/ n is the number of follow-on bytes left in a multi-byte*
415	* UTF-8 sequence */
416	for (cur = src, n = `0`; cur < end \|\| escaped; cur++) {
417	/ first convert any \ escapes and store value in c /
418	if (escaped) {
419	switch (*cur) {
420	case `'0'`:
421	case `'1'`:
422	case `'2'`:
423	case `'3'`:
424	case `'4'`:
425	case `'5'`:
426	case `'6'`:
427	case `'7'`:
428	/ \ with up to three octal digits /
429	c = base08(*cur);
430	if (num08(cur[`1`])) {
431	cur++;
432	c = mult08(c) + base08(*cur);
433	if (num08(cur[`1`])) {
434	if (c > `037`) {
435	/ octal*
436	* escape
437	* sequence
438	* out or
439	* range */
440	GDKerror("not an octal number\n");
441	return -`1`;
442	}
443	cur++;
444	c = mult08(c) + base08(*cur);
445	assert(c >= `0` && c <= `0377`);
446	}
447	}
448	break;
449	case `'x'`:
450	/ \x with one or two hexadecimal digits /
451	if (num16(cur[`1`])) {
452	cur++;
453	c = base16(*cur);
454	if (num16(cur[`1`])) {
455	cur++;
456	c = mult16(c) + base16(*cur);
457	}
458	} else
459	c = `'x'`;
460	break;
461	case `'u'`:
462	case `'U'`:
463	/ \u with four hexadecimal digits or*
464	* \U with eight hexadecimal digits */
465	if (n > `0`) {
466	/ not when in the middle of a*
467	* UTF-8 sequence */
468	goto notutf8;
469	}
470	c = `0`;
471	for (n = *cur == `'U'` ? `8` : `4`; n > `0`; n--) {
472	cur++;
473	if (!num16(*cur)) {
474	GDKerror("not a Unicode code point escape\n");
475	return -`1`;
476	}
477	c = c << `4` \| base16(*cur);
478	}
479	/ n == 0 now /
480	if (c == `0` \|\| c > `0x10FFFF` \|\|
481	(c & `0xFFF800`) == `0xD800`) {
482	GDKerror("illegal Unicode code point\n");
483	return -`1`;
484	}
485	if (c < `0x80`) {
486	p++ = (unsigned* char) c;
487	} else {
488	if (c < `0x800`) {
489	*p++ = `0xC0` \| (c >> `6`);
490	} else {
491	if (c < `0x10000`) {
492	*p++ = `0xE0` \| (c >> `12`);
493	} else {
494	*p++ = `0xF0` \| (c >> `18`);
495	*p++ = `0x80` \| ((c >> `12`) & `0x3F`);
496	}
497	*p++ = `0x80` \| ((c >> `6`) & `0x3F`);
498	}
499	*p++ = `0x80` \| (c & `0x3F`);
500	}
501	escaped = false;
502	continue;
503	case `'a'`:
504	c = `'\a'`;
505	break;
506	case `'b'`:
507	c = `'\b'`;
508	break;
509	case `'f'`:
510	c = `'\f'`;
511	break;
512	case `'n'`:
513	c = `'\n'`;
514	break;
515	case `'r'`:
516	c = `'\r'`;
517	break;
518	case `'t'`:
519	c = `'\t'`;
520	break;
521	case `'\0'`:
522	c = `'\\'`;
523	break;
524	case `'\''`:
525	case `'\\'`:
526	/ \' and \\ can be handled by the*
527	* default case */
528	default:
529	/ unrecognized \ escape, just copy*
530	* the backslashed character */
531	c = *cur;
532	break;
533	}
534	escaped = false;
535	} else if ((c = *cur) == `'\\'`) {
536	escaped = true;
537	continue;
538	#if 0
539	} else if (c == quote && cur[`1`] == quote) {
540	assert(c != `0`);
541	if (n > `0`)
542	goto notutf8;
543	*p++ = quote;
544	cur++;
545	continue;
546	#endif
547	}
548
549	if (n > `0`) {
550	/ we're still expecting follow-up bytes in a*
551	* UTF-8 sequence */
552	if ((c & `0xC0`) != `0x80`) {
553	/ incorrect UTF-8 sequence: byte is*
554	* not 10xxxxxx */
555	goto notutf8;
556	}
557	utf8char = (utf8char << `6`) \| (c & `0x3F`);
558	n--;
559	if (n == `0`) {
560	/ this was the last byte in the sequence /
561	if ((utf8char & mask) == `0`) {
562	/ incorrect UTF-8 sequence:*
563	* not shortest possible */
564	goto notutf8;
565	}
566	if (utf8char > `0x10FFFF`) {
567	/ incorrect UTF-8 sequence:*
568	* value too large */
569	goto notutf8;
570	}
571	if ((utf8char & `0x1FFF800`) == `0xD800`) {
572	/ incorrect UTF-8 sequence:*
573	* low or high surrogate
574	* encoded as UTF-8 */
575	goto notutf8;
576	}
577	}
578	} else if (c >= `0x80`) {
579	int m;
580
581	/ start of multi-byte UTF-8 character /
582	for (n = `0`, m = `0x40`; c & m; n++, m >>= `1`)
583	;
584	/ n now is number of 10xxxxxx bytes that*
585	* should follow */
586	if (n == `0` \|\| n >= `4`) {
587	/ incorrect UTF-8 sequence /
588	/ n==0: c == 10xxxxxx /
589	/ n>=4: c == 11111xxx /
590	goto notutf8;
591	}
592	mask = utf8chkmsk[n];
593	/ collect the Unicode code point in utf8char /
594	utf8char = c & ~(`0xFFC0` >> n); / remove non-x bits /
595	}
596	*p++ = c;
597	}
598	if (n > `0`) {
599	/ incomplete UTF-8 sequence /
600	goto notutf8;
601	}
602	*p++ = `0`;
603	return len;
604	notutf8:
605	GDKerror("not a proper UTF-8 sequence\n");
606	return -`1`;
607	}
608
609	ssize_t
610	strFromStr(const char *restrict src, size_t *restrict len, char **restrict dst, bool external)
611	{
612	const char cur = src, start = NULL;
613	size_t l = `1`;
614	bool escaped = false;
615
616	if (!external) {
617	size_t sz = strLen(src);
618	atommem(sz);
619	return (ssize_t) strcpy_len(*dst, src, sz);
620	}
621
622	if (GDK_STRNIL(src)) {
623	atommem(`2`);
624	strcpy(*dst, str_nil);
625	return `1`;
626	}
627
628	while (GDKisspace(*cur))
629	cur++;
630	if (*cur != `'"'`) {
631	if (strncmp(cur, "nil", `3`) == `0`) {
632	atommem(`2`);
633	strcpy(*dst, str_nil);
634	return (ssize_t) (cur - src) + `3`;
635	}
636	GDKerror("not a quoted string\n");
637	return -`1`;
638	}
639
640	/ scout the string to find out its length and whether it was*
641	* properly quoted */
642	for (start = ++cur; *cur != `'"'` \|\| escaped; cur++) {
643	if (*cur == `0`) {
644	GDKerror("no closing quotes\n");
645	return -`1`;
646	} else if (*cur == `'\\'` && !escaped) {
647	escaped = true;
648	} else {
649	escaped = false;
650	l++;
651	}
652	}
653
654	/ alloc new memory /
655	if (dst == NULL \|\| len < l) {
656	GDKfree(*dst);
657	dst = GDKmalloc(len = l);
658	if (*dst == NULL) {
659	*len = `0`;
660	return -`1`;
661	}
662	}
663
664	return GDKstrFromStr((unsigned char ) dst,
665	(const unsigned char *) start,
666	(ssize_t) (cur - start));
667	}
668
669	/*
670	* Convert a GDK string value to something printable.
671	*/
672	/ all but control characters (in range 0 to 31) and DEL /
673	#ifdef ASCII_CHR
674	/ ASCII printable characters /
675	#define printable_chr(ch) (' ' <= (ch) && (ch) <= '~')
676	#else
677	/ everything except ASCII control characters /
678	#define printable_chr(ch) ((' ' <= (ch) && (ch) <= '~') \|\| ((ch) & 0x80) != 0)
679	#endif
680
681	size_t
682	escapedStrlen(const char *restrict src, const char sep1, const* char sep2, int* quote)
683	{
684	size_t end, sz = `0`;
685	size_t sep1len, sep2len;
686
687	sep1len = sep1 ? strlen(sep1) : `0`;
688	sep2len = sep2 ? strlen(sep2) : `0`;
689	for (end = `0`; src[end]; end++)
690	if (src[end] == `'\\'`
691	\|\| src[end] == quote
692	\|\| (sep1len && strncmp(src + end, sep1, sep1len) == `0`)
693	\|\| (sep2len && strncmp(src + end, sep2, sep2len) == `0`)) {
694	sz += `2`;
695	#ifndef ASCII_CHR
696	} else if (src[end] == (char) `'\302'` &&
697	`0200` <= ((int) src[end + `1`] & `0377`) &&
698	((int) src[end + `1`] & `0377`) <= `0237`) {
699	/ Unicode control character (code point range*
700	* U-00000080 through U-0000009F encoded in
701	* UTF-8 */
702	/ for the first one of the two UTF-8 bytes we*
703	* count a width of 7 and for the second one
704	* 1, together that's 8, i.e. the width of two
705	* backslash-escaped octal coded characters */
706	sz += `7`;
707	#endif
708	} else if (!printable_chr(src[end])) {
709	sz += `4`;
710	} else {
711	sz++;
712	}
713	return sz;
714	}
715
716	size_t
717	escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char sep1, const* char sep2, int* quote)
718	{
719	size_t cur = `0`, l = `0`;
720	size_t sep1len, sep2len;
721
722	sep1len = sep1 ? strlen(sep1) : `0`;
723	sep2len = sep2 ? strlen(sep2) : `0`;
724	for (; src[cur] && l < dstlen; cur++)
725	if (!printable_chr(src[cur])
726	#ifndef ASCII_CHR
727	\|\| (src[cur] == `'\302'`
728	&& `0200` <= (src[cur + `1`] & `0377`)
729	&& ((int) src[cur + `1`] & `0377`) <= `0237`)
730	\|\| (cur > `0`
731	&& src[cur - `1`] == `'\302'`
732	&& `0200` <= (src[cur] & `0377`)
733	&& (src[cur] & `0377`) <= `0237`)
734	#endif
735	) {
736	dst[l++] = `'\\'`;
737	switch (src[cur]) {
738	case `'\t'`:
739	dst[l++] = `'t'`;
740	break;
741	case `'\n'`:
742	dst[l++] = `'n'`;
743	break;
744	case `'\r'`:
745	dst[l++] = `'r'`;
746	break;
747	case `'\f'`:
748	dst[l++] = `'f'`;
749	break;
750	default:
751	snprintf(dst + l, dstlen - l, "%03o", (unsigned char) src[cur]);
752	l += `3`;
753	break;
754	}
755	} else if (src[cur] == `'\\'`
756	\|\| src[cur] == quote
757	\|\| (sep1len && strncmp(src + cur, sep1, sep1len) == `0`)
758	\|\| (sep2len && strncmp(src + cur, sep2, sep2len) == `0`)) {
759	dst[l++] = `'\\'`;
760	dst[l++] = src[cur];
761	} else {
762	dst[l++] = src[cur];
763	}
764	assert(l < dstlen);
765	dst[l] = `0`;
766	return l;
767	}
768
769	ssize_t
770	strToStr(char **restrict dst, size_t *restrict len, const char *restrict src, bool external)
771	{
772	size_t sz;
773
774	if (!external) {
775	sz = strLen(src);
776	atommem(sz);
777	return (ssize_t) strcpy_len(*dst, src, sz);
778	}
779	if (GDK_STRNIL(src)) {
780	atommem(`4`);
781	strcpy(*dst, "nil");
782	return `3`;
783	} else {
784	ssize_t l = `0`;
785	size_t sz = escapedStrlen(src, NULL, NULL, `'"'`);
786
787	atommem(sz + `3`);
788	l = (ssize_t) escapedStr((dst) + `1`, src, len - `1`, NULL, NULL, `'"'`);
789	l++;
790	(dst)[`0`] = (dst)[l++] = `'"'`;
791	(*dst)[l] = `0`;
792	return l;
793	}
794	}
795
796	str
797	strRead(str a, stream *s, size_t cnt)
798	{
799	int len;
800
801	(void) cnt;
802	assert(cnt == `1`);
803	if (mnstr_readInt(s, &len) != `1`)
804	return NULL;
805	if ((a = GDKmalloc(len + `1`)) == NULL)
806	return NULL;
807	if (len && mnstr_read(s, a, len, `1`) != `1`) {
808	GDKfree(a);
809	return NULL;
810	}
811	a[len] = `0`;
812	return a;
813	}
814
815	gdk_return
816	strWrite(const char a, stream s, size_t cnt)
817	{
818	size_t len = strlen(a);
819
820	(void) cnt;
821	assert(cnt == `1`);
822	if (mnstr_writeInt(s, (int) len) && mnstr_write(s, a, len, `1`) == `1`)
823	return GDK_SUCCEED;
824	else
825	return GDK_FAIL;
826	}
827
828	static gdk_return
829	concat_strings(BAT *bnp, ValPtr pt, BAT b, oid seqb,
830	BUN ngrp, struct canditer *restrict ci, BUN ncand,
831	const oid *restrict gids, oid min, oid max, bool skip_nils,
832	const char separator, BUN has_nils)
833	{
834	oid gid;
835	BUN i, p, nils = `0`;
836	size_t *lengths = NULL, separator_length = strlen(separator), next_length;
837	str *astrings = NULL, s;
838	BATiter bi;
839	BAT *bn = NULL;
840	gdk_return rres = GDK_SUCCEED;
841
842	/ exactly one of bnp and pt must be NULL, the other non-NULL /
843	assert((bnp == NULL) != (pt == NULL));
844	/ if pt not NULL, only a single group allowed /
845	assert(pt == NULL \|\| ngrp == `1`);
846	if (bnp) {
847	if ((bn = COLnew(min, TYPE_str, ngrp, TRANSIENT)) == NULL) {
848	rres = GDK_FAIL;
849	goto finish;
850	}
851	*bnp = bn;
852	}
853
854	bi = bat_iterator(b);
855
856	if (ngrp == `1`) {
857	size_t offset = `0`, single_length = `0`;
858	bool empty = true;
859
860	for (i = `0`; i < ncand; i++) {
861	p = canditer_next(ci) - seqb;
862	s = BUNtvar(bi, p);
863	if (GDK_STRNIL(s)) {
864	if (!skip_nils) {
865	nils = `1`;
866	break;
867	}
868	} else {
869	single_length += strlen(s);
870	if (!empty)
871	single_length += separator_length;
872	empty = false;
873	}
874	}
875	canditer_reset(ci);
876
877	if (nils == `0`) {
878	char *single_str;
879
880	if ((single_str = GDKmalloc(single_length + `1`)) == NULL) {
881	return GDK_FAIL;
882	}
883	empty = true;
884	for (i = `0`; i < ncand; i++) {
885	p = canditer_next(ci) - seqb;
886	s = BUNtvar(bi, p);
887	if (GDK_STRNIL(s))
888	continue;
889	if (!empty) {
890	memcpy(single_str + offset, separator, separator_length);
891	offset += separator_length;
892	}
893	next_length = strlen(s);
894	memcpy(single_str + offset, s, next_length);
895	offset += next_length;
896	empty = false;
897	}
898	single_str[offset] = `'\0'`;
899	if (bn) {
900	if (BUNappend(bn, single_str, false) != GDK_SUCCEED) {
901	GDKfree(single_str);
902	return GDK_FAIL;
903	}
904	} else {
905	pt->len = offset + `1`;
906	pt->val.sval = single_str;
907	single_str = NULL; / don't free /
908	}
909	GDKfree(single_str);
910	} else if (bn) {
911	if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
912	return GDK_FAIL;
913	}
914	} else {
915	if (VALinit(pt, TYPE_str, str_nil) == NULL) {
916	return GDK_FAIL;
917	}
918	}
919	return GDK_SUCCEED;
920	} else {
921	/ first used to calculated the total length of*
922	* each group, then the the total offset */
923	lengths = GDKzalloc(ngrp * sizeof(*lengths));
924	astrings = GDKmalloc(ngrp * sizeof(str));
925	if (lengths == NULL \|\| astrings == NULL) {
926	rres = GDK_FAIL;
927	goto finish;
928	}
929	/ at first, set astrings[i] to str_nil, then for each*
930	* non-empty group (even if all strings in the group
931	* are empty), set to NULL */
932	for (i = `0`; i < ngrp; i++)
933	astrings[i] = (char *) str_nil;
934	for (p = `0`; p < ncand; p++) {
935	i = canditer_next(ci) - seqb;
936	if (gids[i] >= min && gids[i] <= max) {
937	gid = gids[i] - min;
938	if (lengths[gid] == (size_t) -`1`)
939	continue;
940	s = BUNtvar(bi, i);
941	if (!GDK_STRNIL(s)) {
942	lengths[gid] += strlen(s) + separator_length;
943	astrings[gid] = NULL;
944	} else if (!skip_nils) {
945	nils++;
946	lengths[gid] = (size_t) -`1`;
947	astrings[gid] = (char *) str_nil;
948	}
949	}
950	}
951	for (i = `0`; i < ngrp; i++) {
952	if (astrings[i] == NULL) {
953	if ((astrings[i] = GDKmalloc(lengths[i] + `1` - separator_length)) == NULL) {
954	rres = GDK_FAIL;
955	goto finish;
956	}
957	astrings[i][`0`] = `0`;
958	lengths[i] = `0`;
959	} else
960	astrings[i] = NULL;
961	}
962	canditer_reset(ci);
963	for (p = `0`; p < ncand; p++) {
964	i = canditer_next(ci) - seqb;
965	if (gids[i] >= min && gids[i] <= max) {
966	gid = gids[i] - min;
967	if (astrings[gid]) {
968	s = BUNtvar(bi, i);
969	if (GDK_STRNIL(s))
970	continue;
971	if (astrings[gid][lengths[gid]]) {
972	memcpy(astrings[gid] + lengths[gid], separator, separator_length);
973	lengths[gid] += separator_length;
974	}
975	next_length = strlen(s);
976	memcpy(astrings[gid] + lengths[gid], s, next_length);
977	lengths[gid] += next_length;
978	astrings[gid][lengths[gid]] = `1`;
979	}
980	}
981	}
982	for (i = `0`; i < ngrp; i++) {
983	if (astrings[i]) {
984	astrings[i][lengths[i]] = `'\0'`;
985	if (BUNappend(bn, astrings[i], false) != GDK_SUCCEED) {
986	rres = GDK_FAIL;
987	goto finish;
988	}
989	} else if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
990	rres = GDK_FAIL;
991	goto finish;
992	}
993	}
994	}
995
996	finish:
997	if (has_nils)
998	*has_nils = nils;
999	GDKfree(lengths);
1000	if (astrings) {
1001	for (i = `0`; i < ngrp; i++) {
1002	if (astrings[i] != str_nil)
1003	GDKfree(astrings[i]);
1004	}
1005	GDKfree(astrings);
1006	}
1007	if (rres == GDK_FAIL)
1008	BBPreclaim(bn);
1009
1010	return rres;
1011	}
1012
1013	gdk_return
1014	BATstr_group_concat(ValPtr res, BAT b, BAT s, bool skip_nils,
1015	bool abort_on_error, bool nil_if_empty,
1016	const char *separator)
1017	{
1018	BUN ncand;
1019	struct canditer ci;
1020
1021	(void) abort_on_error;
1022	assert(separator);
1023	res->vtype = TYPE_str;
1024
1025	ncand = canditer_init(&ci, b, s);
1026
1027	if (ncand == `0` \|\| GDK_STRNIL(separator)) {
1028	if (VALinit(res, TYPE_str, nil_if_empty ? str_nil : "") == NULL)
1029	return GDK_FAIL;
1030	return GDK_SUCCEED;
1031	}
1032
1033	return concat_strings(NULL, res, b, b->hseqbase,
1034	`1`, &ci, ncand, NULL, `0`, `0`, skip_nils,
1035	separator, NULL);
1036	}
1037
1038	BAT *
1039	BATgroupstr_group_concat(BAT b, BAT g, BAT e, BAT s, bool skip_nils,
1040	bool abort_on_error, const char *separator)
1041	{
1042	BAT *bn = NULL;
1043	oid min, max;
1044	BUN ngrp;
1045	BUN ncand;
1046	struct canditer ci;
1047	const char *err;
1048	BUN nils = `0`;
1049	gdk_return res;
1050
1051	assert(separator);
1052	(void) skip_nils;
1053
1054	if ((err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp,
1055	&ci, &ncand)) !=NULL) {
1056	GDKerror("BATgroupstr_group_concat: %s\n", err);
1057	return NULL;
1058	}
1059	if (g == NULL) {
1060	GDKerror("BATgroupstr_group_concat: b and g must be aligned\n");
1061	return NULL;
1062	}
1063
1064	if (ncand == `0` \|\| ngrp == `0` \|\| GDK_STRNIL(separator)) {
1065	/ trivial: no strings to concat, so return bat*
1066	* aligned with g with nil in the tail */
1067	return BATconstant(ngrp == `0` ? `0` : min, TYPE_str, str_nil, ngrp, TRANSIENT);
1068	}
1069
1070	if (BATtdense(g) \|\| (g->tkey && g->tnonil)) {
1071	/ trivial: singleton groups, so all results are equal*
1072	* to the inputs (but possibly a different type) */
1073	return BATconvert(b, s, TYPE_str, abort_on_error);
1074	}
1075
1076	res = concat_strings(&bn, NULL, b, b->hseqbase,
1077	ngrp, &ci, ncand, (const oid *) Tloc(g, `0`),
1078	min, max, skip_nils, separator, &nils);
1079	if (res != GDK_SUCCEED)
1080	return NULL;
1081
1082	return bn;
1083	}
1084

Browse the source code of MonetDB/gdk/gdk_string.c