1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * (c) M. L. Kersten, P. Boncz, S. Manegold, N. Nes, K.S. Mullender
11 * Common BAT Operations
12 * We factor out all possible overhead by inlining code. This
13 * includes the macros BUNhead and BUNtail, which do a test to see
14 * whether the atom resides in the buns or in a variable storage
15 * heap.
16 */
17#include "monetdb_config.h"
18#include "gdk.h"
19#include "gdk_private.h"
20
21gdk_return
22unshare_string_heap(BAT *b)
23{
24 assert(b->batCacheid > 0);
25 if (b->ttype == TYPE_str &&
26 b->tvheap->parentid != b->batCacheid) {
27 Heap *h = GDKzalloc(sizeof(Heap));
28 if (h == NULL)
29 return GDK_FAIL;
30 h->parentid = b->batCacheid;
31 h->farmid = BBPselectfarm(b->batRole, TYPE_str, varheap);
32 strconcat_len(h->filename, sizeof(h->filename),
33 BBP_physical(b->batCacheid), ".theap", NULL);
34 if (HEAPcopy(h, b->tvheap) != GDK_SUCCEED) {
35 HEAPfree(h, true);
36 GDKfree(h);
37 return GDK_FAIL;
38 }
39 BBPunshare(b->tvheap->parentid);
40 b->tvheap = h;
41 }
42 return GDK_SUCCEED;
43}
44
45/* We try to be clever when appending one string bat to another.
46 * First of all, we try to actually share the string heap so that we
47 * don't need an extra copy, and if that can't be done, we see whether
48 * it makes sense to just quickly copy the whole string heap instead
49 * of inserting individual strings. See the comments in the code for
50 * more information. */
51static gdk_return
52insert_string_bat(BAT *b, BAT *n, BAT *s, bool force)
53{
54 BATiter ni; /* iterator */
55 size_t toff = ~(size_t) 0; /* tail offset */
56 BUN p, r; /* loop variables */
57 const void *tp; /* tail value pointer */
58 unsigned char tbv; /* tail value-as-bte */
59 unsigned short tsv; /* tail value-as-sht */
60#if SIZEOF_VAR_T == 8
61 unsigned int tiv; /* tail value-as-int */
62#endif
63 var_t v; /* value */
64 size_t off; /* offset within n's string heap */
65 struct canditer ci;
66 BUN cnt;
67
68 assert(b->ttype == TYPE_str);
69 /* only transient bats can use some other bat's string heap */
70 assert(b->batRole == TRANSIENT || b->tvheap->parentid == b->batCacheid);
71 if (n->batCount == 0 || (s && s->batCount == 0))
72 return GDK_SUCCEED;
73 ni = bat_iterator(n);
74 tp = NULL;
75 cnt = canditer_init(&ci, n, s);
76 if (cnt == 0)
77 return GDK_SUCCEED;
78 if ((!GDK_ELIMDOUBLES(b->tvheap) || b->batCount == 0) &&
79 !GDK_ELIMDOUBLES(n->tvheap) &&
80 b->tvheap->hashash == n->tvheap->hashash) {
81 if (b->batRole == TRANSIENT || b->tvheap == n->tvheap) {
82 /* If b is in the transient farm (i.e. b will
83 * never become persistent), we try some
84 * clever tricks to avoid copying:
85 * - if b is empty, we just let it share the
86 * string heap with n;
87 * - otherwise, if b's string heap and n's
88 * string heap are the same (i.e. shared),
89 * we leave it that way (this includes the
90 * case that b is persistent and n shares
91 * its string heap with b);
92 * - otherwise, if b shares its string heap
93 * with some other bat, we materialize it
94 * and we will have to copy strings.
95 */
96 bat bid = b->batCacheid;
97
98 /* if candidates are not dense, there is no
99 * wholesale copying of n's offset heap, but
100 * we may still be able to share the string
101 * heap */
102 if (b->batCount == 0 &&
103 b->tvheap != n->tvheap &&
104 ci.tpe == cand_dense) {
105 if (b->tvheap->parentid != bid) {
106 BBPunshare(b->tvheap->parentid);
107 } else {
108 HEAPfree(b->tvheap, true);
109 GDKfree(b->tvheap);
110 }
111 BBPshare(n->tvheap->parentid);
112 b->tvheap = n->tvheap;
113 b->batDirtydesc = true;
114 toff = 0;
115 } else if (b->tvheap->parentid == n->tvheap->parentid &&
116 ci.tpe == cand_dense) {
117 toff = 0;
118 } else if (b->tvheap->parentid != bid &&
119 unshare_string_heap(b) != GDK_SUCCEED) {
120 return GDK_FAIL;
121 }
122 }
123 if (toff == ~(size_t) 0 && cnt > 1024) {
124 /* If b and n aren't sharing their string
125 * heaps, we try to determine whether to copy
126 * n's whole string heap to the end of b's, or
127 * whether we will insert each string from n
128 * individually. We do this by testing a
129 * sample of n's strings and extrapolating
130 * from that sample whether n uses a
131 * significant part of its string heap for its
132 * strings (i.e. whether there are many unused
133 * strings in n's string heap). If n doesn't
134 * have many strings in the first place, we
135 * skip this and just insert them all
136 * individually. We also check whether a
137 * significant number of n's strings happen to
138 * have the same offset in b. In the latter
139 * case we also want to insert strings
140 * individually, but reusing the string in b's
141 * string heap. */
142 int match = 0, i;
143 size_t len = b->tvheap->hashash ? 1024 * EXTRALEN : 0;
144 for (i = 0; i < 1024; i++) {
145 p = (BUN) (((double) rand() / RAND_MAX) * (cnt - 1));
146 p = canditer_idx(&ci, p) - n->hseqbase;
147 off = BUNtvaroff(ni, p);
148 if (off < b->tvheap->free &&
149 strcmp(b->tvheap->base + off, n->tvheap->base + off) == 0 &&
150 (!b->tvheap->hashash ||
151 ((BUN *) (b->tvheap->base + off))[-1] == (n->tvheap->hashash ? ((BUN *) (n->tvheap->base + off))[-1] : strHash(n->tvheap->base + off))))
152 match++;
153 len += (strlen(n->tvheap->base + off) + 8) & ~7;
154 }
155 if (match < 768 && (size_t) (BATcount(n) * (double) len / 1024) >= n->tvheap->free / 2) {
156 /* append string heaps */
157 toff = b->batCount == 0 ? 0 : b->tvheap->free;
158 /* make sure we get alignment right */
159 toff = (toff + GDK_VARALIGN - 1) & ~(GDK_VARALIGN - 1);
160 /* if in "force" mode, the heap may be
161 * shared when memory mapped */
162 if (HEAPextend(b->tvheap, toff + n->tvheap->size, force) != GDK_SUCCEED) {
163 toff = ~(size_t) 0;
164 goto bunins_failed;
165 }
166 memcpy(b->tvheap->base + toff, n->tvheap->base, n->tvheap->free);
167 b->tvheap->free = toff + n->tvheap->free;
168 if (toff > 0) {
169 /* flush double-elimination
170 * hash table */
171 memset(b->tvheap->base, 0,
172 GDK_STRHASHSIZE);
173 }
174 }
175 }
176 if (toff != ~(size_t) 0) {
177 /* we only have to copy the offsets from n to
178 * b, possibly with an offset (if toff != 0),
179 * so set up some variables and set up b's
180 * tail so that it looks like it's a fixed
181 * size column. Of course, we must make sure
182 * first that the width of b's offset heap can
183 * accommodate all values. */
184 if (b->twidth < SIZEOF_VAR_T &&
185 ((size_t) 1 << 8 * b->twidth) <= (b->twidth <= 2 ? b->tvheap->size - GDK_VAROFFSET : b->tvheap->size)) {
186 /* offsets aren't going to fit, so
187 * widen offset heap */
188 if (GDKupgradevarheap(b, (var_t) b->tvheap->size, false, force) != GDK_SUCCEED) {
189 toff = ~(size_t) 0;
190 goto bunins_failed;
191 }
192 }
193 }
194 } else if (unshare_string_heap(b) != GDK_SUCCEED)
195 return GDK_FAIL;
196 if (toff == 0 && n->twidth == b->twidth && ci.tpe == cand_dense) {
197 /* we don't need to do any translation of offset
198 * values, so we can use fast memcpy */
199 memcpy(Tloc(b, BUNlast(b)), Tloc(n, ci.seq - n->hseqbase), cnt << n->tshift);
200 BATsetcount(b, BATcount(b) + cnt);
201 } else if (toff != ~(size_t) 0) {
202 /* we don't need to insert any actual strings since we
203 * have already made sure that they are all in b's
204 * string heap at known locations (namely the offset
205 * in n added to toff), so insert offsets from n after
206 * adding toff into b */
207 /* note the use of the "restrict" qualifier here: all
208 * four pointers below point to the same value, but
209 * only one of them will actually be used, hence we
210 * still obey the rule for restrict-qualified
211 * pointers */
212 const unsigned char *restrict tbp = (const unsigned char *) Tloc(n, 0);
213 const unsigned short *restrict tsp = (const unsigned short *) Tloc(n, 0);
214#if SIZEOF_VAR_T == 8
215 const unsigned int *restrict tip = (const unsigned int *) Tloc(n, 0);
216#endif
217 const var_t *restrict tvp = (const var_t *) Tloc(n, 0);
218
219 switch (b->twidth) {
220 case 1:
221 b->ttype = TYPE_bte;
222 tp = &tbv;
223 break;
224 case 2:
225 b->ttype = TYPE_sht;
226 tp = &tsv;
227 break;
228#if SIZEOF_VAR_T == 8
229 case 4:
230 b->ttype = TYPE_int;
231 tp = &tiv;
232 break;
233 case 8:
234 b->ttype = TYPE_lng;
235 tp = &v;
236 break;
237#else
238 case 4:
239 b->ttype = TYPE_int;
240 tp = &v;
241 break;
242#endif
243 default:
244 assert(0);
245 }
246 b->tvarsized = false;
247 while (cnt > 0) {
248 cnt--;
249 p = canditer_next(&ci) - n->hseqbase;
250 switch (n->twidth) {
251 case 1:
252 v = (var_t) tbp[p] + GDK_VAROFFSET;
253 break;
254 case 2:
255 v = (var_t) tsp[p] + GDK_VAROFFSET;
256 break;
257#if SIZEOF_VAR_T == 8
258 case 4:
259 v = (var_t) tip[p];
260 break;
261#endif
262 default:
263 v = tvp[p];
264 break;
265 }
266 v = (var_t) ((size_t) v + toff);
267 assert(v >= GDK_VAROFFSET);
268 assert((size_t) v < b->tvheap->free);
269 if (BUNlast(b) >= BATcapacity(b)) {
270 if (BATcount(b) == BUN_MAX) {
271 GDKerror("bunfastapp: too many elements to accomodate (" BUNFMT ")\n", BUN_MAX);
272 goto bunins_failed;
273 }
274 if (BATextend(b, BATgrows(b)) != GDK_SUCCEED)
275 goto bunins_failed;
276 }
277 switch (b->twidth) {
278 case 1:
279 assert(v - GDK_VAROFFSET < ((var_t) 1 << 8));
280 ((uint8_t *) b->theap.base)[b->batCount++] = (uint8_t) (v - GDK_VAROFFSET);
281 b->theap.free += 1;
282 break;
283 case 2:
284 assert(v - GDK_VAROFFSET < ((var_t) 1 << 16));
285 ((uint16_t *) b->theap.base)[b->batCount++] = (uint16_t) (v - GDK_VAROFFSET);
286 b->theap.free += 2;
287 break;
288#if SIZEOF_VAR_T == 8
289 case 4:
290 assert(v < ((var_t) 1 << 32));
291 ((uint32_t *) b->theap.base)[b->batCount++] = (uint32_t) v;
292 b->theap.free += 4;
293 break;
294#endif
295 default:
296 ((var_t *) b->theap.base)[b->batCount++] = v;
297 b->theap.free += sizeof(var_t);
298 break;
299 }
300 b->theap.dirty = true;
301 }
302 b->tvarsized = true;
303 b->ttype = TYPE_str;
304 } else if (b->tvheap->free < n->tvheap->free / 2 ||
305 GDK_ELIMDOUBLES(b->tvheap)) {
306 /* if b's string heap is much smaller than n's string
307 * heap, don't bother checking whether n's string
308 * values occur in b's string heap; also, if b is
309 * (still) fully double eliminated, we must continue
310 * to use the double elimination mechanism */
311 r = BUNlast(b);
312 oid hseq = n->hseqbase;
313 while (cnt > 0) {
314 cnt--;
315 p = canditer_next(&ci) - hseq;
316 tp = BUNtvar(ni, p);
317 bunfastappVAR(b, tp);
318 HASHins(b, r, tp);
319 r++;
320 }
321 } else {
322 /* Insert values from n individually into b; however,
323 * we check whether there is a string in b's string
324 * heap at the same offset as the string is in n's
325 * string heap (in case b's string heap is a copy of
326 * n's). If this is the case, we just copy the
327 * offset, otherwise we insert normally. */
328 r = BUNlast(b);
329 while (cnt > 0) {
330 cnt--;
331 p = canditer_next(&ci) - n->hseqbase;
332 off = BUNtvaroff(ni, p); /* the offset */
333 tp = n->tvheap->base + off; /* the string */
334 if (off < b->tvheap->free &&
335 strcmp(b->tvheap->base + off, tp) == 0 &&
336 (!b->tvheap->hashash ||
337 ((BUN *) (b->tvheap->base + off))[-1] == (n->tvheap->hashash ? ((BUN *) tp)[-1] : strHash(tp)))) {
338 /* we found the string at the same
339 * offset in b's string heap as it was
340 * in n's string heap, so we don't
341 * have to insert a new string into b:
342 * we can just copy the offset */
343 v = (var_t) off;
344 if (b->twidth < SIZEOF_VAR_T &&
345 ((size_t) 1 << 8 * b->twidth) <= (b->twidth <= 2 ? v - GDK_VAROFFSET : v)) {
346 /* offset isn't going to fit,
347 * so widen offset heap */
348 if (GDKupgradevarheap(b, v, false, force) != GDK_SUCCEED) {
349 goto bunins_failed;
350 }
351 }
352 switch (b->twidth) {
353 case 1:
354 assert(v - GDK_VAROFFSET < ((var_t) 1 << 8));
355 *(unsigned char *)Tloc(b, BUNlast(b)) = (unsigned char) (v - GDK_VAROFFSET);
356 b->theap.free += 1;
357 break;
358 case 2:
359 assert(v - GDK_VAROFFSET < ((var_t) 1 << 16));
360 *(unsigned short *)Tloc(b, BUNlast(b)) = (unsigned short) (v - GDK_VAROFFSET);
361 b->theap.free += 2;
362 break;
363#if SIZEOF_VAR_T == 8
364 case 4:
365 assert(v < ((var_t) 1 << 32));
366 *(unsigned int *)Tloc(b, BUNlast(b)) = (unsigned int) v;
367 b->theap.free += 4;
368 break;
369#endif
370 default:
371 *(var_t *)Tloc(b, BUNlast(b)) = v;
372 b->theap.free += SIZEOF_VAR_T;
373 break;
374 }
375 b->batCount++;
376 } else {
377 bunfastappVAR(b, tp);
378 }
379 HASHins(b, r, tp);
380 r++;
381 }
382 }
383 b->theap.dirty = true;
384 return GDK_SUCCEED;
385 bunins_failed:
386 b->tvarsized = true;
387 b->ttype = TYPE_str;
388 return GDK_FAIL;
389}
390
391static gdk_return
392append_varsized_bat(BAT *b, BAT *n, BAT *s)
393{
394 BATiter ni;
395 struct canditer ci;
396 BUN cnt, r;
397 oid hseq = n->hseqbase;
398
399 /* only transient bats can use some other bat's vheap */
400 assert(b->batRole == TRANSIENT || b->tvheap->parentid == b->batCacheid);
401 /* make sure the bats use var_t */
402 assert(b->twidth == n->twidth);
403 assert(b->twidth == SIZEOF_VAR_T);
404 if (n->batCount == 0 || (s && s->batCount == 0))
405 return GDK_SUCCEED;
406 cnt = canditer_init(&ci, n, s);
407 if (cnt == 0)
408 return GDK_SUCCEED;
409 if (BATcount(b) == 0 &&
410 b->batRole == TRANSIENT &&
411 n->batRestricted == BAT_READ &&
412 b->tvheap != n->tvheap) {
413 /* if b is still empty, in the transient farm, and n
414 * is read-only, we replace b's vheap with a reference
415 * to n's */
416 if (b->tvheap->parentid != b->batCacheid) {
417 BBPunshare(b->tvheap->parentid);
418 } else {
419 HEAPfree(b->tvheap, true);
420 GDKfree(b->tvheap);
421 }
422 BBPshare(n->tvheap->parentid);
423 b->tvheap = n->tvheap;
424 b->batDirtydesc = true;
425 }
426 if (b->tvheap == n->tvheap) {
427 /* if b and n use the same vheap, we only need to copy
428 * the offsets from n to b */
429 HASHdestroy(b); /* not maintaining, so destroy it */
430 if (ci.tpe == cand_dense) {
431 /* fast memcpy since we copy a consecutive
432 * chunk of memory */
433 memcpy(Tloc(b, BUNlast(b)),
434 Tloc(n, ci.seq - hseq),
435 cnt << b->tshift);
436 } else {
437 var_t *restrict dst = (var_t *) Tloc(b, BUNlast(b));
438 const var_t *restrict src = (const var_t *) Tloc(n, 0);
439 while (cnt > 0) {
440 cnt--;
441 *dst++ = src[canditer_next(&ci) - hseq];
442 }
443 }
444 b->theap.dirty = true;
445 BATsetcount(b, BATcount(b) + ci.ncand);
446 return GDK_SUCCEED;
447 }
448 /* b and n do not share their vheap, so we need to copy data */
449 if (b->tvheap->parentid != b->batCacheid) {
450 /* if b shares its vheap with some other bat, unshare it */
451 Heap *h = GDKzalloc(sizeof(Heap));
452 if (h == NULL)
453 return GDK_FAIL;
454 h->parentid = b->batCacheid;
455 h->farmid = BBPselectfarm(b->batRole, b->ttype, varheap);
456 strconcat_len(h->filename, sizeof(h->filename),
457 BBP_physical(b->batCacheid), ".theap", NULL);
458 if (HEAPcopy(h, b->tvheap) != GDK_SUCCEED) {
459 HEAPfree(h, true);
460 GDKfree(h);
461 return GDK_FAIL;
462 }
463 BBPunshare(b->tvheap->parentid);
464 b->tvheap = h;
465 }
466 /* copy data from n to b */
467 ni = bat_iterator(n);
468 r = BUNlast(b);
469 while (cnt > 0) {
470 cnt--;
471 BUN p = canditer_next(&ci) - hseq;
472 const void *t = BUNtvar(ni, p);
473 bunfastapp_nocheckVAR(b, r, t, Tsize(b));
474 HASHins(b, r, t);
475 r++;
476 }
477 b->theap.dirty = true;
478 return GDK_SUCCEED;
479
480 bunins_failed:
481 if (b->tunique)
482 BBPunfix(s->batCacheid);
483 return GDK_FAIL;
484}
485
486/* Append the contents of BAT n (subject to the optional candidate
487 * list s) to BAT b. If b is empty, b will get the seqbase of s if it
488 * was passed in, and else the seqbase of n. */
489gdk_return
490BATappend(BAT *b, BAT *n, BAT *s, bool force)
491{
492 struct canditer ci;
493 BUN cnt;
494 BUN r;
495 PROPrec *prop, *nprop;
496 oid hseq = n->hseqbase;
497
498 if (b == NULL || n == NULL || (cnt = BATcount(n)) == 0) {
499 return GDK_SUCCEED;
500 }
501 assert(b->batCacheid > 0);
502 assert(b->theap.parentid == 0);
503
504 ALIGNapp(b, "BATappend", force, GDK_FAIL);
505
506 if (ATOMstorage(ATOMtype(b->ttype)) != ATOMstorage(ATOMtype(n->ttype))) {
507 GDKerror("Incompatible operands.\n");
508 return GDK_FAIL;
509 }
510 CHECKDEBUG {
511 if (BATttype(b) != BATttype(n) &&
512 ATOMtype(b->ttype) != ATOMtype(n->ttype)) {
513 fprintf(stderr,"#Interpreting %s as %s.\n",
514 ATOMname(BATttype(n)), ATOMname(BATttype(b)));
515 }
516 }
517
518 if (b->tunique) {
519 /* if b has the unique bit set, only insert values
520 * from n that don't already occur in b, and make sure
521 * we don't insert any duplicates either; we do this
522 * by calculating a subset of n that complies with
523 * this */
524 BAT *d;
525
526 d = BATdiff(n, b, s, NULL, true, false, BUN_NONE);
527 if (d == NULL)
528 return GDK_FAIL;
529 s = BATunique(n, d);
530 BBPunfix(d->batCacheid);
531 if (s == NULL)
532 return GDK_FAIL;
533 if (BATcount(s) == 0) {
534 /* no new values in subset of n */
535 BBPunfix(s->batCacheid);
536 return GDK_SUCCEED;
537 }
538 }
539
540 cnt = canditer_init(&ci, n, s);
541 if (cnt == 0) {
542 assert(!b->tunique);
543 return GDK_SUCCEED;
544 }
545
546 if (BUNlast(b) + cnt > BUN_MAX) {
547 if (b->tunique)
548 BBPunfix(s->batCacheid);
549 GDKerror("BATappend: combined BATs too large\n");
550 return GDK_FAIL;
551 }
552
553 if (b->hseqbase + BATcount(b) + cnt >= GDK_oid_max) {
554 if (b->tunique)
555 BBPunfix(s->batCacheid);
556 GDKerror("BATappend: overflow of head value\n");
557 return GDK_FAIL;
558 }
559
560 b->batDirtydesc = true;
561
562 IMPSdestroy(b); /* imprints do not support updates yet */
563 OIDXdestroy(b);
564 if ((prop = BATgetprop(b, GDK_MAX_VALUE)) != NULL) {
565 if ((nprop = BATgetprop(n, GDK_MAX_VALUE)) != NULL) {
566 if (ATOMcmp(b->ttype, VALptr(&prop->v), VALptr(&nprop->v)) < 0) {
567 if (s == NULL)
568 BATsetprop(b, GDK_MAX_VALUE, b->ttype, VALptr(&nprop->v));
569 else
570 BATrmprop(b, GDK_MAX_VALUE);
571 }
572 } else {
573 BATrmprop(b, GDK_MAX_VALUE);
574 }
575 }
576 if ((prop = BATgetprop(b, GDK_MIN_VALUE)) != NULL) {
577 if ((nprop = BATgetprop(n, GDK_MIN_VALUE)) != NULL) {
578 if (ATOMcmp(b->ttype, VALptr(&prop->v), VALptr(&nprop->v)) > 0) {
579 if (s == NULL)
580 BATsetprop(b, GDK_MIN_VALUE, b->ttype, VALptr(&nprop->v));
581 else
582 BATrmprop(b, GDK_MIN_VALUE);
583 }
584 } else {
585 BATrmprop(b, GDK_MIN_VALUE);
586 }
587 }
588#if 0 /* enable if we have more properties than just min/max */
589 do {
590 for (prop = b->tprops; prop; prop = prop->next)
591 if (prop->id != GDK_MAX_VALUE &&
592 prop->id != GDK_MIN_VALUE &&
593 prop->id != GDK_HASH_MASK) {
594 BATrmprop(b, prop->id);
595 break;
596 }
597 } while (prop);
598#endif
599 if (b->thash == (Hash *) 1 || BATcount(b) == 0 ||
600 (b->thash && ((size_t *) b->thash->heap.base)[0] & (1 << 24))) {
601 /* don't bother first loading the hash to then change
602 * it, or updating the hash if we replace the heap,
603 * also, we cannot maintain persistent hashes */
604 HASHdestroy(b);
605 }
606
607 if (b->ttype == TYPE_void) {
608 /* b does not have storage, keep it that way if we can */
609 HASHdestroy(b); /* we're not maintaining the hash here */
610 if (BATtdense(n) && ci.tpe == cand_dense &&
611 (BATcount(b) == 0 ||
612 (BATtdense(b) &&
613 b->tseqbase + BATcount(b) == n->tseqbase + ci.seq - hseq))) {
614 /* n is also dense and consecutive with b */
615 if (BATcount(b) == 0)
616 BATtseqbase(b, n->tseqbase + ci.seq - hseq);
617 BATsetcount(b, BATcount(b) + cnt);
618 if (b->tunique)
619 BBPunfix(s->batCacheid);
620 return GDK_SUCCEED;
621 }
622 if ((BATcount(b) == 0 || is_oid_nil(b->tseqbase)) &&
623 n->ttype == TYPE_void && is_oid_nil(n->tseqbase)) {
624 /* both b and n are void/nil */
625 BATtseqbase(b, oid_nil);
626 BATsetcount(b, BATcount(b) + cnt);
627 if (b->tunique)
628 BBPunfix(s->batCacheid);
629 return GDK_SUCCEED;
630 }
631 /* we need to materialize b; allocate enough capacity */
632 b->batCapacity = BATcount(b) + cnt;
633 if (BATmaterialize(b) != GDK_SUCCEED)
634 goto bunins_failed;
635 } else if (cnt > BATcapacity(b) - BATcount(b)) {
636 /* if needed space exceeds a normal growth extend just
637 * with what's needed */
638 BUN ncap = BATcount(b) + cnt;
639 BUN grows = BATgrows(b);
640
641 if (ncap > grows)
642 grows = ncap;
643 if (BATextend(b, grows) != GDK_SUCCEED)
644 goto bunins_failed;
645 }
646
647 /* if growing too much, remove the hash, else we maintain it */
648 MT_lock_set(&b->batIdxLock);
649 if (b->thash == (Hash *) 1 ||
650 (b->thash != NULL &&
651 (2 * b->thash->mask) < (BATcount(b) + cnt))) {
652 MT_lock_unset(&b->batIdxLock);
653 HASHdestroy(b);
654 } else {
655 MT_lock_unset(&b->batIdxLock);
656 }
657
658 r = BUNlast(b);
659
660 if (BATcount(b) == 0) {
661 b->tsorted = n->tsorted;
662 b->trevsorted = n->trevsorted;
663 b->tseqbase = oid_nil;
664 b->tnonil = n->tnonil;
665 b->tnil = n->tnil && cnt == BATcount(n);
666 b->tseqbase = oid_nil;
667 if (ci.tpe == cand_dense) {
668 b->tnosorted = ci.seq - hseq <= n->tnosorted && n->tnosorted < ci.seq + cnt - hseq ? n->tnosorted + hseq - ci.seq : 0;
669 b->tnorevsorted = ci.seq - hseq <= n->tnorevsorted && n->tnorevsorted < ci.seq + cnt - hseq ? n->tnorevsorted + hseq - ci.seq : 0;
670 if (BATtdense(n)) {
671 b->tseqbase = n->tseqbase + ci.seq - hseq;
672 }
673 } else {
674 b->tnosorted = 0;
675 b->tnorevsorted = 0;
676 }
677 /* if tunique, uniqueness is guaranteed above */
678 b->tkey = n->tkey | b->tunique;
679 if (!b->tunique && cnt == BATcount(n)) {
680 b->tnokey[0] = n->tnokey[0];
681 b->tnokey[1] = n->tnokey[1];
682 } else {
683 b->tnokey[0] = b->tnokey[1] = 0;
684 }
685 } else {
686 BUN last = r - 1;
687 BATiter ni = bat_iterator(n);
688 BATiter bi = bat_iterator(b);
689 int xx = ATOMcmp(b->ttype,
690 BUNtail(ni, ci.seq - hseq),
691 BUNtail(bi, last));
692 if (BATtordered(b) && (!BATtordered(n) || xx < 0)) {
693 b->tsorted = false;
694 b->tnosorted = 0;
695 b->tseqbase = oid_nil;
696 }
697 if (BATtrevordered(b) &&
698 (!BATtrevordered(n) || xx > 0)) {
699 b->trevsorted = false;
700 b->tnorevsorted = 0;
701 }
702 if (!b->tunique && /* uniqueness is guaranteed above */
703 b->tkey &&
704 (!(BATtordered(b) || BATtrevordered(b)) ||
705 !n->tkey || xx == 0)) {
706 BATkey(b, false);
707 }
708 if (b->ttype != TYPE_void && b->tsorted && BATtdense(b) &&
709 (!BATtdense(n) ||
710 ci.tpe != cand_dense ||
711 1 + *(oid *) BUNtloc(bi, last) != BUNtoid(n, ci.seq - hseq))) {
712 b->tseqbase = oid_nil;
713 }
714 b->tnonil &= n->tnonil;
715 b->tnil |= n->tnil && cnt == BATcount(n);
716 }
717 if (b->ttype == TYPE_str) {
718 if (insert_string_bat(b, n, s, force) != GDK_SUCCEED) {
719 if (b->tunique)
720 BBPunfix(s->batCacheid);
721 return GDK_FAIL;
722 }
723 } else if (ATOMvarsized(b->ttype)) {
724 if (append_varsized_bat(b, n, s) != GDK_SUCCEED) {
725 if (b->tunique)
726 BBPunfix(s->batCacheid);
727 return GDK_FAIL;
728 }
729 } else {
730 if (BATatoms[b->ttype].atomFix == NULL &&
731 b->ttype != TYPE_void &&
732 n->ttype != TYPE_void &&
733 ci.tpe == cand_dense) {
734 /* use fast memcpy if we can, but then we
735 * can't maintain the hash */
736 HASHdestroy(b);
737 memcpy(Tloc(b, BUNlast(b)),
738 Tloc(n, ci.seq - hseq),
739 cnt * Tsize(n));
740 BATsetcount(b, BATcount(b) + cnt);
741 } else {
742 BATiter ni = bat_iterator(n);
743
744 while (cnt > 0) {
745 cnt--;
746 BUN p = canditer_next(&ci) - hseq;
747 const void *t = BUNtail(ni, p);
748 bunfastapp_nocheck(b, r, t, Tsize(b));
749 HASHins(b, r, t);
750 r++;
751 }
752 }
753 b->theap.dirty = true;
754 }
755 if (b->tunique)
756 BBPunfix(s->batCacheid);
757 return GDK_SUCCEED;
758 bunins_failed:
759 if (b->tunique)
760 BBPunfix(s->batCacheid);
761 return GDK_FAIL;
762}
763
764gdk_return
765BATdel(BAT *b, BAT *d)
766{
767 int (*unfix) (const void *) = BATatoms[b->ttype].atomUnfix;
768 void (*atmdel) (Heap *, var_t *) = BATatoms[b->ttype].atomDel;
769 BATiter bi = bat_iterator(b);
770
771 assert(ATOMtype(d->ttype) == TYPE_oid);
772 assert(d->tsorted);
773 assert(d->tkey);
774 if (BATcount(d) == 0)
775 return GDK_SUCCEED;
776 if (BATtdense(d)) {
777 oid o = d->tseqbase;
778 BUN c = BATcount(d);
779
780 if (o + c <= b->hseqbase)
781 return GDK_SUCCEED;
782 if (o < b->hseqbase) {
783 c -= b->hseqbase - o;
784 o = b->hseqbase;
785 }
786 if (o - b->hseqbase < b->batInserted) {
787 GDKerror("BATdelete: cannot delete committed values\n");
788 return GDK_FAIL;
789 }
790 if (o + c > b->hseqbase + BATcount(b))
791 c = b->hseqbase + BATcount(b) - o;
792 if (c == 0)
793 return GDK_SUCCEED;
794 if (unfix || atmdel) {
795 BUN p = o - b->hseqbase;
796 BUN q = p + c;
797 while (p < q) {
798 if (unfix)
799 (*unfix)(BUNtail(bi, p));
800 if (atmdel)
801 (*atmdel)(b->tvheap, (var_t *) BUNtloc(bi, p));
802 p++;
803 }
804 }
805 if (BATtdense(b) && BATmaterialize(b) != GDK_SUCCEED)
806 return GDK_FAIL;
807 if (o + c < b->hseqbase + BATcount(b)) {
808 memmove(Tloc(b, o - b->hseqbase),
809 Tloc(b, o + c - b->hseqbase),
810 Tsize(b) * (BATcount(b) - (o + c - b->hseqbase)));
811 }
812 b->batCount -= c;
813 } else {
814 const oid *o = (const oid *) Tloc(d, 0);
815 const oid *s;
816 BUN c = BATcount(d);
817 BUN nd = 0;
818 char *p;
819
820 if (o[c - 1] <= b->hseqbase)
821 return GDK_SUCCEED;
822 while (*o < b->hseqbase) {
823 o++;
824 c--;
825 }
826 if (*o - b->hseqbase < b->batInserted) {
827 GDKerror("BATdelete: cannot delete committed values\n");
828 return GDK_FAIL;
829 }
830 if (BATtdense(b) && BATmaterialize(b) != GDK_SUCCEED)
831 return GDK_FAIL;
832 s = o;
833 p = Tloc(b, *o - b->hseqbase);
834 while (c > 0 && *o < b->hseqbase + BATcount(b)) {
835 size_t n;
836 if (unfix)
837 (*unfix)(BUNtail(bi, *o - b->hseqbase));
838 if (atmdel)
839 (*atmdel)(b->tvheap, (var_t *) BUNtloc(bi, *o - b->hseqbase));
840 o++;
841 c--;
842 nd++;
843 if (c == 0 || *o - b->hseqbase >= BATcount(b))
844 n = b->hseqbase + BATcount(b) - o[-1] - 1;
845 else if ((oid) (o - s) < *o - *s)
846 n = o[0] - o[-1] - 1;
847 else
848 n = 0;
849 if (n > 0) {
850 n *= Tsize(b);
851 memmove(p,
852 Tloc(b, o[-1] + 1 - b->hseqbase),
853 n);
854 p += n;
855 s = o;
856 }
857 }
858 b->batCount -= nd;
859 }
860 if (b->batCount <= 1) {
861 /* some trivial properties */
862 b->tkey = true;
863 b->tsorted = b->trevsorted = true;
864 if (b->batCount == 0) {
865 b->tnil = false;
866 b->tnonil = true;
867 }
868 }
869 /* not sure about these anymore */
870 b->tnosorted = b->tnorevsorted = 0;
871 b->tnokey[0] = b->tnokey[1] = 0;
872 PROPdestroy(b);
873
874 return GDK_SUCCEED;
875}
876
877/*
878 * The last in this series is a BATreplace, which replaces all the
879 * buns mentioned.
880 */
881gdk_return
882BATreplace(BAT *b, BAT *p, BAT *n, bool force)
883{
884 if (b == NULL || p == NULL || n == NULL || BATcount(n) == 0) {
885 return GDK_SUCCEED;
886 }
887 if (void_replace_bat(b, p, n, force) != GDK_SUCCEED)
888 return GDK_FAIL;
889 return GDK_SUCCEED;
890}
891
892
893/*
894 * BAT Selections
895 * The BAT selectors are among the most heavily used operators.
896 * Their efficient implementation is therefore mandatory.
897 *
898 * BAT slice
899 * This function returns a horizontal slice from a BAT. It optimizes
900 * execution by avoiding to copy when the BAT is memory mapped (in
901 * this case, an independent submap is created) or else when it is
902 * read-only, then a VIEW bat is created as a result.
903 *
904 * If a new copy has to be created, this function takes care to
905 * preserve void-columns (in this case, the seqbase has to be
906 * recomputed in the result).
907 *
908 * NOTE new semantics, the selected range is excluding the high value.
909 */
910BAT *
911BATslice(BAT *b, BUN l, BUN h)
912{
913 BUN low = l;
914 BAT *bn;
915 BATiter bni, bi = bat_iterator(b);
916 oid foid; /* first oid value if oid column */
917
918 BATcheck(b, "BATslice", NULL);
919 if (h > BATcount(b))
920 h = BATcount(b);
921 if (h < l)
922 h = l;
923
924 if (l > BUN_MAX || h > BUN_MAX) {
925 GDKerror("BATslice: boundary out of range\n");
926 return NULL;
927 }
928
929 if (b->ttype == TYPE_void && b->tvheap != NULL) {
930 /* slicing a candidate list with exceptions */
931 struct canditer ci;
932 canditer_init(&ci, NULL, b);
933 return canditer_slice(&ci, l, h);
934 }
935 /* If the source BAT is readonly, then we can obtain a VIEW
936 * that just reuses the memory of the source. */
937 if (b->batRestricted == BAT_READ &&
938 (!VIEWtparent(b) ||
939 BBP_cache(VIEWtparent(b))->batRestricted == BAT_READ)) {
940 bn = VIEWcreate(b->hseqbase + low, b);
941 if (bn == NULL)
942 return NULL;
943 VIEWbounds(b, bn, l, h);
944 } else {
945 /* create a new BAT and put everything into it */
946 BUN p = l;
947 BUN q = h;
948
949 bn = COLnew((oid) (b->hseqbase + low), BATtdense(b) ? TYPE_void : b->ttype, h - l, TRANSIENT);
950 if (bn == NULL)
951 return NULL;
952
953 if (bn->ttype == TYPE_void ||
954 (!bn->tvarsized &&
955 BATatoms[bn->ttype].atomPut == NULL &&
956 BATatoms[bn->ttype].atomFix == NULL)) {
957 if (bn->ttype) {
958 memcpy(Tloc(bn, 0), Tloc(b, p),
959 (q - p) * Tsize(bn));
960 bn->theap.dirty = true;
961 }
962 BATsetcount(bn, h - l);
963 } else {
964 for (; p < q; p++) {
965 bunfastapp(bn, BUNtail(bi, p));
966 }
967 }
968 bn->theap.dirty = true;
969 bn->tsorted = b->tsorted;
970 bn->trevsorted = b->trevsorted;
971 bn->tkey = b->tkey;
972 bn->tnonil = b->tnonil;
973 if (b->tnosorted > l && b->tnosorted < h)
974 bn->tnosorted = b->tnosorted - l;
975 else
976 bn->tnosorted = 0;
977 if (b->tnorevsorted > l && b->tnorevsorted < h)
978 bn->tnorevsorted = b->tnorevsorted - l;
979 else
980 bn->tnorevsorted = 0;
981 if (b->tnokey[0] >= l && b->tnokey[0] < h &&
982 b->tnokey[1] >= l && b->tnokey[1] < h &&
983 b->tnokey[0] != b->tnokey[1]) {
984 bn->tnokey[0] = b->tnokey[0] - l;
985 bn->tnokey[1] = b->tnokey[1] - l;
986 } else {
987 bn->tnokey[0] = bn->tnokey[1] = 0;
988 }
989 }
990 bn->tnonil = b->tnonil || bn->batCount == 0;
991 bn->tnil = false; /* we just don't know */
992 bn->tnosorted = 0;
993 bn->tnokey[0] = bn->tnokey[1] = 0;
994 bni = bat_iterator(bn);
995 if (BATtdense(b)) {
996 BATtseqbase(bn, (oid) (b->tseqbase + low));
997 } else if (bn->ttype == TYPE_oid) {
998 if (BATcount(bn) == 0) {
999 BATtseqbase(bn, 0);
1000 } else if (!is_oid_nil((foid = *(oid *) BUNtloc(bni, 0))) &&
1001 (BATcount(bn) == 1 ||
1002 (bn->tkey &&
1003 bn->tsorted &&
1004 foid + BATcount(bn) - 1 == *(oid *) BUNtloc(bni, BUNlast(bn) - 1)))) {
1005 BATtseqbase(bn, foid);
1006 }
1007 }
1008 if (bn->batCount <= 1) {
1009 bn->tsorted = ATOMlinear(b->ttype);
1010 bn->trevsorted = ATOMlinear(b->ttype);
1011 BATkey(bn, true);
1012 } else {
1013 bn->tsorted = b->tsorted;
1014 bn->trevsorted = b->trevsorted;
1015 BATkey(bn, BATtkey(b));
1016 }
1017 ALGODEBUG fprintf(stderr,
1018 "#BATslice(" ALGOBATFMT "," BUNFMT "," BUNFMT ")"
1019 "=" ALGOBATFMT "\n",
1020 ALGOBATPAR(b), l, h, ALGOBATPAR(bn));
1021 return bn;
1022 bunins_failed:
1023 BBPreclaim(bn);
1024 return NULL;
1025}
1026
1027/* Return whether the BAT has all unique values or not. It we don't
1028 * know, invest in a proper check and record the results in the bat
1029 * descriptor. */
1030bool
1031BATkeyed(BAT *b)
1032{
1033 lng t0 = GDKusec();
1034 BATiter bi = bat_iterator(b);
1035 int (*cmpf)(const void *, const void *) = ATOMcompare(b->ttype);
1036 BUN p, q, hb;
1037 Hash *hs = NULL;
1038
1039 if (b->ttype == TYPE_void)
1040 return BATtdense(b) || BATcount(b) <= 1;
1041 if (BATcount(b) <= 1)
1042 return true;
1043 if (b->twidth < SIZEOF_BUN &&
1044 BATcount(b) > (BUN) 1 << (8 * b->twidth)) {
1045 /* more rows than possible bit combinations in the atom */
1046 assert(!b->tkey);
1047 return false;
1048 }
1049
1050 b->batDirtydesc = true;
1051 if (!b->tkey && b->tnokey[0] == 0 && b->tnokey[1] == 0) {
1052 if (b->tsorted || b->trevsorted) {
1053 const void *prev = BUNtail(bi, 0);
1054 const void *cur;
1055 for (q = BUNlast(b), p = 1; p < q; p++) {
1056 cur = BUNtail(bi, p);
1057 if ((*cmpf)(prev, cur) == 0) {
1058 b->tnokey[0] = p - 1;
1059 b->tnokey[1] = p;
1060 ALGODEBUG fprintf(stderr, "#BATkeyed: fixed nokey(" BUNFMT "," BUNFMT ") for %s#" BUNFMT " (" LLFMT " usec)\n", p - 1, p, BATgetId(b), BATcount(b), GDKusec() - t0);
1061 goto doreturn;
1062 }
1063 prev = cur;
1064 }
1065 /* we completed the scan: no duplicates */
1066 b->tkey = true;
1067 } else if (BATcheckhash(b) ||
1068 (!b->batTransient &&
1069 BAThash(b) == GDK_SUCCEED) ||
1070 (VIEWtparent(b) != 0 &&
1071 BATcheckhash(BBPdescriptor(VIEWtparent(b))))) {
1072 /* we already have a hash table on b, or b is
1073 * persistent and we could create a hash
1074 * table, or b is a view on a bat that already
1075 * has a hash table */
1076 BUN lo = 0;
1077
1078 hs = b->thash;
1079 if (hs == NULL && VIEWtparent(b) != 0) {
1080 BAT *b2 = BBPdescriptor(VIEWtparent(b));
1081 lo = (BUN) ((b->theap.base - b2->theap.base) >> b->tshift);
1082 hs = b2->thash;
1083 }
1084 for (q = BUNlast(b), p = 0; p < q; p++) {
1085 const void *v = BUNtail(bi, p);
1086 for (hb = HASHgetlink(hs, p + lo);
1087 hb != HASHnil(hs) && hb >= lo;
1088 hb = HASHgetlink(hs, hb)) {
1089 assert(hb < p + lo);
1090 if ((*cmpf)(v, BUNtail(bi, hb - lo)) == 0) {
1091 b->tnokey[0] = hb - lo;
1092 b->tnokey[1] = p;
1093 ALGODEBUG fprintf(stderr, "#BATkeyed: fixed nokey(" BUNFMT "," BUNFMT ") for %s#" BUNFMT " (" LLFMT " usec)\n", hb - lo, p, BATgetId(b), BATcount(b), GDKusec() - t0);
1094 goto doreturn;
1095 }
1096 }
1097 }
1098 /* we completed the scan: no duplicates */
1099 b->tkey = true;
1100 } else {
1101 const char *nme;
1102 BUN prb;
1103 BUN mask;
1104 int len;
1105
1106 GDKclrerr(); /* not interested in BAThash errors */
1107 nme = BBP_physical(b->batCacheid);
1108 if (ATOMbasetype(b->ttype) == TYPE_bte) {
1109 mask = (BUN) 1 << 8;
1110 cmpf = NULL; /* no compare needed, "hash" is perfect */
1111 } else if (ATOMbasetype(b->ttype) == TYPE_sht) {
1112 mask = (BUN) 1 << 16;
1113 cmpf = NULL; /* no compare needed, "hash" is perfect */
1114 } else {
1115 mask = HASHmask(b->batCount);
1116 if (mask < ((BUN) 1 << 16))
1117 mask = (BUN) 1 << 16;
1118 }
1119 if ((hs = GDKzalloc(sizeof(Hash))) == NULL)
1120 goto doreturn;
1121 len = snprintf(hs->heap.filename, sizeof(hs->heap.filename), "%s.hash%d", nme, THRgettid());
1122 if (len == -1 || len >= (int) sizeof(hs->heap.filename) ||
1123 HASHnew(hs, b->ttype, BUNlast(b), mask, BUN_NONE) != GDK_SUCCEED) {
1124 GDKfree(hs);
1125 /* err on the side of caution: not keyed */
1126 goto doreturn;
1127 }
1128 for (q = BUNlast(b), p = 0; p < q; p++) {
1129 const void *v = BUNtail(bi, p);
1130 prb = HASHprobe(hs, v);
1131 for (hb = HASHget(hs, prb);
1132 hb != HASHnil(hs);
1133 hb = HASHgetlink(hs, hb)) {
1134 if (cmpf == NULL ||
1135 (*cmpf)(v, BUNtail(bi, hb)) == 0) {
1136 b->tnokey[0] = hb;
1137 b->tnokey[1] = p;
1138 ALGODEBUG fprintf(stderr, "#BATkeyed: fixed nokey(" BUNFMT "," BUNFMT ") for %s#" BUNFMT " (" LLFMT " usec)\n", hb, p, BATgetId(b), BATcount(b), GDKusec() - t0);
1139 goto doreturn_free;
1140 }
1141 }
1142 /* enter into hash table */
1143 HASHputlink(hs, p, HASHget(hs, prb));
1144 HASHput(hs, prb, p);
1145 }
1146 doreturn_free:
1147 HEAPfree(&hs->heap, true);
1148 GDKfree(hs);
1149 if (p == q) {
1150 /* we completed the complete scan: no
1151 * duplicates */
1152 b->tkey = true;
1153 }
1154 }
1155 }
1156 doreturn:
1157 return b->tkey;
1158}
1159
1160/* Return whether the BAT is ordered or not. If we don't know, invest
1161 * in a scan and record the results in the bat descriptor. If during
1162 * the scan we happen to find evidence that the BAT is not reverse
1163 * sorted, we record the location. */
1164bool
1165BATordered(BAT *b)
1166{
1167 lng t0 = 0;
1168
1169 ALGODEBUG t0 = GDKusec();
1170
1171 if (b->ttype == TYPE_void)
1172 return true;
1173 /* In order that multiple threads don't scan the same BAT at
1174 * the same time (happens a lot with mitosis/mergetable), we
1175 * use a lock. We reuse the hash lock for this, not because
1176 * this scanning interferes with hashes, but because it's
1177 * there, and not so likely to be used at the same time. */
1178 MT_lock_set(&b->batIdxLock);
1179 if (!b->tsorted && b->tnosorted == 0) {
1180 BATiter bi = bat_iterator(b);
1181 int (*cmpf)(const void *, const void *) = ATOMcompare(b->ttype);
1182 BUN p, q;
1183 b->batDirtydesc = true;
1184 switch (ATOMbasetype(b->ttype)) {
1185 case TYPE_int: {
1186 const int *iptr = (const int *) Tloc(b, 0);
1187 for (q = BUNlast(b), p = 1; p < q; p++) {
1188 if (iptr[p - 1] > iptr[p]) {
1189 b->tnosorted = p;
1190 ALGODEBUG fprintf(stderr, "#BATordered: fixed nosorted(" BUNFMT ") for %s#" BUNFMT " (" LLFMT " usec)\n", p, BATgetId(b), BATcount(b), GDKusec() - t0);
1191 goto doreturn;
1192 } else if (!b->trevsorted &&
1193 b->tnorevsorted == 0 &&
1194 iptr[p - 1] < iptr[p]) {
1195 b->tnorevsorted = p;
1196 ALGODEBUG fprintf(stderr, "#BATordered: fixed norevsorted(" BUNFMT ") for %s#" BUNFMT "\n", p, BATgetId(b), BATcount(b));
1197 }
1198 }
1199 break;
1200 }
1201 case TYPE_lng: {
1202 const lng *lptr = (const lng *) Tloc(b, 0);
1203 for (q = BUNlast(b), p = 1; p < q; p++) {
1204 if (lptr[p - 1] > lptr[p]) {
1205 b->tnosorted = p;
1206 ALGODEBUG fprintf(stderr, "#BATordered: fixed nosorted(" BUNFMT ") for %s#" BUNFMT " (" LLFMT " usec)\n", p, BATgetId(b), BATcount(b), GDKusec() - t0);
1207 goto doreturn;
1208 } else if (!b->trevsorted &&
1209 b->tnorevsorted == 0 &&
1210 lptr[p - 1] < lptr[p]) {
1211 b->tnorevsorted = p;
1212 ALGODEBUG fprintf(stderr, "#BATordered: fixed norevsorted(" BUNFMT ") for %s#" BUNFMT "\n", p, BATgetId(b), BATcount(b));
1213 }
1214 }
1215 break;
1216 }
1217 default:
1218 for (q = BUNlast(b), p = 1; p < q; p++) {
1219 int c;
1220 if ((c = cmpf(BUNtail(bi, p - 1), BUNtail(bi, p))) > 0) {
1221 b->tnosorted = p;
1222 ALGODEBUG fprintf(stderr, "#BATordered: fixed nosorted(" BUNFMT ") for %s#" BUNFMT " (" LLFMT " usec)\n", p, BATgetId(b), BATcount(b), GDKusec() - t0);
1223 goto doreturn;
1224 } else if (!b->trevsorted &&
1225 b->tnorevsorted == 0 &&
1226 c < 0) {
1227 b->tnorevsorted = p;
1228 ALGODEBUG fprintf(stderr, "#BATordered: fixed norevsorted(" BUNFMT ") for %s#" BUNFMT "\n", p, BATgetId(b), BATcount(b));
1229 }
1230 }
1231 break;
1232 }
1233 /* we only get here if we completed the scan; note
1234 * that if we didn't record evidence about *reverse*
1235 * sortedness, we know that the BAT is also reverse
1236 * sorted */
1237 b->tsorted = true;
1238 ALGODEBUG fprintf(stderr, "#BATordered: fixed sorted for %s#" BUNFMT " (" LLFMT " usec)\n", BATgetId(b), BATcount(b), GDKusec() - t0);
1239 if (!b->trevsorted && b->tnorevsorted == 0) {
1240 b->trevsorted = true;
1241 ALGODEBUG fprintf(stderr, "#BATordered: fixed revsorted for %s#" BUNFMT "\n", BATgetId(b), BATcount(b));
1242 }
1243 }
1244 doreturn:
1245 MT_lock_unset(&b->batIdxLock);
1246 return b->tsorted;
1247}
1248
1249/* Return whether the BAT is reverse ordered or not. If we don't
1250 * know, invest in a scan and record the results in the bat
1251 * descriptor. */
1252bool
1253BATordered_rev(BAT *b)
1254{
1255 lng t0 = 0;
1256
1257 ALGODEBUG t0 = GDKusec();
1258
1259 if (b == NULL)
1260 return false;
1261 if (BATcount(b) <= 1)
1262 return true;
1263 if (b->ttype == TYPE_void)
1264 return is_oid_nil(b->tseqbase);
1265 if (BATtdense(b))
1266 return false;
1267 MT_lock_set(&b->batIdxLock);
1268 if (!b->trevsorted && b->tnorevsorted == 0) {
1269 BATiter bi = bat_iterator(b);
1270 int (*cmpf)(const void *, const void *) = ATOMcompare(b->ttype);
1271 BUN p, q;
1272 b->batDirtydesc = true;
1273 for (q = BUNlast(b), p = 1; p < q; p++) {
1274 if (cmpf(BUNtail(bi, p - 1), BUNtail(bi, p)) < 0) {
1275 b->tnorevsorted = p;
1276 ALGODEBUG fprintf(stderr, "#BATordered_rev: fixed norevsorted(" BUNFMT ") for %s#" BUNFMT " (" LLFMT " usec)\n", p, BATgetId(b), BATcount(b), GDKusec() - t0);
1277 goto doreturn;
1278 }
1279 }
1280 b->trevsorted = true;
1281 ALGODEBUG fprintf(stderr, "#BATordered_rev: fixed revsorted for %s#" BUNFMT " (" LLFMT " usec)\n", BATgetId(b), BATcount(b), GDKusec() - t0);
1282 }
1283 doreturn:
1284 MT_lock_unset(&b->batIdxLock);
1285 return b->trevsorted;
1286}
1287
1288/* figure out which sort function is to be called
1289 * stable sort can produce an error (not enough memory available),
1290 * "quick" sort does not produce errors */
1291static gdk_return
1292do_sort(void *restrict h, void *restrict t, const void *restrict base,
1293 size_t n, int hs, int ts, int tpe, bool reverse, bool nilslast,
1294 bool stable)
1295{
1296 if (n <= 1) /* trivially sorted */
1297 return GDK_SUCCEED;
1298 if (stable) {
1299 if (reverse)
1300 return GDKssort_rev(h, t, base, n, hs, ts, tpe);
1301 else
1302 return GDKssort(h, t, base, n, hs, ts, tpe);
1303 } else {
1304 GDKqsort(h, t, base, n, hs, ts, tpe, reverse, nilslast);
1305 }
1306 return GDK_SUCCEED;
1307}
1308
1309/* Sort the bat b according to both o and g. The stable and reverse
1310 * parameters indicate whether the sort should be stable or descending
1311 * respectively. The parameter b is required, o and g are optional
1312 * (i.e., they may be NULL).
1313 *
1314 * A sorted copy is returned through the sorted parameter, the new
1315 * ordering is returned through the order parameter, group information
1316 * is returned through the groups parameter. All three output
1317 * parameters may be NULL. If they're all NULL, this function does
1318 * nothing.
1319 *
1320 * If o is specified, it is used to first rearrange b according to the
1321 * order specified in o, after which b is sorted taking g into
1322 * account.
1323 *
1324 * If g is specified, it indicates groups which should be individually
1325 * ordered. Each row of consecutive equal values in g indicates a
1326 * group which is sorted according to stable and reverse. g is used
1327 * after the order in b was rearranged according to o.
1328 *
1329 * The outputs order and groups can be used in subsequent calls to
1330 * this function. This can be used if multiple BATs need to be sorted
1331 * together. The BATs should then be sorted in order of significance,
1332 * and each following call should use the original unordered BAT plus
1333 * the order and groups bat from the previous call. In this case, the
1334 * sorted BATs are not of much use, so the sorted output parameter
1335 * does not need to be specified.
1336 * Apart from error checking and maintaining reference counts, sorting
1337 * three columns (col1, col2, col3) could look like this with the
1338 * sorted results in (col1s, col2s, col3s):
1339 * BATsort(&col1s, &ord1, &grp1, col1, NULL, NULL, false, false, false);
1340 * BATsort(&col2s, &ord2, &grp2, col2, ord1, grp1, false, false, false);
1341 * BATsort(&col3s, NULL, NULL, col3, ord2, grp2, false, false, false);
1342 * Note that the "reverse" parameter can be different for each call.
1343 */
1344gdk_return
1345BATsort(BAT **sorted, BAT **order, BAT **groups,
1346 BAT *b, BAT *o, BAT *g, bool reverse, bool nilslast, bool stable)
1347{
1348 BAT *bn = NULL, *on = NULL, *gn = NULL, *pb = NULL;
1349 oid *restrict grps, *restrict ords, prev;
1350 BUN p, q, r;
1351 lng t0 = 0;
1352 bool mkorderidx, orderidxlock = false;
1353
1354 ALGODEBUG t0 = GDKusec();
1355
1356 /* we haven't implemented NILs as largest value for stable
1357 * sort, so NILs come first for ascending and last for
1358 * descending */
1359 assert(!stable || reverse == nilslast);
1360
1361 if (b == NULL) {
1362 GDKerror("BATsort: b must exist\n");
1363 return GDK_FAIL;
1364 }
1365 if (stable && reverse != nilslast) {
1366 GDKerror("BATsort: stable sort cannot have "
1367 "reverse != nilslast\n");
1368 return GDK_FAIL;
1369 }
1370 if (!ATOMlinear(b->ttype)) {
1371 GDKerror("BATsort: type %s cannot be sorted\n",
1372 ATOMname(b->ttype));
1373 return GDK_FAIL;
1374 }
1375 if (b->ttype == TYPE_void) {
1376 if (!b->tsorted) {
1377 b->tsorted = true;
1378 b->batDirtydesc = true;
1379 }
1380 if (b->trevsorted != is_oid_nil(b->tseqbase) || b->batCount <= 1) {
1381 b->trevsorted = !b->trevsorted;
1382 b->batDirtydesc = true;
1383 }
1384 if (b->tkey != !is_oid_nil(b->tseqbase)) {
1385 b->tkey = !b->tkey;
1386 b->batDirtydesc = true;
1387 }
1388 } else if (b->batCount <= 1) {
1389 if (!b->tsorted || !b->trevsorted) {
1390 b->tsorted = b->trevsorted = true;
1391 b->batDirtydesc = true;
1392 }
1393 }
1394 if (o != NULL &&
1395 (ATOMtype(o->ttype) != TYPE_oid || /* oid tail */
1396 BATcount(o) != BATcount(b) || /* same size as b */
1397 (o->ttype == TYPE_void && /* no nil tail */
1398 BATcount(o) != 0 &&
1399 is_oid_nil(o->tseqbase)))) {
1400 GDKerror("BATsort: o must have type oid and same size as b\n");
1401 return GDK_FAIL;
1402 }
1403 if (g != NULL &&
1404 (ATOMtype(g->ttype) != TYPE_oid || /* oid tail */
1405 !g->tsorted || /* sorted */
1406 BATcount(o) != BATcount(b) || /* same size as b */
1407 (g->ttype == TYPE_void && /* no nil tail */
1408 BATcount(g) != 0 &&
1409 is_oid_nil(g->tseqbase)))) {
1410 GDKerror("BATsort: g must have type oid, sorted on the tail, "
1411 "and same size as b\n");
1412 return GDK_FAIL;
1413 }
1414 if (sorted == NULL && order == NULL) {
1415 /* no place to put result, so we're done quickly */
1416 GDKerror("BATsort: no place to put the result.\n");
1417 return GDK_FAIL;
1418 }
1419 if (g == NULL && !stable) {
1420 /* pre-ordering doesn't make sense if we're not
1421 * subsorting and the sort is not stable */
1422 o = NULL;
1423 }
1424 if (b->tnonil) {
1425 /* if there are no nils, placement of nils doesn't
1426 * matter, so set nilslast such that ordered bits can
1427 * be used */
1428 nilslast = reverse;
1429 }
1430 if (BATcount(b) <= 1 ||
1431 (reverse == nilslast &&
1432 (reverse ? BATtrevordered(b) : BATtordered(b)) &&
1433 o == NULL && g == NULL &&
1434 (groups == NULL || BATtkey(b) ||
1435 (reverse ? BATtordered(b) : BATtrevordered(b))))) {
1436 /* trivially (sub)sorted, and either we don't need to
1437 * return group information, or we can trivially
1438 * deduce the groups */
1439 if (sorted) {
1440 bn = COLcopy(b, b->ttype, false, TRANSIENT);
1441 if (bn == NULL)
1442 goto error;
1443 *sorted = bn;
1444 }
1445 if (order) {
1446 on = BATdense(b->hseqbase, b->hseqbase, BATcount(b));
1447 if (on == NULL)
1448 goto error;
1449 *order = on;
1450 }
1451 if (groups) {
1452 if (BATtkey(b)) {
1453 /* singleton groups */
1454 gn = BATdense(0, 0, BATcount(b));
1455 if (gn == NULL)
1456 goto error;
1457 } else {
1458 /* single group */
1459 const oid *o = 0;
1460 assert(BATcount(b) == 1 ||
1461 (BATtordered(b) && BATtrevordered(b)));
1462 gn = BATconstant(0, TYPE_oid, &o, BATcount(b), TRANSIENT);
1463 if (gn == NULL)
1464 goto error;
1465 }
1466 *groups = gn;
1467 }
1468 ALGODEBUG fprintf(stderr, "#BATsort(b=" ALGOBATFMT ",o="
1469 ALGOOPTBATFMT ",g=" ALGOOPTBATFMT
1470 ",reverse=%d,nilslast=%d,stable=%d) = ("
1471 ALGOOPTBATFMT "," ALGOOPTBATFMT ","
1472 ALGOOPTBATFMT ") -- trivial (" LLFMT
1473 " usec)\n",
1474 ALGOBATPAR(b), ALGOOPTBATPAR(o),
1475 ALGOOPTBATPAR(g), reverse, nilslast, stable,
1476 ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn),
1477 ALGOOPTBATPAR(on), GDKusec() - t0);
1478 return GDK_SUCCEED;
1479 }
1480 if (VIEWtparent(b)) {
1481 pb = BBPdescriptor(VIEWtparent(b));
1482 if (b->theap.base != pb->theap.base ||
1483 BATcount(b) != BATcount(pb) ||
1484 b->hseqbase != pb->hseqbase ||
1485 BATatoms[b->ttype].atomCmp != BATatoms[pb->ttype].atomCmp)
1486 pb = NULL;
1487 } else {
1488 pb = b;
1489 }
1490 /* when we will create an order index if it doesn't already exist */
1491 mkorderidx = (g == NULL && !reverse && !nilslast && pb != NULL && (order || !pb->batTransient));
1492 if (g == NULL && !reverse && !nilslast &&
1493 pb != NULL && !BATcheckorderidx(pb)) {
1494 MT_lock_set(&pb->batIdxLock);
1495 if (pb->torderidx == NULL) {
1496 /* no index created while waiting for lock */
1497 if (mkorderidx) /* keep lock when going to create */
1498 orderidxlock = true;
1499 } else {
1500 /* no need to create an index: it already exists */
1501 mkorderidx = false;
1502 }
1503 if (!orderidxlock)
1504 MT_lock_unset(&pb->batIdxLock);
1505 } else {
1506 mkorderidx = false;
1507 }
1508 if (g == NULL && o == NULL && !reverse && !nilslast &&
1509 pb != NULL && pb->torderidx != NULL &&
1510 /* if we want a stable sort, the order index must be
1511 * stable, if we don't want stable, we don't care */
1512 (!stable || ((oid *) pb->torderidx->base)[2])) {
1513 /* there is an order index that we can use */
1514 on = COLnew(pb->hseqbase, TYPE_oid, BATcount(pb), TRANSIENT);
1515 if (on == NULL)
1516 goto error;
1517 memcpy(Tloc(on, 0), (oid *) pb->torderidx->base + ORDERIDXOFF, BATcount(pb) * sizeof(oid));
1518 BATsetcount(on, BATcount(b));
1519 on->tkey = true;
1520 on->tnil = false;
1521 on->tnonil = true;
1522 on->tsorted = on->trevsorted = false;
1523 on->tseqbase = oid_nil;
1524 if (sorted || groups) {
1525 bn = BATproject(on, b);
1526 if (bn == NULL)
1527 goto error;
1528 bn->tsorted = true;
1529 if (groups) {
1530 if (BATgroup_internal(groups, NULL, NULL, bn, NULL, g, NULL, NULL, true) != GDK_SUCCEED)
1531 goto error;
1532 if (sorted &&
1533 (*groups)->tkey &&
1534 g == NULL) {
1535 /* if new groups bat is key
1536 * and since there is no input
1537 * groups bat, we know the
1538 * result bat is key */
1539 bn->tkey = true;
1540 }
1541 }
1542 if (sorted)
1543 *sorted = bn;
1544 else {
1545 BBPunfix(bn->batCacheid);
1546 bn = NULL;
1547 }
1548 }
1549 if (order)
1550 *order = on;
1551 else {
1552 BBPunfix(on->batCacheid);
1553 on = NULL;
1554 }
1555 ALGODEBUG fprintf(stderr, "#BATsort(b=" ALGOBATFMT ",o="
1556 ALGOOPTBATFMT ",g=" ALGOOPTBATFMT
1557 ",reverse=%d,nilslast=%d,stable=%d) = ("
1558 ALGOOPTBATFMT "," ALGOOPTBATFMT ","
1559 ALGOOPTBATFMT ") -- orderidx (" LLFMT
1560 " usec)\n",
1561 ALGOBATPAR(b), ALGOOPTBATPAR(o),
1562 ALGOOPTBATPAR(g), reverse, nilslast, stable,
1563 ALGOOPTBATPAR(bn), ALGOOPTBATPAR(gn),
1564 ALGOOPTBATPAR(on), GDKusec() - t0);
1565 return GDK_SUCCEED;
1566 }
1567 if (o) {
1568 bn = BATproject(o, b);
1569 if (bn == NULL)
1570 goto error;
1571 if (bn->ttype == TYPE_void || isVIEW(bn)) {
1572 BAT *b2 = COLcopy(bn, ATOMtype(bn->ttype), true, TRANSIENT);
1573 BBPunfix(bn->batCacheid);
1574 bn = b2;
1575 }
1576 pb = NULL;
1577 } else {
1578 bn = COLcopy(b, b->ttype, true, TRANSIENT);
1579 }
1580 if (bn == NULL)
1581 goto error;
1582 if (order) {
1583 /* prepare order bat */
1584 if (o) {
1585 /* make copy of input so that we can refine it;
1586 * copy can be read-only if we take the shortcut
1587 * below in the case g is "key" */
1588 on = COLcopy(o, TYPE_oid,
1589 g == NULL ||
1590 !(g->tkey || g->ttype == TYPE_void),
1591 TRANSIENT);
1592 if (on == NULL)
1593 goto error;
1594 BAThseqbase(on, b->hseqbase);
1595 } else {
1596 /* create new order */
1597 on = COLnew(b->hseqbase, TYPE_oid, BATcount(bn), TRANSIENT);
1598 if (on == NULL)
1599 goto error;
1600 ords = (oid *) Tloc(on, 0);
1601 for (p = 0, q = BATcount(bn); p < q; p++)
1602 ords[p] = p + b->hseqbase;
1603 BATsetcount(on, BATcount(bn));
1604 on->tkey = true;
1605 on->tnil = false;
1606 on->tnonil = true;
1607 }
1608 /* COLcopy above can create TYPE_void */
1609 if (on->ttype != TYPE_void) {
1610 on->tsorted = on->trevsorted = false; /* it won't be sorted */
1611 on->tseqbase = oid_nil; /* and hence not dense */
1612 on->tnosorted = on->tnorevsorted = 0;
1613 }
1614 *order = on;
1615 ords = (oid *) Tloc(on, 0);
1616 } else {
1617 ords = NULL;
1618 }
1619 if (g) {
1620 if (g->tkey || g->ttype == TYPE_void) {
1621 /* if g is "key", all groups are size 1, so no
1622 * subsorting needed */
1623 if (sorted) {
1624 *sorted = bn;
1625 } else {
1626 BBPunfix(bn->batCacheid);
1627 bn = NULL;
1628 }
1629 if (order) {
1630 *order = on;
1631 if (o) {
1632 /* we can inherit sortedness
1633 * after all */
1634 on->tsorted = o->tsorted;
1635 on->trevsorted = o->trevsorted;
1636 if (o->tnosorted)
1637 on->tnosorted = o->tnosorted;
1638 if (o->tnorevsorted)
1639 on->tnorevsorted = o->tnorevsorted;
1640 } else {
1641 /* we didn't rearrange, so
1642 * still sorted */
1643 on->tsorted = true;
1644 on->trevsorted = false;
1645 }
1646 if (BATcount(on) <= 1) {
1647 on->tsorted = true;
1648 on->trevsorted = true;
1649 }
1650 }
1651 if (groups) {
1652 gn = COLcopy(g, g->ttype, false, TRANSIENT);
1653 if (gn == NULL)
1654 goto error;
1655 *groups = gn;
1656 }
1657 ALGODEBUG fprintf(stderr, "#BATsort(b=" ALGOBATFMT
1658 ",o=" ALGOOPTBATFMT ",g=" ALGOBATFMT
1659 ",reverse=%d,nilslast=%d,stable=%d"
1660 ") = (" ALGOOPTBATFMT ","
1661 ALGOOPTBATFMT "," ALGOOPTBATFMT
1662 ") -- key group (" LLFMT " usec)\n",
1663 ALGOBATPAR(b), ALGOOPTBATPAR(o),
1664 ALGOBATPAR(g), reverse, nilslast,
1665 stable, ALGOOPTBATPAR(bn),
1666 ALGOOPTBATPAR(gn), ALGOOPTBATPAR(on),
1667 GDKusec() - t0);
1668 return GDK_SUCCEED;
1669 }
1670 assert(g->ttype == TYPE_oid);
1671 grps = (oid *) Tloc(g, 0);
1672 prev = grps[0];
1673 if (BATmaterialize(bn) != GDK_SUCCEED)
1674 goto error;
1675 for (r = 0, p = 1, q = BATcount(g); p < q; p++) {
1676 if (grps[p] != prev) {
1677 /* sub sort [r,p) */
1678 if (do_sort(Tloc(bn, r),
1679 ords ? ords + r : NULL,
1680 bn->tvheap ? bn->tvheap->base : NULL,
1681 p - r, Tsize(bn), ords ? sizeof(oid) : 0,
1682 bn->ttype, reverse, nilslast, stable) != GDK_SUCCEED)
1683 goto error;
1684 r = p;
1685 prev = grps[p];
1686 }
1687 }
1688 /* sub sort [r,q) */
1689 if (do_sort(Tloc(bn, r),
1690 ords ? ords + r : NULL,
1691 bn->tvheap ? bn->tvheap->base : NULL,
1692 p - r, Tsize(bn), ords ? sizeof(oid) : 0,
1693 bn->ttype, reverse, nilslast, stable) != GDK_SUCCEED)
1694 goto error;
1695 /* if single group (r==0) the result is (rev)sorted,
1696 * otherwise (maybe) not */
1697 bn->tsorted = r == 0 && !reverse && !nilslast;
1698 bn->trevsorted = r == 0 && reverse && nilslast;
1699 } else {
1700 Heap *m = NULL;
1701 /* only invest in creating an order index if the BAT
1702 * is persistent */
1703 if (mkorderidx) {
1704 assert(orderidxlock);
1705 if ((m = createOIDXheap(pb, stable)) != NULL &&
1706 ords == NULL) {
1707 ords = (oid *) m->base + ORDERIDXOFF;
1708 if (o && o->ttype != TYPE_void)
1709 memcpy(ords, Tloc(o, 0), BATcount(o) * sizeof(oid));
1710 else if (o)
1711 for (p = 0, q = BATcount(o); p < q; p++)
1712 ords[p] = p + o->tseqbase;
1713 else
1714 for (p = 0, q = BATcount(b); p < q; p++)
1715 ords[p] = p + b->hseqbase;
1716 }
1717 }
1718 if ((reverse != nilslast ||
1719 (reverse ? !bn->trevsorted : !bn->tsorted)) &&
1720 (BATmaterialize(bn) != GDK_SUCCEED ||
1721 do_sort(Tloc(bn, 0),
1722 ords,
1723 bn->tvheap ? bn->tvheap->base : NULL,
1724 BATcount(bn), Tsize(bn), ords ? sizeof(oid) : 0,
1725 bn->ttype, reverse, nilslast, stable) != GDK_SUCCEED)) {
1726 if (m != NULL) {
1727 HEAPfree(m, true);
1728 GDKfree(m);
1729 }
1730 if (orderidxlock)
1731 MT_lock_unset(&pb->batIdxLock);
1732 goto error;
1733 }
1734 bn->tsorted = !reverse && !nilslast;
1735 bn->trevsorted = reverse && nilslast;
1736 if (m != NULL) {
1737 assert(orderidxlock);
1738 if (pb->torderidx == NULL) {
1739 pb->batDirtydesc = true;
1740 if (ords != (oid *) m->base + ORDERIDXOFF) {
1741 memcpy((oid *) m->base + ORDERIDXOFF,
1742 ords,
1743 BATcount(pb) * sizeof(oid));
1744 }
1745 pb->torderidx = m;
1746 persistOIDX(pb);
1747 } else {
1748 HEAPfree(m, true);
1749 GDKfree(m);
1750 }
1751 }
1752 }
1753 if (orderidxlock)
1754 MT_lock_unset(&pb->batIdxLock);
1755 bn->theap.dirty = true;
1756 bn->tnosorted = 0;
1757 bn->tnorevsorted = 0;
1758 bn->tnokey[0] = bn->tnokey[1] = 0;
1759 if (groups) {
1760 if (BATgroup_internal(groups, NULL, NULL, bn, NULL, g, NULL, NULL, true) != GDK_SUCCEED)
1761 goto error;
1762 if ((*groups)->tkey &&
1763 (g == NULL || (g->tsorted && g->trevsorted))) {
1764 /* if new groups bat is key and the input
1765 * group bat has a single value (both sorted
1766 * and revsorted), we know the result bat is
1767 * key */
1768 bn->tkey = true;
1769 }
1770 }
1771
1772 if (sorted)
1773 *sorted = bn;
1774 else {
1775 BBPunfix(bn->batCacheid);
1776 bn = NULL;
1777 }
1778
1779 ALGODEBUG fprintf(stderr, "#BATsort(b=" ALGOBATFMT ",o=" ALGOOPTBATFMT
1780 ",g=" ALGOOPTBATFMT ",reverse=%d,nilslast=%d,"
1781 "stable=%d) = (" ALGOOPTBATFMT "," ALGOOPTBATFMT ","
1782 ALGOOPTBATFMT ") -- %ssort (" LLFMT " usec)\n",
1783 ALGOBATPAR(b), ALGOOPTBATPAR(o), ALGOOPTBATPAR(g),
1784 reverse, nilslast, stable, ALGOOPTBATPAR(bn),
1785 ALGOOPTBATPAR(gn), ALGOOPTBATPAR(on),
1786 g ? "grouped " : "", GDKusec() - t0);
1787 return GDK_SUCCEED;
1788
1789 error:
1790 if (bn)
1791 BBPunfix(bn->batCacheid);
1792 BBPreclaim(on);
1793 if (sorted)
1794 *sorted = NULL;
1795 if (order)
1796 *order = NULL;
1797 if (groups)
1798 *groups = NULL;
1799 return GDK_FAIL;
1800}
1801
1802/* return a new BAT of length n with seqbase hseq, and the constant v
1803 * in the tail */
1804BAT *
1805BATconstant(oid hseq, int tailtype, const void *v, BUN n, role_t role)
1806{
1807 BAT *bn;
1808 void *restrict p;
1809 BUN i;
1810 lng t0 = 0;
1811
1812 ALGODEBUG t0 = GDKusec();
1813
1814 if (v == NULL)
1815 return NULL;
1816 bn = COLnew(hseq, tailtype, n, role);
1817 if (bn != NULL && n > 0) {
1818 p = Tloc(bn, 0);
1819 switch (ATOMstorage(tailtype)) {
1820 case TYPE_void:
1821 v = &oid_nil;
1822 BATtseqbase(bn, oid_nil);
1823 break;
1824 case TYPE_bte:
1825 for (i = 0; i < n; i++)
1826 ((bte *) p)[i] = *(bte *) v;
1827 break;
1828 case TYPE_sht:
1829 for (i = 0; i < n; i++)
1830 ((sht *) p)[i] = *(sht *) v;
1831 break;
1832 case TYPE_int:
1833 case TYPE_flt:
1834 assert(sizeof(int) == sizeof(flt));
1835 for (i = 0; i < n; i++)
1836 ((int *) p)[i] = *(int *) v;
1837 break;
1838 case TYPE_lng:
1839 case TYPE_dbl:
1840 assert(sizeof(lng) == sizeof(dbl));
1841 for (i = 0; i < n; i++)
1842 ((lng *) p)[i] = *(lng *) v;
1843 break;
1844#ifdef HAVE_HGE
1845 case TYPE_hge:
1846 for (i = 0; i < n; i++)
1847 ((hge *) p)[i] = *(hge *) v;
1848 break;
1849#endif
1850 default:
1851 for (i = 0, n += i; i < n; i++)
1852 tfastins_nocheck(bn, i, v, Tsize(bn));
1853 break;
1854 }
1855 bn->theap.dirty = true;
1856 bn->tnil = n >= 1 && (*ATOMcompare(tailtype))(v, ATOMnilptr(tailtype)) == 0;
1857 BATsetcount(bn, n);
1858 bn->tsorted = true;
1859 bn->trevsorted = true;
1860 bn->tnonil = !bn->tnil;
1861 bn->tkey = BATcount(bn) <= 1;
1862 }
1863 ALGODEBUG fprintf(stderr, "#%s: %s()=" ALGOOPTBATFMT
1864 " (" LLFMT "usec)\n",
1865 MT_thread_getname(), __func__,
1866 ALGOOPTBATPAR(bn), GDKusec() - t0);
1867 return bn;
1868
1869 bunins_failed:
1870 BBPreclaim(bn);
1871 return NULL;
1872}
1873
1874/*
1875 * BAT Aggregates
1876 *
1877 * We retain the size() and card() aggregate results in the column
1878 * descriptor. We would like to have such functionality in an
1879 * extensible way for many aggregates, for DD (1) we do not want to
1880 * change the binary BAT format on disk and (2) aggr and size are the
1881 * most relevant aggregates.
1882 *
1883 * It is all hacked into the aggr[3] records; three adjacent integers
1884 * that were left over in the column record. We refer to these as if
1885 * it where an int aggr[3] array. The below routines set and retrieve
1886 * the aggregate values from the tail of the BAT, as many
1887 * aggregate-manipulating BAT functions work on tail.
1888 *
1889 * The rules are as follows: aggr[0] contains the alignment ID of the
1890 * column (if set i.e. nonzero). Hence, if this value is nonzero and
1891 * equal to b->talign, the precomputed aggregate values in
1892 * aggr[GDK_AGGR_SIZE] and aggr[GDK_AGGR_CARD] hold. However, only one
1893 * of them may be set at the time. This is encoded by the value
1894 * int_nil, which cannot occur in these two aggregates.
1895 *
1896 * This was now extended to record the property whether we know there
1897 * is a nil value present by mis-using the highest bits of both
1898 * GDK_AGGR_SIZE and GDK_AGGR_CARD.
1899 */
1900
1901void
1902PROPdestroy(BAT *b)
1903{
1904 PROPrec *p = b->tprops;
1905 PROPrec *n;
1906
1907 b->tprops = NULL;
1908 while (p) {
1909 n = p->next;
1910 VALclear(&p->v);
1911 GDKfree(p);
1912 p = n;
1913 }
1914}
1915
1916PROPrec *
1917BATgetprop_nolock(BAT *b, enum prop_t idx)
1918{
1919 PROPrec *p;
1920
1921 p = b->tprops;
1922 while (p && p->id != idx)
1923 p = p->next;
1924 return p;
1925}
1926
1927static void
1928BATrmprop_nolock(BAT *b, enum prop_t idx)
1929{
1930 PROPrec *prop = b->tprops, *prev = NULL;
1931
1932 while (prop) {
1933 if (prop->id == idx) {
1934 if (prev)
1935 prev->next = prop->next;
1936 else
1937 b->tprops = prop->next;
1938 VALclear(&prop->v);
1939 GDKfree(prop);
1940 return;
1941 }
1942 prev = prop;
1943 prop = prop->next;
1944 }
1945}
1946
1947void
1948BATsetprop_nolock(BAT *b, enum prop_t idx, int type, const void *v)
1949{
1950 PROPrec *p;
1951
1952 p = b->tprops;
1953 while (p && p->id != idx)
1954 p = p->next;
1955 if (p == NULL) {
1956 if ((p = GDKmalloc(sizeof(PROPrec))) == NULL) {
1957 /* properties are hints, so if we can't create
1958 * one we ignore the error */
1959 GDKclrerr();
1960 return;
1961 }
1962 p->id = idx;
1963 p->next = b->tprops;
1964 p->v.vtype = 0;
1965 b->tprops = p;
1966 } else {
1967 VALclear(&p->v);
1968 }
1969 if (VALinit(&p->v, type, v) == NULL) {
1970 /* failed to initialize, so remove property */
1971 BATrmprop_nolock(b, idx);
1972 GDKclrerr();
1973 }
1974 b->batDirtydesc = true;
1975}
1976
1977PROPrec *
1978BATgetprop(BAT *b, enum prop_t idx)
1979{
1980 PROPrec *p;
1981
1982 MT_lock_set(&b->batIdxLock);
1983 p = BATgetprop_nolock(b, idx);
1984 MT_lock_unset(&b->batIdxLock);
1985 return p;
1986}
1987
1988void
1989BATsetprop(BAT *b, enum prop_t idx, int type, const void *v)
1990{
1991 MT_lock_set(&b->batIdxLock);
1992 BATsetprop_nolock(b, idx, type, v);
1993 MT_lock_unset(&b->batIdxLock);
1994}
1995
1996void
1997BATrmprop(BAT *b, enum prop_t idx)
1998{
1999 MT_lock_set(&b->batIdxLock);
2000 BATrmprop_nolock(b, idx);
2001 MT_lock_unset(&b->batIdxLock);
2002}
2003
2004
2005/*
2006 * The BATcount_no_nil function counts all BUN in a BAT that have a
2007 * non-nil tail value.
2008 */
2009BUN
2010BATcount_no_nil(BAT *b)
2011{
2012 BUN cnt = 0;
2013 BUN i, n;
2014 const void *restrict p, *restrict nil;
2015 const char *restrict base;
2016 int t;
2017 int (*cmp)(const void *, const void *);
2018
2019 BATcheck(b, "BATcnt", 0);
2020 n = BATcount(b);
2021 if (b->tnonil)
2022 return n;
2023 p = Tloc(b, 0);
2024 t = ATOMbasetype(b->ttype);
2025 switch (t) {
2026 case TYPE_void:
2027 cnt = n * BATtdense(b);
2028 break;
2029 case TYPE_bte:
2030 for (i = 0; i < n; i++)
2031 cnt += !is_bte_nil(((const bte *) p)[i]);
2032 break;
2033 case TYPE_sht:
2034 for (i = 0; i < n; i++)
2035 cnt += !is_sht_nil(((const sht *) p)[i]);
2036 break;
2037 case TYPE_int:
2038 for (i = 0; i < n; i++)
2039 cnt += !is_int_nil(((const int *) p)[i]);
2040 break;
2041 case TYPE_lng:
2042 for (i = 0; i < n; i++)
2043 cnt += !is_lng_nil(((const lng *) p)[i]);
2044 break;
2045#ifdef HAVE_HGE
2046 case TYPE_hge:
2047 for (i = 0; i < n; i++)
2048 cnt += !is_hge_nil(((const hge *) p)[i]);
2049 break;
2050#endif
2051 case TYPE_flt:
2052 for (i = 0; i < n; i++)
2053 cnt += !is_flt_nil(((const flt *) p)[i]);
2054 break;
2055 case TYPE_dbl:
2056 for (i = 0; i < n; i++)
2057 cnt += !is_dbl_nil(((const dbl *) p)[i]);
2058 break;
2059 case TYPE_str:
2060 base = b->tvheap->base;
2061 switch (b->twidth) {
2062 case 1:
2063 for (i = 0; i < n; i++)
2064 cnt += base[(var_t) ((const unsigned char *) p)[i] + GDK_VAROFFSET] != '\200';
2065 break;
2066 case 2:
2067 for (i = 0; i < n; i++)
2068 cnt += base[(var_t) ((const unsigned short *) p)[i] + GDK_VAROFFSET] != '\200';
2069 break;
2070#if SIZEOF_VAR_T != SIZEOF_INT
2071 case 4:
2072 for (i = 0; i < n; i++)
2073 cnt += base[(var_t) ((const unsigned int *) p)[i]] != '\200';
2074 break;
2075#endif
2076 default:
2077 for (i = 0; i < n; i++)
2078 cnt += base[((const var_t *) p)[i]] != '\200';
2079 break;
2080 }
2081 break;
2082 default:
2083 nil = ATOMnilptr(t);
2084 cmp = ATOMcompare(t);
2085 if (b->tvarsized) {
2086 base = b->tvheap->base;
2087 for (i = 0; i < n; i++)
2088 cnt += (*cmp)(nil, base + ((const var_t *) p)[i]) != 0;
2089 } else {
2090 for (i = 0, n += i; i < n; i++)
2091 cnt += (*cmp)(Tloc(b, i), nil) != 0;
2092 }
2093 break;
2094 }
2095 if (cnt == BATcount(b)) {
2096 /* we learned something */
2097 b->tnonil = true;
2098 assert(!b->tnil);
2099 b->tnil = false;
2100 }
2101 return cnt;
2102}
2103