| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | #include "monetdb_config.h" |
| 10 | #include "gdk.h" |
| 11 | #include "gdk_private.h" |
| 12 | #include "gdk_calc_private.h" |
| 13 | |
| 14 | #define VALUE(x) (vars ? vars + VarHeapVal(vals, (x), width) : vals + (x) * width) |
| 15 | /* BATunique returns a bat that indicates the unique tail values of |
| 16 | * the input bat. This is essentially the same output as the |
| 17 | * "extents" output of BATgroup. The difference is that BATunique |
| 18 | * does not return the grouping bat. |
| 19 | * |
| 20 | * The inputs must be dense-headed, the first input is the bat from |
| 21 | * which unique rows are selected, the second input is a list of |
| 22 | * candidates. |
| 23 | */ |
| 24 | BAT * |
| 25 | BATunique(BAT *b, BAT *s) |
| 26 | { |
| 27 | BAT *bn; |
| 28 | BUN cnt; |
| 29 | const void *v; |
| 30 | const char *vals; |
| 31 | const char *vars; |
| 32 | int width; |
| 33 | oid i, o; |
| 34 | uint16_t *seen = NULL; |
| 35 | const char *nme; |
| 36 | Hash *hs = NULL; |
| 37 | BUN hb; |
| 38 | BATiter bi; |
| 39 | int (*cmp)(const void *, const void *); |
| 40 | bat parent; |
| 41 | struct canditer ci; |
| 42 | |
| 43 | BATcheck(b, "BATunique" , NULL); |
| 44 | cnt = canditer_init(&ci, b, s); |
| 45 | |
| 46 | if (b->tkey || cnt <= 1 || BATtdense(b)) { |
| 47 | /* trivial: already unique */ |
| 48 | bn = canditer_slice(&ci, 0, ci.ncand); |
| 49 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT |
| 50 | ",s=" ALGOOPTBATFMT ")=" ALGOOPTBATFMT |
| 51 | ": trivial case: " |
| 52 | "already unique, slice candidates\n" , |
| 53 | ALGOBATPAR(b), ALGOOPTBATPAR(s), |
| 54 | ALGOOPTBATPAR(bn)); |
| 55 | return bn; |
| 56 | } |
| 57 | |
| 58 | if ((BATordered(b) && BATordered_rev(b)) || |
| 59 | (b->ttype == TYPE_void && is_oid_nil(b->tseqbase))) { |
| 60 | /* trivial: all values are the same */ |
| 61 | bn = BATdense(0, ci.seq, 1); |
| 62 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
| 63 | ALGOOPTBATFMT ")=" ALGOOPTBATFMT |
| 64 | ": trivial case: all equal\n" , |
| 65 | ALGOBATPAR(b), ALGOOPTBATPAR(s), |
| 66 | ALGOOPTBATPAR(bn)); |
| 67 | return bn; |
| 68 | } |
| 69 | |
| 70 | assert(b->ttype != TYPE_void); |
| 71 | |
| 72 | bn = COLnew(0, TYPE_oid, 1024, TRANSIENT); |
| 73 | if (bn == NULL) |
| 74 | return NULL; |
| 75 | vals = Tloc(b, 0); |
| 76 | if (b->tvarsized && b->ttype) |
| 77 | vars = b->tvheap->base; |
| 78 | else |
| 79 | vars = NULL; |
| 80 | width = Tsize(b); |
| 81 | cmp = ATOMcompare(b->ttype); |
| 82 | bi = bat_iterator(b); |
| 83 | |
| 84 | if (BATordered(b) || BATordered_rev(b)) { |
| 85 | const void *prev = NULL; |
| 86 | |
| 87 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
| 88 | ALGOOPTBATFMT "): (reverse) sorted\n" , |
| 89 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
| 90 | for (i = 0; i < ci.ncand; i++) { |
| 91 | o = canditer_next(&ci); |
| 92 | v = VALUE(o - b->hseqbase); |
| 93 | if (prev == NULL || (*cmp)(v, prev) != 0) { |
| 94 | bunfastappTYPE(oid, bn, &o); |
| 95 | } |
| 96 | prev = v; |
| 97 | } |
| 98 | } else if (ATOMbasetype(b->ttype) == TYPE_bte) { |
| 99 | unsigned char val; |
| 100 | |
| 101 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
| 102 | ALGOOPTBATFMT "): byte sized atoms\n" , |
| 103 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
| 104 | assert(vars == NULL); |
| 105 | seen = GDKzalloc((256 / 16) * sizeof(seen[0])); |
| 106 | if (seen == NULL) |
| 107 | goto bunins_failed; |
| 108 | for (i = 0; i < ci.ncand; i++) { |
| 109 | o = canditer_next(&ci); |
| 110 | val = ((const unsigned char *) vals)[o - b->hseqbase]; |
| 111 | if (!(seen[val >> 4] & (1U << (val & 0xF)))) { |
| 112 | seen[val >> 4] |= 1U << (val & 0xF); |
| 113 | bunfastappTYPE(oid, bn, &o); |
| 114 | if (bn->batCount == 256) { |
| 115 | /* there cannot be more than |
| 116 | * 256 distinct values */ |
| 117 | break; |
| 118 | } |
| 119 | } |
| 120 | } |
| 121 | GDKfree(seen); |
| 122 | seen = NULL; |
| 123 | } else if (ATOMbasetype(b->ttype) == TYPE_sht) { |
| 124 | unsigned short val; |
| 125 | |
| 126 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
| 127 | ALGOOPTBATFMT "): short sized atoms\n" , |
| 128 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
| 129 | assert(vars == NULL); |
| 130 | seen = GDKzalloc((65536 / 16) * sizeof(seen[0])); |
| 131 | if (seen == NULL) |
| 132 | goto bunins_failed; |
| 133 | for (i = 0; i < ci.ncand; i++) { |
| 134 | o = canditer_next(&ci); |
| 135 | val = ((const unsigned short *) vals)[o - b->hseqbase]; |
| 136 | if (!(seen[val >> 4] & (1U << (val & 0xF)))) { |
| 137 | seen[val >> 4] |= 1U << (val & 0xF); |
| 138 | bunfastappTYPE(oid, bn, &o); |
| 139 | if (bn->batCount == 65536) { |
| 140 | /* there cannot be more than |
| 141 | * 65536 distinct values */ |
| 142 | break; |
| 143 | } |
| 144 | } |
| 145 | } |
| 146 | GDKfree(seen); |
| 147 | seen = NULL; |
| 148 | } else if (BATcheckhash(b) || |
| 149 | (!b->batTransient && |
| 150 | BAThash(b) == GDK_SUCCEED) || |
| 151 | ((parent = VIEWtparent(b)) != 0 && |
| 152 | BATcheckhash(BBPdescriptor(parent)))) { |
| 153 | BUN lo; |
| 154 | oid seq; |
| 155 | |
| 156 | /* we already have a hash table on b, or b is |
| 157 | * persistent and we could create a hash table, or b |
| 158 | * is a view on a bat that already has a hash table */ |
| 159 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
| 160 | ALGOOPTBATFMT "): use existing hash\n" , |
| 161 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
| 162 | seq = b->hseqbase; |
| 163 | if (b->thash == NULL && (parent = VIEWtparent(b)) != 0) { |
| 164 | BAT *b2 = BBPdescriptor(parent); |
| 165 | lo = (BUN) ((b->theap.base - b2->theap.base) >> b->tshift); |
| 166 | b = b2; |
| 167 | bi = bat_iterator(b); |
| 168 | } else { |
| 169 | lo = 0; |
| 170 | } |
| 171 | hs = b->thash; |
| 172 | for (i = 0; i < ci.ncand; i++) { |
| 173 | BUN p; |
| 174 | |
| 175 | o = canditer_next(&ci); |
| 176 | p = o - seq; |
| 177 | v = VALUE(p); |
| 178 | for (hb = HASHgetlink(hs, p + lo); |
| 179 | hb != HASHnil(hs) && hb >= lo; |
| 180 | hb = HASHgetlink(hs, hb)) { |
| 181 | assert(hb < p + lo); |
| 182 | if (cmp(v, BUNtail(bi, hb)) == 0 && |
| 183 | canditer_search(&ci, |
| 184 | hb - lo + seq, |
| 185 | false) != BUN_NONE) { |
| 186 | /* we've seen this value |
| 187 | * before */ |
| 188 | break; |
| 189 | } |
| 190 | } |
| 191 | if (hb == HASHnil(hs) || hb < lo) { |
| 192 | bunfastappTYPE(oid, bn, &o); |
| 193 | } |
| 194 | } |
| 195 | } else { |
| 196 | BUN prb; |
| 197 | BUN p; |
| 198 | BUN mask; |
| 199 | int len; |
| 200 | |
| 201 | GDKclrerr(); /* not interested in BAThash errors */ |
| 202 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
| 203 | ALGOOPTBATFMT "): create partial hash\n" , |
| 204 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
| 205 | nme = BBP_physical(b->batCacheid); |
| 206 | if (ATOMbasetype(b->ttype) == TYPE_bte) { |
| 207 | mask = (BUN) 1 << 8; |
| 208 | cmp = NULL; /* no compare needed, "hash" is perfect */ |
| 209 | } else if (ATOMbasetype(b->ttype) == TYPE_sht) { |
| 210 | mask = (BUN) 1 << 16; |
| 211 | cmp = NULL; /* no compare needed, "hash" is perfect */ |
| 212 | } else { |
| 213 | if (s) |
| 214 | mask = HASHmask(s->batCount); |
| 215 | else |
| 216 | mask = HASHmask(b->batCount); |
| 217 | if (mask < ((BUN) 1 << 16)) |
| 218 | mask = (BUN) 1 << 16; |
| 219 | } |
| 220 | if ((hs = GDKzalloc(sizeof(Hash))) == NULL) { |
| 221 | GDKerror("BATunique: cannot allocate hash table\n" ); |
| 222 | goto bunins_failed; |
| 223 | } |
| 224 | len = snprintf(hs->heap.filename, sizeof(hs->heap.filename), "%s.hash%d" , nme, THRgettid()); |
| 225 | if (len == -1 || len >= (int) sizeof(hs->heap.filename) || |
| 226 | HASHnew(hs, b->ttype, BUNlast(b), mask, BUN_NONE) != GDK_SUCCEED) { |
| 227 | GDKfree(hs); |
| 228 | hs = NULL; |
| 229 | GDKerror("BATunique: cannot allocate hash table\n" ); |
| 230 | goto bunins_failed; |
| 231 | } |
| 232 | for (i = 0; i < ci.ncand; i++) { |
| 233 | o = canditer_next(&ci); |
| 234 | v = VALUE(o - b->hseqbase); |
| 235 | prb = HASHprobe(hs, v); |
| 236 | for (hb = HASHget(hs, prb); |
| 237 | hb != HASHnil(hs); |
| 238 | hb = HASHgetlink(hs, hb)) { |
| 239 | if (cmp == NULL || cmp(v, BUNtail(bi, hb)) == 0) |
| 240 | break; |
| 241 | } |
| 242 | if (hb == HASHnil(hs)) { |
| 243 | p = o - b->hseqbase; |
| 244 | bunfastappTYPE(oid, bn, &o); |
| 245 | /* enter into hash table */ |
| 246 | HASHputlink(hs, p, HASHget(hs, prb)); |
| 247 | HASHput(hs, prb, p); |
| 248 | } |
| 249 | } |
| 250 | HEAPfree(&hs->heap, true); |
| 251 | GDKfree(hs); |
| 252 | } |
| 253 | |
| 254 | bn->theap.dirty = true; |
| 255 | bn->tsorted = true; |
| 256 | bn->trevsorted = BATcount(bn) <= 1; |
| 257 | bn->tkey = true; |
| 258 | bn->tnil = false; |
| 259 | bn->tnonil = true; |
| 260 | if (BATcount(bn) == BATcount(b)) { |
| 261 | /* it turns out all values are distinct */ |
| 262 | assert(b->tnokey[0] == 0); |
| 263 | assert(b->tnokey[1] == 0); |
| 264 | b->tkey = true; |
| 265 | b->batDirtydesc = true; |
| 266 | } |
| 267 | bn = virtualize(bn); |
| 268 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT "," |
| 269 | "s=" ALGOOPTBATFMT ")=" |
| 270 | ALGOBATFMT "\n" , |
| 271 | ALGOBATPAR(b), ALGOOPTBATPAR(s), |
| 272 | ALGOBATPAR(bn)); |
| 273 | return bn; |
| 274 | |
| 275 | bunins_failed: |
| 276 | if (seen) |
| 277 | GDKfree(seen); |
| 278 | if (hs != NULL && hs != b->thash) { |
| 279 | HEAPfree(&hs->heap, true); |
| 280 | GDKfree(hs); |
| 281 | } |
| 282 | BBPreclaim(bn); |
| 283 | return NULL; |
| 284 | } |
| 285 | |