1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9#include "monetdb_config.h"
10#include "gdk.h"
11#include "gdk_private.h"
12#include "gdk_calc_private.h"
13
14#define VALUE(x) (vars ? vars + VarHeapVal(vals, (x), width) : vals + (x) * width)
15/* BATunique returns a bat that indicates the unique tail values of
16 * the input bat. This is essentially the same output as the
17 * "extents" output of BATgroup. The difference is that BATunique
18 * does not return the grouping bat.
19 *
20 * The inputs must be dense-headed, the first input is the bat from
21 * which unique rows are selected, the second input is a list of
22 * candidates.
23 */
24BAT *
25BATunique(BAT *b, BAT *s)
26{
27 BAT *bn;
28 BUN cnt;
29 const void *v;
30 const char *vals;
31 const char *vars;
32 int width;
33 oid i, o;
34 uint16_t *seen = NULL;
35 const char *nme;
36 Hash *hs = NULL;
37 BUN hb;
38 BATiter bi;
39 int (*cmp)(const void *, const void *);
40 bat parent;
41 struct canditer ci;
42
43 BATcheck(b, "BATunique", NULL);
44 cnt = canditer_init(&ci, b, s);
45
46 if (b->tkey || cnt <= 1 || BATtdense(b)) {
47 /* trivial: already unique */
48 bn = canditer_slice(&ci, 0, ci.ncand);
49 ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT
50 ",s=" ALGOOPTBATFMT ")=" ALGOOPTBATFMT
51 ": trivial case: "
52 "already unique, slice candidates\n",
53 ALGOBATPAR(b), ALGOOPTBATPAR(s),
54 ALGOOPTBATPAR(bn));
55 return bn;
56 }
57
58 if ((BATordered(b) && BATordered_rev(b)) ||
59 (b->ttype == TYPE_void && is_oid_nil(b->tseqbase))) {
60 /* trivial: all values are the same */
61 bn = BATdense(0, ci.seq, 1);
62 ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s="
63 ALGOOPTBATFMT ")=" ALGOOPTBATFMT
64 ": trivial case: all equal\n",
65 ALGOBATPAR(b), ALGOOPTBATPAR(s),
66 ALGOOPTBATPAR(bn));
67 return bn;
68 }
69
70 assert(b->ttype != TYPE_void);
71
72 bn = COLnew(0, TYPE_oid, 1024, TRANSIENT);
73 if (bn == NULL)
74 return NULL;
75 vals = Tloc(b, 0);
76 if (b->tvarsized && b->ttype)
77 vars = b->tvheap->base;
78 else
79 vars = NULL;
80 width = Tsize(b);
81 cmp = ATOMcompare(b->ttype);
82 bi = bat_iterator(b);
83
84 if (BATordered(b) || BATordered_rev(b)) {
85 const void *prev = NULL;
86
87 ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s="
88 ALGOOPTBATFMT "): (reverse) sorted\n",
89 ALGOBATPAR(b), ALGOOPTBATPAR(s));
90 for (i = 0; i < ci.ncand; i++) {
91 o = canditer_next(&ci);
92 v = VALUE(o - b->hseqbase);
93 if (prev == NULL || (*cmp)(v, prev) != 0) {
94 bunfastappTYPE(oid, bn, &o);
95 }
96 prev = v;
97 }
98 } else if (ATOMbasetype(b->ttype) == TYPE_bte) {
99 unsigned char val;
100
101 ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s="
102 ALGOOPTBATFMT "): byte sized atoms\n",
103 ALGOBATPAR(b), ALGOOPTBATPAR(s));
104 assert(vars == NULL);
105 seen = GDKzalloc((256 / 16) * sizeof(seen[0]));
106 if (seen == NULL)
107 goto bunins_failed;
108 for (i = 0; i < ci.ncand; i++) {
109 o = canditer_next(&ci);
110 val = ((const unsigned char *) vals)[o - b->hseqbase];
111 if (!(seen[val >> 4] & (1U << (val & 0xF)))) {
112 seen[val >> 4] |= 1U << (val & 0xF);
113 bunfastappTYPE(oid, bn, &o);
114 if (bn->batCount == 256) {
115 /* there cannot be more than
116 * 256 distinct values */
117 break;
118 }
119 }
120 }
121 GDKfree(seen);
122 seen = NULL;
123 } else if (ATOMbasetype(b->ttype) == TYPE_sht) {
124 unsigned short val;
125
126 ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s="
127 ALGOOPTBATFMT "): short sized atoms\n",
128 ALGOBATPAR(b), ALGOOPTBATPAR(s));
129 assert(vars == NULL);
130 seen = GDKzalloc((65536 / 16) * sizeof(seen[0]));
131 if (seen == NULL)
132 goto bunins_failed;
133 for (i = 0; i < ci.ncand; i++) {
134 o = canditer_next(&ci);
135 val = ((const unsigned short *) vals)[o - b->hseqbase];
136 if (!(seen[val >> 4] & (1U << (val & 0xF)))) {
137 seen[val >> 4] |= 1U << (val & 0xF);
138 bunfastappTYPE(oid, bn, &o);
139 if (bn->batCount == 65536) {
140 /* there cannot be more than
141 * 65536 distinct values */
142 break;
143 }
144 }
145 }
146 GDKfree(seen);
147 seen = NULL;
148 } else if (BATcheckhash(b) ||
149 (!b->batTransient &&
150 BAThash(b) == GDK_SUCCEED) ||
151 ((parent = VIEWtparent(b)) != 0 &&
152 BATcheckhash(BBPdescriptor(parent)))) {
153 BUN lo;
154 oid seq;
155
156 /* we already have a hash table on b, or b is
157 * persistent and we could create a hash table, or b
158 * is a view on a bat that already has a hash table */
159 ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s="
160 ALGOOPTBATFMT "): use existing hash\n",
161 ALGOBATPAR(b), ALGOOPTBATPAR(s));
162 seq = b->hseqbase;
163 if (b->thash == NULL && (parent = VIEWtparent(b)) != 0) {
164 BAT *b2 = BBPdescriptor(parent);
165 lo = (BUN) ((b->theap.base - b2->theap.base) >> b->tshift);
166 b = b2;
167 bi = bat_iterator(b);
168 } else {
169 lo = 0;
170 }
171 hs = b->thash;
172 for (i = 0; i < ci.ncand; i++) {
173 BUN p;
174
175 o = canditer_next(&ci);
176 p = o - seq;
177 v = VALUE(p);
178 for (hb = HASHgetlink(hs, p + lo);
179 hb != HASHnil(hs) && hb >= lo;
180 hb = HASHgetlink(hs, hb)) {
181 assert(hb < p + lo);
182 if (cmp(v, BUNtail(bi, hb)) == 0 &&
183 canditer_search(&ci,
184 hb - lo + seq,
185 false) != BUN_NONE) {
186 /* we've seen this value
187 * before */
188 break;
189 }
190 }
191 if (hb == HASHnil(hs) || hb < lo) {
192 bunfastappTYPE(oid, bn, &o);
193 }
194 }
195 } else {
196 BUN prb;
197 BUN p;
198 BUN mask;
199 int len;
200
201 GDKclrerr(); /* not interested in BAThash errors */
202 ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s="
203 ALGOOPTBATFMT "): create partial hash\n",
204 ALGOBATPAR(b), ALGOOPTBATPAR(s));
205 nme = BBP_physical(b->batCacheid);
206 if (ATOMbasetype(b->ttype) == TYPE_bte) {
207 mask = (BUN) 1 << 8;
208 cmp = NULL; /* no compare needed, "hash" is perfect */
209 } else if (ATOMbasetype(b->ttype) == TYPE_sht) {
210 mask = (BUN) 1 << 16;
211 cmp = NULL; /* no compare needed, "hash" is perfect */
212 } else {
213 if (s)
214 mask = HASHmask(s->batCount);
215 else
216 mask = HASHmask(b->batCount);
217 if (mask < ((BUN) 1 << 16))
218 mask = (BUN) 1 << 16;
219 }
220 if ((hs = GDKzalloc(sizeof(Hash))) == NULL) {
221 GDKerror("BATunique: cannot allocate hash table\n");
222 goto bunins_failed;
223 }
224 len = snprintf(hs->heap.filename, sizeof(hs->heap.filename), "%s.hash%d", nme, THRgettid());
225 if (len == -1 || len >= (int) sizeof(hs->heap.filename) ||
226 HASHnew(hs, b->ttype, BUNlast(b), mask, BUN_NONE) != GDK_SUCCEED) {
227 GDKfree(hs);
228 hs = NULL;
229 GDKerror("BATunique: cannot allocate hash table\n");
230 goto bunins_failed;
231 }
232 for (i = 0; i < ci.ncand; i++) {
233 o = canditer_next(&ci);
234 v = VALUE(o - b->hseqbase);
235 prb = HASHprobe(hs, v);
236 for (hb = HASHget(hs, prb);
237 hb != HASHnil(hs);
238 hb = HASHgetlink(hs, hb)) {
239 if (cmp == NULL || cmp(v, BUNtail(bi, hb)) == 0)
240 break;
241 }
242 if (hb == HASHnil(hs)) {
243 p = o - b->hseqbase;
244 bunfastappTYPE(oid, bn, &o);
245 /* enter into hash table */
246 HASHputlink(hs, p, HASHget(hs, prb));
247 HASHput(hs, prb, p);
248 }
249 }
250 HEAPfree(&hs->heap, true);
251 GDKfree(hs);
252 }
253
254 bn->theap.dirty = true;
255 bn->tsorted = true;
256 bn->trevsorted = BATcount(bn) <= 1;
257 bn->tkey = true;
258 bn->tnil = false;
259 bn->tnonil = true;
260 if (BATcount(bn) == BATcount(b)) {
261 /* it turns out all values are distinct */
262 assert(b->tnokey[0] == 0);
263 assert(b->tnokey[1] == 0);
264 b->tkey = true;
265 b->batDirtydesc = true;
266 }
267 bn = virtualize(bn);
268 ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ","
269 "s=" ALGOOPTBATFMT ")="
270 ALGOBATFMT "\n",
271 ALGOBATPAR(b), ALGOOPTBATPAR(s),
272 ALGOBATPAR(bn));
273 return bn;
274
275 bunins_failed:
276 if (seen)
277 GDKfree(seen);
278 if (hs != NULL && hs != b->thash) {
279 HEAPfree(&hs->heap, true);
280 GDKfree(hs);
281 }
282 BBPreclaim(bn);
283 return NULL;
284}
285