1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | #include "monetdb_config.h" |
10 | #include "gdk.h" |
11 | #include "gdk_private.h" |
12 | #include "gdk_calc_private.h" |
13 | |
14 | #define VALUE(x) (vars ? vars + VarHeapVal(vals, (x), width) : vals + (x) * width) |
15 | /* BATunique returns a bat that indicates the unique tail values of |
16 | * the input bat. This is essentially the same output as the |
17 | * "extents" output of BATgroup. The difference is that BATunique |
18 | * does not return the grouping bat. |
19 | * |
20 | * The inputs must be dense-headed, the first input is the bat from |
21 | * which unique rows are selected, the second input is a list of |
22 | * candidates. |
23 | */ |
24 | BAT * |
25 | BATunique(BAT *b, BAT *s) |
26 | { |
27 | BAT *bn; |
28 | BUN cnt; |
29 | const void *v; |
30 | const char *vals; |
31 | const char *vars; |
32 | int width; |
33 | oid i, o; |
34 | uint16_t *seen = NULL; |
35 | const char *nme; |
36 | Hash *hs = NULL; |
37 | BUN hb; |
38 | BATiter bi; |
39 | int (*cmp)(const void *, const void *); |
40 | bat parent; |
41 | struct canditer ci; |
42 | |
43 | BATcheck(b, "BATunique" , NULL); |
44 | cnt = canditer_init(&ci, b, s); |
45 | |
46 | if (b->tkey || cnt <= 1 || BATtdense(b)) { |
47 | /* trivial: already unique */ |
48 | bn = canditer_slice(&ci, 0, ci.ncand); |
49 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT |
50 | ",s=" ALGOOPTBATFMT ")=" ALGOOPTBATFMT |
51 | ": trivial case: " |
52 | "already unique, slice candidates\n" , |
53 | ALGOBATPAR(b), ALGOOPTBATPAR(s), |
54 | ALGOOPTBATPAR(bn)); |
55 | return bn; |
56 | } |
57 | |
58 | if ((BATordered(b) && BATordered_rev(b)) || |
59 | (b->ttype == TYPE_void && is_oid_nil(b->tseqbase))) { |
60 | /* trivial: all values are the same */ |
61 | bn = BATdense(0, ci.seq, 1); |
62 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
63 | ALGOOPTBATFMT ")=" ALGOOPTBATFMT |
64 | ": trivial case: all equal\n" , |
65 | ALGOBATPAR(b), ALGOOPTBATPAR(s), |
66 | ALGOOPTBATPAR(bn)); |
67 | return bn; |
68 | } |
69 | |
70 | assert(b->ttype != TYPE_void); |
71 | |
72 | bn = COLnew(0, TYPE_oid, 1024, TRANSIENT); |
73 | if (bn == NULL) |
74 | return NULL; |
75 | vals = Tloc(b, 0); |
76 | if (b->tvarsized && b->ttype) |
77 | vars = b->tvheap->base; |
78 | else |
79 | vars = NULL; |
80 | width = Tsize(b); |
81 | cmp = ATOMcompare(b->ttype); |
82 | bi = bat_iterator(b); |
83 | |
84 | if (BATordered(b) || BATordered_rev(b)) { |
85 | const void *prev = NULL; |
86 | |
87 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
88 | ALGOOPTBATFMT "): (reverse) sorted\n" , |
89 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
90 | for (i = 0; i < ci.ncand; i++) { |
91 | o = canditer_next(&ci); |
92 | v = VALUE(o - b->hseqbase); |
93 | if (prev == NULL || (*cmp)(v, prev) != 0) { |
94 | bunfastappTYPE(oid, bn, &o); |
95 | } |
96 | prev = v; |
97 | } |
98 | } else if (ATOMbasetype(b->ttype) == TYPE_bte) { |
99 | unsigned char val; |
100 | |
101 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
102 | ALGOOPTBATFMT "): byte sized atoms\n" , |
103 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
104 | assert(vars == NULL); |
105 | seen = GDKzalloc((256 / 16) * sizeof(seen[0])); |
106 | if (seen == NULL) |
107 | goto bunins_failed; |
108 | for (i = 0; i < ci.ncand; i++) { |
109 | o = canditer_next(&ci); |
110 | val = ((const unsigned char *) vals)[o - b->hseqbase]; |
111 | if (!(seen[val >> 4] & (1U << (val & 0xF)))) { |
112 | seen[val >> 4] |= 1U << (val & 0xF); |
113 | bunfastappTYPE(oid, bn, &o); |
114 | if (bn->batCount == 256) { |
115 | /* there cannot be more than |
116 | * 256 distinct values */ |
117 | break; |
118 | } |
119 | } |
120 | } |
121 | GDKfree(seen); |
122 | seen = NULL; |
123 | } else if (ATOMbasetype(b->ttype) == TYPE_sht) { |
124 | unsigned short val; |
125 | |
126 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
127 | ALGOOPTBATFMT "): short sized atoms\n" , |
128 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
129 | assert(vars == NULL); |
130 | seen = GDKzalloc((65536 / 16) * sizeof(seen[0])); |
131 | if (seen == NULL) |
132 | goto bunins_failed; |
133 | for (i = 0; i < ci.ncand; i++) { |
134 | o = canditer_next(&ci); |
135 | val = ((const unsigned short *) vals)[o - b->hseqbase]; |
136 | if (!(seen[val >> 4] & (1U << (val & 0xF)))) { |
137 | seen[val >> 4] |= 1U << (val & 0xF); |
138 | bunfastappTYPE(oid, bn, &o); |
139 | if (bn->batCount == 65536) { |
140 | /* there cannot be more than |
141 | * 65536 distinct values */ |
142 | break; |
143 | } |
144 | } |
145 | } |
146 | GDKfree(seen); |
147 | seen = NULL; |
148 | } else if (BATcheckhash(b) || |
149 | (!b->batTransient && |
150 | BAThash(b) == GDK_SUCCEED) || |
151 | ((parent = VIEWtparent(b)) != 0 && |
152 | BATcheckhash(BBPdescriptor(parent)))) { |
153 | BUN lo; |
154 | oid seq; |
155 | |
156 | /* we already have a hash table on b, or b is |
157 | * persistent and we could create a hash table, or b |
158 | * is a view on a bat that already has a hash table */ |
159 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
160 | ALGOOPTBATFMT "): use existing hash\n" , |
161 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
162 | seq = b->hseqbase; |
163 | if (b->thash == NULL && (parent = VIEWtparent(b)) != 0) { |
164 | BAT *b2 = BBPdescriptor(parent); |
165 | lo = (BUN) ((b->theap.base - b2->theap.base) >> b->tshift); |
166 | b = b2; |
167 | bi = bat_iterator(b); |
168 | } else { |
169 | lo = 0; |
170 | } |
171 | hs = b->thash; |
172 | for (i = 0; i < ci.ncand; i++) { |
173 | BUN p; |
174 | |
175 | o = canditer_next(&ci); |
176 | p = o - seq; |
177 | v = VALUE(p); |
178 | for (hb = HASHgetlink(hs, p + lo); |
179 | hb != HASHnil(hs) && hb >= lo; |
180 | hb = HASHgetlink(hs, hb)) { |
181 | assert(hb < p + lo); |
182 | if (cmp(v, BUNtail(bi, hb)) == 0 && |
183 | canditer_search(&ci, |
184 | hb - lo + seq, |
185 | false) != BUN_NONE) { |
186 | /* we've seen this value |
187 | * before */ |
188 | break; |
189 | } |
190 | } |
191 | if (hb == HASHnil(hs) || hb < lo) { |
192 | bunfastappTYPE(oid, bn, &o); |
193 | } |
194 | } |
195 | } else { |
196 | BUN prb; |
197 | BUN p; |
198 | BUN mask; |
199 | int len; |
200 | |
201 | GDKclrerr(); /* not interested in BAThash errors */ |
202 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT ",s=" |
203 | ALGOOPTBATFMT "): create partial hash\n" , |
204 | ALGOBATPAR(b), ALGOOPTBATPAR(s)); |
205 | nme = BBP_physical(b->batCacheid); |
206 | if (ATOMbasetype(b->ttype) == TYPE_bte) { |
207 | mask = (BUN) 1 << 8; |
208 | cmp = NULL; /* no compare needed, "hash" is perfect */ |
209 | } else if (ATOMbasetype(b->ttype) == TYPE_sht) { |
210 | mask = (BUN) 1 << 16; |
211 | cmp = NULL; /* no compare needed, "hash" is perfect */ |
212 | } else { |
213 | if (s) |
214 | mask = HASHmask(s->batCount); |
215 | else |
216 | mask = HASHmask(b->batCount); |
217 | if (mask < ((BUN) 1 << 16)) |
218 | mask = (BUN) 1 << 16; |
219 | } |
220 | if ((hs = GDKzalloc(sizeof(Hash))) == NULL) { |
221 | GDKerror("BATunique: cannot allocate hash table\n" ); |
222 | goto bunins_failed; |
223 | } |
224 | len = snprintf(hs->heap.filename, sizeof(hs->heap.filename), "%s.hash%d" , nme, THRgettid()); |
225 | if (len == -1 || len >= (int) sizeof(hs->heap.filename) || |
226 | HASHnew(hs, b->ttype, BUNlast(b), mask, BUN_NONE) != GDK_SUCCEED) { |
227 | GDKfree(hs); |
228 | hs = NULL; |
229 | GDKerror("BATunique: cannot allocate hash table\n" ); |
230 | goto bunins_failed; |
231 | } |
232 | for (i = 0; i < ci.ncand; i++) { |
233 | o = canditer_next(&ci); |
234 | v = VALUE(o - b->hseqbase); |
235 | prb = HASHprobe(hs, v); |
236 | for (hb = HASHget(hs, prb); |
237 | hb != HASHnil(hs); |
238 | hb = HASHgetlink(hs, hb)) { |
239 | if (cmp == NULL || cmp(v, BUNtail(bi, hb)) == 0) |
240 | break; |
241 | } |
242 | if (hb == HASHnil(hs)) { |
243 | p = o - b->hseqbase; |
244 | bunfastappTYPE(oid, bn, &o); |
245 | /* enter into hash table */ |
246 | HASHputlink(hs, p, HASHget(hs, prb)); |
247 | HASHput(hs, prb, p); |
248 | } |
249 | } |
250 | HEAPfree(&hs->heap, true); |
251 | GDKfree(hs); |
252 | } |
253 | |
254 | bn->theap.dirty = true; |
255 | bn->tsorted = true; |
256 | bn->trevsorted = BATcount(bn) <= 1; |
257 | bn->tkey = true; |
258 | bn->tnil = false; |
259 | bn->tnonil = true; |
260 | if (BATcount(bn) == BATcount(b)) { |
261 | /* it turns out all values are distinct */ |
262 | assert(b->tnokey[0] == 0); |
263 | assert(b->tnokey[1] == 0); |
264 | b->tkey = true; |
265 | b->batDirtydesc = true; |
266 | } |
267 | bn = virtualize(bn); |
268 | ALGODEBUG fprintf(stderr, "#BATunique(b=" ALGOBATFMT "," |
269 | "s=" ALGOOPTBATFMT ")=" |
270 | ALGOBATFMT "\n" , |
271 | ALGOBATPAR(b), ALGOOPTBATPAR(s), |
272 | ALGOBATPAR(bn)); |
273 | return bn; |
274 | |
275 | bunins_failed: |
276 | if (seen) |
277 | GDKfree(seen); |
278 | if (hs != NULL && hs != b->thash) { |
279 | HEAPfree(&hs->heap, true); |
280 | GDKfree(hs); |
281 | } |
282 | BBPreclaim(bn); |
283 | return NULL; |
284 | } |
285 | |