1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * (c) Martin Kersten |
11 | * Multicolumn group-by support |
12 | * The group-by support module is meant to simplify code analysis and |
13 | * speedup the kernel on multi-attribute grouping routines. |
14 | * |
15 | * The target is to support SQL-like multicolumngroup_by operations, which are lists of |
16 | * attributes and a group aggregate function. |
17 | * Each group can be represented with an oid into the n-ary table. |
18 | * Consider the query "select count(*), max(A) from R group by A, B,C." whose code |
19 | * snippet in MAL would become something like: |
20 | * |
21 | * @verbatim |
22 | * _1:bat[:int] := sql.bind("sys","r","a",0); |
23 | * _2:bat[:str] := sql.bind("sys","r","b",0); |
24 | * _3:bat[:date] := sql.bind("sys","r","c",0); |
25 | * ... |
26 | * _9 := algebra.select(_1,0,100); |
27 | * .. |
28 | * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2); |
29 | * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3); |
30 | * @end verbatim |
31 | * |
32 | * The id() function merely becomes the old-fashioned oid-based group identification list. |
33 | * This way related values can be obtained from the attribute columns. It can be the input |
34 | * for the count() function, which saves some re-computation. |
35 | * |
36 | * Aside the group ids, we also provide options to return the value based aggregate table |
37 | * to ease development of parallel plans. |
38 | * |
39 | * The implementation is optimized for a limited number of groups. The default is |
40 | * to fall back on the old code sequences. |
41 | * |
42 | */ |
43 | #include "monetdb_config.h" |
44 | #include "groupby.h" |
45 | #include "group.h" |
46 | |
47 | /* |
48 | * The implementation is based on a two-phase process. In phase 1, we estimate |
49 | * the number of groups to deal with using column independence. |
50 | * The grouping is performed in parallel over slices of the tables. |
51 | * The final pieces are glued together. |
52 | */ |
53 | typedef struct{ |
54 | bat *bid; /* input bats */ |
55 | BAT *candidate; /* list */ |
56 | BAT **cols; |
57 | BUN *unique; /* number of different values */ |
58 | int last; |
59 | BUN size; |
60 | } AGGRtask; |
61 | |
62 | static AGGRtask* |
63 | GROUPcollect( Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci){ |
64 | AGGRtask *a; |
65 | int i; |
66 | BAT *b, *bs, *bh = NULL; |
67 | BUN sample; |
68 | |
69 | (void) mb; |
70 | (void) cntxt; |
71 | a= (AGGRtask *) GDKzalloc(sizeof(*a)); |
72 | if ( a == NULL) |
73 | return NULL; |
74 | a->bid = (bat*) GDKzalloc(pci->argc * sizeof(bat)); |
75 | a->cols = (BAT**) GDKzalloc(pci->argc * sizeof(BAT*)); |
76 | a->unique = (BUN *) GDKzalloc(pci->argc * sizeof(BUN)); |
77 | if ( a->cols == NULL || a->bid == NULL || a->unique == NULL){ |
78 | if(a->cols) GDKfree(a->cols); |
79 | if(a->bid) GDKfree(a->bid); |
80 | if(a->unique) GDKfree(a->unique); |
81 | GDKfree(a); |
82 | return NULL; |
83 | } |
84 | for ( i= pci->retc; i< pci->argc; i++, a->last++) { |
85 | a->bid[a->last] = *getArgReference_bat(stk,pci,i); |
86 | b = a->cols[a->last]= BATdescriptor(a->bid[a->last]); |
87 | if ( a->cols[a->last] == NULL){ |
88 | for(a->last--; a->last>=0; a->last--) |
89 | BBPunfix(a->cols[a->last]->batCacheid); |
90 | GDKfree(a->cols); |
91 | GDKfree(a->bid); |
92 | GDKfree(a->unique); |
93 | GDKfree(a); |
94 | return NULL; |
95 | } |
96 | sample = BATcount(b) < 1000 ? BATcount(b): 1000; |
97 | bs = BATsample( b, sample); |
98 | if (bs) { |
99 | bh = BATunique(b, bs); |
100 | if (bh) { |
101 | a->unique[a->last] = BATcount(bh); |
102 | BBPunfix(bh->batCacheid); |
103 | } |
104 | BBPunfix(bs->batCacheid); |
105 | } |
106 | if ( b->tsorted) |
107 | a->unique[a->last] = 1000; /* sorting helps grouping */ |
108 | a->size = BATcount(b); |
109 | } |
110 | |
111 | #ifdef _DEBUG_GROUPBY_ |
112 | for(i=0; i<a->last; i++) |
113 | fprintf(stderr,"#group %d unique " BUNFMT "\n" , i, a->unique[i]); |
114 | #endif |
115 | return a; |
116 | } |
117 | |
118 | static void |
119 | GROUPcollectSort(AGGRtask *a, int start, int finish) |
120 | { |
121 | int i,j,k; |
122 | BAT *b; |
123 | BUN sample; |
124 | |
125 | /* sort the columns by decreasing unique */ |
126 | for (i = start; i< finish; i++) |
127 | for( j = i+1; j<finish; j++) |
128 | if ( a->unique[i] < a->unique[j]){ |
129 | k =a->bid[i]; |
130 | a->bid[i] = a->bid[j]; |
131 | a->bid[j] = k; |
132 | |
133 | b= a->cols[i]; |
134 | a->cols[i] = a->cols[j]; |
135 | a->cols[j] = b; |
136 | |
137 | sample = a->unique[i]; |
138 | a->unique[i] = a->unique[j]; |
139 | a->unique[j] = sample; |
140 | } |
141 | } |
142 | |
143 | static void |
144 | GROUPdelete(AGGRtask *a){ |
145 | for(a->last--; a->last>=0; a->last--){ |
146 | BBPunfix(a->cols[a->last]->batCacheid); |
147 | } |
148 | GDKfree(a->bid); |
149 | GDKfree(a->cols); |
150 | GDKfree(a->unique); |
151 | GDKfree(a); |
152 | } |
153 | |
154 | /* |
155 | * The groups optimizer takes a grouping sequence and attempts to |
156 | * minimize the intermediate result. The choice depends on a good |
157 | * estimate of intermediate results using properties. |
158 | */ |
159 | |
160 | str |
161 | GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) |
162 | { |
163 | bat *grp = getArgReference_bat(stk, pci, 0); |
164 | bat *ext = getArgReference_bat(stk, pci, 1); |
165 | bat *hist = getArgReference_bat(stk, pci, 2); |
166 | int i, j; |
167 | bat oldgrp, oldext, oldhist; |
168 | str msg = MAL_SUCCEED; |
169 | BAT *b; |
170 | BUN count = 0; |
171 | AGGRtask *aggr; |
172 | |
173 | aggr = GROUPcollect(cntxt, mb, stk, pci); |
174 | if( aggr == NULL) |
175 | throw(MAL,"group.multicolumn" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
176 | GROUPcollectSort(aggr, 0, aggr->last); |
177 | |
178 | /* (grp,ext,hist) := group.group(..) */ |
179 | /* use the old pattern to perform the incremental grouping */ |
180 | *grp = 0; |
181 | *ext = 0; |
182 | *hist = 0; |
183 | msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]); |
184 | i = 1; |
185 | if (msg == MAL_SUCCEED && aggr->last > 1) |
186 | do { |
187 | /* early break when there are as many groups as entries */ |
188 | b = BATdescriptor(*hist); |
189 | if (b) { |
190 | j = BATcount(b) == count; |
191 | BBPunfix(*hist); |
192 | if (j) |
193 | break; |
194 | } |
195 | |
196 | /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */ |
197 | oldgrp = *grp; |
198 | oldext = *ext; |
199 | oldhist = *hist; |
200 | *grp = 0; |
201 | *ext = 0; |
202 | *hist = 0; |
203 | msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp, &oldext, &oldhist); |
204 | BBPrelease(oldgrp); |
205 | BBPrelease(oldext); |
206 | BBPrelease(oldhist); |
207 | } while (msg == MAL_SUCCEED && ++i < aggr->last); |
208 | GROUPdelete(aggr); |
209 | return msg; |
210 | } |
211 | |