1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * (c) Martin Kersten
11 * Multicolumn group-by support
12 * The group-by support module is meant to simplify code analysis and
13 * speedup the kernel on multi-attribute grouping routines.
14 *
15 * The target is to support SQL-like multicolumngroup_by operations, which are lists of
16 * attributes and a group aggregate function.
17 * Each group can be represented with an oid into the n-ary table.
18 * Consider the query "select count(*), max(A) from R group by A, B,C." whose code
19 * snippet in MAL would become something like:
20 *
21 * @verbatim
22 * _1:bat[:int] := sql.bind("sys","r","a",0);
23 * _2:bat[:str] := sql.bind("sys","r","b",0);
24 * _3:bat[:date] := sql.bind("sys","r","c",0);
25 * ...
26 * _9 := algebra.select(_1,0,100);
27 * ..
28 * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2);
29 * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3);
30 * @end verbatim
31 *
32 * The id() function merely becomes the old-fashioned oid-based group identification list.
33 * This way related values can be obtained from the attribute columns. It can be the input
34 * for the count() function, which saves some re-computation.
35 *
36 * Aside the group ids, we also provide options to return the value based aggregate table
37 * to ease development of parallel plans.
38 *
39 * The implementation is optimized for a limited number of groups. The default is
40 * to fall back on the old code sequences.
41 *
42 */
43#include "monetdb_config.h"
44#include "groupby.h"
45#include "group.h"
46
47/*
48 * The implementation is based on a two-phase process. In phase 1, we estimate
49 * the number of groups to deal with using column independence.
50 * The grouping is performed in parallel over slices of the tables.
51 * The final pieces are glued together.
52 */
53typedef struct{
54 bat *bid; /* input bats */
55 BAT *candidate; /* list */
56 BAT **cols;
57 BUN *unique; /* number of different values */
58 int last;
59 BUN size;
60} AGGRtask;
61
62static AGGRtask*
63GROUPcollect( Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci){
64 AGGRtask *a;
65 int i;
66 BAT *b, *bs, *bh = NULL;
67 BUN sample;
68
69 (void) mb;
70 (void) cntxt;
71 a= (AGGRtask *) GDKzalloc(sizeof(*a));
72 if ( a == NULL)
73 return NULL;
74 a->bid = (bat*) GDKzalloc(pci->argc * sizeof(bat));
75 a->cols = (BAT**) GDKzalloc(pci->argc * sizeof(BAT*));
76 a->unique = (BUN *) GDKzalloc(pci->argc * sizeof(BUN));
77 if ( a->cols == NULL || a->bid == NULL || a->unique == NULL){
78 if(a->cols) GDKfree(a->cols);
79 if(a->bid) GDKfree(a->bid);
80 if(a->unique) GDKfree(a->unique);
81 GDKfree(a);
82 return NULL;
83 }
84 for ( i= pci->retc; i< pci->argc; i++, a->last++) {
85 a->bid[a->last] = *getArgReference_bat(stk,pci,i);
86 b = a->cols[a->last]= BATdescriptor(a->bid[a->last]);
87 if ( a->cols[a->last] == NULL){
88 for(a->last--; a->last>=0; a->last--)
89 BBPunfix(a->cols[a->last]->batCacheid);
90 GDKfree(a->cols);
91 GDKfree(a->bid);
92 GDKfree(a->unique);
93 GDKfree(a);
94 return NULL;
95 }
96 sample = BATcount(b) < 1000 ? BATcount(b): 1000;
97 bs = BATsample( b, sample);
98 if (bs) {
99 bh = BATunique(b, bs);
100 if (bh) {
101 a->unique[a->last] = BATcount(bh);
102 BBPunfix(bh->batCacheid);
103 }
104 BBPunfix(bs->batCacheid);
105 }
106 if ( b->tsorted)
107 a->unique[a->last] = 1000; /* sorting helps grouping */
108 a->size = BATcount(b);
109 }
110
111#ifdef _DEBUG_GROUPBY_
112 for(i=0; i<a->last; i++)
113 fprintf(stderr,"#group %d unique "BUNFMT "\n", i, a->unique[i]);
114#endif
115 return a;
116}
117
118static void
119GROUPcollectSort(AGGRtask *a, int start, int finish)
120{
121 int i,j,k;
122 BAT *b;
123 BUN sample;
124
125 /* sort the columns by decreasing unique */
126 for (i = start; i< finish; i++)
127 for( j = i+1; j<finish; j++)
128 if ( a->unique[i] < a->unique[j]){
129 k =a->bid[i];
130 a->bid[i] = a->bid[j];
131 a->bid[j] = k;
132
133 b= a->cols[i];
134 a->cols[i] = a->cols[j];
135 a->cols[j] = b;
136
137 sample = a->unique[i];
138 a->unique[i] = a->unique[j];
139 a->unique[j] = sample;
140 }
141}
142
143static void
144GROUPdelete(AGGRtask *a){
145 for(a->last--; a->last>=0; a->last--){
146 BBPunfix(a->cols[a->last]->batCacheid);
147 }
148 GDKfree(a->bid);
149 GDKfree(a->cols);
150 GDKfree(a->unique);
151 GDKfree(a);
152}
153
154/*
155 * The groups optimizer takes a grouping sequence and attempts to
156 * minimize the intermediate result. The choice depends on a good
157 * estimate of intermediate results using properties.
158 */
159
160str
161GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
162{
163 bat *grp = getArgReference_bat(stk, pci, 0);
164 bat *ext = getArgReference_bat(stk, pci, 1);
165 bat *hist = getArgReference_bat(stk, pci, 2);
166 int i, j;
167 bat oldgrp, oldext, oldhist;
168 str msg = MAL_SUCCEED;
169 BAT *b;
170 BUN count = 0;
171 AGGRtask *aggr;
172
173 aggr = GROUPcollect(cntxt, mb, stk, pci);
174 if( aggr == NULL)
175 throw(MAL,"group.multicolumn", SQLSTATE(HY001) MAL_MALLOC_FAIL);
176 GROUPcollectSort(aggr, 0, aggr->last);
177
178 /* (grp,ext,hist) := group.group(..) */
179 /* use the old pattern to perform the incremental grouping */
180 *grp = 0;
181 *ext = 0;
182 *hist = 0;
183 msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]);
184 i = 1;
185 if (msg == MAL_SUCCEED && aggr->last > 1)
186 do {
187 /* early break when there are as many groups as entries */
188 b = BATdescriptor(*hist);
189 if (b) {
190 j = BATcount(b) == count;
191 BBPunfix(*hist);
192 if (j)
193 break;
194 }
195
196 /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */
197 oldgrp = *grp;
198 oldext = *ext;
199 oldhist = *hist;
200 *grp = 0;
201 *ext = 0;
202 *hist = 0;
203 msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp, &oldext, &oldhist);
204 BBPrelease(oldgrp);
205 BBPrelease(oldext);
206 BBPrelease(oldhist);
207 } while (msg == MAL_SUCCEED && ++i < aggr->last);
208 GROUPdelete(aggr);
209 return msg;
210}
211