| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * (c) Martin Kersten |
| 11 | * Multicolumn group-by support |
| 12 | * The group-by support module is meant to simplify code analysis and |
| 13 | * speedup the kernel on multi-attribute grouping routines. |
| 14 | * |
| 15 | * The target is to support SQL-like multicolumngroup_by operations, which are lists of |
| 16 | * attributes and a group aggregate function. |
| 17 | * Each group can be represented with an oid into the n-ary table. |
| 18 | * Consider the query "select count(*), max(A) from R group by A, B,C." whose code |
| 19 | * snippet in MAL would become something like: |
| 20 | * |
| 21 | * @verbatim |
| 22 | * _1:bat[:int] := sql.bind("sys","r","a",0); |
| 23 | * _2:bat[:str] := sql.bind("sys","r","b",0); |
| 24 | * _3:bat[:date] := sql.bind("sys","r","c",0); |
| 25 | * ... |
| 26 | * _9 := algebra.select(_1,0,100); |
| 27 | * .. |
| 28 | * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2); |
| 29 | * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3); |
| 30 | * @end verbatim |
| 31 | * |
| 32 | * The id() function merely becomes the old-fashioned oid-based group identification list. |
| 33 | * This way related values can be obtained from the attribute columns. It can be the input |
| 34 | * for the count() function, which saves some re-computation. |
| 35 | * |
| 36 | * Aside the group ids, we also provide options to return the value based aggregate table |
| 37 | * to ease development of parallel plans. |
| 38 | * |
| 39 | * The implementation is optimized for a limited number of groups. The default is |
| 40 | * to fall back on the old code sequences. |
| 41 | * |
| 42 | */ |
| 43 | #include "monetdb_config.h" |
| 44 | #include "groupby.h" |
| 45 | #include "group.h" |
| 46 | |
| 47 | /* |
| 48 | * The implementation is based on a two-phase process. In phase 1, we estimate |
| 49 | * the number of groups to deal with using column independence. |
| 50 | * The grouping is performed in parallel over slices of the tables. |
| 51 | * The final pieces are glued together. |
| 52 | */ |
| 53 | typedef struct{ |
| 54 | bat *bid; /* input bats */ |
| 55 | BAT *candidate; /* list */ |
| 56 | BAT **cols; |
| 57 | BUN *unique; /* number of different values */ |
| 58 | int last; |
| 59 | BUN size; |
| 60 | } AGGRtask; |
| 61 | |
| 62 | static AGGRtask* |
| 63 | GROUPcollect( Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci){ |
| 64 | AGGRtask *a; |
| 65 | int i; |
| 66 | BAT *b, *bs, *bh = NULL; |
| 67 | BUN sample; |
| 68 | |
| 69 | (void) mb; |
| 70 | (void) cntxt; |
| 71 | a= (AGGRtask *) GDKzalloc(sizeof(*a)); |
| 72 | if ( a == NULL) |
| 73 | return NULL; |
| 74 | a->bid = (bat*) GDKzalloc(pci->argc * sizeof(bat)); |
| 75 | a->cols = (BAT**) GDKzalloc(pci->argc * sizeof(BAT*)); |
| 76 | a->unique = (BUN *) GDKzalloc(pci->argc * sizeof(BUN)); |
| 77 | if ( a->cols == NULL || a->bid == NULL || a->unique == NULL){ |
| 78 | if(a->cols) GDKfree(a->cols); |
| 79 | if(a->bid) GDKfree(a->bid); |
| 80 | if(a->unique) GDKfree(a->unique); |
| 81 | GDKfree(a); |
| 82 | return NULL; |
| 83 | } |
| 84 | for ( i= pci->retc; i< pci->argc; i++, a->last++) { |
| 85 | a->bid[a->last] = *getArgReference_bat(stk,pci,i); |
| 86 | b = a->cols[a->last]= BATdescriptor(a->bid[a->last]); |
| 87 | if ( a->cols[a->last] == NULL){ |
| 88 | for(a->last--; a->last>=0; a->last--) |
| 89 | BBPunfix(a->cols[a->last]->batCacheid); |
| 90 | GDKfree(a->cols); |
| 91 | GDKfree(a->bid); |
| 92 | GDKfree(a->unique); |
| 93 | GDKfree(a); |
| 94 | return NULL; |
| 95 | } |
| 96 | sample = BATcount(b) < 1000 ? BATcount(b): 1000; |
| 97 | bs = BATsample( b, sample); |
| 98 | if (bs) { |
| 99 | bh = BATunique(b, bs); |
| 100 | if (bh) { |
| 101 | a->unique[a->last] = BATcount(bh); |
| 102 | BBPunfix(bh->batCacheid); |
| 103 | } |
| 104 | BBPunfix(bs->batCacheid); |
| 105 | } |
| 106 | if ( b->tsorted) |
| 107 | a->unique[a->last] = 1000; /* sorting helps grouping */ |
| 108 | a->size = BATcount(b); |
| 109 | } |
| 110 | |
| 111 | #ifdef _DEBUG_GROUPBY_ |
| 112 | for(i=0; i<a->last; i++) |
| 113 | fprintf(stderr,"#group %d unique " BUNFMT "\n" , i, a->unique[i]); |
| 114 | #endif |
| 115 | return a; |
| 116 | } |
| 117 | |
| 118 | static void |
| 119 | GROUPcollectSort(AGGRtask *a, int start, int finish) |
| 120 | { |
| 121 | int i,j,k; |
| 122 | BAT *b; |
| 123 | BUN sample; |
| 124 | |
| 125 | /* sort the columns by decreasing unique */ |
| 126 | for (i = start; i< finish; i++) |
| 127 | for( j = i+1; j<finish; j++) |
| 128 | if ( a->unique[i] < a->unique[j]){ |
| 129 | k =a->bid[i]; |
| 130 | a->bid[i] = a->bid[j]; |
| 131 | a->bid[j] = k; |
| 132 | |
| 133 | b= a->cols[i]; |
| 134 | a->cols[i] = a->cols[j]; |
| 135 | a->cols[j] = b; |
| 136 | |
| 137 | sample = a->unique[i]; |
| 138 | a->unique[i] = a->unique[j]; |
| 139 | a->unique[j] = sample; |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | static void |
| 144 | GROUPdelete(AGGRtask *a){ |
| 145 | for(a->last--; a->last>=0; a->last--){ |
| 146 | BBPunfix(a->cols[a->last]->batCacheid); |
| 147 | } |
| 148 | GDKfree(a->bid); |
| 149 | GDKfree(a->cols); |
| 150 | GDKfree(a->unique); |
| 151 | GDKfree(a); |
| 152 | } |
| 153 | |
| 154 | /* |
| 155 | * The groups optimizer takes a grouping sequence and attempts to |
| 156 | * minimize the intermediate result. The choice depends on a good |
| 157 | * estimate of intermediate results using properties. |
| 158 | */ |
| 159 | |
| 160 | str |
| 161 | GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) |
| 162 | { |
| 163 | bat *grp = getArgReference_bat(stk, pci, 0); |
| 164 | bat *ext = getArgReference_bat(stk, pci, 1); |
| 165 | bat *hist = getArgReference_bat(stk, pci, 2); |
| 166 | int i, j; |
| 167 | bat oldgrp, oldext, oldhist; |
| 168 | str msg = MAL_SUCCEED; |
| 169 | BAT *b; |
| 170 | BUN count = 0; |
| 171 | AGGRtask *aggr; |
| 172 | |
| 173 | aggr = GROUPcollect(cntxt, mb, stk, pci); |
| 174 | if( aggr == NULL) |
| 175 | throw(MAL,"group.multicolumn" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 176 | GROUPcollectSort(aggr, 0, aggr->last); |
| 177 | |
| 178 | /* (grp,ext,hist) := group.group(..) */ |
| 179 | /* use the old pattern to perform the incremental grouping */ |
| 180 | *grp = 0; |
| 181 | *ext = 0; |
| 182 | *hist = 0; |
| 183 | msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]); |
| 184 | i = 1; |
| 185 | if (msg == MAL_SUCCEED && aggr->last > 1) |
| 186 | do { |
| 187 | /* early break when there are as many groups as entries */ |
| 188 | b = BATdescriptor(*hist); |
| 189 | if (b) { |
| 190 | j = BATcount(b) == count; |
| 191 | BBPunfix(*hist); |
| 192 | if (j) |
| 193 | break; |
| 194 | } |
| 195 | |
| 196 | /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */ |
| 197 | oldgrp = *grp; |
| 198 | oldext = *ext; |
| 199 | oldhist = *hist; |
| 200 | *grp = 0; |
| 201 | *ext = 0; |
| 202 | *hist = 0; |
| 203 | msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp, &oldext, &oldhist); |
| 204 | BBPrelease(oldgrp); |
| 205 | BBPrelease(oldext); |
| 206 | BBPrelease(oldhist); |
| 207 | } while (msg == MAL_SUCCEED && ++i < aggr->last); |
| 208 | GROUPdelete(aggr); |
| 209 | return msg; |
| 210 | } |
| 211 | |