Line data Source code
1 : /*
2 : * This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 : *
6 : * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
7 : */
8 :
9 : /*
10 : * (c) Martin Kersten
11 : * Multicolumn group-by support
12 : * The group-by support module is meant to simplify code analysis and
13 : * speedup the kernel on multi-attribute grouping routines.
14 : *
15 : * The target is to support SQL-like multicolumngroup_by operations, which are lists of
16 : * attributes and a group aggregate function.
17 : * Each group can be represented with an oid into the n-ary table.
18 : * Consider the query "select count(*), max(A) from R group by A, B,C." whose code
19 : * snippet in MAL would become something like:
20 : *
21 : * @verbatim
22 : * _1:bat[:int] := sql.bind("sys","r","a",0);
23 : * _2:bat[:str] := sql.bind("sys","r","b",0);
24 : * _3:bat[:date] := sql.bind("sys","r","c",0);
25 : * ...
26 : * _9 := algebra.select(_1,0,100);
27 : * ..
28 : * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2);
29 : * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3);
30 : * @end verbatim
31 : *
32 : * The id() function merely becomes the old-fashioned oid-based group identification list.
33 : * This way related values can be obtained from the attribute columns. It can be the input
34 : * for the count() function, which saves some re-computation.
35 : *
36 : * Aside the group ids, we also provide options to return the value based aggregate table
37 : * to ease development of parallel plans.
38 : *
39 : * The implementation is optimized for a limited number of groups. The default is
40 : * to fall back on the old code sequences.
41 : *
42 : */
43 : #include "monetdb_config.h"
44 : #include "mal.h"
45 : #include "mal_interpreter.h"
46 : #include "group.h"
47 : #include "mal_exception.h"
48 :
49 : /*
50 : * The implementation is based on a two-phase process. In phase 1, we estimate
51 : * the number of groups to deal with using column independence.
52 : * The grouping is performed in parallel over slices of the tables.
53 : * The final pieces are glued together.
54 : */
55 : typedef struct{
56 : bat *bid; /* input bats */
57 : BAT *candidate; /* list */
58 : BAT **cols;
59 : BUN *unique; /* number of different values */
60 : int last;
61 : BUN size;
62 : } AGGRtask;
63 :
64 : static AGGRtask*
65 10 : GROUPcollect( Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci){
66 : AGGRtask *a;
67 : int i;
68 : BAT *b, *bs, *bh = NULL;
69 : BUN sample;
70 :
71 : (void) mb;
72 : (void) cntxt;
73 10 : a= (AGGRtask *) GDKzalloc(sizeof(*a));
74 10 : if ( a == NULL)
75 : return NULL;
76 10 : a->bid = (bat*) GDKzalloc(pci->argc * sizeof(bat));
77 10 : a->cols = (BAT**) GDKzalloc(pci->argc * sizeof(BAT*));
78 10 : a->unique = (BUN *) GDKzalloc(pci->argc * sizeof(BUN));
79 10 : if ( a->cols == NULL || a->bid == NULL || a->unique == NULL){
80 0 : if(a->cols) GDKfree(a->cols);
81 0 : if(a->bid) GDKfree(a->bid);
82 0 : if(a->unique) GDKfree(a->unique);
83 0 : GDKfree(a);
84 0 : return NULL;
85 : }
86 31 : for ( i= pci->retc; i< pci->argc; i++, a->last++) {
87 21 : a->bid[a->last] = *getArgReference_bat(stk,pci,i);
88 21 : b = a->cols[a->last]= BATdescriptor(a->bid[a->last]);
89 21 : if ( a->cols[a->last] == NULL){
90 0 : for(a->last--; a->last>=0; a->last--)
91 0 : BBPunfix(a->cols[a->last]->batCacheid);
92 0 : GDKfree(a->cols);
93 0 : GDKfree(a->bid);
94 0 : GDKfree(a->unique);
95 0 : GDKfree(a);
96 0 : return NULL;
97 : }
98 21 : sample = BATcount(b) < 1000 ? BATcount(b): 1000;
99 21 : bs = BATsample( b, sample);
100 21 : if (bs) {
101 21 : bh = BATunique(b, bs);
102 21 : if (bh) {
103 21 : a->unique[a->last] = BATcount(bh);
104 21 : BBPunfix(bh->batCacheid);
105 : }
106 21 : BBPunfix(bs->batCacheid);
107 : }
108 21 : if ( b->tsorted)
109 11 : a->unique[a->last] = 1000; /* sorting helps grouping */
110 21 : a->size = BATcount(b);
111 : }
112 :
113 : return a;
114 : }
115 :
116 : static void
117 10 : GROUPcollectSort(AGGRtask *a, int start, int finish)
118 : {
119 : int i,j,k;
120 : BAT *b;
121 : BUN sample;
122 :
123 : /* sort the columns by decreasing unique */
124 31 : for (i = start; i< finish; i++)
125 36 : for( j = i+1; j<finish; j++)
126 15 : if ( a->unique[i] < a->unique[j]){
127 0 : k =a->bid[i];
128 0 : a->bid[i] = a->bid[j];
129 0 : a->bid[j] = k;
130 :
131 0 : b= a->cols[i];
132 0 : a->cols[i] = a->cols[j];
133 0 : a->cols[j] = b;
134 :
135 : sample = a->unique[i];
136 0 : a->unique[i] = a->unique[j];
137 0 : a->unique[j] = sample;
138 : }
139 10 : }
140 :
141 : static void
142 10 : GROUPdelete(AGGRtask *a){
143 31 : for(a->last--; a->last>=0; a->last--){
144 21 : BBPunfix(a->cols[a->last]->batCacheid);
145 : }
146 10 : GDKfree(a->bid);
147 10 : GDKfree(a->cols);
148 10 : GDKfree(a->unique);
149 10 : GDKfree(a);
150 10 : }
151 :
152 : /*
153 : * The groups optimizer takes a grouping sequence and attempts to
154 : * minimize the intermediate result. The choice depends on a good
155 : * estimate of intermediate results using properties.
156 : */
157 :
158 : static str
159 10 : GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
160 : {
161 10 : bat *grp = getArgReference_bat(stk, pci, 0);
162 10 : bat *ext = getArgReference_bat(stk, pci, 1);
163 10 : bat *hist = getArgReference_bat(stk, pci, 2);
164 : int i, j;
165 : bat oldgrp, oldext, oldhist;
166 : str msg = MAL_SUCCEED;
167 : BAT *b;
168 : BUN count = 0;
169 : AGGRtask *aggr;
170 :
171 10 : aggr = GROUPcollect(cntxt, mb, stk, pci);
172 10 : if( aggr == NULL)
173 0 : throw(MAL,"group.multicolumn", SQLSTATE(HY013) MAL_MALLOC_FAIL);
174 10 : GROUPcollectSort(aggr, 0, aggr->last);
175 :
176 : /* (grp,ext,hist) := group.group(..) */
177 : /* use the old pattern to perform the incremental grouping */
178 10 : *grp = 0;
179 10 : *ext = 0;
180 10 : *hist = 0;
181 10 : msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]);
182 : i = 1;
183 10 : if (msg == MAL_SUCCEED && aggr->last > 1)
184 : do {
185 : /* early break when there are as many groups as entries */
186 11 : b = BBPquickdesc(*hist);
187 11 : if (b) {
188 11 : j = BATcount(b) == count;
189 11 : if (j)
190 : break;
191 : }
192 :
193 : /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */
194 11 : oldgrp = *grp;
195 11 : oldext = *ext;
196 11 : oldhist = *hist;
197 11 : *grp = 0;
198 11 : *ext = 0;
199 11 : *hist = 0;
200 11 : msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp, &oldext, &oldhist);
201 11 : BBPrelease(oldgrp);
202 11 : BBPrelease(oldext);
203 11 : BBPrelease(oldhist);
204 11 : } while (msg == MAL_SUCCEED && ++i < aggr->last);
205 10 : GROUPdelete(aggr);
206 10 : return msg;
207 : }
208 :
209 : #include "mel.h"
210 : mel_func groupby_init_funcs[] = {
211 : pattern("group", "multicolumn", GROUPmulticolumngroup, false, "Derivation of a group index over multiple columns.", args(3,4, batarg("ref",oid),batarg("grp",oid),batargany("hist",0),batvarargany("b",0))),
212 : { .imp=NULL }
213 : };
214 : #include "mal_import.h"
215 : #ifdef _MSC_VER
216 : #undef read
217 : #pragma section(".CRT$XCU",read)
218 : #endif
219 257 : LIB_STARTUP_FUNC(init_groupby_mal)
220 257 : { mal_module("groupby", NULL, groupby_init_funcs); }
|