LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - groupby.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 45 54 83.3 %
Date: 2021-01-13 20:07:21 Functions: 4 4 100.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * (c) Martin Kersten
      11             :  * Multicolumn group-by support
      12             :  * The group-by support module is meant to simplify code analysis and
      13             :  * speedup the kernel on multi-attribute grouping routines.
      14             :  *
      15             :  * The target is to support SQL-like multicolumngroup_by operations, which are lists of
      16             :  * attributes and a group aggregate function.
      17             :  * Each group can be represented with an oid into the n-ary table.
      18             :  * Consider the query "select count(*), max(A) from R group by A, B,C." whose code
      19             :  * snippet in MAL would become something like:
      20             :  *
      21             :  * @verbatim
      22             :  * _1:bat[:int]  := sql.bind("sys","r","a",0);
      23             :  * _2:bat[:str]  := sql.bind("sys","r","b",0);
      24             :  * _3:bat[:date]  := sql.bind("sys","r","c",0);
      25             :  * ...
      26             :  * _9 := algebra.select(_1,0,100);
      27             :  * ..
      28             :  * (grp_4:bat[:lng], gid:bat[:oid]) := groupby.count(_9,_2);
      29             :  * (grp_5:bat[:lng], gid:bat[:oid]) := groupby.max(_9,_2,_3);
      30             :  * @end verbatim
      31             :  *
      32             :  * The id() function merely becomes the old-fashioned oid-based group identification list.
      33             :  * This way related values can be obtained from the attribute columns. It can be the input
      34             :  * for the count() function, which saves some re-computation.
      35             :  *
      36             :  * Aside the group ids, we also provide options to return the value based aggregate table
      37             :  * to ease development of parallel plans.
      38             :  *
      39             :  * The implementation is optimized for a limited number of groups. The default is
      40             :  * to fall back on the old code sequences.
      41             :  *
      42             :  */
      43             : #include "monetdb_config.h"
      44             : #include "mal.h"
      45             : #include "mal_interpreter.h"
      46             : #include "group.h"
      47             : 
      48             : /*
      49             :  * The implementation is based on a two-phase process. In phase 1, we estimate
      50             :  * the number of groups to deal with using column independence.
      51             :  * The grouping is performed in parallel over slices of the tables.
      52             :  * The final pieces are glued together.
      53             :  */
      54             : typedef struct{
      55             :         bat *bid;       /* input bats */
      56             :         BAT *candidate; /* list */
      57             :         BAT **cols;
      58             :         BUN *unique; /* number of different values */
      59             :         int last;
      60             :         BUN size;
      61             : } AGGRtask;
      62             : 
      63             : static AGGRtask*
      64             : GROUPcollect( Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci){
      65             :         AGGRtask *a;
      66             :         int i;
      67             :         BAT *b, *bs, *bh = NULL;
      68             :         BUN sample;
      69             : 
      70             :         (void) mb;
      71             :         (void) cntxt;
      72             :         a= (AGGRtask *) GDKzalloc(sizeof(*a));
      73             :         if ( a == NULL)
      74             :                 return NULL;
      75             :         a->bid = (bat*) GDKzalloc(pci->argc * sizeof(bat));
      76             :         a->cols = (BAT**) GDKzalloc(pci->argc * sizeof(BAT*));
      77             :         a->unique = (BUN *) GDKzalloc(pci->argc * sizeof(BUN));
      78             :         if ( a->cols == NULL || a->bid == NULL || a->unique == NULL){
      79             :                 if(a->cols) GDKfree(a->cols);
      80             :                 if(a->bid) GDKfree(a->bid);
      81             :                 if(a->unique) GDKfree(a->unique);
      82             :                 GDKfree(a);
      83             :                 return NULL;
      84             :         }
      85             :         for ( i= pci->retc; i< pci->argc; i++, a->last++) {
      86             :                 a->bid[a->last] = *getArgReference_bat(stk,pci,i);
      87             :                 b = a->cols[a->last]= BATdescriptor(a->bid[a->last]);
      88             :                 if ( a->cols[a->last] == NULL){
      89             :                         for(a->last--; a->last>=0; a->last--)
      90             :                                 BBPunfix(a->cols[a->last]->batCacheid);
      91             :                         GDKfree(a->cols);
      92             :                         GDKfree(a->bid);
      93             :                         GDKfree(a->unique);
      94             :                         GDKfree(a);
      95             :                         return NULL;
      96             :                 }
      97             :                 sample = BATcount(b) < 1000 ? BATcount(b): 1000;
      98             :                 bs = BATsample( b, sample);
      99             :                 if (bs) {
     100             :                         bh = BATunique(b, bs);
     101             :                         if (bh) {
     102             :                                 a->unique[a->last] = BATcount(bh);
     103             :                                 BBPunfix(bh->batCacheid);
     104             :                         }
     105             :                         BBPunfix(bs->batCacheid);
     106             :                 }
     107             :                 if ( b->tsorted)
     108             :                         a->unique[a->last] = 1000; /* sorting helps grouping */
     109             :                 a->size = BATcount(b);
     110             :         }
     111             : 
     112             :         return a;
     113             : }
     114             : 
     115             : static void
     116          10 : GROUPcollectSort(AGGRtask *a, int start, int finish)
     117             : {
     118             :         int i,j,k;
     119             :         BAT *b;
     120             :         BUN sample;
     121             : 
     122             :         /* sort the columns by decreasing unique */
     123          31 :         for (i = start; i< finish; i++)
     124          36 :         for( j = i+1; j<finish; j++)
     125          15 :         if ( a->unique[i] < a->unique[j]){
     126           0 :                 k =a->bid[i];
     127           0 :                 a->bid[i] = a->bid[j];
     128           0 :                 a->bid[j] = k;
     129             : 
     130           0 :                 b= a->cols[i];
     131           0 :                 a->cols[i] = a->cols[j];
     132           0 :                 a->cols[j] = b;
     133             : 
     134             :                 sample = a->unique[i];
     135           0 :                 a->unique[i] = a->unique[j];
     136           0 :                 a->unique[j] = sample;
     137             :         }
     138          10 : }
     139             : 
     140             : static void
     141          10 : GROUPdelete(AGGRtask *a){
     142          31 :         for(a->last--; a->last>=0; a->last--){
     143          21 :                 BBPunfix(a->cols[a->last]->batCacheid);
     144             :         }
     145          10 :         GDKfree(a->bid);
     146          10 :         GDKfree(a->cols);
     147          10 :         GDKfree(a->unique);
     148          10 :         GDKfree(a);
     149          10 : }
     150             : 
     151             : /*
     152             :  * The groups optimizer takes a grouping sequence and attempts to
     153             :  * minimize the intermediate result.  The choice depends on a good
     154             :  * estimate of intermediate results using properties.
     155             :  */
     156             : 
     157             : static str
     158          10 : GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     159             : {
     160          10 :         bat *grp = getArgReference_bat(stk, pci, 0);
     161          10 :         bat *ext = getArgReference_bat(stk, pci, 1);
     162          10 :         bat *hist = getArgReference_bat(stk, pci, 2);
     163             :         int i, j;
     164             :         bat oldgrp, oldext, oldhist;
     165             :         str msg = MAL_SUCCEED;
     166             :         BAT *b;
     167             :         BUN count = 0;
     168             :         AGGRtask *aggr;
     169             : 
     170          10 :         aggr = GROUPcollect(cntxt, mb, stk, pci);
     171          10 :         if( aggr == NULL)
     172           0 :                 throw(MAL,"group.multicolumn", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     173          10 :         GROUPcollectSort(aggr, 0, aggr->last);
     174             : 
     175             :         /* (grp,ext,hist) := group.group(..) */
     176             :         /* use the old pattern to perform the incremental grouping */
     177          10 :         *grp = 0;
     178          10 :         *ext = 0;
     179          10 :         *hist = 0;
     180          10 :         msg = GRPgroup1(grp, ext, hist, &aggr->bid[0]);
     181             :         i = 1;
     182          10 :         if (msg == MAL_SUCCEED && aggr->last > 1)
     183             :                 do {
     184             :                         /* early break when there are as many groups as entries */
     185          11 :                         b = BATdescriptor(*hist);
     186          11 :                         if (b) {
     187          11 :                                 j = BATcount(b) == count;
     188          11 :                                 BBPunfix(*hist);
     189          11 :                                 if (j)
     190             :                                         break;
     191             :                         }
     192             : 
     193             :                         /* (grp,ext,hist) := group.subgroup(arg,grp,ext,hist) */
     194          11 :                         oldgrp = *grp;
     195          11 :                         oldext = *ext;
     196          11 :                         oldhist = *hist;
     197          11 :                         *grp = 0;
     198          11 :                         *ext = 0;
     199          11 :                         *hist = 0;
     200          11 :                         msg = GRPsubgroup5(grp, ext, hist, &aggr->bid[i], NULL, &oldgrp, &oldext, &oldhist);
     201          11 :                         BBPrelease(oldgrp);
     202          11 :                         BBPrelease(oldext);
     203          11 :                         BBPrelease(oldhist);
     204          11 :                 } while (msg == MAL_SUCCEED && ++i < aggr->last);
     205          10 :         GROUPdelete(aggr);
     206          10 :         return msg;
     207             : }
     208             : 
     209             : #include "mel.h"
     210             : mel_func groupby_init_funcs[] = {
     211             :  pattern("group", "multicolumn", GROUPmulticolumngroup, false, "Derivation of a group index over multiple columns.", args(3,4, batarg("ref",oid),batarg("grp",oid),batargany("hist",0),batvarargany("b",0))),
     212             :  { .imp=NULL }
     213             : };
     214             : #include "mal_import.h"
     215             : #ifdef _MSC_VER
     216             : #undef read
     217             : #pragma section(".CRT$XCU",read)
     218             : #endif
     219         255 : LIB_STARTUP_FUNC(init_groupby_mal)
     220         255 : { mal_module("groupby", NULL, groupby_init_funcs); }

Generated by: LCOV version 1.14