LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - sample.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 22 24 91.7 %
Date: 2021-01-13 20:07:21 Functions: 2 2 100.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * @a Lefteris Sidirourgos
      11             :  * @d 30/08/2011
      12             :  * @+ The sampling facilities
      13             :  *
      14             :  * In the context of the SciBORQ project, we introduce a number of sampling
      15             :  * techniques in the MonetDB software stack. Our goal is to provide methods
      16             :  * for performing sampling (uniform and weighted) over a) the result of a
      17             :  * query, b) the base tables, and c) the entire database schema. Sampling
      18             :  * can be performed during query execution, as well as during data loading in
      19             :  * the case of predefined sampling indexes. In addition to the sampling
      20             :  * methods, a number of query plan optimisations for sampling are introduced on
      21             :  * the SQL and MAL level.
      22             :  *
      23             :  * Besides the sampling methods, SciBORQ also aims at multi-layered bounded
      24             :  * query execution. That is steering query execution over many layers of
      25             :  * samples with different size in order to achieve either strict error bounds
      26             :  * or limited execution time. For more details see the SciBORQ module.
      27             :  *
      28             :  * In the following, details are presented on the implementation and the usage
      29             :  * of each sampling method.
      30             :  */
      31             : 
      32             : #include "monetdb_config.h"
      33             : #include "gdk.h"
      34             : #include "mal_exception.h"
      35             : #include "mal_interpreter.h"
      36             : 
      37             : // TODO: Go through this documentation and update it with an explanation about seeds.
      38             : /*
      39             :  * @- Uniform Sampling.
      40             :  *
      41             :  * A new SQL operator has been added to support sampling the result of a query.
      42             :  * The syntax for sampling is:
      43             :  * SELECT ... FROM ... WHERE ... SAMPLE s
      44             :  *
      45             :  * where s if is an integer greater than 1, it defines the number of rows to be
      46             :  * in the sample. If s is a double between [0.0,1.0] the it refers to the
      47             :  * percentage of the result to be sampled. That is if s=0.3 then the sample
      48             :  * will be 30% the size of the query result.
      49             :  *
      50             :  * SAMPLE is been treated as LIMIT, ORDER BY, etc., that means that it can only
      51             :  * be in the outer most SELECT clause, i.e., SAMPLE cannot appear in a
      52             :  * subquery. However, if this is needed, then one may define a function, for
      53             :  * example
      54             :  *
      55             :  * CREATE FUNCTION mysample ()
      56             :  * RETURNS TABLE(col a,...)
      57             :  * BEGIN
      58             :  *    RETURN
      59             :  *      SELECT a,...
      60             :  *      FROM name_table
      61             :  *      SAMPLE 100;
      62             :  * end;
      63             :  *
      64             :  * and then use function mysample() for example to populate a new table with
      65             :  * the sample. E.g.,
      66             :  *
      67             :  * INSERT INTO sample_table (SELECT * FROM mysample());
      68             :  *
      69             :  */
      70             : 
      71             : static str
      72          18 : SAMPLEuniform(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) {
      73             : 
      74             :         bat *r, *b;
      75             :         lng sample_size;
      76             :         unsigned seed;
      77             :         (void) cntxt;
      78             : 
      79             :         BAT *br, *bb;
      80             : 
      81          18 :         r = getArgReference_bat(stk, pci, 0);
      82          18 :         b = getArgReference_bat(stk, pci, 1);
      83             : 
      84          18 :         if ((bb = BATdescriptor(*b)) == NULL) {
      85           0 :                 throw(MAL, "sample.subuniform", INTERNAL_BAT_ACCESS);
      86             :         }
      87             : 
      88          18 :         if (getArgType(mb, pci, 2) == TYPE_dbl)
      89             :         {
      90           8 :                 dbl pr = *getArgReference_dbl(stk, pci, 2);
      91             : 
      92           8 :                 if ( pr < 0.0 || pr > 1.0 ) {
      93           1 :                         BBPunfix(bb->batCacheid);
      94           1 :                         throw(MAL, "sample.subuniform", ILLEGAL_ARGUMENT
      95             :                                         " p should be between 0 and 1.0" );
      96           7 :                 } else if (pr == 0) {/* special case */
      97             :                         sample_size = 0;
      98             :                         // TODO: Add special case for pr == 1.0.
      99             :                 } else {
     100           6 :                         sample_size = (lng) (pr*(double)BATcount(bb));
     101             :                 }
     102             :         } else {
     103          10 :                 sample_size = *getArgReference_lng(stk, pci, 2);
     104             :         }
     105             : 
     106          17 :         if (pci->argc == 4) {
     107          10 :                 seed = (unsigned) *getArgReference_int(stk, pci, 3);
     108          10 :                 br = BATsample_with_seed(bb, (BUN) sample_size, seed);
     109             :         }
     110             :         else {
     111           7 :                 br = BATsample(bb, (BUN) sample_size);
     112             :         }
     113             : 
     114          17 :         BBPunfix(bb->batCacheid);
     115          17 :         if (br == NULL)
     116           0 :                 throw(MAL, "sample.subuniform", OPERATION_FAILED);
     117             : 
     118          17 :         BBPkeepref(*r = br->batCacheid);
     119          17 :         return MAL_SUCCEED;
     120             : }
     121             : 
     122             : #include "mel.h"
     123             : mel_func sample_init_funcs[] = {
     124             :  pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size s", args(1,3, batarg("",oid),batargany("b",0),arg("sample_size",lng))),
     125             :  pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size s and where the prg is seeded with sample_seed", args(1,4, batarg("",oid),batargany("b",0),arg("sample_size",lng),arg("sample_seed",int))),
     126             :  pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size = (p x count(b)), where 0 <= p <= 1.0", args(1,3, batarg("",oid),batargany("b",0),arg("p",dbl))),
     127             :  pattern("sample", "subuniform", SAMPLEuniform, false, "Returns the oids of a uniform sample of size = (p x count(b)), where 0 <= p <= 1.0 and where the prg is seeded with sample_seed", args(1,4, batarg("",oid),batargany("b",0),arg("p",dbl),arg("sample_seed",int))),
     128             :  { .imp=NULL }
     129             : };
     130             : #include "mal_import.h"
     131             : #ifdef _MSC_VER
     132             : #undef read
     133             : #pragma section(".CRT$XCU",read)
     134             : #endif
     135         255 : LIB_STARTUP_FUNC(init_sample_mal)
     136         255 : { mal_module("sample", NULL, sample_init_funcs); }

Generated by: LCOV version 1.14