LCOV - code coverage report
Current view: top level - gdk - gdk_sample.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 85 87 97.7 %
Date: 2020-06-29 20:00:14 Functions: 6 6 100.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2020 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * @a Lefteris Sidirourgos, Hannes Muehleisen
      11             :  * @* Low level sample facilities
      12             :  *
      13             :  * This sampling implementation generates a sorted set of OIDs by
      14             :  * calling the random number generator, and uses a binary tree to
      15             :  * eliminate duplicates.  The elements of the tree are then used to
      16             :  * create a sorted sample BAT.  This implementation has a logarithmic
      17             :  * complexity that only depends on the sample size.
      18             :  *
      19             :  * There is a pathological case when the sample size is almost the
      20             :  * size of the BAT.  Then, many collisions occur and performance
      21             :  * degrades. To catch this, we switch to antiset semantics when the
      22             :  * sample size is larger than half the BAT size. Then, we generate the
      23             :  * values that should be omitted from the sample.
      24             :  */
      25             : 
      26             : #include "monetdb_config.h"
      27             : #include "gdk.h"
      28             : #include "gdk_private.h"
      29             : #include "xoshiro256starstar.h"
      30             : 
      31             : /* this is a straightforward implementation of a binary tree */
      32             : struct oidtreenode {
      33             :         oid o;
      34             :         struct oidtreenode *left;
      35             :         struct oidtreenode *right;
      36             : };
      37             : 
      38             : static int
      39      411240 : OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
      40             : {
      41      411240 :         struct oidtreenode **nodep;
      42             : 
      43      411240 :         if (allocated == 0) {
      44         207 :                 tree->left = tree->right = NULL;
      45         207 :                 tree->o = o;
      46         207 :                 return 1;
      47             :         }
      48             :         nodep = &tree;
      49     5483590 :         while (*nodep) {
      50     5090200 :                 if (o == (*nodep)->o)
      51             :                         return 0;
      52     5072560 :                 if (o < (*nodep)->o)
      53     2547060 :                         nodep = &(*nodep)->left;
      54             :                 else
      55     2525500 :                         nodep = &(*nodep)->right;
      56             :         }
      57      393394 :         *nodep = &tree[allocated];
      58      393394 :         tree[allocated].left = tree[allocated].right = NULL;
      59      393394 :         tree[allocated].o = o;
      60      393394 :         return 1;
      61             : }
      62             : 
      63             : /* inorder traversal, gives us a sorted BAT */
      64             : static void
      65      187826 : OIDTreeToBAT(struct oidtreenode *node, BAT *bat)
      66             : {
      67      375146 :         if (node->left != NULL)
      68      187636 :                 OIDTreeToBAT(node->left, bat);
      69      375146 :         ((oid *) bat->theap.base)[bat->batCount++] = node->o;
      70      375146 :         if (node->right != NULL )
      71             :                 OIDTreeToBAT(node->right, bat);
      72      187826 : }
      73             : 
      74             : /* Antiset traversal, give us all values but the ones in the tree */
      75             : static void
      76        9247 : OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bat, oid start, oid stop)
      77             : {
      78       18455 :         oid noid;
      79             : 
      80       18455 :         if (node->left != NULL)
      81        9230 :                 OIDTreeToBATAntiset(node->left, bat, start, node->o);
      82             :         else
      83       22653 :                 for (noid = start; noid < node->o; noid++)
      84       13428 :                         ((oid *) bat->theap.base)[bat->batCount++] = noid;
      85             : 
      86       18455 :         if (node->right != NULL)
      87        9208 :                 OIDTreeToBATAntiset(node->right, bat, node->o + 1, stop);
      88             :         else
      89       22563 :                 for (noid = node->o+1; noid < stop; noid++)
      90       13316 :                         ((oid *) bat->theap.base)[bat->batCount++] = noid;
      91        9247 : }
      92             : 
      93             : static BAT *
      94         713 : do_batsample(BAT *b, BUN n, random_state_engine rse, MT_Lock *lock)
      95             : {
      96         713 :         BAT *bn;
      97         713 :         BUN cnt, slen;
      98         713 :         BUN rescnt;
      99         713 :         struct oidtreenode *tree = NULL;
     100             : 
     101         713 :         BATcheck(b, NULL);
     102         713 :         ERRORcheck(n > BUN_MAX, "sample size larger than BUN_MAX\n", NULL);
     103         713 :         cnt = BATcount(b);
     104             :         /* empty sample size */
     105         713 :         if (n == 0) {
     106           2 :                 bn = BATdense(0, 0, 0);
     107         711 :         } else if (cnt <= n) {
     108             :                 /* sample size is larger than the input BAT, return
     109             :                  * all oids */
     110         504 :                 bn = BATdense(0, b->hseqbase, cnt);
     111             :         } else {
     112         207 :                 oid minoid = b->hseqbase;
     113         207 :                 oid maxoid = b->hseqbase + cnt;
     114             : 
     115             :                 /* if someone samples more than half of our tree, we
     116             :                  * do the antiset */
     117         207 :                 bool antiset = n > cnt / 2;
     118         207 :                 slen = n;
     119         207 :                 if (antiset)
     120          17 :                         n = cnt - n;
     121             : 
     122         207 :                 tree = GDKmalloc(n * sizeof(struct oidtreenode));
     123         207 :                 if (tree == NULL) {
     124             :                         return NULL;
     125             :                 }
     126         207 :                 bn = COLnew(0, TYPE_oid, slen, TRANSIENT);
     127         207 :                 if (bn == NULL) {
     128           0 :                         GDKfree(tree);
     129           0 :                         return NULL;
     130             :                 }
     131             : 
     132             :                 /* while we do not have enough sample OIDs yet */
     133         207 :                 if (lock)       /* serialize access to random state engine */
     134         197 :                         MT_lock_set(lock);
     135      393808 :                 for (rescnt = 0; rescnt < n; rescnt++) {
     136      411240 :                         oid candoid;
     137      411240 :                         do {
     138      411240 :                                 candoid = minoid + next(rse) % cnt;
     139             :                                 /* if that candidate OID was already
     140             :                                  * generated, try again */
     141      411240 :                         } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
     142             :                 }
     143         207 :                 if (lock)
     144         197 :                         MT_lock_unset(lock);
     145         207 :                 if (!antiset) {
     146         190 :                         OIDTreeToBAT(tree, bn);
     147             :                 } else {
     148          17 :                         OIDTreeToBATAntiset(tree, bn, minoid, maxoid);
     149             :                 }
     150         207 :                 GDKfree(tree);
     151             : 
     152         207 :                 BATsetcount(bn, slen);
     153         207 :                 bn->trevsorted = bn->batCount <= 1;
     154         207 :                 bn->tsorted = true;
     155         207 :                 bn->tkey = true;
     156         207 :                 bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil;
     157             :         }
     158         713 :         TRC_DEBUG(ALGO, "BATsample(" ALGOBATFMT "," BUNFMT ")="
     159             :                   ALGOOPTBATFMT "\n",
     160             :                   ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
     161             :         return bn;
     162             : }
     163             : 
     164             : /* BATsample implements sampling for BATs */
     165             : BAT *
     166          10 : BATsample_with_seed(BAT *b, BUN n, unsigned seed)
     167             : {
     168          10 :         random_state_engine rse;
     169             : 
     170          10 :         init_random_state_engine(rse, (uint64_t) seed);
     171             : 
     172          10 :         return do_batsample(b, n, rse, NULL);
     173             : }
     174             : 
     175             : BAT *
     176         703 : BATsample(BAT *b, BUN n)
     177             : {
     178         703 :         static random_state_engine rse;
     179         703 :         static MT_Lock rse_lock = MT_LOCK_INITIALIZER("rse_lock");
     180             : 
     181         703 :         MT_lock_set(&rse_lock);
     182         703 :         if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0)
     183         130 :                 init_random_state_engine(rse, (uint64_t) GDKusec());
     184         703 :         MT_lock_unset(&rse_lock);
     185         703 :         return do_batsample(b, n, rse, &rse_lock);
     186             : }

Generated by: LCOV version 1.14