LCOV - code coverage report
Current view: top level - gdk - gdk_sample.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 83 85 97.6 %
Date: 2021-09-14 19:48:19 Functions: 6 6 100.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * @a Lefteris Sidirourgos, Hannes Muehleisen
      11             :  * @* Low level sample facilities
      12             :  *
      13             :  * This sampling implementation generates a sorted set of OIDs by
      14             :  * calling the random number generator, and uses a binary tree to
      15             :  * eliminate duplicates.  The elements of the tree are then used to
      16             :  * create a sorted sample BAT.  This implementation has a logarithmic
      17             :  * complexity that only depends on the sample size.
      18             :  *
      19             :  * There is a pathological case when the sample size is almost the
      20             :  * size of the BAT.  Then, many collisions occur and performance
      21             :  * degrades. To catch this, we switch to antiset semantics when the
      22             :  * sample size is larger than half the BAT size. Then, we generate the
      23             :  * values that should be omitted from the sample.
      24             :  */
      25             : 
      26             : #include "monetdb_config.h"
      27             : #include "gdk.h"
      28             : #include "gdk_private.h"
      29             : #include "xoshiro256starstar.h"
      30             : 
      31             : /* this is a straightforward implementation of a binary tree */
      32             : struct oidtreenode {
      33             :         union {
      34             :                 struct {        /* use as a binary tree */
      35             :                         oid o;
      36             :                         struct oidtreenode *left;
      37             :                         struct oidtreenode *right;
      38             :                 };
      39             :                 uint64_t r;     /* temporary storage for random numbers */
      40             :         };
      41             : };
      42             : 
      43             : static int
      44     1770377 : OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
      45             : {
      46             :         struct oidtreenode **nodep;
      47             : 
      48     1770377 :         if (allocated == 0) {
      49        2506 :                 tree->left = tree->right = NULL;
      50        2506 :                 tree->o = o;
      51        2506 :                 return 1;
      52             :         }
      53             :         nodep = &tree;
      54    19323129 :         while (*nodep) {
      55    17775144 :                 if (o == (*nodep)->o)
      56             :                         return 0;
      57    17555258 :                 if (o < (*nodep)->o)
      58     8762115 :                         nodep = &(*nodep)->left;
      59             :                 else
      60     8793143 :                         nodep = &(*nodep)->right;
      61             :         }
      62     1547985 :         *nodep = &tree[allocated];
      63     1547985 :         tree[allocated].left = tree[allocated].right = NULL;
      64     1547985 :         tree[allocated].o = o;
      65     1547985 :         return 1;
      66             : }
      67             : 
      68             : /* inorder traversal, gives us a sorted BAT */
      69             : static void
      70      588400 : OIDTreeToBAT(struct oidtreenode *node, BAT *bat)
      71             : {
      72     1173496 :         if (node->left != NULL)
      73      586552 :                 OIDTreeToBAT(node->left, bat);
      74     1174219 :         ((oid *) bat->theap->base)[bat->batCount++] = node->o;
      75     1174219 :         if (node->right != NULL )
      76             :                 OIDTreeToBAT(node->right, bat);
      77      589123 : }
      78             : 
      79             : /* Antiset traversal, give us all values but the ones in the tree */
      80             : static void
      81      266633 : OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bat, oid start, oid stop)
      82             : {
      83             :         oid noid;
      84             : 
      85      532144 :         if (node->left != NULL)
      86      265765 :                 OIDTreeToBATAntiset(node->left, bat, start, node->o);
      87             :         else
      88      837854 :                 for (noid = start; noid < node->o; noid++)
      89      571475 :                         ((oid *) bat->theap->base)[bat->batCount++] = noid;
      90             : 
      91      531705 :         if (node->right != NULL)
      92      265511 :                 OIDTreeToBATAntiset(node->right, bat, node->o + 1, stop);
      93             :         else
      94      837119 :                 for (noid = node->o+1; noid < stop; noid++)
      95      570925 :                         ((oid *) bat->theap->base)[bat->batCount++] = noid;
      96      266194 : }
      97             : 
      98             : static BAT *
      99       14555 : do_batsample(BAT *b, BUN n, random_state_engine rse, MT_Lock *lock)
     100             : {
     101             :         BAT *bn;
     102             :         BUN cnt, slen;
     103             :         BUN rescnt;
     104             :         struct oidtreenode *tree = NULL;
     105             : 
     106       14555 :         BATcheck(b, NULL);
     107       14555 :         ERRORcheck(n > BUN_MAX, "sample size larger than BUN_MAX\n", NULL);
     108       14555 :         cnt = BATcount(b);
     109             :         /* empty sample size */
     110       14555 :         if (n == 0) {
     111           2 :                 bn = BATdense(0, 0, 0);
     112       14553 :         } else if (cnt <= n) {
     113             :                 /* sample size is larger than the input BAT, return
     114             :                  * all oids */
     115       12033 :                 bn = BATdense(0, b->hseqbase, cnt);
     116             :         } else {
     117        2520 :                 oid minoid = b->hseqbase;
     118        2520 :                 oid maxoid = b->hseqbase + cnt;
     119             : 
     120             :                 /* if someone samples more than half of our tree, we
     121             :                  * do the antiset */
     122        2520 :                 bool antiset = n > cnt / 2;
     123             :                 slen = n;
     124        2520 :                 if (antiset)
     125        1210 :                         n = cnt - n;
     126             : 
     127        2520 :                 tree = GDKmalloc(n * sizeof(struct oidtreenode));
     128        2548 :                 if (tree == NULL) {
     129             :                         return NULL;
     130             :                 }
     131        2548 :                 bn = COLnew(0, TYPE_oid, slen, TRANSIENT);
     132        2519 :                 if (bn == NULL) {
     133           0 :                         GDKfree(tree);
     134           0 :                         return NULL;
     135             :                 }
     136             : 
     137             :                 /* generate a list of random numbers; note we use the
     138             :                  * "tree" array, but we use the value from each location
     139             :                  * before it is overwritten by the use as part of the
     140             :                  * binary tree */
     141        2519 :                 if (lock) {
     142           1 :                         MT_lock_set(lock);
     143         301 :                         for (rescnt = 0; rescnt < n; rescnt++)
     144         300 :                                 tree[rescnt].r = next(rse);
     145           1 :                         MT_lock_unset(lock);
     146             :                 } else {
     147     1773611 :                         for (rescnt = 0; rescnt < n; rescnt++)
     148     1771099 :                                 tree[rescnt].r = next(rse);
     149             :                 }
     150             : 
     151             :                 /* while we do not have enough sample OIDs yet */
     152             :                 BUN rnd = 0;
     153     1551932 :                 for (rescnt = 0; rescnt < n; rescnt++) {
     154             :                         oid candoid;
     155             :                         do {
     156     1735242 :                                 if (rnd == n) {
     157             :                                         /* we ran out of random numbers,
     158             :                                          * so generate more */
     159        8895 :                                         if (lock)
     160           2 :                                                 MT_lock_set(lock);
     161      243535 :                                         for (rnd = rescnt; rnd < n; rnd++)
     162      234617 :                                                 tree[rnd].r = next(rse);
     163        8918 :                                         if (lock)
     164           2 :                                                 MT_lock_unset(lock);
     165             :                                         rnd = rescnt;
     166             :                                 }
     167     1735265 :                                 candoid = minoid + tree[rnd++].r % cnt;
     168             :                                 /* if that candidate OID was already
     169             :                                  * generated, try again */
     170     1735265 :                         } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
     171             :                 }
     172        2545 :                 if (!antiset) {
     173        1330 :                         OIDTreeToBAT(tree, bn);
     174             :                 } else {
     175        1215 :                         OIDTreeToBATAntiset(tree, bn, minoid, maxoid);
     176             :                 }
     177        2551 :                 GDKfree(tree);
     178             : 
     179        2553 :                 BATsetcount(bn, slen);
     180        2553 :                 bn->trevsorted = bn->batCount <= 1;
     181        2553 :                 bn->tsorted = true;
     182        2553 :                 bn->tkey = true;
     183        2553 :                 bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil;
     184             :         }
     185       14535 :         TRC_DEBUG(ALGO, "BATsample(" ALGOBATFMT "," BUNFMT ")="
     186             :                   ALGOOPTBATFMT "\n",
     187             :                   ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
     188             :         return bn;
     189             : }
     190             : 
     191             : /* BATsample implements sampling for BATs */
     192             : BAT *
     193       14520 : BATsample_with_seed(BAT *b, BUN n, uint64_t seed)
     194             : {
     195             :         random_state_engine rse;
     196             : 
     197       14520 :         init_random_state_engine(rse, seed);
     198             : 
     199       14525 :         return do_batsample(b, n, rse, NULL);
     200             : }
     201             : 
     202             : static MT_Lock rse_lock = MT_LOCK_INITIALIZER(rse_lock);
     203             : BAT *
     204          28 : BATsample(BAT *b, BUN n)
     205             : {
     206             :         static random_state_engine rse;
     207             : 
     208          28 :         MT_lock_set(&rse_lock);
     209          28 :         if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0)
     210           5 :                 init_random_state_engine(rse, (uint64_t) GDKusec());
     211          28 :         MT_lock_unset(&rse_lock);
     212          28 :         return do_batsample(b, n, rse, &rse_lock);
     213             : }

Generated by: LCOV version 1.14