LCOV - code coverage report
Current view: top level - gdk - gdk_hash.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 469 614 76.4 %
Date: 2020-06-29 20:00:14 Functions: 14 18 77.8 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2020 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * - Hash Table Creation
      11             :  * The hash indexing scheme for BATs reserves a block of memory to
      12             :  * maintain the hash table and a collision list. A one-to-one mapping
      13             :  * exists between the BAT and the collision list using the BUN
      14             :  * index. NOTE: we alloc the link list as a parallel array to the BUN
      15             :  * array; hence the hash link array has the same size as
      16             :  * BATcapacity(b) (not BATcount(b)). This allows us in the BUN insert
      17             :  * and delete to assume that there is hash space iff there is BUN
      18             :  * space.
      19             :  *
      20             :  * The hash mask size is a power of two, so we can do bitwise AND on
      21             :  * the hash (integer) number to quickly find the head of the bucket
      22             :  * chain.  Clearly, the hash mask size is a crucial parameter. If we
      23             :  * know that the column is unique (tkey), we use direct hashing (mask
      24             :  * size ~= BATcount). Otherwise we dynamically determine the mask size
      25             :  * by starting out with mask size = BATcount/64 (just 1.5% of memory
      26             :  * storage overhead). Then we start building the hash table on the
      27             :  * first 25% of the BAT. As we aim for max-collisions list length of
      28             :  * 4, the list on 25% should not exceed length 1. So, if a small
      29             :  * number of collisssions occurs (mask/2) then we abandon the attempt
      30             :  * and restart with a mask that is 4 times larger. This converges
      31             :  * after three cycles to direct hashing.
      32             :  */
      33             : 
      34             : #include "monetdb_config.h"
      35             : #include "gdk.h"
      36             : #include "gdk_private.h"
      37             : 
      38             : static uint8_t
      39      335549 : HASHwidth(BUN hashsize)
      40             : {
      41      335549 :         if (hashsize <= (BUN) BUN2_NONE)
      42             :                 return BUN2;
      43             : #ifdef BUN8
      44         784 :         if (hashsize <= (BUN) BUN4_NONE)
      45         784 :                 return BUN4;
      46             :         return BUN8;
      47             : #else
      48             :         return BUN4;
      49             : #endif
      50             : }
      51             : 
      52             : static inline BUN
      53       34421 : hashmask(BUN m)
      54             : {
      55       34421 :         m |= m >> 1;
      56       34421 :         m |= m >> 2;
      57       34421 :         m |= m >> 4;
      58       34421 :         m |= m >> 8;
      59       34421 :         m |= m >> 16;
      60             : #if SIZEOF_BUN == 8
      61       34421 :         m |= m >> 32;
      62             : #endif
      63       34421 :         return m;
      64             : }
      65             : 
      66             : BUN
      67       89096 : HASHmask(BUN cnt)
      68             : {
      69       89096 :         BUN m = cnt;
      70             : 
      71             : #if 0
      72             :         /* find largest power of 2 smaller than or equal to cnt */
      73             :         m = hashmask(m);
      74             :         m -= m >> 1;
      75             : 
      76             :         /* if cnt is more than 1/3 into the gap between m and 2*m,
      77             :            double m */
      78             :         if (m + m - cnt < 2 * (cnt - m))
      79             :                 m += m;
      80             : #else
      81       89096 :         m = m * 8 / 7;
      82             : #endif
      83       74366 :         if (m < BATTINY)
      84       56499 :                 m = BATTINY;
      85       74722 :         return m;
      86             : }
      87             : 
      88             : static inline void
      89       92892 : HASHclear(Hash *h)
      90             : {
      91             :         /* since BUN2_NONE, BUN4_NONE, BUN8_NONE
      92             :          * are all equal to -1 (~0), i.e., have all bits set,
      93             :          * we can use a simple memset() to clear the Hash,
      94             :          * rather than iteratively assigning individual
      95             :          * BUNi_NONE values in a for-loop
      96             :          */
      97       92892 :         memset(h->Bckt, 0xFF, h->nbucket * h->width);
      98             : }
      99             : 
     100             : #define HASH_VERSION            4
     101             : /* this is only for the change of hash function of the UUID type; if
     102             :  * HASH_VERSION is increased again from 4, the code associated with
     103             :  * HASH_VERSION_NOUUID must be deleted */
     104             : #define HASH_VERSION_NOUUID     3
     105             : #define HASH_HEADER_SIZE        7       /* nr of size_t fields in header */
     106             : 
     107             : static void
     108       11964 : doHASHdestroy(BAT *b, Hash *hs)
     109             : {
     110       11964 :         if (hs == (Hash *) 1) {
     111         381 :                 GDKunlink(BBPselectfarm(b->batRole, b->ttype, hashheap),
     112             :                           BATDIR,
     113         381 :                           BBP_physical(b->batCacheid),
     114             :                           "thashl");
     115         381 :                 GDKunlink(BBPselectfarm(b->batRole, b->ttype, hashheap),
     116             :                           BATDIR,
     117         381 :                           BBP_physical(b->batCacheid),
     118             :                           "thashb");
     119       11583 :         } else if (hs) {
     120       11583 :                 bat p = VIEWtparent(b);
     121       11583 :                 BAT *hp = NULL;
     122             : 
     123       11583 :                 if (p)
     124         519 :                         hp = BBP_cache(p);
     125             : 
     126         519 :                 if (!hp || hs != hp->thash) {
     127       11583 :                         TRC_DEBUG(ACCELERATOR, ALGOBATFMT ": removing%s hash\n", ALGOBATPAR(b), *(size_t *) hs->heapbckt.base & (1 << 24) ? " persisted" : "");
     128       11583 :                         HEAPfree(&hs->heapbckt, true);
     129       11585 :                         HEAPfree(&hs->heaplink, true);
     130       11585 :                         GDKfree(hs);
     131             :                 }
     132             :         }
     133       11966 : }
     134             : 
     135             : gdk_return
     136       92892 : HASHnew(Hash *h, int tpe, BUN size, BUN mask, BUN count, bool bcktonly)
     137             : {
     138       92892 :         if (h->width == 0)
     139       78292 :                 h->width = HASHwidth(size);
     140             : 
     141       92892 :         if (!bcktonly) {
     142       77592 :                 if (HEAPalloc(&h->heaplink, size, h->width) != GDK_SUCCEED)
     143             :                         return GDK_FAIL;
     144       77594 :                 h->heaplink.free = size * h->width;
     145       77594 :                 h->Link = h->heaplink.base;
     146             :         }
     147       92894 :         if (HEAPalloc(&h->heapbckt, mask + HASH_HEADER_SIZE * SIZEOF_SIZE_T / h->width, h->width) != GDK_SUCCEED)
     148             :                 return GDK_FAIL;
     149       92892 :         h->heapbckt.free = mask * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
     150       92892 :         h->nbucket = mask;
     151       92892 :         if (mask & (mask - 1)) {
     152       32434 :                 h->mask2 = hashmask(mask);
     153       32434 :                 h->mask1 = h->mask2 >> 1;
     154             :         } else {
     155             :                 /* mask is a power of two */
     156       60458 :                 h->mask1 = mask - 1;
     157       60458 :                 h->mask2 = h->mask1 << 1 | 1;
     158             :         }
     159       92892 :         switch (h->width) {
     160       92028 :         case BUN2:
     161       92028 :                 h->nil = (BUN) BUN2_NONE;
     162       92028 :                 break;
     163         864 :         case BUN4:
     164         864 :                 h->nil = (BUN) BUN4_NONE;
     165         864 :                 break;
     166             : #ifdef BUN8
     167           0 :         case BUN8:
     168           0 :                 h->nil = (BUN) BUN8_NONE;
     169           0 :                 break;
     170             : #endif
     171             :         default:
     172           0 :                 assert(0);
     173             :         }
     174       92892 :         h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
     175       92892 :         h->type = tpe;
     176       92892 :         HASHclear(h);           /* zero the mask */
     177       92892 :         ((size_t *) h->heapbckt.base)[0] = (size_t) HASH_VERSION;
     178       92892 :         ((size_t *) h->heapbckt.base)[1] = (size_t) size;
     179       92892 :         ((size_t *) h->heapbckt.base)[2] = (size_t) h->nbucket;
     180       92892 :         ((size_t *) h->heapbckt.base)[3] = (size_t) h->width;
     181       92892 :         ((size_t *) h->heapbckt.base)[4] = (size_t) count;
     182       92892 :         ((size_t *) h->heapbckt.base)[5] = (size_t) h->nunique;
     183       92892 :         ((size_t *) h->heapbckt.base)[6] = (size_t) h->nheads;
     184       92892 :         TRC_DEBUG(ACCELERATOR,
     185             :                   "create hash(size " BUNFMT ", mask " BUNFMT ", width %d, total " BUNFMT " bytes);\n", size, mask, h->width, (size + mask) * h->width);
     186             :         return GDK_SUCCEED;
     187             : }
     188             : 
     189             : /* collect HASH statistics for analysis */
     190             : static void
     191           0 : HASHcollisions(BAT *b, Hash *h, const char *func)
     192             : {
     193           0 :         lng cnt, entries = 0, max = 0;
     194           0 :         double total = 0;
     195           0 :         BUN p, i, j, nil;
     196             : 
     197           0 :         if (b == 0 || h == 0)
     198             :                 return;
     199           0 :         nil = HASHnil(h);
     200           0 :         for (i = 0, j = h->nbucket; i < j; i++)
     201           0 :                 if ((p = HASHget(h, i)) != nil) {
     202           0 :                         entries++;
     203           0 :                         cnt = 0;
     204           0 :                         for (; p != nil; p = HASHgetlink(h, p))
     205           0 :                                 cnt++;
     206           0 :                         if (cnt > max)
     207             :                                 max = cnt;
     208           0 :                         total += cnt;
     209             :                 }
     210           0 :         TRC_DEBUG_ENDIF(ACCELERATOR,
     211             :                         "%s(" ALGOBATFMT "): statistics " BUNFMT ", "
     212             :                         "entries " LLFMT ", nunique " BUNFMT ", "
     213             :                         "nbucket " BUNFMT ", max " LLFMT ", avg %2.6f;\n",
     214             :                         func, ALGOBATPAR(b), BATcount(b), entries,
     215             :                         h->nunique, h->nbucket, max,
     216             :                         entries == 0 ? 0 : total / entries);
     217             : }
     218             : 
     219             : static gdk_return
     220           0 : HASHupgradehashheap(BAT *b)
     221             : {
     222           0 :         Hash *h = b->thash;
     223           0 :         int nwidth = h->width << 1;
     224           0 :         BUN i;
     225             : 
     226           0 :         assert(nwidth <= SIZEOF_BUN);
     227           0 :         assert((nwidth & (nwidth - 1)) == 0);
     228             : 
     229           0 :         if (HEAPextend(&h->heaplink, h->heaplink.size * nwidth / h->width, true) != GDK_SUCCEED ||
     230           0 :             HEAPextend(&h->heapbckt, (h->heapbckt.size - HASH_HEADER_SIZE * SIZEOF_SIZE_T) * nwidth / h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T, true) != GDK_SUCCEED) {
     231           0 :                 b->thash = NULL;
     232           0 :                 doHASHdestroy(b, h);
     233           0 :                 return GDK_FAIL;
     234             :         }
     235           0 :         h->Link = h->heaplink.base;
     236           0 :         h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
     237           0 :         switch (nwidth) {
     238           0 :         case BUN4:
     239           0 :                 switch (h->width) {
     240           0 :                 case BUN2:
     241           0 :                         i = h->heaplink.free / h->width;
     242           0 :                         h->heaplink.free = i * nwidth;
     243           0 :                         while (i > 0) {
     244           0 :                                 i--;
     245           0 :                                 BUN2type v = ((BUN2type *) h->Link)[i];
     246           0 :                                 ((BUN4type *) h->Link)[i] = v == BUN2_NONE ? BUN4_NONE : v;
     247             :                         }
     248           0 :                         i = (h->heapbckt.free - HASH_HEADER_SIZE * SIZEOF_SIZE_T) / h->width;
     249           0 :                         h->heapbckt.free = HASH_HEADER_SIZE * SIZEOF_SIZE_T + i * nwidth;
     250           0 :                         while (i > 0) {
     251           0 :                                 i--;
     252           0 :                                 BUN2type v = ((BUN2type *) h->Bckt)[i];
     253           0 :                                 ((BUN4type *) h->Bckt)[i] = v == BUN2_NONE ? BUN4_NONE : v;
     254             :                         }
     255             :                         break;
     256             :                 }
     257           0 :                 h->nil = BUN4_NONE;
     258           0 :                 break;
     259             : #ifdef BUN8
     260           0 :         case BUN8:
     261           0 :                 switch (h->width) {
     262           0 :                 case BUN2:
     263           0 :                         i = h->heaplink.free / h->width;
     264           0 :                         h->heaplink.free = i * nwidth;
     265           0 :                         while (i > 0) {
     266           0 :                                 i--;
     267           0 :                                 BUN2type v = ((BUN2type *) h->Link)[i];
     268           0 :                                 ((BUN8type *) h->Link)[i] = v == BUN2_NONE ? BUN8_NONE : v;
     269             :                         }
     270           0 :                         i = (h->heapbckt.free - HASH_HEADER_SIZE * SIZEOF_SIZE_T) / h->width;
     271           0 :                         h->heapbckt.free = HASH_HEADER_SIZE * SIZEOF_SIZE_T + i * nwidth;
     272           0 :                         while (i > 0) {
     273           0 :                                 i--;
     274           0 :                                 BUN2type v = ((BUN2type *) h->Bckt)[i];
     275           0 :                                 ((BUN8type *) h->Bckt)[i] = v == BUN2_NONE ? BUN8_NONE : v;
     276             :                         }
     277             :                         break;
     278           0 :                 case BUN4:
     279           0 :                         i = h->heaplink.free / h->width;
     280           0 :                         h->heaplink.free = i * nwidth;
     281           0 :                         while (i > 0) {
     282           0 :                                 i--;
     283           0 :                                 BUN4type v = ((BUN4type *) h->Link)[i];
     284           0 :                                 ((BUN8type *) h->Link)[i] = v == BUN4_NONE ? BUN8_NONE : v;
     285             :                         }
     286           0 :                         i = (h->heapbckt.free - HASH_HEADER_SIZE * SIZEOF_SIZE_T) / h->width;
     287           0 :                         h->heapbckt.free = HASH_HEADER_SIZE * SIZEOF_SIZE_T + i * nwidth;
     288           0 :                         while (i > 0) {
     289           0 :                                 i--;
     290           0 :                                 BUN4type v = ((BUN4type *) h->Bckt)[i];
     291           0 :                                 ((BUN8type *) h->Bckt)[i] = v == BUN4_NONE ? BUN8_NONE : v;
     292             :                         }
     293             :                         break;
     294             :                 }
     295           0 :                 h->nil = BUN8_NONE;
     296           0 :                 break;
     297             : #endif
     298             :         }
     299           0 :         h->width = nwidth;
     300           0 :         return GDK_SUCCEED;
     301             : }
     302             : 
     303             : gdk_return
     304      243160 : HASHgrowbucket(BAT *b)
     305             : {
     306      243160 :         Hash *h = b->thash;
     307      243160 :         BUN nbucket;
     308      243160 :         BUN onbucket = h->nbucket;
     309      243160 :         lng t0 = 0;
     310             : 
     311      243160 :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
     312             : 
     313             :         /* only needed to fix hash tables built before this fix was
     314             :          * introduced */
     315      243160 :         if (h->nil <= h->mask2 && HASHupgradehashheap(b) != GDK_SUCCEED)
     316             :                 return GDK_FAIL;
     317             : 
     318      243160 :         h->heapbckt.dirty = true;
     319      243160 :         h->heaplink.dirty = true;
     320      243160 :         if (((size_t *) h->heapbckt.base)[0] & (size_t) 1 << 24) {
     321             :                 /* persistent hash: remove persistency */
     322         830 :                 ((size_t *) h->heapbckt.base)[0] &= ~((size_t) 1 << 24);
     323         830 :                 if (h->heapbckt.storage != STORE_MEM) {
     324           0 :                         if (!(GDKdebug & NOSYNCMASK) &&
     325           0 :                             MT_msync(h->heapbckt.base, SIZEOF_SIZE_T) < 0)
     326             :                                 return GDK_FAIL;
     327             :                 }
     328             :         }
     329      346106 :         while (h->nunique >= (nbucket = h->nbucket) * 7 / 8) {
     330      102946 :                 BUN new = h->nbucket;
     331      102946 :                 BUN old = new & h->mask1;
     332      102946 :                 BATiter bi = bat_iterator(b);
     333      102946 :                 BUN msk = h->mask1 + 1; /* == h->mask2 - h->mask1 */
     334             : 
     335      102946 :                 assert(h->heapbckt.free == nbucket * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T);
     336      102946 :                 if (h->heapbckt.free + h->width > h->heapbckt.size) {
     337         509 :                         if (HEAPextend(&h->heapbckt,
     338             :                                        h->heapbckt.size + GDK_mmap_pagesize,
     339             :                                        true) != GDK_SUCCEED) {
     340           0 :                                 return GDK_FAIL;
     341             :                         }
     342         509 :                         h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
     343             :                 }
     344      102946 :                 assert(h->heapbckt.free + h->width <= h->heapbckt.size);
     345      102946 :                 if (h->nbucket == h->mask2) {
     346         124 :                         h->mask1 = h->mask2;
     347         124 :                         h->mask2 = h->mask2 << 1 | 1;
     348         124 :                         if (h->nil == h->mask2) {
     349             :                                 /* time to widen the hash table */
     350           0 :                                 if (HASHupgradehashheap(b) != GDK_SUCCEED)
     351             :                                         return GDK_FAIL;
     352             :                         }
     353             :                 }
     354      102946 :                 h->nbucket++;
     355      102946 :                 h->heapbckt.free += h->width;
     356      102946 :                 BUN lold, lnew, hb;
     357      102946 :                 lold = lnew = HASHnil(h);
     358      205892 :                 if ((hb = HASHget(h, old)) != HASHnil(h)) {
     359       82693 :                         h->nheads--;
     360      145172 :                         do {
     361      145172 :                                 const void *v = BUNtail(bi, hb);
     362      145172 :                                 BUN hsh = ATOMhash(h->type, v);
     363      145172 :                                 assert((hsh & (msk - 1)) == old);
     364      145172 :                                 if (hsh & msk) {
     365             :                                         /* move to new list */
     366       63961 :                                         if (lnew == HASHnil(h)) {
     367       50166 :                                                 HASHput(h, new, hb);
     368       50166 :                                                 h->nheads++;
     369             :                                         } else
     370       13795 :                                                 HASHputlink(h, lnew, hb);
     371             :                                         lnew = hb;
     372             :                                 } else {
     373             :                                         /* move to old list */
     374       81211 :                                         if (lold == HASHnil(h)) {
     375       57288 :                                                 h->nheads++;
     376       57288 :                                                 HASHput(h, old, hb);
     377             :                                         } else
     378       23923 :                                                 HASHputlink(h, lold, hb);
     379             :                                         lold = hb;
     380             :                                 }
     381      145172 :                                 hb = HASHgetlink(h, hb);
     382      145172 :                         } while (hb != HASHnil(h));
     383             :                 }
     384      102946 :                 if (lnew == HASHnil(h))
     385       52780 :                         HASHput(h, new, HASHnil(h));
     386             :                 else
     387       50166 :                         HASHputlink(h, lnew, HASHnil(h));
     388      102946 :                 if (lold == HASHnil(h))
     389       45658 :                         HASHput(h, old, HASHnil(h));
     390             :                 else
     391       57288 :                         HASHputlink(h, lold, HASHnil(h));
     392      102946 :                 BATsetprop_nolock(b, GDK_HASH_BUCKETS, TYPE_oid,
     393      102946 :                                   &(oid){h->nbucket});
     394             :         }
     395      243160 :         TRC_DEBUG_IF(ACCELERATOR) if (h->nbucket > onbucket) {
     396           0 :                 TRC_DEBUG_ENDIF(ACCELERATOR, ALGOBATFMT " " BUNFMT
     397             :                         " -> " BUNFMT " buckets (" LLFMT " usec)\n",
     398             :                         ALGOBATPAR(b),
     399             :                         onbucket, h->nbucket, GDKusec() - t0);
     400           0 :                 HASHcollisions(b, h, __func__);
     401             :         }
     402             :         return GDK_SUCCEED;
     403             : }
     404             : 
     405             : /* Return TRUE if we have a hash on the tail, even if we need to read
     406             :  * one from disk.
     407             :  *
     408             :  * Note that the b->thash pointer can be NULL, meaning there is no
     409             :  * hash; (Hash *) 1, meaning there is no hash loaded, but it may exist
     410             :  * on disk; or a valid pointer to a loaded hash.  These values are
     411             :  * maintained here, in the HASHdestroy and HASHfree functions, and in
     412             :  * BBPdiskscan during initialization. */
     413             : bool
     414     5800710 : BATcheckhash(BAT *b)
     415             : {
     416     5800710 :         bool ret;
     417     5800710 :         lng t = 0;
     418             : 
     419             :         /* we don't need the lock just to read the value b->thash */
     420     5800710 :         if (b->thash == (Hash *) 1) {
     421             :                 /* but when we want to change it, we need the lock */
     422        3387 :                 TRC_DEBUG_IF(ACCELERATOR) t = GDKusec();
     423        3387 :                 MT_lock_set(&b->batIdxLock);
     424        3387 :                 TRC_DEBUG_IF(ACCELERATOR) t = GDKusec() - t;
     425             :                 /* if still 1 now that we have the lock, we can update */
     426        3387 :                 if (b->thash == (Hash *) 1) {
     427        3387 :                         Hash *h;
     428        3387 :                         int fd;
     429             : 
     430        3387 :                         assert(!GDKinmemory());
     431        3387 :                         b->thash = NULL;
     432        3387 :                         if ((h = GDKzalloc(sizeof(*h))) != NULL &&
     433        3387 :                             (h->heaplink.farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) >= 0 &&
     434        3387 :                             (h->heapbckt.farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) >= 0) {
     435        3387 :                                 const char *nme = BBP_physical(b->batCacheid);
     436        3387 :                                 strconcat_len(h->heaplink.filename,
     437             :                                               sizeof(h->heaplink.filename),
     438             :                                               nme, ".thashl", NULL);
     439        3387 :                                 strconcat_len(h->heapbckt.filename,
     440             :                                               sizeof(h->heapbckt.filename),
     441             :                                               nme, ".thashb", NULL);
     442             : 
     443             :                                 /* check whether a persisted hash can be found */
     444        3387 :                                 if ((fd = GDKfdlocate(h->heapbckt.farmid, nme, "rb+", "thashb")) >= 0) {
     445        3387 :                                         size_t hdata[HASH_HEADER_SIZE];
     446        3387 :                                         struct stat st;
     447             : 
     448        3387 :                                         if (read(fd, hdata, sizeof(hdata)) == sizeof(hdata) &&
     449        3387 :                                             (hdata[0] == (
     450             : #ifdef PERSISTENTHASH
     451             :                                                     ((size_t) 1 << 24) |
     452             : #endif
     453             :                                                     HASH_VERSION)
     454             : #ifdef HASH_VERSION_NOUUID
     455             :                                              /* if not uuid, also allow previous version */
     456         163 :                                              || (hdata[0] == (
     457             : #ifdef PERSISTENTHASH
     458             :                                                          ((size_t) 1 << 24) |
     459             : #endif
     460         163 :                                                          HASH_VERSION_NOUUID) &&
     461         163 :                                                  strcmp(ATOMname(b->ttype), "uuid") != 0)
     462             : #endif
     463        3387 :                                                     ) &&
     464        3387 :                                             hdata[1] > 0 &&
     465        3263 :                                             hdata[4] == (size_t) BATcount(b) &&
     466        3263 :                                             fstat(fd, &st) == 0 &&
     467        6526 :                                             st.st_size >= (off_t) (h->heapbckt.size = h->heapbckt.free = (h->nbucket = (BUN) hdata[2]) * (BUN) (h->width = (uint8_t) hdata[3]) + HASH_HEADER_SIZE * SIZEOF_SIZE_T) &&
     468        3263 :                                             close(fd) == 0 &&
     469        3263 :                                             (fd = GDKfdlocate(h->heaplink.farmid, nme, "rb+", "thashl")) >= 0 &&
     470        3263 :                                             fstat(fd, &st) == 0 &&
     471        3263 :                                             st.st_size > 0 &&
     472        6526 :                                             st.st_size >= (off_t) (h->heaplink.size = h->heaplink.free = hdata[1] * h->width) &&
     473        3263 :                                             HEAPload(&h->heaplink, nme, "thashl", false) == GDK_SUCCEED) {
     474        3263 :                                                 if (HEAPload(&h->heapbckt, nme, "thashb", false) == GDK_SUCCEED) {
     475        3263 :                                                         if (h->nbucket & (h->nbucket - 1)) {
     476        1987 :                                                                 h->mask2 = hashmask(h->nbucket);
     477        1987 :                                                                 h->mask1 = h->mask2 >> 1;
     478             :                                                         } else {
     479        1276 :                                                                 h->mask1 = h->nbucket - 1;
     480        1276 :                                                                 h->mask2 = h->mask1 << 1 | 1;
     481             :                                                         }
     482        3263 :                                                         h->nunique = hdata[5];
     483        3263 :                                                         h->nheads = hdata[6];
     484        3263 :                                                         h->type = ATOMtype(b->ttype);
     485        3263 :                                                         switch (h->width) {
     486        3258 :                                                         case BUN2:
     487        3258 :                                                                 h->nil = (BUN) BUN2_NONE;
     488        3258 :                                                                 break;
     489           5 :                                                         case BUN4:
     490           5 :                                                                 h->nil = (BUN) BUN4_NONE;
     491           5 :                                                                 break;
     492             : #ifdef BUN8
     493           0 :                                                         case BUN8:
     494           0 :                                                                 h->nil = (BUN) BUN8_NONE;
     495           0 :                                                                 break;
     496             : #endif
     497             :                                                         default:
     498           0 :                                                                 assert(0);
     499             :                                                         }
     500        3263 :                                                         if (h->nil > h->nbucket) {
     501        3261 :                                                                 close(fd);
     502        3261 :                                                                 h->Link = h->heaplink.base;
     503        3261 :                                                                 h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
     504        3261 :                                                                 h->heaplink.parentid = b->batCacheid;
     505        3261 :                                                                 h->heapbckt.parentid = b->batCacheid;
     506        3261 :                                                                 h->heaplink.dirty = false;
     507        3261 :                                                                 h->heapbckt.dirty = false;
     508        3261 :                                                                 BATsetprop_nolock(
     509             :                                                                         b,
     510             :                                                                         GDK_HASH_BUCKETS,
     511             :                                                                         TYPE_oid,
     512        3261 :                                                                         &(oid){h->nbucket});
     513        3261 :                                                                 BATsetprop_nolock(
     514             :                                                                         b,
     515             :                                                                         GDK_NUNIQUE,
     516             :                                                                         TYPE_oid,
     517        3261 :                                                                         &(oid){h->nunique});
     518        3261 :                                                                 b->thash = h;
     519        3261 :                                                                 TRC_DEBUG(ACCELERATOR,
     520             :                                                                           ALGOBATFMT ": reusing persisted hash\n", ALGOBATPAR(b));
     521        3261 :                                                                 MT_lock_unset(&b->batIdxLock);
     522        3261 :                                                                 return true;
     523             :                                                         }
     524             :                                                         /* if h->nil==h->nbucket
     525             :                                                          * (was
     526             :                                                          * possible in
     527             :                                                          * previous
     528             :                                                          * iterations
     529             :                                                          * of the
     530             :                                                          * code), then
     531             :                                                          * we can't
     532             :                                                          * use the
     533             :                                                          * hash since
     534             :                                                          * we can't
     535             :                                                          * distinguish
     536             :                                                          * between
     537             :                                                          * end-of-list
     538             :                                                          * and a valid
     539             :                                                          * link */
     540           2 :                                                         HEAPfree(&h->heapbckt, false);
     541             :                                                 }
     542           2 :                                                 HEAPfree(&h->heaplink, false);
     543             :                                         }
     544         126 :                                         close(fd);
     545             :                                         /* unlink unusable file */
     546         126 :                                         GDKunlink(h->heaplink.farmid, BATDIR, nme, "thashl");
     547         126 :                                         GDKunlink(h->heapbckt.farmid, BATDIR, nme, "thashb");
     548             :                                 }
     549             :                         }
     550         126 :                         GDKfree(h);
     551         126 :                         GDKclrerr();    /* we're not currently interested in errors */
     552             :                 }
     553         126 :                 MT_lock_unset(&b->batIdxLock);
     554             :         }
     555     5797450 :         ret = b->thash != NULL;
     556     5797450 :         if (ret)
     557     3592490 :                 TRC_DEBUG(ACCELERATOR, ALGOBATFMT ": already has hash, waited " LLFMT " usec\n", ALGOBATPAR(b), t);
     558             :         return ret;
     559             : }
     560             : 
     561             : gdk_return
     562       12828 : BAThashsave(BAT *b, bool dosync)
     563             : {
     564       12828 :         int fd;
     565       12828 :         gdk_return rc = GDK_SUCCEED;
     566       12828 :         Hash *h;
     567       12828 :         lng t0 = 0;
     568             : 
     569       12828 :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
     570             : 
     571       12828 :         if ((h = b->thash) != NULL) {
     572       12826 :                 Heap *hp = &h->heapbckt;
     573             : 
     574             : #ifndef PERSISTENTHASH
     575             :                 /* no need to sync if not persistent */
     576             :                 dosync = false;
     577             : #endif
     578             : 
     579       12826 :                 rc = GDK_FAIL;
     580             :                 /* only persist if parent BAT hasn't changed in the
     581             :                  * mean time */
     582       12826 :                 ((size_t *) hp->base)[0] = (size_t) HASH_VERSION;
     583       12826 :                 ((size_t *) hp->base)[1] = (size_t) (h->heaplink.free / h->width);
     584       12826 :                 ((size_t *) hp->base)[2] = (size_t) h->nbucket;
     585       12826 :                 ((size_t *) hp->base)[3] = (size_t) h->width;
     586       12826 :                 ((size_t *) hp->base)[4] = (size_t) BATcount(b);
     587       12826 :                 ((size_t *) hp->base)[5] = (size_t) h->nunique;
     588       12826 :                 ((size_t *) hp->base)[6] = (size_t) h->nheads;
     589       18563 :                 if (!b->theap.dirty &&
     590       11474 :                     HEAPsave(&h->heaplink, h->heaplink.filename, NULL, dosync) == GDK_SUCCEED &&
     591        5737 :                     HEAPsave(hp, hp->filename, NULL, dosync) == GDK_SUCCEED) {
     592        5737 :                         h->heaplink.dirty = false;
     593        5737 :                         if (hp->storage == STORE_MEM) {
     594        5697 :                                 if ((fd = GDKfdlocate(hp->farmid, hp->filename, "rb+", NULL)) >= 0) {
     595        5697 :                                         ((size_t *) hp->base)[0] |= (size_t) 1 << 24;
     596        5697 :                                         if (write(fd, hp->base, SIZEOF_SIZE_T) >= 0) {
     597        5697 :                                                 rc = GDK_SUCCEED;
     598        5697 :                                                 if (dosync &&
     599        5694 :                                                     !(GDKdebug & NOSYNCMASK)) {
     600             : #if defined(NATIVE_WIN32)
     601             :                                                         _commit(fd);
     602             : #elif defined(HAVE_FDATASYNC)
     603           0 :                                                         fdatasync(fd);
     604             : #elif defined(HAVE_FSYNC)
     605             :                                                         fsync(fd);
     606             : #endif
     607             :                                                 }
     608        5697 :                                                 hp->dirty = false;
     609             :                                         } else {
     610           0 :                                                 perror("write hash");
     611             :                                         }
     612        5697 :                                         close(fd);
     613             :                                 }
     614             :                         } else {
     615          40 :                                 ((size_t *) hp->base)[0] |= (size_t) 1 << 24;
     616          40 :                                 if (dosync && !(GDKdebug & NOSYNCMASK) &&
     617           0 :                                     MT_msync(hp->base, SIZEOF_SIZE_T) < 0) {
     618           0 :                                         ((size_t *) hp->base)[0] &= ~((size_t) 1 << 24);
     619             :                                 } else {
     620          40 :                                         hp->dirty = false;
     621          40 :                                         rc = GDK_SUCCEED;
     622             :                                 }
     623             :                         }
     624        5737 :                         TRC_DEBUG(ACCELERATOR,
     625             :                                   ALGOBATFMT ": persisting hash %s%s (" LLFMT " usec)%s\n", ALGOBATPAR(b), hp->filename, dosync ? "" : " no sync", GDKusec() - t0, rc == GDK_SUCCEED ? "" : " failed");
     626             :                 }
     627             :         }
     628       12828 :         return rc;
     629             : }
     630             : 
     631             : #ifdef PERSISTENTHASH
     632             : static void
     633        2833 : BAThashsync(void *arg)
     634             : {
     635        2833 :         BAT *b = arg;
     636             : 
     637             :         /* we could check whether b->thash == NULL before getting the
     638             :          * lock, and only lock if it isn't; however, it's very
     639             :          * unlikely that that is the case, so we don't */
     640        2833 :         MT_lock_set(&b->batIdxLock);
     641        2833 :         BAThashsave(b, true);
     642        2833 :         MT_lock_unset(&b->batIdxLock);
     643        2833 :         BBPunfix(b->batCacheid);
     644        2833 : }
     645             : #endif
     646             : 
     647             : #define EQbte(a, b)     ((a) == (b))
     648             : #define EQsht(a, b)     ((a) == (b))
     649             : #define EQint(a, b)     ((a) == (b))
     650             : #define EQlng(a, b)     ((a) == (b))
     651             : #ifdef HAVE_HGE
     652             : #define EQhge(a, b)     ((a) == (b))
     653             : #endif
     654             : #define EQflt(a, b)     (is_flt_nil(a) ? is_flt_nil(b) : (a) == (b))
     655             : #define EQdbl(a, b)     (is_dbl_nil(a) ? is_dbl_nil(b) : (a) == (b))
     656             : 
     657             : #define starthash(TYPE)                                                 \
     658             :         do {                                                            \
     659             :                 const TYPE *restrict v = (const TYPE *) BUNtloc(bi, 0); \
     660             :                 for (; p < cnt1; p++) {                                      \
     661             :                         c = hash_##TYPE(h, v + o - b->hseqbase);     \
     662             :                         hget = HASHget(h, c);                           \
     663             :                         if (hget == hnil) {                             \
     664             :                                 if (h->nheads == maxslots)           \
     665             :                                         break; /* mask too full */      \
     666             :                                 h->nheads++;                         \
     667             :                                 h->nunique++;                                \
     668             :                         } else {                                        \
     669             :                                 for (hb = hget;                         \
     670             :                                      hb != hnil;                        \
     671             :                                      hb = HASHgetlink(h, hb)) {         \
     672             :                                         if (EQ##TYPE(v[o - b->hseqbase], v[hb])) \
     673             :                                                 break;                  \
     674             :                                 }                                       \
     675             :                                 h->nunique += hb == hnil;            \
     676             :                         }                                               \
     677             :                         HASHputlink(h, p, hget);                        \
     678             :                         HASHput(h, c, p);                               \
     679             :                         o = canditer_next(ci);                          \
     680             :                 }                                                       \
     681             :         } while (0)
     682             : #define finishhash(TYPE)                                                \
     683             :         do {                                                            \
     684             :                 const TYPE *restrict v = (const TYPE *) BUNtloc(bi, 0); \
     685             :                 for (; p < ci->ncand; p++) {                                      \
     686             :                         c = hash_##TYPE(h, v + o - b->hseqbase);     \
     687             :                         c = hash_##TYPE(h, v + o - b->hseqbase);     \
     688             :                         hget = HASHget(h, c);                           \
     689             :                         h->nheads += hget == hnil;                   \
     690             :                         for (hb = hget;                                 \
     691             :                              hb != hnil;                                \
     692             :                              hb = HASHgetlink(h, hb)) {                 \
     693             :                                 if (EQ##TYPE(v[o - b->hseqbase], v[hb])) \
     694             :                                         break;                          \
     695             :                         }                                               \
     696             :                         h->nunique += hb == hnil;                    \
     697             :                         HASHputlink(h, p, hget);                        \
     698             :                         HASHput(h, c, p);                               \
     699             :                         o = canditer_next(ci);                          \
     700             :                 }                                                       \
     701             :         } while (0)
     702             : 
     703             : /* Internal function to create a hash table for the given BAT b.
     704             :  * If a candidate list s is also given, the hash table is specific for
     705             :  * the combination of the two: only values from b that are referred to
     706             :  * by s are included in the hash table, so if a result is found when
     707             :  * searching the hash table, the result is a candidate. */
     708             : Hash *
     709       14767 : BAThash_impl(BAT *restrict b, struct canditer *restrict ci, const char *restrict ext)
     710             : {
     711       14767 :         lng t0 = 0;
     712       14767 :         unsigned int tpe = ATOMbasetype(b->ttype);
     713       14767 :         BUN cnt1;
     714       14767 :         BUN mask, maxmask = 0;
     715       14767 :         BUN p, c;
     716       14767 :         oid o;
     717       14767 :         BUN hnil, hget, hb;
     718       14767 :         Hash *h = NULL;
     719       14767 :         const char *nme = GDKinmemory() ? ":inmemory" : BBP_physical(b->batCacheid);
     720       14763 :         BATiter bi = bat_iterator(b);
     721       14763 :         PROPrec *prop;
     722       14763 :         bool hascand = ci->tpe != cand_dense || ci->ncand != BATcount(b);
     723             : 
     724       14763 :         assert(strcmp(ext, "thash") != 0 || !hascand);
     725             : 
     726       14763 :         TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
     727       14763 :         TRC_DEBUG(ACCELERATOR,
     728             :                   ALGOBATFMT ": create hash;\n", ALGOBATPAR(b));
     729       14763 :         if (b->ttype == TYPE_void) {
     730           0 :                 if (is_oid_nil(b->tseqbase)) {
     731           0 :                         TRC_DEBUG(ACCELERATOR,
     732             :                                   "cannot create hash-table on void-NIL column.\n");
     733           0 :                         GDKerror("no hash on void/nil column\n");
     734           0 :                         return NULL;
     735             :                 }
     736           0 :                 TRC_DEBUG(ACCELERATOR,
     737             :                           "creating hash-table on void column..\n");
     738           0 :                 assert(0);
     739             :                 tpe = TYPE_void;
     740             :         }
     741             : 
     742       14763 :         if ((h = GDKzalloc(sizeof(*h))) == NULL ||
     743       14759 :             (h->heaplink.farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) < 0 ||
     744       14759 :             (h->heapbckt.farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) < 0) {
     745           0 :                 GDKfree(h);
     746           0 :                 return NULL;
     747             :         }
     748       14758 :         h->width = HASHwidth(BATcapacity(b));
     749       14758 :         h->heaplink.dirty = true;
     750       14758 :         h->heapbckt.dirty = true;
     751       14758 :         strconcat_len(h->heaplink.filename, sizeof(h->heaplink.filename),
     752             :                       nme, ".", ext, "l", NULL);
     753       14767 :         strconcat_len(h->heapbckt.filename, sizeof(h->heapbckt.filename),
     754             :                       nme, ".", ext, "b", NULL);
     755       14769 :         if (HEAPalloc(&h->heaplink, hascand ? ci->ncand : BATcapacity(b),
     756       14769 :                       h->width) != GDK_SUCCEED) {
     757           0 :                 GDKfree(h);
     758           0 :                 return NULL;
     759             :         }
     760       14771 :         h->heaplink.free = ci->ncand * h->width;
     761       14771 :         h->Link = h->heaplink.base;
     762             : #ifndef NDEBUG
     763             :         /* clear unused part of Link array */
     764       14771 :         if (h->heaplink.size > h->heaplink.free)
     765       14147 :                 memset(h->heaplink.base + h->heaplink.free, 0,
     766             :                        h->heaplink.size - h->heaplink.free);
     767             : #endif
     768             : 
     769             :         /* determine hash mask size */
     770       14771 :         cnt1 = 0;
     771       14771 :         if (ATOMsize(tpe) == 1) {
     772             :                 /* perfect hash for one-byte sized atoms */
     773             :                 mask = (1 << 8);
     774       14764 :         } else if (ATOMsize(tpe) == 2) {
     775             :                 /* perfect hash for two-byte sized atoms */
     776             :                 mask = (1 << 16);
     777       14731 :         } else if (b->tkey || ci->ncand <= 4096) {
     778             :                 /* if key, or if small, don't bother dynamically
     779             :                  * adjusting the hash mask */
     780       14374 :                 mask = HASHmask(ci->ncand);
     781         357 :         } else if (!hascand && (prop = BATgetprop_nolock(b, GDK_NUNIQUE)) != NULL) {
     782           0 :                 assert(prop->v.vtype == TYPE_oid);
     783           0 :                 mask = prop->v.val.oval * 8 / 7;
     784         356 :         } else if (!hascand && (prop = BATgetprop_nolock(b, GDK_HASH_BUCKETS)) != NULL) {
     785           0 :                 assert(prop->v.vtype == TYPE_oid);
     786           0 :                 mask = prop->v.val.oval;
     787           0 :                 maxmask = HASHmask(ci->ncand);
     788           0 :                 if (mask > maxmask)
     789             :                         mask = maxmask;
     790             :         } else {
     791             :                 /* dynamic hash: we start with HASHmask(ci->ncand)/64, or,
     792             :                  * if ci->ncand large enough, HASHmask(ci->ncand)/256; if there
     793             :                  * are too many collisions we try HASHmask(ci->ncand)/64,
     794             :                  * HASHmask(ci->ncand)/16, HASHmask(ci->ncand)/4, and finally
     795             :                  * HASHmask(ci->ncand), but we might skip some of these if
     796             :                  * there are many distinct values.  */
     797         356 :                 maxmask = HASHmask(ci->ncand);
     798         356 :                 mask = maxmask >> 6;
     799         401 :                 while (mask > 4096)
     800          45 :                         mask >>= 2;
     801             :                 /* try out on first 25% of b */
     802         356 :                 cnt1 = ci->ncand >> 2;
     803             :         }
     804             : 
     805       14770 :         o = canditer_next(ci);  /* always one ahead */
     806         524 :         for (;;) {
     807       15291 :                 lng t1 = 0;
     808       15291 :                 TRC_DEBUG_IF(ACCELERATOR) t1 = GDKusec();
     809       15291 :                 BUN maxslots = (mask >> 3) - 1;   /* 1/8 full is too full */
     810             : 
     811       15291 :                 h->nheads = 0;
     812       15291 :                 h->nunique = 0;
     813       15291 :                 p = 0;
     814       15291 :                 HEAPfree(&h->heapbckt, true);
     815             :                 /* create the hash structures */
     816       30598 :                 if (HASHnew(h, ATOMtype(b->ttype), BATcapacity(b),
     817             :                             mask, ci->ncand, true) != GDK_SUCCEED) {
     818           0 :                         HEAPfree(&h->heaplink, true);
     819           0 :                         GDKfree(h);
     820           0 :                         return NULL;
     821             :                 }
     822             : 
     823       15296 :                 hnil = HASHnil(h);
     824             : 
     825       15296 :                 switch (tpe) {
     826           7 :                 case TYPE_bte:
     827           7 :                         starthash(bte);
     828             :                         break;
     829          33 :                 case TYPE_sht:
     830          33 :                         starthash(sht);
     831             :                         break;
     832           2 :                 case TYPE_flt:
     833           2 :                         starthash(flt);
     834             :                         break;
     835        7142 :                 case TYPE_int:
     836    14909800 :                         starthash(int);
     837             :                         break;
     838           8 :                 case TYPE_dbl:
     839           8 :                         starthash(dbl);
     840             :                         break;
     841        7078 :                 case TYPE_lng:
     842     7072340 :                         starthash(lng);
     843             :                         break;
     844             : #ifdef HAVE_HGE
     845          12 :                 case TYPE_hge:
     846          12 :                         starthash(hge);
     847             :                         break;
     848             : #endif
     849             :                 default:
     850      200966 :                         for (; p < cnt1; p++) {
     851      199983 :                                 const void *restrict v = BUNtail(bi, o - b->hseqbase);
     852      199983 :                                 c = hash_any(h, v);
     853      199983 :                                 hget = HASHget(h, c);
     854      199983 :                                 if (hget == hnil) {
     855       36584 :                                         if (h->nheads == maxslots)
     856             :                                                 break; /* mask too full */
     857       36553 :                                         h->nheads++;
     858       36553 :                                         h->nunique++;
     859             :                                 } else {
     860      172896 :                                         for (hb = hget;
     861      172896 :                                              hb != hnil;
     862      182393 :                                              hb = HASHgetlink(h, hb)) {
     863      170865 :                                                 if (ATOMcmp(h->type,
     864             :                                                             v,
     865             :                                                             BUNtail(bi, hb)) == 0)
     866             :                                                         break;
     867             :                                         }
     868      163399 :                                         h->nunique += hb == hnil;
     869             :                                 }
     870      199952 :                                 HASHputlink(h, p, hget);
     871      199952 :                                 HASHput(h, c, p);
     872      199952 :                                 o = canditer_next(ci);
     873             :                         }
     874             :                         break;
     875             :                 }
     876       15298 :                 TRC_DEBUG_IF(ACCELERATOR) if (p < cnt1)
     877           0 :                         TRC_DEBUG_ENDIF(ACCELERATOR,
     878             :                                         "%s: abort starthash with "
     879             :                                 "mask " BUNFMT " at " BUNFMT " after " LLFMT " usec\n", BATgetId(b), mask, p, GDKusec() - t1);
     880       15298 :                 if (p == cnt1 || mask == maxmask)
     881             :                         break;
     882         527 :                 mask <<= 2;
     883             :                 /* if we fill up the slots fast (p <= maxslots * 1.2)
     884             :                  * increase mask size a bit more quickly */
     885         527 :                 if (p == h->nunique) {
     886             :                         /* only unique values so far: grow really fast */
     887             :                         mask = maxmask;
     888             :                         cnt1 = 0;
     889         307 :                 } else if (mask < maxmask && p <= maxslots * 1.2)
     890          26 :                         mask <<= 2;
     891         527 :                 canditer_reset(ci);
     892         527 :                 o = canditer_next(ci);
     893             :         }
     894             : 
     895             :         /* finish the hashtable with the current mask */
     896       14771 :         switch (tpe) {
     897           7 :         case TYPE_bte:
     898          64 :                 finishhash(bte);
     899             :                 break;
     900          33 :         case TYPE_sht:
     901      488187 :                 finishhash(sht);
     902             :                 break;
     903        6748 :         case TYPE_int:
     904   156993000 :                 finishhash(int);
     905             :                 break;
     906           2 :         case TYPE_flt:
     907        2009 :                 finishhash(flt);
     908             :                 break;
     909           8 :         case TYPE_dbl:
     910           8 :                 finishhash(dbl);
     911             :                 break;
     912        6978 :         case TYPE_lng:
     913    57006900 :                 finishhash(lng);
     914             :                 break;
     915             : #ifdef HAVE_HGE
     916          12 :         case TYPE_hge:
     917         535 :                 finishhash(hge);
     918             :                 break;
     919             : #endif
     920             :         default:
     921      647611 :                 for (; p < ci->ncand; p++) {
     922      646628 :                         const void *restrict v = BUNtail(bi, o - b->hseqbase);
     923      646628 :                         c = hash_any(h, v);
     924      646641 :                         hget = HASHget(h, c);
     925      646641 :                         h->nheads += hget == hnil;
     926      646641 :                         for (hb = hget;
     927      783747 :                              hb != hnil;
     928      920853 :                              hb = HASHgetlink(h, hb)) {
     929      610842 :                                 if (ATOMcmp(h->type, v, BUNtail(bi, hb)) == 0)
     930             :                                         break;
     931             :                         }
     932      646630 :                         h->nunique += hb == hnil;
     933      646630 :                         HASHputlink(h, p, hget);
     934      646630 :                         HASHput(h, c, p);
     935      646630 :                         o = canditer_next(ci);
     936             :                 }
     937             :                 break;
     938             :         }
     939       14772 :         if (!hascand) {
     940       14718 :                 BATsetprop_nolock(b, GDK_HASH_BUCKETS, TYPE_oid, &(oid){h->nbucket});
     941       14716 :                 BATsetprop_nolock(b, GDK_NUNIQUE, TYPE_oid, &(oid){h->nunique});
     942             :         }
     943       14771 :         h->heapbckt.parentid = b->batCacheid;
     944       14771 :         h->heaplink.parentid = b->batCacheid;
     945             :         /* if the number of unique values is equal to the bat count,
     946             :          * all values are necessarily distinct */
     947       14771 :         if (h->nunique == BATcount(b) && !b->tkey) {
     948        7240 :                 b->tkey = true;
     949        7240 :                 b->batDirtydesc = true;
     950             :         }
     951       14771 :         TRC_DEBUG_IF(ACCELERATOR) {
     952           0 :                 TRC_DEBUG_ENDIF(ACCELERATOR,
     953             :                                 "hash construction " LLFMT " usec\n", GDKusec() - t0);
     954           0 :                 HASHcollisions(b, h, __func__);
     955             :         }
     956             :         return h;
     957             : }
     958             : 
     959             : gdk_return
     960     2185060 : BAThash(BAT *b)
     961             : {
     962     2185060 :         assert(b->batCacheid > 0);
     963     2185060 :         if (BATcheckhash(b)) {
     964             :                 return GDK_SUCCEED;
     965             :         }
     966       14892 :         MT_lock_set(&b->batIdxLock);
     967       14895 :         if (b->thash == NULL) {
     968       14717 :                 struct canditer ci;
     969       14717 :                 canditer_init(&ci, b, NULL);
     970       14713 :                 if ((b->thash = BAThash_impl(b, &ci, "thash")) == NULL) {
     971           0 :                         MT_lock_unset(&b->batIdxLock);
     972        2833 :                         return GDK_FAIL;
     973             :                 }
     974             : #ifdef PERSISTENTHASH
     975       14717 :                 if (BBP_status(b->batCacheid) & BBPEXISTING && !b->theap.dirty && !GDKinmemory()) {
     976        2833 :                         MT_Id tid;
     977        2833 :                         BBPfix(b->batCacheid);
     978        2833 :                         char name[MT_NAME_LEN];
     979        2833 :                         snprintf(name, sizeof(name), "hashsync%d", b->batCacheid);
     980        2833 :                         MT_lock_unset(&b->batIdxLock);
     981        2833 :                         if (MT_create_thread(&tid, BAThashsync, b,
     982             :                                              MT_THR_DETACHED,
     983             :                                              name) < 0) {
     984             :                                 /* couldn't start thread: clean up */
     985           0 :                                 BBPunfix(b->batCacheid);
     986             :                         }
     987        2833 :                         return GDK_SUCCEED;
     988             :                 } else
     989       11884 :                         TRC_DEBUG(ACCELERATOR,
     990             :                                         "NOT persisting hash %d\n", b->batCacheid);
     991             : #endif
     992             :         }
     993       12062 :         MT_lock_unset(&b->batIdxLock);
     994       12062 :         return GDK_SUCCEED;
     995             : }
     996             : 
     997             : /*
     998             :  * The entry on which a value hashes can be calculated with the
     999             :  * routine HASHprobe.
    1000             :  */
    1001             : BUN
    1002   262832000 : HASHprobe(const Hash *h, const void *v)
    1003             : {
    1004   317692000 :         switch (ATOMbasetype(h->type)) {
    1005        1522 :         case TYPE_bte:
    1006        1522 :                 return hash_bte(h, v);
    1007       33331 :         case TYPE_sht:
    1008       33331 :                 return hash_sht(h, v);
    1009    21670100 :         case TYPE_int:
    1010             :         case TYPE_flt:
    1011    21670100 :                 return hash_int(h, v);
    1012   236134000 :         case TYPE_dbl:
    1013             :         case TYPE_lng:
    1014   236134000 :                 return hash_lng(h, v);
    1015             : #ifdef HAVE_HGE
    1016       16805 :         case TYPE_hge:
    1017       16805 :                 return hash_hge(h, v);
    1018             : #endif
    1019     4976360 :         default:
    1020     4976360 :                 return hash_any(h, v);
    1021             :         }
    1022             : }
    1023             : 
    1024             : static void
    1025      243209 : HASHins_locked(BAT *b, BUN i, const void *v)
    1026             : {
    1027      243209 :         Hash *h = b->thash;
    1028      243209 :         if (h == NULL) {
    1029          11 :                 return;
    1030             :         }
    1031      243209 :         if (h == (Hash *) 1) {
    1032          11 :                 b->thash = NULL;
    1033          11 :                 doHASHdestroy(b, h);
    1034          11 :                 return;
    1035             :         }
    1036      243198 :         if (HASHwidth(i + 1) > h->width &&
    1037           0 :              HASHupgradehashheap(b) != GDK_SUCCEED) {
    1038             :                 return;
    1039             :         }
    1040      486358 :         if ((ATOMsize(b->ttype) > 2 &&
    1041      243160 :              HASHgrowbucket(b) != GDK_SUCCEED) ||
    1042      243926 :             ((i + 1) * h->width > h->heaplink.size &&
    1043         728 :              HEAPextend(&h->heaplink,
    1044         728 :                         i * h->width + GDK_mmap_pagesize,
    1045             :                         true) != GDK_SUCCEED)) {
    1046           0 :                 b->thash = NULL;
    1047           0 :                 HEAPfree(&h->heapbckt, true);
    1048           0 :                 HEAPfree(&h->heaplink, true);
    1049           0 :                 GDKfree(h);
    1050           0 :                 return;
    1051             :         }
    1052      243198 :         h->Link = h->heaplink.base;
    1053      243198 :         BUN c = HASHprobe(h, v);
    1054      243198 :         h->heaplink.free += h->width;
    1055      243198 :         BUN hb = HASHget(h, c);
    1056      243198 :         BUN hb2;
    1057      243198 :         BATiter bi = bat_iterator(b);
    1058      243198 :         for (hb2 = hb;
    1059      332494 :              hb2 != HASHnil(h);
    1060      421790 :              hb2 = HASHgetlink(h, hb2)) {
    1061      117685 :                 if (ATOMcmp(h->type,
    1062             :                             v,
    1063             :                             BUNtail(bi, hb2)) == 0)
    1064             :                         break;
    1065             :         }
    1066      243198 :         h->nheads += hb == HASHnil(h);
    1067      243198 :         h->nunique += hb2 == HASHnil(h);
    1068      243198 :         HASHputlink(h, i, hb);
    1069      243198 :         HASHput(h, c, i);
    1070      243198 :         h->heapbckt.dirty = true;
    1071      243198 :         h->heaplink.dirty = true;
    1072             : }
    1073             : 
    1074             : void
    1075      243209 : HASHins(BAT *b, BUN i, const void *v)
    1076             : {
    1077      243209 :         MT_lock_set(&b->batIdxLock);
    1078      243209 :         HASHins_locked(b, i, v);
    1079      243209 :         MT_lock_unset(&b->batIdxLock);
    1080      243209 : }
    1081             : 
    1082             : BUN
    1083           0 : HASHlist(Hash *h, BUN i)
    1084             : {
    1085           0 :         BUN c = 1;
    1086           0 :         BUN j = HASHget(h, i), nil = HASHnil(h);
    1087             : 
    1088           0 :         if (j == nil)
    1089             :                 return 1;
    1090           0 :         while ((j = HASHgetlink(h, i)) != nil) {
    1091           0 :                 c++;
    1092           0 :                 i = j;
    1093             :         }
    1094             :         return c;
    1095             : }
    1096             : 
    1097             : void
    1098     7019440 : HASHdestroy(BAT *b)
    1099             : {
    1100     7019440 :         if (b && b->thash) {
    1101       11955 :                 Hash *hs;
    1102       11955 :                 MT_lock_set(&b->batIdxLock);
    1103       11955 :                 hs = b->thash;
    1104       11955 :                 b->thash = NULL;
    1105       11955 :                 MT_lock_unset(&b->batIdxLock);
    1106       11955 :                 doHASHdestroy(b, hs);
    1107             :         }
    1108     7019440 : }
    1109             : 
    1110             : void
    1111     1172870 : HASHfree(BAT *b)
    1112             : {
    1113     1172870 :         if (b && b->thash) {
    1114       10978 :                 Hash *h;
    1115       10978 :                 MT_lock_set(&b->batIdxLock);
    1116       10978 :                 if ((h = b->thash) != NULL && h != (Hash *) 1) {
    1117        6370 :                         bool rmheap = GDKinmemory() || h->heaplink.dirty || h->heapbckt.dirty;
    1118        6370 :                         TRC_DEBUG(ACCELERATOR, ALGOBATFMT " free hash %s\n",
    1119             :                                   ALGOBATPAR(b),
    1120             :                                   rmheap ? "removing" : "keeping");
    1121             : 
    1122        6370 :                         b->thash = rmheap ? NULL : (Hash *) 1;
    1123        6370 :                         HEAPfree(&h->heapbckt, rmheap);
    1124        6370 :                         HEAPfree(&h->heaplink, rmheap);
    1125        6370 :                         GDKfree(h);
    1126             :                 }
    1127       10978 :                 MT_lock_unset(&b->batIdxLock);
    1128             :         }
    1129     1172870 : }
    1130             : 
    1131             : bool
    1132           0 : HASHgonebad(BAT *b, const void *v)
    1133             : {
    1134           0 :         Hash *h = b->thash;
    1135           0 :         BATiter bi = bat_iterator(b);
    1136           0 :         BUN cnt, hit;
    1137             : 
    1138           0 :         if (h == NULL)
    1139             :                 return true;    /* no hash is bad hash? */
    1140             : 
    1141           0 :         if (h->nbucket * 2 < BATcount(b)) {
    1142           0 :                 int (*cmp) (const void *, const void *) = ATOMcompare(b->ttype);
    1143           0 :                 BUN i = HASHget(h, (BUN) HASHprobe(h, v)), nil = HASHnil(h);
    1144           0 :                 for (cnt = hit = 1; i != nil; i = HASHgetlink(h, i), cnt++)
    1145           0 :                         hit += ((*cmp) (v, BUNtail(bi, (BUN) i)) == 0);
    1146             : 
    1147           0 :                 if (cnt / hit > 4)
    1148           0 :                         return true;    /* linked list too long */
    1149             : 
    1150             :                 /* in this case, linked lists are long but contain the
    1151             :                  * desired values such hash tables may be useful for
    1152             :                  * locating all duplicates */
    1153             :         }
    1154             :         return false;           /* a-ok */
    1155             : }

Generated by: LCOV version 1.14