LCOV - code coverage report
Current view: top level - gdk - gdk_unique.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 133 158 84.2 %
Date: 2021-10-13 02:24:04 Functions: 1 1 100.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : #include "monetdb_config.h"
      10             : #include "gdk.h"
      11             : #include "gdk_private.h"
      12             : #include "gdk_calc_private.h"
      13             : 
      14             : #define VALUE(x)        (vars ? vars + VarHeapVal(vals, (x), width) : vals + (x) * width)
      15             : /* BATunique returns a bat that indicates the unique tail values of
      16             :  * the input bat.  This is essentially the same output as the
      17             :  * "extents" output of BATgroup.  The difference is that BATunique
      18             :  * does not return the grouping bat.
      19             :  *
      20             :  * The first input is the bat from which unique rows are selected, the
      21             :  * second input is an optional candidate list.
      22             :  *
      23             :  * The return value is a candidate list.
      24             :  */
      25             : BAT *
      26         634 : BATunique(BAT *b, BAT *s)
      27             : {
      28             :         BAT *bn;
      29             :         BUN cnt;
      30             :         const void *v;
      31             :         const char *vals;
      32             :         const char *vars;
      33             :         int width;
      34             :         oid i, o;
      35             :         uint16_t *seen = NULL;
      36             :         const char *nme;
      37             :         Hash *hs = NULL;
      38             :         BUN hb;
      39             :         BATiter bi;
      40             :         int (*cmp)(const void *, const void *);
      41             :         struct canditer ci;
      42             :         const char *algomsg = "";
      43             :         lng t0 = 0;
      44             : 
      45             :         size_t counter = 0;
      46             :         lng timeoffset = 0;
      47         634 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
      48         634 :         if (qry_ctx != NULL) {
      49         602 :                 timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
      50             :         }
      51             : 
      52         634 :         TRC_DEBUG_IF(ALGO) t0 = GDKusec();
      53             : 
      54         634 :         BATcheck(b, NULL);
      55         634 :         cnt = canditer_init(&ci, b, s);
      56             : 
      57         634 :         if (b->tkey || cnt <= 1 || BATtdense(b)) {
      58             :                 /* trivial: already unique */
      59         239 :                 bn = canditer_slice(&ci, 0, cnt);
      60         239 :                 TRC_DEBUG(ALGO, "b=" ALGOBATFMT
      61             :                           ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
      62             :                           " (already unique, slice candidates -- "
      63             :                           LLFMT "usec)\n",
      64             :                           ALGOBATPAR(b), ALGOOPTBATPAR(s),
      65             :                           ALGOOPTBATPAR(bn), GDKusec() - t0);
      66         239 :                 return bn;
      67             :         }
      68             : 
      69         395 :         if ((BATordered(b) && BATordered_rev(b)) ||
      70         326 :             (b->ttype == TYPE_void && is_oid_nil(b->tseqbase))) {
      71             :                 /* trivial: all values are the same */
      72          69 :                 bn = BATdense(0, ci.seq, 1);
      73          69 :                 TRC_DEBUG(ALGO, "b=" ALGOBATFMT
      74             :                           ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
      75             :                           " (all equal -- "
      76             :                           LLFMT "usec)\n",
      77             :                           ALGOBATPAR(b), ALGOOPTBATPAR(s),
      78             :                           ALGOOPTBATPAR(bn), GDKusec() - t0);
      79          69 :                 return bn;
      80             :         }
      81             : 
      82         326 :         assert(b->ttype != TYPE_void);
      83             : 
      84             :         BUN initsize = BUN_NONE;
      85         326 :         if (s == NULL) {
      86         260 :                 MT_rwlock_rdlock(&b->thashlock);
      87         260 :                 if (b->thash != NULL && b->thash != (Hash *) 1)
      88           2 :                         initsize = b->thash->nunique;
      89         260 :                 MT_rwlock_rdunlock(&b->thashlock);
      90         260 :                 if (initsize == BUN_NONE) {
      91         258 :                         MT_lock_set(&b->theaplock);
      92         258 :                         if (b->tunique_est != 0)
      93          15 :                                 initsize = (BUN) b->tunique_est;
      94         258 :                         MT_lock_unset(&b->theaplock);
      95             :                 }
      96             :         }
      97         260 :         if (initsize == BUN_NONE)
      98             :                 initsize = 1024;
      99         326 :         bn = COLnew(0, TYPE_oid, initsize, TRANSIENT);
     100         326 :         if (bn == NULL)
     101             :                 return NULL;
     102         326 :         bi = bat_iterator(b);
     103         326 :         vals = bi.base;
     104         326 :         if (b->tvarsized && b->ttype)
     105          89 :                 vars = bi.vh->base;
     106             :         else
     107             :                 vars = NULL;
     108         326 :         width = bi.width;
     109         326 :         cmp = ATOMcompare(b->ttype);
     110             : 
     111         326 :         if (BATordered(b) || BATordered_rev(b)) {
     112             :                 const void *prev = NULL;
     113             :                 algomsg = "unique: sorted";
     114      255492 :                 for (i = 0; i < cnt; i++) {
     115      255413 :                         GDK_CHECK_TIMEOUT(timeoffset, counter,
     116             :                                         GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     117      255413 :                         o = canditer_next(&ci);
     118      255413 :                         v = VALUE(o - b->hseqbase);
     119      255413 :                         if (prev == NULL || (*cmp)(v, prev) != 0) {
     120       39663 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     121           0 :                                         goto bunins_failed;
     122             :                         }
     123             :                         prev = v;
     124             :                 }
     125         247 :         } else if (ATOMbasetype(b->ttype) == TYPE_bte) {
     126             :                 unsigned char val;
     127             : 
     128             :                 algomsg = "unique: byte-sized atoms";
     129          20 :                 assert(vars == NULL);
     130          20 :                 seen = GDKzalloc((256 / 16) * sizeof(seen[0]));
     131          20 :                 if (seen == NULL)
     132           0 :                         goto bunins_failed;
     133       20244 :                 for (i = 0; i < cnt; i++) {
     134       20224 :                         GDK_CHECK_TIMEOUT(timeoffset, counter,
     135             :                                         GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     136       20224 :                         o = canditer_next(&ci);
     137       20224 :                         val = ((const unsigned char *) vals)[o - b->hseqbase];
     138       20224 :                         if (!(seen[val >> 4] & (1U << (val & 0xF)))) {
     139          40 :                                 seen[val >> 4] |= 1U << (val & 0xF);
     140          40 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     141           0 :                                         goto bunins_failed;
     142          40 :                                 if (bn->batCount == 256) {
     143             :                                         /* there cannot be more than
     144             :                                          * 256 distinct values */
     145             :                                         break;
     146             :                                 }
     147             :                         }
     148             :                 }
     149          20 :                 GDKfree(seen);
     150             :                 seen = NULL;
     151         227 :         } else if (ATOMbasetype(b->ttype) == TYPE_sht) {
     152             :                 unsigned short val;
     153             : 
     154             :                 algomsg = "unique: short-sized atoms";
     155           9 :                 assert(vars == NULL);
     156           9 :                 seen = GDKzalloc((65536 / 16) * sizeof(seen[0]));
     157           9 :                 if (seen == NULL)
     158           0 :                         goto bunins_failed;
     159        3421 :                 for (i = 0; i < cnt; i++) {
     160        3412 :                         GDK_CHECK_TIMEOUT(timeoffset, counter,
     161             :                                         GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     162        3412 :                         o = canditer_next(&ci);
     163        3412 :                         val = ((const unsigned short *) vals)[o - b->hseqbase];
     164        3412 :                         if (!(seen[val >> 4] & (1U << (val & 0xF)))) {
     165          26 :                                 seen[val >> 4] |= 1U << (val & 0xF);
     166          26 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     167           0 :                                         goto bunins_failed;
     168          26 :                                 if (bn->batCount == 65536) {
     169             :                                         /* there cannot be more than
     170             :                                          * 65536 distinct values */
     171             :                                         break;
     172             :                                 }
     173             :                         }
     174             :                 }
     175           9 :                 GDKfree(seen);
     176             :                 seen = NULL;
     177         218 :         } else if (BATcheckhash(b) ||
     178         216 :                    (!b->batTransient &&
     179          78 :                     cnt == BATcount(b) &&
     180          80 :                     BAThash(b) == GDK_SUCCEED)) {
     181             :                 BUN lo = 0;
     182             :                 oid seq;
     183             : 
     184             :                 /* we already have a hash table on b, or b is
     185             :                  * persistent and we could create a hash table, or b
     186             :                  * is a view on a bat that already has a hash table */
     187             :                 algomsg = "unique: existing hash";
     188          41 :                 seq = b->hseqbase;
     189          41 :                 MT_rwlock_rdlock(&b->thashlock);
     190          41 :                 hs = b->thash;
     191          41 :                 if (hs == NULL) {
     192           0 :                         MT_rwlock_rdunlock(&b->thashlock);
     193           0 :                         goto lost_hash;
     194             :                 }
     195         306 :                 for (i = 0; i < cnt; i++) {
     196         265 :                         GDK_CHECK_TIMEOUT(timeoffset, counter,
     197             :                                         GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     198             :                         BUN p;
     199             : 
     200         265 :                         o = canditer_next(&ci);
     201         265 :                         p = o - seq;
     202         265 :                         v = VALUE(p);
     203         274 :                         for (hb = HASHgetlink(hs, p + lo);
     204             :                              hb != BUN_NONE && hb >= lo;
     205           9 :                              hb = HASHgetlink(hs, hb)) {
     206         168 :                                 assert(hb < p + lo);
     207         327 :                                 if (cmp(v, BUNtail(bi, hb)) == 0 &&
     208         159 :                                     canditer_contains(&ci, hb - lo + seq)) {
     209             :                                         /* we've seen this value
     210             :                                          * before */
     211             :                                         break;
     212             :                                 }
     213             :                         }
     214         265 :                         if (hb == BUN_NONE || hb < lo) {
     215         106 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED) {
     216           0 :                                         MT_rwlock_rdunlock(&b->thashlock);
     217             :                                         hs = NULL;
     218           0 :                                         goto bunins_failed;
     219             :                                 }
     220             :                         }
     221             :                 }
     222          41 :                 MT_rwlock_rdunlock(&b->thashlock);
     223             :         } else {
     224             :                 BUN prb;
     225             :                 BUN p;
     226             :                 BUN mask;
     227             : 
     228         177 :           lost_hash:
     229         177 :                 GDKclrerr();    /* not interested in BAThash errors */
     230             :                 algomsg = "unique: new partial hash";
     231         177 :                 nme = BBP_physical(b->batCacheid);
     232         177 :                 if (ATOMbasetype(b->ttype) == TYPE_bte) {
     233             :                         mask = (BUN) 1 << 8;
     234             :                         cmp = NULL; /* no compare needed, "hash" is perfect */
     235         177 :                 } else if (ATOMbasetype(b->ttype) == TYPE_sht) {
     236             :                         mask = (BUN) 1 << 16;
     237             :                         cmp = NULL; /* no compare needed, "hash" is perfect */
     238             :                 } else {
     239         177 :                         mask = HASHmask(cnt);
     240             :                         if (mask < ((BUN) 1 << 16))
     241             :                                 mask = (BUN) 1 << 16;
     242             :                 }
     243         177 :                 if ((hs = GDKzalloc(sizeof(Hash))) == NULL) {
     244           0 :                         GDKerror("cannot allocate hash table\n");
     245           0 :                         goto bunins_failed;
     246             :                 }
     247         177 :                 if ((hs->heaplink.farmid = BBPselectfarm(TRANSIENT, b->ttype, hashheap)) < 0 ||
     248         177 :                     (hs->heapbckt.farmid = BBPselectfarm(TRANSIENT, b->ttype, hashheap)) < 0 ||
     249         177 :                     snprintf(hs->heaplink.filename, sizeof(hs->heaplink.filename), "%s.thshunil%x", nme, (unsigned) THRgettid()) >= (int) sizeof(hs->heaplink.filename) ||
     250         354 :                     snprintf(hs->heapbckt.filename, sizeof(hs->heapbckt.filename), "%s.thshunib%x", nme, (unsigned) THRgettid()) >= (int) sizeof(hs->heapbckt.filename) ||
     251         177 :                     HASHnew(hs, b->ttype, BUNlast(b), mask, BUN_NONE, false) != GDK_SUCCEED) {
     252           0 :                         GDKfree(hs);
     253             :                         hs = NULL;
     254           0 :                         GDKerror("cannot allocate hash table\n");
     255           0 :                         goto bunins_failed;
     256             :                 }
     257      229141 :                 for (i = 0; i < cnt; i++) {
     258      228964 :                         GDK_CHECK_TIMEOUT(timeoffset, counter,
     259             :                                         GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed));
     260      228964 :                         o = canditer_next(&ci);
     261      229011 :                         v = VALUE(o - b->hseqbase);
     262      229011 :                         prb = HASHprobe(hs, v);
     263      229644 :                         for (hb = HASHget(hs, prb);
     264             :                              hb != BUN_NONE;
     265         597 :                              hb = HASHgetlink(hs, hb)) {
     266      172202 :                                 if (cmp == NULL || cmp(v, BUNtail(bi, hb)) == 0)
     267             :                                         break;
     268             :                         }
     269      228941 :                         if (hb == BUN_NONE) {
     270       78368 :                                 p = o - b->hseqbase;
     271       78368 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     272           0 :                                         goto bunins_failed;
     273             :                                 /* enter into hash table */
     274       78368 :                                 HASHputlink(hs, p, HASHget(hs, prb));
     275       78307 :                                 HASHput(hs, prb, p);
     276             :                         }
     277             :                 }
     278         177 :                 HEAPfree(&hs->heaplink, true);
     279         177 :                 HEAPfree(&hs->heapbckt, true);
     280         177 :                 GDKfree(hs);
     281             :         }
     282         326 :         bat_iterator_end(&bi);
     283             : 
     284         326 :         bn->theap->dirty = true;
     285         326 :         bn->tsorted = true;
     286         326 :         bn->trevsorted = BATcount(bn) <= 1;
     287         326 :         bn->tkey = true;
     288         326 :         bn->tnil = false;
     289         326 :         bn->tnonil = true;
     290         326 :         if (BATcount(bn) == BATcount(b)) {
     291             :                 /* it turns out all values are distinct */
     292         107 :                 assert(b->tnokey[0] == 0);
     293         107 :                 assert(b->tnokey[1] == 0);
     294         107 :                 b->tkey = true;
     295         107 :                 b->batDirtydesc = true;
     296             :         }
     297         326 :         bn = virtualize(bn);
     298         326 :         MT_thread_setalgorithm(algomsg);
     299         326 :         TRC_DEBUG(ALGO, "b=" ALGOBATFMT
     300             :                   ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
     301             :                   " (%s -- " LLFMT "usec)\n",
     302             :                   ALGOBATPAR(b), ALGOOPTBATPAR(s),
     303             :                   ALGOOPTBATPAR(bn), algomsg, GDKusec() - t0);
     304             :         return bn;
     305             : 
     306           0 :   bunins_failed:
     307           0 :         bat_iterator_end(&bi);
     308           0 :         if (seen)
     309           0 :                 GDKfree(seen);
     310           0 :         if (hs != NULL) {
     311           0 :                 HEAPfree(&hs->heaplink, true);
     312           0 :                 HEAPfree(&hs->heapbckt, true);
     313           0 :                 GDKfree(hs);
     314             :         }
     315           0 :         BBPreclaim(bn);
     316           0 :         return NULL;
     317             : }

Generated by: LCOV version 1.14