LCOV - code coverage report
Current view: top level - gdk - gdk_unique.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 147 174 84.5 %
Date: 2020-06-29 20:00:14 Functions: 1 1 100.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2020 MonetDB B.V.
       7             :  */
       8             : 
       9             : #include "monetdb_config.h"
      10             : #include "gdk.h"
      11             : #include "gdk_private.h"
      12             : #include "gdk_calc_private.h"
      13             : 
      14             : #define VALUE(x)        (vars ? vars + VarHeapVal(vals, (x), width) : vals + (x) * width)
      15             : /* BATunique returns a bat that indicates the unique tail values of
      16             :  * the input bat.  This is essentially the same output as the
      17             :  * "extents" output of BATgroup.  The difference is that BATunique
      18             :  * does not return the grouping bat.
      19             :  *
      20             :  * The first input is the bat from which unique rows are selected, the
      21             :  * second input is an optional candidate list.
      22             :  *
      23             :  * The return value is a candidate list.
      24             :  */
      25             : BAT *
      26        1034 : BATunique(BAT *b, BAT *s)
      27             : {
      28        1034 :         BAT *bn;
      29        1034 :         BUN cnt;
      30        1034 :         const void *v;
      31        1034 :         const char *vals;
      32        1034 :         const char *vars;
      33        1034 :         int width;
      34        1034 :         oid i, o;
      35        1034 :         uint16_t *seen = NULL;
      36        1034 :         const char *nme;
      37        1034 :         Hash *hs = NULL;
      38        1034 :         BUN hb;
      39        1034 :         BATiter bi;
      40        1034 :         int (*cmp)(const void *, const void *);
      41        1034 :         bat parent;
      42        1034 :         struct canditer ci;
      43        1034 :         PROPrec *prop;
      44        1034 :         const char *algomsg = "";
      45        1034 :         lng t0 = 0;
      46             : 
      47        1034 :         TRC_DEBUG_IF(ALGO) t0 = GDKusec();
      48             : 
      49        1034 :         BATcheck(b, NULL);
      50        1034 :         cnt = canditer_init(&ci, b, s);
      51             : 
      52        1034 :         if (b->tkey || cnt <= 1 || BATtdense(b)) {
      53             :                 /* trivial: already unique */
      54         137 :                 bn = canditer_slice(&ci, 0, ci.ncand);
      55         137 :                 TRC_DEBUG(ALGO, "b=" ALGOBATFMT
      56             :                           ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
      57             :                           " (already unique, slice candidates -- "
      58             :                           LLFMT "usec)\n",
      59             :                           ALGOBATPAR(b), ALGOOPTBATPAR(s),
      60             :                           ALGOOPTBATPAR(bn), GDKusec() - t0);
      61         137 :                 return bn;
      62             :         }
      63             : 
      64         897 :         if ((BATordered(b) && BATordered_rev(b)) ||
      65         833 :             (b->ttype == TYPE_void && is_oid_nil(b->tseqbase))) {
      66             :                 /* trivial: all values are the same */
      67          64 :                 bn = BATdense(0, ci.seq, 1);
      68          64 :                 TRC_DEBUG(ALGO, "b=" ALGOBATFMT
      69             :                           ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
      70             :                           " (all equal -- "
      71             :                           LLFMT "usec)\n",
      72             :                           ALGOBATPAR(b), ALGOOPTBATPAR(s),
      73             :                           ALGOOPTBATPAR(bn), GDKusec() - t0);
      74          64 :                 return bn;
      75             :         }
      76             : 
      77         833 :         assert(b->ttype != TYPE_void);
      78             : 
      79         833 :         if (s == NULL && (prop = BATgetprop(b, GDK_NUNIQUE)) != NULL)
      80           4 :                 bn = COLnew(0, TYPE_oid, prop->v.val.oval, TRANSIENT);
      81             :         else
      82         829 :                 bn = COLnew(0, TYPE_oid, 1024, TRANSIENT);
      83         833 :         if (bn == NULL)
      84             :                 return NULL;
      85         833 :         vals = Tloc(b, 0);
      86         833 :         if (b->tvarsized && b->ttype)
      87          65 :                 vars = b->tvheap->base;
      88             :         else
      89             :                 vars = NULL;
      90         833 :         width = Tsize(b);
      91         833 :         cmp = ATOMcompare(b->ttype);
      92         833 :         bi = bat_iterator(b);
      93             : 
      94         833 :         if (BATordered(b) || BATordered_rev(b)) {
      95             :                 const void *prev = NULL;
      96       40465 :                 algomsg = "sorted";
      97       40465 :                 for (i = 0; i < ci.ncand; i++) {
      98       40433 :                         o = canditer_next(&ci);
      99       40433 :                         v = VALUE(o - b->hseqbase);
     100       40433 :                         if (prev == NULL || (*cmp)(v, prev) != 0) {
     101       22491 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     102           0 :                                         goto bunins_failed;
     103             :                         }
     104       40433 :                         prev = v;
     105             :                 }
     106         801 :         } else if (ATOMbasetype(b->ttype) == TYPE_bte) {
     107          11 :                 unsigned char val;
     108             : 
     109          11 :                 algomsg = "byte-sized atoms";
     110          11 :                 assert(vars == NULL);
     111          11 :                 seen = GDKzalloc((256 / 16) * sizeof(seen[0]));
     112          11 :                 if (seen == NULL)
     113           0 :                         goto bunins_failed;
     114       20654 :                 for (i = 0; i < ci.ncand; i++) {
     115       20643 :                         o = canditer_next(&ci);
     116       20643 :                         val = ((const unsigned char *) vals)[o - b->hseqbase];
     117       20643 :                         if (!(seen[val >> 4] & (1U << (val & 0xF)))) {
     118          25 :                                 seen[val >> 4] |= 1U << (val & 0xF);
     119          25 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     120           0 :                                         goto bunins_failed;
     121          25 :                                 if (bn->batCount == 256) {
     122             :                                         /* there cannot be more than
     123             :                                          * 256 distinct values */
     124             :                                         break;
     125             :                                 }
     126             :                         }
     127             :                 }
     128          11 :                 GDKfree(seen);
     129          11 :                 seen = NULL;
     130         790 :         } else if (ATOMbasetype(b->ttype) == TYPE_sht) {
     131         182 :                 unsigned short val;
     132             : 
     133         182 :                 algomsg = "short-sized atoms";
     134         182 :                 assert(vars == NULL);
     135         182 :                 seen = GDKzalloc((65536 / 16) * sizeof(seen[0]));
     136         182 :                 if (seen == NULL)
     137           0 :                         goto bunins_failed;
     138       19245 :                 for (i = 0; i < ci.ncand; i++) {
     139       19063 :                         o = canditer_next(&ci);
     140       19063 :                         val = ((const unsigned short *) vals)[o - b->hseqbase];
     141       19063 :                         if (!(seen[val >> 4] & (1U << (val & 0xF)))) {
     142         614 :                                 seen[val >> 4] |= 1U << (val & 0xF);
     143         614 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     144           0 :                                         goto bunins_failed;
     145         614 :                                 if (bn->batCount == 65536) {
     146             :                                         /* there cannot be more than
     147             :                                          * 65536 distinct values */
     148             :                                         break;
     149             :                                 }
     150             :                         }
     151             :                 }
     152         182 :                 GDKfree(seen);
     153         182 :                 seen = NULL;
     154         608 :         } else if (BATcheckhash(b) ||
     155         490 :                    (!b->batTransient &&
     156         681 :                     cnt == BATcount(b) &&
     157         251 :                     BAThash(b) == GDK_SUCCEED) ||
     158         239 :                    ((parent = VIEWtparent(b)) != 0 &&
     159           0 :                     BATcheckhash(BBPdescriptor(parent)))) {
     160         369 :                 BUN lo;
     161         369 :                 oid seq;
     162             : 
     163             :                 /* we already have a hash table on b, or b is
     164             :                  * persistent and we could create a hash table, or b
     165             :                  * is a view on a bat that already has a hash table */
     166         369 :                 algomsg = "existing hash";
     167         369 :                 seq = b->hseqbase;
     168         369 :                 if (b->thash == NULL && (parent = VIEWtparent(b)) != 0) {
     169           0 :                         BAT *b2 = BBPdescriptor(parent);
     170           0 :                         lo = (BUN) ((b->theap.base - b2->theap.base) >> b->tshift);
     171           0 :                         b = b2;
     172           0 :                         bi = bat_iterator(b);
     173             :                 } else {
     174             :                         lo = 0;
     175             :                 }
     176         369 :                 hs = b->thash;
     177      459638 :                 for (i = 0; i < ci.ncand; i++) {
     178      459269 :                         BUN p;
     179             : 
     180      459269 :                         o = canditer_next(&ci);
     181      459006 :                         p = o - seq;
     182      459006 :                         v = VALUE(p);
     183      459006 :                         for (hb = HASHgetlink(hs, p + lo);
     184      483896 :                              hb != HASHnil(hs) && hb >= lo;
     185      643754 :                              hb = HASHgetlink(hs, hb)) {
     186      483896 :                                 assert(hb < p + lo);
     187      915375 :                                 if (cmp(v, BUNtail(bi, hb)) == 0 &&
     188      430678 :                                     canditer_contains(&ci, hb - lo + seq)) {
     189             :                                         /* we've seen this value
     190             :                                          * before */
     191             :                                         break;
     192             :                                 }
     193             :                         }
     194      459269 :                         if (hb == HASHnil(hs) || hb < lo) {
     195       67490 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     196           0 :                                         goto bunins_failed;
     197             :                         }
     198             :                 }
     199             :         } else {
     200         239 :                 BUN prb;
     201         239 :                 BUN p;
     202         239 :                 BUN mask;
     203             : 
     204         239 :                 GDKclrerr();    /* not interested in BAThash errors */
     205         239 :                 algomsg = "new partial hash";
     206         239 :                 nme = BBP_physical(b->batCacheid);
     207         239 :                 if (ATOMbasetype(b->ttype) == TYPE_bte) {
     208             :                         mask = (BUN) 1 << 8;
     209             :                         cmp = NULL; /* no compare needed, "hash" is perfect */
     210         239 :                 } else if (ATOMbasetype(b->ttype) == TYPE_sht) {
     211             :                         mask = (BUN) 1 << 16;
     212             :                         cmp = NULL; /* no compare needed, "hash" is perfect */
     213             :                 } else {
     214         239 :                         mask = HASHmask(cnt);
     215         239 :                         if (mask < ((BUN) 1 << 16))
     216             :                                 mask = (BUN) 1 << 16;
     217             :                 }
     218         239 :                 if ((hs = GDKzalloc(sizeof(Hash))) == NULL) {
     219           0 :                         GDKerror("cannot allocate hash table\n");
     220           0 :                         goto bunins_failed;
     221             :                 }
     222         239 :                 if (snprintf(hs->heaplink.filename, sizeof(hs->heaplink.filename), "%s.thshunil%x", nme, (unsigned) THRgettid()) >= (int) sizeof(hs->heaplink.filename) ||
     223         478 :                     snprintf(hs->heapbckt.filename, sizeof(hs->heapbckt.filename), "%s.thshunib%x", nme, (unsigned) THRgettid()) >= (int) sizeof(hs->heapbckt.filename) ||
     224         239 :                     HASHnew(hs, b->ttype, BUNlast(b), mask, BUN_NONE, false) != GDK_SUCCEED) {
     225           0 :                         GDKfree(hs);
     226           0 :                         hs = NULL;
     227           0 :                         GDKerror("cannot allocate hash table\n");
     228           0 :                         goto bunins_failed;
     229             :                 }
     230      365649 :                 for (i = 0; i < ci.ncand; i++) {
     231      365410 :                         o = canditer_next(&ci);
     232      365134 :                         v = VALUE(o - b->hseqbase);
     233      365134 :                         prb = HASHprobe(hs, v);
     234      364875 :                         for (hb = HASHget(hs, prb);
     235      366229 :                              hb != HASHnil(hs);
     236      367583 :                              hb = HASHgetlink(hs, hb)) {
     237      158555 :                                 if (cmp == NULL || cmp(v, BUNtail(bi, hb)) == 0)
     238             :                                         break;
     239             :                         }
     240      365410 :                         if (hb == HASHnil(hs)) {
     241      207634 :                                 p = o - b->hseqbase;
     242      207634 :                                 if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED)
     243           0 :                                         goto bunins_failed;
     244             :                                 /* enter into hash table */
     245      415268 :                                 HASHputlink(hs, p, HASHget(hs, prb));
     246      573044 :                                 HASHput(hs, prb, p);
     247             :                         }
     248             :                 }
     249         239 :                 HEAPfree(&hs->heaplink, true);
     250         239 :                 HEAPfree(&hs->heapbckt, true);
     251         239 :                 GDKfree(hs);
     252             :         }
     253             : 
     254         833 :         bn->theap.dirty = true;
     255         833 :         bn->tsorted = true;
     256         833 :         bn->trevsorted = BATcount(bn) <= 1;
     257         833 :         bn->tkey = true;
     258         833 :         bn->tnil = false;
     259         833 :         bn->tnonil = true;
     260         833 :         if (BATcount(bn) == BATcount(b)) {
     261             :                 /* it turns out all values are distinct */
     262          57 :                 assert(b->tnokey[0] == 0);
     263          57 :                 assert(b->tnokey[1] == 0);
     264          57 :                 b->tkey = true;
     265          57 :                 b->batDirtydesc = true;
     266             :         }
     267         833 :         bn = virtualize(bn);
     268         833 :         TRC_DEBUG(ALGO, "b=" ALGOBATFMT
     269             :                   ",s=" ALGOOPTBATFMT " -> " ALGOOPTBATFMT
     270             :                   " (%s -- " LLFMT "usec)\n",
     271             :                   ALGOBATPAR(b), ALGOOPTBATPAR(s),
     272             :                   ALGOOPTBATPAR(bn), algomsg, GDKusec() - t0);
     273             :         return bn;
     274             : 
     275           0 :   bunins_failed:
     276           0 :         if (seen)
     277           0 :                 GDKfree(seen);
     278           0 :         if (hs != NULL && hs != b->thash) {
     279           0 :                 HEAPfree(&hs->heaplink, true);
     280           0 :                 HEAPfree(&hs->heapbckt, true);
     281           0 :                 GDKfree(hs);
     282             :         }
     283           0 :         BBPreclaim(bn);
     284           0 :         return NULL;
     285             : }

Generated by: LCOV version 1.14