LCOV - code coverage report
Current view: top level - gdk - gdk_string.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 478 625 76.5 %
Date: 2021-10-13 02:24:04 Functions: 15 15 100.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : #include "monetdb_config.h"
      10             : #include "gdk.h"
      11             : #include "gdk_private.h"
      12             : #include "gdk_cand.h"
      13             : 
      14             : /* String Atom Implementation
      15             :  *
      16             :  * Strings are stored in two parts.  The first part is the normal tail
      17             :  * heap which contains a list of offsets.  The second part is the
      18             :  * theap which contains the actual strings.  The offsets in the tail
      19             :  * heap (a.k.a. offset heap) point into the theap (a.k.a. string
      20             :  * heap).  Strings are NULL-terminated and are stored without any
      21             :  * escape sequences.  Strings are encoded using the UTF-8 encoding
      22             :  * of Unicode.  This means that individual "characters" (really,
      23             :  * Unicode code points) can be between one and four bytes long.
      24             :  *
      25             :  * Because in many typical situations there are lots of duplicated
      26             :  * string values that are being stored in a table, but also in many
      27             :  * (other) typical situations there are very few duplicated string
      28             :  * values stored, a scheme has been introduced to cater to both
      29             :  * situations.
      30             :  *
      31             :  * When the string heap is "small" (defined as less than 64KiB), the
      32             :  * string heap is fully duplicate eliminated.  When the string heap
      33             :  * grows beyond this size, the heap is not kept free of duplicate
      34             :  * strings, but there is then a heuristic that tries to limit the
      35             :  * number of duplicates.
      36             :  *
      37             :  * This is done by having a fixed sized hash table at the start of the
      38             :  * string heap, and allocating space for collision lists in the first
      39             :  * 64KiB of the string heap.  After the first 64KiB no extra space is
      40             :  * allocated for lists, so hash collisions cannot be resolved.
      41             :  */
      42             : 
      43             : /* some of these macros are duplicates from gdk_atoms.c */
      44             : #define num08(x)        ((x) >= '0' && (x) <= '7')
      45             : #define base08(x)       ((x) - '0')
      46             : #define mult08(x)       ((x) << 3)
      47             : 
      48             : #define num16(x)        isxdigit((unsigned char) (x))
      49             : #define base16(x)       (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) >= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0')
      50             : #define mult16(x)       ((x) << 4)
      51             : 
      52             : #define atommem(size)                                   \
      53             :         do {                                            \
      54             :                 if (*dst == NULL || *len < (size)) { \
      55             :                         GDKfree(*dst);                  \
      56             :                         *len = (size);                  \
      57             :                         *dst = GDKmalloc(*len);         \
      58             :                         if (*dst == NULL) {             \
      59             :                                 *len = 0;               \
      60             :                                 return -1;              \
      61             :                         }                               \
      62             :                 }                                       \
      63             :         } while (0)
      64             : 
      65             : const char str_nil[2] = { '\200', 0 };
      66             : 
      67             : gdk_return
      68     1053130 : strHeap(Heap *d, size_t cap)
      69             : {
      70             :         size_t size;
      71             : 
      72     1053130 :         cap = MAX(cap, BATTINY);
      73     1053130 :         size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN);
      74     1053130 :         return HEAPalloc(d, size, 1, 1);
      75             : }
      76             : 
      77             : 
      78             : void
      79        3697 : strCleanHash(Heap *h, bool rebuild)
      80             : {
      81             :         stridx_t newhash[GDK_STRHASHTABLE];
      82             :         size_t pad, pos;
      83             :         BUN off, strhash;
      84             :         const char *s;
      85             : 
      86             :         (void) rebuild;
      87        3697 :         if (!h->cleanhash)
      88         610 :                 return;
      89             :         /* rebuild hash table for double elimination
      90             :          *
      91             :          * If appending strings to the BAT was aborted, if the heap
      92             :          * was memory mapped, the hash in the string heap may well be
      93             :          * incorrect.  Therefore we don't trust it when we read in a
      94             :          * string heap and we rebuild the complete table (it is small,
      95             :          * so this won't take any time at all).
      96             :          * Note that we will only do this the first time the heap is
      97             :          * loaded, and only for heaps that existed when the server was
      98             :          * started. */
      99        3087 :         memset(newhash, 0, sizeof(newhash));
     100             :         pos = GDK_STRHASHSIZE;
     101      243033 :         while (pos < h->free) {
     102      240052 :                 pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
     103      240052 :                 if (pad < sizeof(stridx_t))
     104      203870 :                         pad += GDK_VARALIGN;
     105      240052 :                 pos += pad;
     106      240052 :                 if (pos >= GDK_ELIMLIMIT)
     107             :                         break;
     108      239946 :                 s = h->base + pos;
     109      239946 :                 strhash = strHash(s);
     110      239946 :                 off = strhash & GDK_STRHASHMASK;
     111      239946 :                 newhash[off] = (stridx_t) (pos - sizeof(stridx_t));
     112      239946 :                 pos += strlen(s) + 1;
     113             :         }
     114             :         /* only set dirty flag if the hash table actually changed */
     115        3087 :         if (memcmp(newhash, h->base, sizeof(newhash)) != 0) {
     116         386 :                 memcpy(h->base, newhash, sizeof(newhash));
     117         386 :                 if (h->storage == STORE_MMAP) {
     118          31 :                         if (!(GDKdebug & NOSYNCMASK))
     119           0 :                                 (void) MT_msync(h->base, GDK_STRHASHSIZE);
     120             :                 } else
     121         355 :                         h->dirty = true;
     122             :         }
     123             : #ifndef NDEBUG
     124        3087 :         if (GDK_ELIMDOUBLES(h)) {
     125             :                 pos = GDK_STRHASHSIZE;
     126      170002 :                 while (pos < h->free) {
     127      167024 :                         pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
     128      167024 :                         if (pad < sizeof(stridx_t))
     129      136679 :                                 pad += GDK_VARALIGN;
     130      167024 :                         pos += pad;
     131      167024 :                         s = h->base + pos;
     132      167024 :                         assert(strLocate(h, s) != 0);
     133      167022 :                         pos += strlen(s) + 1;
     134             :                 }
     135             :         }
     136             : #endif
     137        3085 :         h->cleanhash = false;
     138             : }
     139             : 
     140             : /*
     141             :  * The strPut routine. The routine strLocate can be used to identify
     142             :  * the location of a string in the heap if it exists. Otherwise it
     143             :  * returns (var_t) -2 (-1 is reserved for error).
     144             :  */
     145             : var_t
     146      180814 : strLocate(Heap *h, const char *v)
     147             : {
     148             :         stridx_t *ref, *next;
     149             : 
     150             :         /* search hash-table, if double-elimination is still in place */
     151             :         BUN off;
     152      180814 :         if (h->free == 0) {
     153             :                 /* empty, so there are no strings */
     154             :                 return (var_t) -2;
     155             :         }
     156             : 
     157      180814 :         off = strHash(v);
     158      180814 :         off &= GDK_STRHASHMASK;
     159             : 
     160             :         /* should only use strLocate iff fully double eliminated */
     161      180814 :         assert(GDK_ELIMBASE(h->free) == 0);
     162             : 
     163             :         /* search the linked list */
     164      214986 :         for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
     165      213585 :                 next = (stridx_t *) (h->base + *ref);
     166      213585 :                 if (strcmp(v, (str) (next + 1)) == 0)
     167      179413 :                         return (var_t) ((sizeof(stridx_t) + *ref));     /* found */
     168             :         }
     169             :         return (var_t) -2;
     170             : }
     171             : 
     172             : var_t
     173    75300333 : strPut(BAT *b, var_t *dst, const void *V)
     174             : {
     175             :         const char *v = V;
     176    75300333 :         Heap *h = b->tvheap;
     177             :         size_t pad;
     178    75300333 :         size_t pos, len = strlen(v) + 1;
     179             :         stridx_t *bucket;
     180             :         BUN off;
     181             : 
     182    75300333 :         if (h->free == 0) {
     183      219705 :                 if (h->size < GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN) {
     184           0 :                         if (HEAPgrow(&b->theaplock, &b->tvheap, GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN, true) != GDK_SUCCEED) {
     185             :                                 return (var_t) -1;
     186             :                         }
     187           0 :                         h = b->tvheap;
     188             :                 }
     189      219705 :                 h->free = GDK_STRHASHTABLE * sizeof(stridx_t);
     190      219705 :                 h->dirty = true;
     191             : #ifdef NDEBUG
     192             :                 memset(h->base, 0, h->free);
     193             : #else
     194             :                 /* fill should solve initialization problems within valgrind */
     195      219705 :                 memset(h->base, 0, h->size);
     196             : #endif
     197             :         }
     198             : 
     199    75300333 :         off = strHash(v);
     200    75300333 :         off &= GDK_STRHASHMASK;
     201    75300333 :         bucket = ((stridx_t *) h->base) + off;
     202             : 
     203    75300333 :         if (*bucket) {
     204             :                 /* the hash list is not empty */
     205    73004372 :                 if (*bucket < GDK_ELIMLIMIT) {
     206             :                         /* small string heap (<64KiB) -- fully double
     207             :                          * eliminated: search the linked list */
     208             :                         const stridx_t *ref = bucket;
     209             : 
     210             :                         do {
     211    40140308 :                                 pos = *ref + sizeof(stridx_t);
     212    40140308 :                                 if (strcmp(v, h->base + pos) == 0) {
     213             :                                         /* found */
     214    37106868 :                                         return *dst = (var_t) pos;
     215             :                                 }
     216     3033440 :                                 ref = (stridx_t *) (h->base + *ref);
     217     3033440 :                         } while (*ref);
     218             :                 } else {
     219             :                         /* large string heap (>=64KiB) -- there is no
     220             :                          * linked list, so only look at single
     221             :                          * entry */
     222             :                         pos = *bucket;
     223    34874586 :                         if (strcmp(v, h->base + pos) == 0) {
     224             :                                 /* already in heap: reuse */
     225      768647 :                                 return *dst = (var_t) pos;
     226             :                         }
     227             :                 }
     228             :         }
     229             :         /* the string was not found in the heap, we need to enter it */
     230             : 
     231             :         /* check that string is correctly encoded UTF-8; there was no
     232             :          * need to do this earlier: if the string was found above, it
     233             :          * must have gone through here in the past */
     234             : #ifndef NDEBUG
     235    37424818 :         if (!checkUTF8(v)) {
     236           0 :                 GDKerror("incorrectly encoded UTF-8\n");
     237           0 :                 return (var_t) -1;
     238             :         }
     239             : #endif
     240             : 
     241    37471440 :         pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1));
     242    37471440 :         if (GDK_ELIMBASE(h->free + pad) == 0) {      /* i.e. h->free+pad < GDK_ELIMLIMIT */
     243     2596268 :                 if (pad < sizeof(stridx_t)) {
     244             :                         /* make room for hash link */
     245     2103691 :                         pad += GDK_VARALIGN;
     246             :                 }
     247    34875172 :         } else if (GDK_ELIMBASE(h->free) != 0) {
     248             :                 /* no extra padding needed when no hash links needed
     249             :                  * (but only when padding doesn't cross duplicate
     250             :                  * elimination boundary) */
     251             :                 pad = 0;
     252             :         }
     253             : 
     254             :         /* check heap for space (limited to a certain maximum after
     255             :          * which nils are inserted) */
     256    37471440 :         if (h->free + pad + len >= h->size) {
     257        8220 :                 size_t newsize = MAX(h->size, 4096);
     258             : 
     259             :                 /* double the heap size until we have enough space */
     260             :                 do {
     261        8253 :                         if (newsize < 4 * 1024 * 1024)
     262        7873 :                                 newsize <<= 1;
     263             :                         else
     264         380 :                                 newsize += 4 * 1024 * 1024;
     265        8253 :                 } while (newsize <= h->free + pad + len);
     266             : 
     267        8220 :                 assert(newsize);
     268             : 
     269        8220 :                 if (h->free + pad + len >= (size_t) VAR_MAX) {
     270           0 :                         GDKerror("string heap gets larger than %zuGiB.\n", (size_t) VAR_MAX >> 30);
     271           0 :                         return (var_t) -1;
     272             :                 }
     273        8220 :                 TRC_DEBUG(HEAP, "HEAPextend in strPut %s %zu %zu\n", h->filename, h->size, newsize);
     274        8220 :                 if (HEAPgrow(&b->theaplock, &b->tvheap, newsize, true) != GDK_SUCCEED) {
     275             :                         return (var_t) -1;
     276             :                 }
     277        8220 :                 h = b->tvheap;
     278             : 
     279             :                 /* make bucket point into the new heap */
     280        8220 :                 bucket = ((stridx_t *) h->base) + off;
     281             :         }
     282             : 
     283             :         /* insert string */
     284    37471440 :         pos = h->free + pad;
     285    37471440 :         *dst = (var_t) pos;
     286    37471440 :         if (pad > 0)
     287     2594527 :                 memset(h->base + h->free, 0, pad);
     288    37471440 :         memcpy(h->base + pos, v, len);
     289    37471440 :         h->free += pad + len;
     290    37471440 :         h->dirty = true;
     291             : 
     292             :         /* maintain hash table */
     293    37471440 :         if (GDK_ELIMBASE(pos) == 0) {   /* small string heap: link the next pointer */
     294             :                 /* the stridx_t next pointer directly precedes the
     295             :                  * string */
     296     2593308 :                 pos -= sizeof(stridx_t);
     297     2593308 :                 *(stridx_t *) (h->base + pos) = *bucket;
     298             :         }
     299    37471440 :         *bucket = (stridx_t) pos;       /* set bucket to the new string */
     300             : 
     301    37471440 :         return *dst;
     302             : }
     303             : 
     304             : /*
     305             :  * Convert an "" separated string to a GDK string value, checking that
     306             :  * the input is correct UTF-8.
     307             :  */
     308             : 
     309             : #ifdef __GNUC__
     310             : /* __builtin_expect returns its first argument; it is expected to be
     311             :  * equal to the second argument */
     312             : #define unlikely(expr)  __builtin_expect((expr) != 0, 0)
     313             : #define likely(expr)    __builtin_expect((expr) != 0, 1)
     314             : #else
     315             : #define unlikely(expr)  (expr)
     316             : #define likely(expr)    (expr)
     317             : #endif
     318             : 
     319             : ssize_t
     320   356581418 : GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len)
     321             : {
     322             :         unsigned char *p = dst;
     323   356581418 :         const unsigned char *cur = src, *end = src + len;
     324             :         bool escaped = false;
     325             :         int mask = 0, n, c, utf8char = 0;
     326             : 
     327   547580293 :         if (len >= 2 && strNil((const char *) src)) {
     328           0 :                 strcpy((char *) dst, str_nil);
     329           0 :                 return 1;
     330             :         }
     331             : 
     332             :         /* copy it in, while performing the correct escapes */
     333             :         /* n is the number of follow-on bytes left in a multi-byte
     334             :          * UTF-8 sequence */
     335  1996090088 :         for (cur = src, n = 0; cur < end || escaped; cur++) {
     336             :                 /* first convert any \ escapes and store value in c */
     337  1639508671 :                 if (escaped) {
     338      550863 :                         switch (*cur) {
     339        3819 :                         case '0':
     340             :                         case '1':
     341             :                         case '2':
     342             :                         case '3':
     343             :                         case '4':
     344             :                         case '5':
     345             :                         case '6':
     346             :                         case '7':
     347             :                                 /* \ with up to three octal digits */
     348        3819 :                                 c = base08(*cur);
     349        3819 :                                 if (num08(cur[1])) {
     350        3819 :                                         cur++;
     351        3819 :                                         c = mult08(c) + base08(*cur);
     352        3819 :                                         if (num08(cur[1])) {
     353        3819 :                                                 if (unlikely(c > 037)) {
     354             :                                                         /* octal
     355             :                                                          * escape
     356             :                                                          * sequence
     357             :                                                          * out or
     358             :                                                          * range */
     359           1 :                                                         GDKerror("not an octal number\n");
     360           1 :                                                         return -1;
     361             :                                                 }
     362        3818 :                                                 cur++;
     363        3818 :                                                 c = mult08(c) + base08(*cur);
     364        3818 :                                                 assert(c >= 0 && c <= 0377);
     365             :                                         }
     366             :                                 }
     367             :                                 break;
     368           1 :                         case 'x':
     369             :                                 /* \x with one or two hexadecimal digits */
     370           1 :                                 if (num16(cur[1])) {
     371           1 :                                         cur++;
     372           1 :                                         c = base16(*cur);
     373           1 :                                         if (num16(cur[1])) {
     374           1 :                                                 cur++;
     375           1 :                                                 c = mult16(c) + base16(*cur);
     376             :                                         }
     377             :                                 } else
     378             :                                         c = 'x';
     379             :                                 break;
     380           0 :                         case 'u':
     381             :                         case 'U':
     382             :                                 /* \u with four hexadecimal digits or
     383             :                                  * \U with eight hexadecimal digits */
     384           0 :                                 if (unlikely(n > 0)) {
     385             :                                         /* not when in the middle of a
     386             :                                          * UTF-8 sequence */
     387           0 :                                         goto notutf8;
     388             :                                 }
     389             :                                 c = 0;
     390           0 :                                 for (n = *cur == 'U' ? 8 : 4; n > 0; n--) {
     391           0 :                                         cur++;
     392           0 :                                         if (unlikely(!num16(*cur))) {
     393           0 :                                                 GDKerror("not a Unicode code point escape\n");
     394           0 :                                                 return -1;
     395             :                                         }
     396           0 :                                         c = c << 4 | base16(*cur);
     397             :                                 }
     398             :                                 /* n == 0 now */
     399           0 :                                 if (unlikely(c == 0 || c > 0x10FFFF ||
     400             :                                              (c & 0xFFF800) == 0xD800)) {
     401           0 :                                         GDKerror("illegal Unicode code point\n");
     402           0 :                                         return -1;
     403             :                                 }
     404           0 :                                 if (c < 0x80) {
     405           0 :                                         *p++ = (unsigned char) c;
     406             :                                 } else {
     407           0 :                                         if (c < 0x800) {
     408           0 :                                                 *p++ = 0xC0 | (c >> 6);
     409             :                                         } else {
     410           0 :                                                 if (c < 0x10000) {
     411           0 :                                                         *p++ = 0xE0 | (c >> 12);
     412             :                                                 } else {
     413           0 :                                                         *p++ = 0xF0 | (c >> 18);
     414           0 :                                                         *p++ = 0x80 | ((c >> 12) & 0x3F);
     415             :                                                 }
     416           0 :                                                 *p++ = 0x80 | ((c >> 6) & 0x3F);
     417             :                                         }
     418           0 :                                         *p++ = 0x80 | (c & 0x3F);
     419             :                                 }
     420             :                                 escaped = false;
     421           0 :                                 continue;
     422             :                         case 'a':
     423             :                                 c = '\a';
     424             :                                 break;
     425           0 :                         case 'b':
     426             :                                 c = '\b';
     427           0 :                                 break;
     428           3 :                         case 'f':
     429             :                                 c = '\f';
     430           3 :                                 break;
     431        6499 :                         case 'n':
     432             :                                 c = '\n';
     433        6499 :                                 break;
     434          10 :                         case 'r':
     435             :                                 c = '\r';
     436          10 :                                 break;
     437        1985 :                         case 't':
     438             :                                 c = '\t';
     439        1985 :                                 break;
     440           0 :                         case '\0':
     441             :                                 c = '\\';
     442           0 :                                 break;
     443      538546 :                         case '\'':
     444             :                         case '\\':
     445             :                                 /* \' and \\ can be handled by the
     446             :                                  * default case */
     447             :                         default:
     448             :                                 /* unrecognized \ escape, just copy
     449             :                                  * the backslashed character */
     450             :                                 c = *cur;
     451      538546 :                                 break;
     452             :                         }
     453             :                         escaped = false;
     454  1638957808 :                 } else if ((c = *cur) == '\\') {
     455             :                         escaped = true;
     456      550863 :                         continue;
     457             : #if 0
     458             :                 } else if (c == quote && cur[1] == quote) {
     459             :                         assert(c != 0);
     460             :                         if (unlikely(n > 0))
     461             :                                 goto notutf8;
     462             :                         *p++ = quote;
     463             :                         cur++;
     464             :                         continue;
     465             : #endif
     466             :                 }
     467             : 
     468  1638957807 :                 if (n > 0) {
     469             :                         /* we're still expecting follow-up bytes in a
     470             :                          * UTF-8 sequence */
     471       51181 :                         if (unlikely((c & 0xC0) != 0x80)) {
     472             :                                 /* incorrect UTF-8 sequence: byte is
     473             :                                  * not 10xxxxxx */
     474           0 :                                 goto notutf8;
     475             :                         }
     476       51181 :                         utf8char = (utf8char << 6) | (c & 0x3F);
     477       51181 :                         n--;
     478       51181 :                         if (n == 0) {
     479             :                                 /* this was the last byte in the sequence */
     480       26414 :                                 if (unlikely((utf8char & mask) == 0)) {
     481             :                                         /* incorrect UTF-8 sequence:
     482             :                                          * not shortest possible */
     483           0 :                                         goto notutf8;
     484             :                                 }
     485       26414 :                                 if (unlikely(utf8char > 0x10FFFF)) {
     486             :                                         /* incorrect UTF-8 sequence:
     487             :                                          * value too large */
     488           0 :                                         goto notutf8;
     489             :                                 }
     490       26414 :                                 if (unlikely((utf8char & 0x1FFF800) == 0xD800)) {
     491             :                                         /* incorrect UTF-8 sequence:
     492             :                                          * low or high surrogate
     493             :                                          * encoded as UTF-8 */
     494           0 :                                         goto notutf8;
     495             :                                 }
     496             :                         }
     497  1638906626 :                 } else if ((c & 0x80) == 0) {
     498             :                         ;
     499       26414 :                 } else if ((c & 0xE0) == 0xC0) {
     500             :                         n = 1;
     501             :                         mask = 0x000780;
     502        1657 :                         utf8char = c & 0x1F;
     503       24757 :                 } else if ((c & 0xF0) == 0xE0) {
     504             :                         n = 2;
     505             :                         mask = 0x00F800;
     506       24747 :                         utf8char = c & 0x0F;
     507          10 :                 } else if ((c & 0xF8) == 0xF0) {
     508             :                         n = 3;
     509             :                         mask = 0x1F0000;
     510          10 :                         utf8char = c & 0x07;
     511             :                 } else {
     512             :                         /* incorrect UTF-8 sequence */
     513           0 :                         goto notutf8;
     514             :                 }
     515  1638957807 :                 *p++ = c;
     516             :         }
     517   356581417 :         if (unlikely(n > 0)) {
     518             :                 /* incomplete UTF-8 sequence */
     519           0 :                 goto notutf8;
     520             :         }
     521   356581417 :         *p++ = 0;
     522   356581417 :         return len;
     523           0 :   notutf8:
     524           0 :         GDKerror("not a proper UTF-8 sequence\n");
     525           0 :         return -1;
     526             : }
     527             : 
     528             : ssize_t
     529    23452847 : strFromStr(const char *restrict src, size_t *restrict len, char **restrict dst, bool external)
     530             : {
     531             :         const char *cur = src, *start = NULL;
     532             :         size_t l = 1;
     533             :         bool escaped = false;
     534             : 
     535    23452847 :         if (!external) {
     536    23452845 :                 size_t sz = strLen(src);
     537    23452845 :                 atommem(sz);
     538    23452846 :                 return (ssize_t) strcpy_len(*dst, src, sz);
     539             :         }
     540             : 
     541           2 :         if (strNil(src)) {
     542           0 :                 atommem(2);
     543           0 :                 strcpy(*dst, str_nil);
     544           0 :                 return 1;
     545             :         }
     546             : 
     547           2 :         while (GDKisspace(*cur))
     548           0 :                 cur++;
     549           2 :         if (*cur != '"') {
     550           0 :                 if (strncmp(cur, "nil", 3) == 0) {
     551           0 :                         atommem(2);
     552           0 :                         strcpy(*dst, str_nil);
     553           0 :                         return (ssize_t) (cur - src) + 3;
     554             :                 }
     555           0 :                 GDKerror("not a quoted string\n");
     556           0 :                 return -1;
     557             :         }
     558             : 
     559             :         /* scout the string to find out its length and whether it was
     560             :          * properly quoted */
     561          17 :         for (start = ++cur; *cur != '"' || escaped; cur++) {
     562          15 :                 if (*cur == 0) {
     563           0 :                         GDKerror("no closing quotes\n");
     564           0 :                         return -1;
     565          15 :                 } else if (*cur == '\\' && !escaped) {
     566             :                         escaped = true;
     567             :                 } else {
     568             :                         escaped = false;
     569          15 :                         l++;
     570             :                 }
     571             :         }
     572             : 
     573             :         /* alloc new memory */
     574           2 :         if (*dst == NULL || *len < l) {
     575           2 :                 GDKfree(*dst);
     576           2 :                 *dst = GDKmalloc(*len = l);
     577           2 :                 if (*dst == NULL) {
     578           0 :                         *len = 0;
     579           0 :                         return -1;
     580             :                 }
     581             :         }
     582             : 
     583           2 :         return GDKstrFromStr((unsigned char *) *dst,
     584             :                              (const unsigned char *) start,
     585             :                              (ssize_t) (cur - start));
     586             : }
     587             : 
     588             : /*
     589             :  * Convert a GDK string value to something printable.
     590             :  */
     591             : /* all but control characters (in range 0 to 31) and DEL */
     592             : #define printable_chr(ch)       ((' ' <= (ch) && (ch) <= '~') || ((ch) & 0x80) != 0)
     593             : 
     594             : size_t
     595     9618166 : escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote)
     596             : {
     597             :         size_t end, sz = 0;
     598             :         size_t sep1len, sep2len;
     599             : 
     600     9618166 :         sep1len = sep1 ? strlen(sep1) : 0;
     601     9618166 :         sep2len = sep2 ? strlen(sep2) : 0;
     602   346933575 :         for (end = 0; src[end]; end++)
     603   337315409 :                 if (src[end] == '\\'
     604   337306185 :                     || src[end] == quote
     605   337203796 :                     || (sep1len && strncmp(src + end, sep1, sep1len) == 0)
     606   337203779 :                     || (sep2len && strncmp(src + end, sep2, sep2len) == 0)) {
     607      111630 :                         sz += 2;
     608   337203779 :                 } else if (src[end] == (char) '\302' &&
     609           2 :                            0200 <= ((int) src[end + 1] & 0377) &&
     610           2 :                            ((int) src[end + 1] & 0377) <= 0237) {
     611             :                         /* Unicode control character (code point range
     612             :                          * U-00000080 through U-0000009F encoded in
     613             :                          * UTF-8 */
     614             :                         /* for the first one of the two UTF-8 bytes we
     615             :                          * count a width of 7 and for the second one
     616             :                          * 1, together that's 8, i.e. the width of two
     617             :                          * backslash-escaped octal coded characters */
     618           0 :                         sz += 7;
     619   337203779 :                 } else if (!printable_chr(src[end])) {
     620       34736 :                         sz += 4;
     621             :                 } else {
     622   337169043 :                         sz++;
     623             :                 }
     624     9618166 :         return sz;
     625             : }
     626             : 
     627             : size_t
     628     4812122 : escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote)
     629             : {
     630             :         size_t cur = 0, l = 0;
     631             :         size_t sep1len, sep2len;
     632             : 
     633     4812122 :         sep1len = sep1 ? strlen(sep1) : 0;
     634     4812122 :         sep2len = sep2 ? strlen(sep2) : 0;
     635   173554409 :         for (; src[cur] && l < dstlen; cur++)
     636   168742287 :                 if (!printable_chr(src[cur])
     637   168724355 :                     || (src[cur] == '\302'
     638           1 :                         && 0200 <= (src[cur + 1] & 0377)
     639           1 :                         && ((int) src[cur + 1] & 0377) <= 0237)
     640   168724355 :                     || (cur > 0
     641   163923394 :                         && src[cur - 1] == '\302'
     642           1 :                         && 0200 <= (src[cur] & 0377)
     643           1 :                         && (src[cur] & 0377) <= 0237)) {
     644       17932 :                         dst[l++] = '\\';
     645       17932 :                         switch (src[cur]) {
     646         802 :                         case '\t':
     647         802 :                                 dst[l++] = 't';
     648         802 :                                 break;
     649       17125 :                         case '\n':
     650       17125 :                                 dst[l++] = 'n';
     651       17125 :                                 break;
     652           3 :                         case '\r':
     653           3 :                                 dst[l++] = 'r';
     654           3 :                                 break;
     655           2 :                         case '\f':
     656           2 :                                 dst[l++] = 'f';
     657           2 :                                 break;
     658           0 :                         default:
     659           0 :                                 snprintf(dst + l, dstlen - l, "%03o", (unsigned char) src[cur]);
     660           0 :                                 l += 3;
     661           0 :                                 break;
     662             :                         }
     663   168724355 :                 } else if (src[cur] == '\\'
     664   168719383 :                            || src[cur] == quote
     665   168658625 :                            || (sep1len && strncmp(src + cur, sep1, sep1len) == 0)
     666   168658625 :                            || (sep2len && strncmp(src + cur, sep2, sep2len) == 0)) {
     667       65730 :                         dst[l++] = '\\';
     668       65730 :                         dst[l++] = src[cur];
     669             :                 } else {
     670   168658625 :                         dst[l++] = src[cur];
     671             :                 }
     672     4812122 :         assert(l < dstlen);
     673     4812122 :         dst[l] = 0;
     674     4812122 :         return l;
     675             : }
     676             : 
     677             : ssize_t
     678        6116 : strToStr(char **restrict dst, size_t *restrict len, const char *restrict src, bool external)
     679             : {
     680             :         size_t sz;
     681             : 
     682        6116 :         if (!external) {
     683         294 :                 sz = strLen(src);
     684         294 :                 atommem(sz);
     685         294 :                 return (ssize_t) strcpy_len(*dst, src, sz);
     686             :         }
     687        5822 :         if (strNil(src)) {
     688          59 :                 atommem(4);
     689          59 :                 strcpy(*dst, "nil");
     690          59 :                 return 3;
     691             :         } else {
     692             :                 ssize_t l = 0;
     693        5763 :                 size_t sz = escapedStrlen(src, NULL, NULL, '"');
     694             : 
     695        5763 :                 atommem(sz + 3);
     696        5763 :                 l = (ssize_t) escapedStr((*dst) + 1, src, *len - 1, NULL, NULL, '"');
     697        5763 :                 l++;
     698        5763 :                 (*dst)[0] = (*dst)[l++] = '"';
     699        5763 :                 (*dst)[l] = 0;
     700        5763 :                 return l;
     701             :         }
     702             : }
     703             : 
     704             : str
     705          85 : strRead(str a, size_t *dstlen, stream *s, size_t cnt)
     706             : {
     707             :         int len;
     708             : 
     709             :         (void) cnt;
     710          85 :         assert(cnt == 1);
     711          85 :         if (mnstr_readInt(s, &len) != 1 || len < 0)
     712             :                 return NULL;
     713          85 :         if (a == NULL || *dstlen < (size_t) len + 1) {
     714           0 :                 if ((a = GDKrealloc(a, len + 1)) == NULL)
     715             :                         return NULL;
     716           0 :                 *dstlen = len + 1;
     717             :         }
     718          85 :         if (len && mnstr_read(s, a, len, 1) != 1) {
     719           0 :                 GDKfree(a);
     720           0 :                 return NULL;
     721             :         }
     722          85 :         a[len] = 0;
     723          85 :         return a;
     724             : }
     725             : 
     726             : gdk_return
     727          85 : strWrite(const char *a, stream *s, size_t cnt)
     728             : {
     729          85 :         size_t len = strlen(a);
     730             : 
     731             :         (void) cnt;
     732          85 :         assert(cnt == 1);
     733          85 :         if (!checkUTF8(a)) {
     734           0 :                 GDKerror("incorrectly encoded UTF-8\n");
     735           0 :                 return GDK_FAIL;
     736             :         }
     737          85 :         if (mnstr_writeInt(s, (int) len) && mnstr_write(s, a, len, 1) == 1)
     738             :                 return GDK_SUCCEED;
     739             :         else
     740           0 :                 return GDK_FAIL;
     741             : }
     742             : 
     743             : static gdk_return
     744          70 : concat_strings(BAT **bnp, ValPtr pt, BAT *b, oid seqb,
     745             :                BUN ngrp, struct canditer *restrict ci, BUN ncand,
     746             :                const oid *restrict gids, oid min, oid max, bool skip_nils,
     747             :                BAT *sep, const char *restrict separator, BUN *has_nils)
     748             : {
     749             :         oid gid;
     750             :         BUN i, p, nils = 0;
     751             :         size_t *restrict lengths = NULL, *restrict lastseplength = NULL, separator_length = 0, next_length;
     752             :         str *restrict astrings = NULL, s, sl;
     753          70 :         BATiter bi, bis = (BATiter) {0};
     754             :         BAT *bn = NULL;
     755             :         gdk_return rres = GDK_SUCCEED;
     756             : 
     757             :         /* exactly one of bnp and pt must be NULL, the other non-NULL */
     758          70 :         assert((bnp == NULL) != (pt == NULL));
     759             :         /* if pt not NULL, only a single group allowed */
     760          70 :         assert(pt == NULL || ngrp == 1);
     761             : 
     762          70 :         bi = bat_iterator(b);
     763          70 :         if (sep)
     764          26 :                 bis = bat_iterator(sep);
     765             :         else
     766          44 :                 separator_length = strlen(separator);
     767             : 
     768          70 :         if (bnp) {
     769          29 :                 if ((bn = COLnew(min, TYPE_str, ngrp, TRANSIENT)) == NULL) {
     770             :                         rres = GDK_FAIL;
     771           0 :                         goto finish;
     772             :                 }
     773          29 :                 *bnp = bn;
     774             :         }
     775             : 
     776          70 :         if (ngrp == 1) {
     777             :                 size_t offset = 0, single_length = 0;
     778             :                 bool empty = true;
     779             : 
     780          47 :                 if (separator) {
     781         410 :                         for (i = 0; i < ncand; i++) {
     782         381 :                                 p = canditer_next(ci) - seqb;
     783         381 :                                 s = BUNtvar(bi, p);
     784         381 :                                 if (strNil(s)) {
     785          15 :                                         if (!skip_nils) {
     786             :                                                 nils = 1;
     787             :                                                 break;
     788             :                                         }
     789             :                                 } else {
     790         366 :                                         single_length += strlen(s);
     791         366 :                                         if (!empty)
     792         339 :                                                 single_length += separator_length;
     793             :                                         empty = false;
     794             :                                 }
     795             :                         }
     796             :                 } else { /* sep case */
     797          18 :                         assert(sep != NULL);
     798         168 :                         for (i = 0; i < ncand; i++) {
     799         150 :                                 p = canditer_next(ci) - seqb;
     800         150 :                                 s = BUNtvar(bi, p);
     801         150 :                                 sl = BUNtvar(bis, p);
     802         150 :                                 if (strNil(s)) {
     803           4 :                                         if (!skip_nils) {
     804             :                                                 nils = 1;
     805             :                                                 break;
     806             :                                         }
     807             :                                 } else {
     808         146 :                                         single_length += strlen(s);
     809         146 :                                         if (!empty) {
     810         128 :                                                 if (strNil(sl)) {
     811          23 :                                                         if (!skip_nils) {
     812             :                                                                 nils = 1;
     813             :                                                                 break;
     814             :                                                         }
     815             :                                                 } else
     816         105 :                                                         single_length += strlen(sl);
     817             :                                         }
     818             :                                         empty = false;
     819             :                                 }
     820             :                         }
     821             :                 }
     822          47 :                 canditer_reset(ci);
     823             : 
     824          47 :                 if (nils == 0 && !empty) {
     825             :                         char *single_str = NULL;
     826             : 
     827          45 :                         if ((single_str = GDKmalloc(single_length + 1)) == NULL) {
     828           0 :                                 bat_iterator_end(&bi);
     829           0 :                                 if (sep)
     830           0 :                                         bat_iterator_end(&bis);
     831           0 :                                 return GDK_FAIL;
     832             :                         }
     833             :                         empty = true;
     834          45 :                         if (separator) {
     835         395 :                                 for (i = 0; i < ncand; i++) {
     836         368 :                                         p = canditer_next(ci) - seqb;
     837         368 :                                         s = BUNtvar(bi, p);
     838         368 :                                         if (strNil(s))
     839           2 :                                                 continue;
     840         366 :                                         if (!empty) {
     841         339 :                                                 memcpy(single_str + offset, separator, separator_length);
     842         339 :                                                 offset += separator_length;
     843             :                                         }
     844         366 :                                         next_length = strlen(s);
     845         366 :                                         memcpy(single_str + offset, s, next_length);
     846         366 :                                         offset += next_length;
     847             :                                         empty = false;
     848             :                                 }
     849             :                         } else { /* sep case */
     850          18 :                                 assert(sep != NULL);
     851         168 :                                 for (i = 0; i < ncand; i++) {
     852         150 :                                         p = canditer_next(ci) - seqb;
     853         150 :                                         s = BUNtvar(bi, p);
     854         150 :                                         sl = BUNtvar(bis, p);
     855         150 :                                         if (strNil(s))
     856           4 :                                                 continue;
     857         274 :                                         if (!empty && !strNil(sl)) {
     858         105 :                                                 next_length = strlen(sl);
     859         105 :                                                 memcpy(single_str + offset, sl, next_length);
     860         105 :                                                 offset += next_length;
     861             :                                         }
     862         146 :                                         next_length = strlen(s);
     863         146 :                                         memcpy(single_str + offset, s, next_length);
     864         146 :                                         offset += next_length;
     865             :                                         empty = false;
     866             :                                 }
     867             :                         }
     868             : 
     869          45 :                         single_str[offset] = '\0';
     870          45 :                         if (bn) {
     871           6 :                                 if (BUNappend(bn, single_str, false) != GDK_SUCCEED) {
     872           0 :                                         GDKfree(single_str);
     873           0 :                                         bat_iterator_end(&bi);
     874           0 :                                         if (sep)
     875           0 :                                                 bat_iterator_end(&bis);
     876           0 :                                         return GDK_FAIL;
     877             :                                 }
     878             :                         } else {
     879          39 :                                 pt->len = offset + 1;
     880          39 :                                 pt->val.sval = single_str;
     881             :                                 single_str = NULL;      /* don't free */
     882             :                         }
     883          45 :                         GDKfree(single_str);
     884           2 :                 } else if (bn) {
     885           0 :                         if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
     886           0 :                                 bat_iterator_end(&bi);
     887           0 :                                 if (sep)
     888           0 :                                         bat_iterator_end(&bis);
     889           0 :                                 return GDK_FAIL;
     890             :                         }
     891             :                 } else {
     892           2 :                         if (VALinit(pt, TYPE_str, str_nil) == NULL) {
     893           0 :                                 bat_iterator_end(&bi);
     894           0 :                                 if (sep)
     895           0 :                                         bat_iterator_end(&bis);
     896           0 :                                 return GDK_FAIL;
     897             :                         }
     898             :                 }
     899          47 :                 bat_iterator_end(&bi);
     900          47 :                 if (sep)
     901          18 :                         bat_iterator_end(&bis);
     902          47 :                 return GDK_SUCCEED;
     903             :         } else {
     904             :                 /* first used to calculated the total length of
     905             :                  * each group, then the the total offset */
     906          23 :                 lengths = GDKzalloc(ngrp * sizeof(*lengths));
     907          23 :                 astrings = GDKmalloc(ngrp * sizeof(str));
     908          23 :                 if (sep)
     909           8 :                         lastseplength = GDKzalloc(ngrp * sizeof(*lastseplength));
     910          23 :                 if (lengths == NULL || astrings == NULL || (sep && lastseplength == NULL)) {
     911             :                         rres = GDK_FAIL;
     912           0 :                         goto finish;
     913             :                 }
     914             :                 /* at first, set astrings[i] to str_nil, then for each
     915             :                  * non-empty group (even if all strings in the group
     916             :                  * are empty), set to NULL */
     917         131 :                 for (i = 0; i < ngrp; i++)
     918         108 :                         astrings[i] = (char *) str_nil;
     919             : 
     920          23 :                 if (separator) {
     921         178 :                         for (p = 0; p < ncand; p++) {
     922         163 :                                 i = canditer_next(ci) - seqb;
     923         163 :                                 if (gids[i] >= min && gids[i] <= max) {
     924         163 :                                         gid = gids[i] - min;
     925         163 :                                         if (lengths[gid] == (size_t) -1)
     926           0 :                                                 continue;
     927         163 :                                         s = BUNtvar(bi, i);
     928         163 :                                         if (!strNil(s)) {
     929         155 :                                                 lengths[gid] += strlen(s) + separator_length;
     930         155 :                                                 astrings[gid] = NULL;
     931           8 :                                         } else if (!skip_nils) {
     932           0 :                                                 nils++;
     933           0 :                                                 lengths[gid] = (size_t) -1;
     934           0 :                                                 astrings[gid] = (char *) str_nil;
     935             :                                         }
     936             :                                 }
     937             :                         }
     938             :                 } else { /* sep case */
     939           8 :                         assert(sep != NULL);
     940         151 :                         for (p = 0; p < ncand; p++) {
     941         143 :                                 i = canditer_next(ci) - seqb;
     942         143 :                                 if (gids[i] >= min && gids[i] <= max) {
     943         143 :                                         gid = gids[i] - min;
     944         143 :                                         if (lengths[gid] == (size_t) -1)
     945           0 :                                                 continue;
     946         143 :                                         s = BUNtvar(bi, i);
     947         143 :                                         sl = BUNtvar(bis, i);
     948         143 :                                         if (!strNil(s)) {
     949         140 :                                                 lengths[gid] += strlen(s);
     950         140 :                                                 if (!strNil(sl)) {
     951         129 :                                                         next_length = strlen(sl);
     952         129 :                                                         lengths[gid] += next_length;
     953         129 :                                                         lastseplength[gid] = next_length;
     954             :                                                 } else
     955          11 :                                                         lastseplength[gid] = 0;
     956         140 :                                                 astrings[gid] = NULL;
     957           3 :                                         } else if (!skip_nils) {
     958           0 :                                                 nils++;
     959           0 :                                                 lengths[gid] = (size_t) -1;
     960           0 :                                                 lastseplength[gid] = 0;
     961           0 :                                                 astrings[gid] = (char *) str_nil;
     962             :                                         }
     963             :                                 }
     964             :                         }
     965             :                 }
     966             : 
     967          23 :                 if (separator) {
     968          69 :                         for (i = 0; i < ngrp; i++) {
     969          54 :                                 if (astrings[i] == NULL) {
     970          52 :                                         if ((astrings[i] = GDKmalloc(lengths[i] + 1 - separator_length)) == NULL) {
     971             :                                                 rres = GDK_FAIL;
     972           0 :                                                 goto finish;
     973             :                                         }
     974          52 :                                         astrings[i][0] = 0;
     975          52 :                                         lengths[i] = 0;
     976             :                                 } else
     977           2 :                                         astrings[i] = NULL;
     978             :                         }
     979             :                 } else { /* sep case */
     980           8 :                         assert(sep != NULL);
     981          62 :                         for (i = 0; i < ngrp; i++) {
     982          54 :                                 if (astrings[i] == NULL) {
     983          53 :                                         if ((astrings[i] = GDKmalloc(lengths[i] + 1 - lastseplength[i])) == NULL) {
     984             :                                                 rres = GDK_FAIL;
     985           0 :                                                 goto finish;
     986             :                                         }
     987          53 :                                         astrings[i][0] = 0;
     988          53 :                                         lengths[i] = 0;
     989             :                                 } else
     990           1 :                                         astrings[i] = NULL;
     991             :                         }
     992             :                 }
     993          23 :                 canditer_reset(ci);
     994             : 
     995          23 :                 if (separator) {
     996         178 :                         for (p = 0; p < ncand; p++) {
     997         163 :                                 i = canditer_next(ci) - seqb;
     998         163 :                                 if (gids[i] >= min && gids[i] <= max) {
     999         163 :                                         gid = gids[i] - min;
    1000         163 :                                         if (astrings[gid]) {
    1001         160 :                                                 s = BUNtvar(bi, i);
    1002         160 :                                                 if (strNil(s))
    1003           5 :                                                         continue;
    1004         155 :                                                 if (astrings[gid][lengths[gid]]) {
    1005         103 :                                                         memcpy(astrings[gid] + lengths[gid], separator, separator_length);
    1006         103 :                                                         lengths[gid] += separator_length;
    1007             :                                                 }
    1008         155 :                                                 next_length = strlen(s);
    1009         155 :                                                 memcpy(astrings[gid] + lengths[gid], s, next_length);
    1010         155 :                                                 lengths[gid] += next_length;
    1011         155 :                                                 astrings[gid][lengths[gid]] = 1;
    1012             :                                         }
    1013             :                                 }
    1014             :                         }
    1015             :                 } else { /* sep case */
    1016           8 :                         assert(sep != NULL);
    1017         151 :                         for (p = 0; p < ncand; p++) {
    1018         143 :                                 i = canditer_next(ci) - seqb;
    1019         143 :                                 if (gids[i] >= min && gids[i] <= max) {
    1020         143 :                                         gid = gids[i] - min;
    1021         143 :                                         if (astrings[gid]) {
    1022         142 :                                                 s = BUNtvar(bi, i);
    1023         142 :                                                 sl = BUNtvar(bis, i);
    1024         142 :                                                 if (strNil(s))
    1025           2 :                                                         continue;
    1026         227 :                                                 if (astrings[gid][lengths[gid]] && !strNil(sl)) {
    1027          79 :                                                         next_length = strlen(sl);
    1028          79 :                                                         memcpy(astrings[gid] + lengths[gid], sl, next_length);
    1029          79 :                                                         lengths[gid] += next_length;
    1030             :                                                 }
    1031         140 :                                                 next_length = strlen(s);
    1032         140 :                                                 memcpy(astrings[gid] + lengths[gid], s, next_length);
    1033         140 :                                                 lengths[gid] += next_length;
    1034         140 :                                                 astrings[gid][lengths[gid]] = 1;
    1035             :                                         }
    1036             :                                 }
    1037             :                         }
    1038             :                 }
    1039             : 
    1040         131 :                 for (i = 0; i < ngrp; i++) {
    1041         108 :                         if (astrings[i]) {
    1042         105 :                                 astrings[i][lengths[i]] = '\0';
    1043         105 :                                 if (BUNappend(bn, astrings[i], false) != GDK_SUCCEED) {
    1044             :                                         rres = GDK_FAIL;
    1045           0 :                                         goto finish;
    1046             :                                 }
    1047           3 :                         } else if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
    1048             :                                 rres = GDK_FAIL;
    1049           0 :                                 goto finish;
    1050             :                         }
    1051             :                 }
    1052             :         }
    1053             : 
    1054          23 :   finish:
    1055          23 :         bat_iterator_end(&bi);
    1056          23 :         if (sep)
    1057           8 :                 bat_iterator_end(&bis);
    1058          23 :         if (has_nils)
    1059          23 :                 *has_nils = nils;
    1060          23 :         GDKfree(lengths);
    1061          23 :         GDKfree(lastseplength);
    1062          23 :         if (astrings) {
    1063         131 :                 for (i = 0; i < ngrp; i++) {
    1064         108 :                         if (astrings[i] != str_nil)
    1065         108 :                                 GDKfree(astrings[i]);
    1066             :                 }
    1067          23 :                 GDKfree(astrings);
    1068             :         }
    1069          23 :         if (rres != GDK_SUCCEED)
    1070           0 :                 BBPreclaim(bn);
    1071             : 
    1072             :         return rres;
    1073             : }
    1074             : 
    1075             : gdk_return
    1076          42 : BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils,
    1077             :                     bool abort_on_error, bool nil_if_empty, const char *restrict separator)
    1078             : {
    1079             :         BUN ncand;
    1080             :         struct canditer ci;
    1081             : 
    1082             :         (void) abort_on_error;
    1083          42 :         assert((separator && !sep) || (!separator && sep)); /* only one of them must be set */
    1084          42 :         res->vtype = TYPE_str;
    1085             : 
    1086          42 :         ncand = canditer_init(&ci, b, s);
    1087             : 
    1088          42 :         if (sep && BATcount(sep) == 1) { /* Only one element in sep */
    1089           0 :                 BATiter bi = bat_iterator(sep);
    1090           0 :                 separator = BUNtvar(bi, 0);
    1091           0 :                 bat_iterator_end(&bi);
    1092             :                 sep = NULL;
    1093             :         }
    1094             : 
    1095          68 :         if (ncand == 0 || (separator && strNil(separator))) {
    1096           1 :                 if (VALinit(res, TYPE_str, nil_if_empty ? str_nil : "") == NULL)
    1097             :                         return GDK_FAIL;
    1098           1 :                 return GDK_SUCCEED;
    1099             :         }
    1100             : 
    1101          41 :         return concat_strings(NULL, res, b, b->hseqbase, 1, &ci, ncand, NULL, 0, 0,
    1102             :                               skip_nils, sep, separator, NULL);
    1103             : }
    1104             : 
    1105             : BAT *
    1106          49 : BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool skip_nils,
    1107             :                          bool abort_on_error, const char *restrict separator)
    1108             : {
    1109          49 :         BAT *bn = NULL;
    1110             :         oid min, max;
    1111          49 :         BUN ngrp, ncand, nils = 0;
    1112             :         struct canditer ci;
    1113             :         const char *err;
    1114             :         gdk_return res;
    1115             : 
    1116          49 :         assert((separator && !sep) || (!separator && sep)); /* only one of them must be set */
    1117             :         (void) skip_nils;
    1118             : 
    1119          49 :         if ((err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp,
    1120             :                                     &ci, &ncand)) !=NULL) {
    1121           0 :                 GDKerror("%s\n", err);
    1122           0 :                 return NULL;
    1123             :         }
    1124          49 :         if (g == NULL) {
    1125           0 :                 GDKerror("b and g must be aligned\n");
    1126           0 :                 return NULL;
    1127             :         }
    1128             : 
    1129          49 :         if (sep && BATcount(sep) == 1) { /* Only one element in sep */
    1130           0 :                 BATiter bi = bat_iterator(sep);
    1131           0 :                 separator = BUNtvar(bi, 0);
    1132           0 :                 bat_iterator_end(&bi);
    1133             :                 sep = NULL;
    1134             :         }
    1135             : 
    1136          67 :         if (ncand == 0 || ngrp == 0 || (separator && strNil(separator))) {
    1137             :                 /* trivial: no strings to concat, so return bat
    1138             :                  * aligned with g with nil in the tail */
    1139           5 :                 return BATconstant(ngrp == 0 ? 0 : min, TYPE_str, str_nil, ngrp, TRANSIENT);
    1140             :         }
    1141             : 
    1142          44 :         if (BATtdense(g) || (g->tkey && g->tnonil)) {
    1143             :                 /* trivial: singleton groups, so all results are equal
    1144             :                  * to the inputs (but possibly a different type) */
    1145          15 :                 return BATconvert(b, s, TYPE_str, abort_on_error, 0, 0, 0);
    1146             :         }
    1147             : 
    1148          29 :         res = concat_strings(&bn, NULL, b, b->hseqbase, ngrp, &ci, ncand,
    1149          29 :                              (const oid *) Tloc(g, 0), min, max, skip_nils, sep,
    1150             :                              separator, &nils);
    1151          29 :         if (res != GDK_SUCCEED)
    1152             :                 return NULL;
    1153             : 
    1154          29 :         return bn;
    1155             : }
    1156             : 
    1157             : #define compute_next_single_str(START, END)                             \
    1158             :         do {                                                            \
    1159             :                 for (oid m = START; m < END; m++) {                  \
    1160             :                         sb = BUNtvar(bi, m);                            \
    1161             :                                                                         \
    1162             :                         if (separator) {                                \
    1163             :                                 if (!strNil(sb)) {                      \
    1164             :                                         next_group_length += strlen(sb); \
    1165             :                                         if (!empty)                     \
    1166             :                                                 next_group_length += separator_length; \
    1167             :                                         empty = false;                  \
    1168             :                                 }                                       \
    1169             :                         } else { /* sep case */                         \
    1170             :                                 assert(sep != NULL);                    \
    1171             :                                 sl = BUNtvar(sepi, m);                  \
    1172             :                                                                         \
    1173             :                                 if (!strNil(sb)) {                      \
    1174             :                                         next_group_length += strlen(sb); \
    1175             :                                         if (!empty && !strNil(sl))      \
    1176             :                                                 next_group_length += strlen(sl); \
    1177             :                                         empty = false;                  \
    1178             :                                 }                                       \
    1179             :                         }                                               \
    1180             :                 }                                                       \
    1181             :                 if (empty) {                                            \
    1182             :                         if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
    1183             :                                 max_group_length = 1;                   \
    1184             :                                 if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
    1185             :                                         goto allocation_error;          \
    1186             :                         } else if (1 > max_group_length) {           \
    1187             :                                 max_group_length = 1;                   \
    1188             :                                 if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
    1189             :                                         goto allocation_error;          \
    1190             :                                 single_str = next_single_str;           \
    1191             :                         }                                               \
    1192             :                         strcpy(single_str, str_nil);                    \
    1193             :                         has_nils = true;                                \
    1194             :                 } else {                                                \
    1195             :                         empty = true;                                   \
    1196             :                         if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
    1197             :                                 max_group_length = next_group_length;   \
    1198             :                                 if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
    1199             :                                         goto allocation_error;          \
    1200             :                         } else if (next_group_length > max_group_length) { \
    1201             :                                 max_group_length = next_group_length;   \
    1202             :                                 if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
    1203             :                                         goto allocation_error;          \
    1204             :                                 single_str = next_single_str;           \
    1205             :                         }                                               \
    1206             :                                                                         \
    1207             :                         for (oid m = START; m < END; m++) {          \
    1208             :                                 sb = BUNtvar(bi, m);                    \
    1209             :                                                                         \
    1210             :                                 if (separator) {                        \
    1211             :                                         if (strNil(sb))                 \
    1212             :                                                 continue;               \
    1213             :                                         if (!empty) {                   \
    1214             :                                                 memcpy(single_str + offset, separator, separator_length); \
    1215             :                                                 offset += separator_length; \
    1216             :                                         }                               \
    1217             :                                         next_length = strlen(sb);       \
    1218             :                                         memcpy(single_str + offset, sb, next_length); \
    1219             :                                         offset += next_length;          \
    1220             :                                         empty = false;                  \
    1221             :                                 } else { /* sep case */                 \
    1222             :                                         assert(sep != NULL);            \
    1223             :                                         sl = BUNtvar(sepi, m);          \
    1224             :                                                                         \
    1225             :                                         if (strNil(sb))                 \
    1226             :                                                 continue;               \
    1227             :                                         if (!empty && !strNil(sl)) {    \
    1228             :                                                 next_length = strlen(sl); \
    1229             :                                                 memcpy(single_str + offset, sl, next_length); \
    1230             :                                                 offset += next_length;  \
    1231             :                                         }                               \
    1232             :                                         next_length = strlen(sb);       \
    1233             :                                         memcpy(single_str + offset, sb, next_length); \
    1234             :                                         offset += next_length;          \
    1235             :                                         empty = false;                  \
    1236             :                                 }                                       \
    1237             :                         }                                               \
    1238             :                                                                         \
    1239             :                         single_str[offset] = '\0';                      \
    1240             :                 }                                                       \
    1241             : } while (0)
    1242             : 
    1243             : #define ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW          \
    1244             :         do {                                                            \
    1245             :                 size_t slice_length = 0;                                \
    1246             :                 next_group_length = next_length = offset = 0;           \
    1247             :                 empty = true;                                           \
    1248             :                 compute_next_single_str(k, i); /* compute the entire string then slice it starting from the beginning */ \
    1249             :                 empty = true;                                           \
    1250             :                 for (; k < i;) {                                     \
    1251             :                         str nsep, nstr;                                 \
    1252             :                         oid m = k;                                      \
    1253             :                         j = k;                                          \
    1254             :                         do {                                            \
    1255             :                                 k++;                                    \
    1256             :                         } while (k < i && !op[k]);                   \
    1257             :                         for (; j < k; j++) {                         \
    1258             :                                 nstr = BUNtvar(bi, j);                  \
    1259             :                                 if (!strNil(nstr)) {                    \
    1260             :                                         slice_length += strlen(nstr);   \
    1261             :                                         if (!empty) {                   \
    1262             :                                                 if (separator) {        \
    1263             :                                                         nsep = (str) separator; \
    1264             :                                                 } else { /* sep case */ \
    1265             :                                                         assert(sep != NULL); \
    1266             :                                                         nsep = BUNtvar(sepi, j); \
    1267             :                                                 }                       \
    1268             :                                                 if (!strNil(nsep))      \
    1269             :                                                         slice_length += strlen(nsep); \
    1270             :                                         }                               \
    1271             :                                         empty = false;                  \
    1272             :                                 }                                       \
    1273             :                         }                                               \
    1274             :                         if (empty) {                                    \
    1275             :                                 for (j = m; j < k; j++)                      \
    1276             :                                         if (tfastins_nocheckVAR(r, j, str_nil) != GDK_SUCCEED) \
    1277             :                                                 goto allocation_error;  \
    1278             :                                 has_nils = true;                        \
    1279             :                         } else {                                        \
    1280             :                                 char save = single_str[slice_length];   \
    1281             :                                 single_str[slice_length] = '\0';        \
    1282             :                                 for (j = m; j < k; j++)                      \
    1283             :                                         if (tfastins_nocheckVAR(r, j, single_str) != GDK_SUCCEED) \
    1284             :                                                 goto allocation_error;  \
    1285             :                                 single_str[slice_length] = save;        \
    1286             :                         }                                               \
    1287             :                 }                                                       \
    1288             :         } while (0)
    1289             : 
    1290             : #define ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS                            \
    1291             :         do {                                                            \
    1292             :                 next_group_length = next_length = offset = 0;           \
    1293             :                 empty = true;                                           \
    1294             :                 compute_next_single_str(k, i);                          \
    1295             :                 for (; k < i; k++)                                   \
    1296             :                         if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
    1297             :                                 goto allocation_error;                  \
    1298             :         } while (0)
    1299             : 
    1300             : #define ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW                         \
    1301             :         do {                                                            \
    1302             :                 for (; k < i; k++) {                                 \
    1303             :                         str next = BUNtvar(bi, k);                      \
    1304             :                         if (tfastins_nocheckVAR(r, k, next) != GDK_SUCCEED) \
    1305             :                                 goto allocation_error;                  \
    1306             :                         has_nils |= strNil(next);                       \
    1307             :                 }                                                       \
    1308             :         } while (0)
    1309             : 
    1310             : #define ANALYTICAL_STR_GROUP_CONCAT_OTHERS                              \
    1311             :         do {                                                            \
    1312             :                 for (; k < i; k++) {                                 \
    1313             :                         next_group_length = next_length = offset = 0;   \
    1314             :                         empty = true;                                   \
    1315             :                         compute_next_single_str(start[k], end[k]);      \
    1316             :                         if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
    1317             :                                 goto allocation_error;                  \
    1318             :                 }                                                       \
    1319             :         } while (0)
    1320             : 
    1321             : #define ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(IMP)     \
    1322             :         do {                                            \
    1323             :                 if (p) {                                \
    1324             :                         for (; i < cnt; i++) {               \
    1325             :                                 if (np[i])              \
    1326             :                                         IMP;            \
    1327             :                         }                               \
    1328             :                 }                                       \
    1329             :                 i = cnt;                                \
    1330             :                 IMP;                                    \
    1331             :         } while (0)
    1332             : 
    1333             : gdk_return
    1334          50 : GDKanalytical_str_group_concat(BAT *r, BAT *p, BAT *o, BAT *b, BAT *sep, BAT *s, BAT *e, const char *restrict separator, int frame_type)
    1335             : {
    1336             :         bool has_nils = false, empty;
    1337          50 :         BATiter pi = bat_iterator(p);
    1338          50 :         BATiter oi = bat_iterator(o);
    1339          50 :         BATiter bi = bat_iterator(b);
    1340          50 :         BATiter sepi = bat_iterator(sep);
    1341          50 :         BATiter si = bat_iterator(s);
    1342          50 :         BATiter ei = bat_iterator(e);
    1343          50 :         oid i = 0, j = 0, k = 0, cnt = BATcount(b), *restrict start = si.base, *restrict end = ei.base;
    1344          50 :         bit *np = pi.base, *op = oi.base;
    1345             :         str sb, sl, single_str = NULL, next_single_str;
    1346             :         size_t separator_length = 0, next_group_length, max_group_length = 0, next_length, offset;
    1347             : 
    1348          50 :         assert((sep && !separator && BATcount(b) == BATcount(sep)) || (!sep && separator));
    1349          50 :         if (b->ttype != TYPE_str || r->ttype != TYPE_str || (sep && sep->ttype != TYPE_str)) {
    1350           0 :                 GDKerror("only string type is supported\n");
    1351           0 :                 bat_iterator_end(&pi);
    1352           0 :                 bat_iterator_end(&oi);
    1353           0 :                 bat_iterator_end(&bi);
    1354           0 :                 bat_iterator_end(&sepi);
    1355           0 :                 bat_iterator_end(&si);
    1356           0 :                 bat_iterator_end(&ei);
    1357           0 :                 return GDK_FAIL;
    1358             :         }
    1359          50 :         if (sep && BATcount(sep) == 1) { /* Only one element in sep */
    1360           0 :                 separator = BUNtvar(sepi, 0);
    1361             :                 sep = NULL;
    1362             :         }
    1363             : 
    1364          50 :         if (sep == NULL)
    1365          20 :                 separator_length = strlen(separator);
    1366             : 
    1367          50 :         if (cnt > 0) {
    1368          49 :                 switch (frame_type) {
    1369          26 :                 case 3: /* unbounded until current row */       {
    1370      164838 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW);
    1371             :                 } break;
    1372           0 :                 case 4: /* current row until unbounded */
    1373           0 :                         goto notimplemented;
    1374          23 :                 case 5: /* all rows */  {
    1375         847 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS);
    1376             :                 } break;
    1377           0 :                 case 6: /* current row */ {
    1378           0 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW);
    1379             :                 } break;
    1380           0 :                 default: {
    1381           0 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_OTHERS);
    1382             :                 }
    1383             :                 }
    1384             :         }
    1385             : 
    1386          50 :         bat_iterator_end(&pi);
    1387          50 :         bat_iterator_end(&oi);
    1388          50 :         bat_iterator_end(&bi);
    1389          50 :         bat_iterator_end(&sepi);
    1390          50 :         bat_iterator_end(&si);
    1391          50 :         bat_iterator_end(&ei);
    1392          50 :         GDKfree(single_str);
    1393          50 :         BATsetcount(r, cnt);
    1394          50 :         r->tnonil = !has_nils;
    1395          50 :         r->tnil = has_nils;
    1396          50 :         return GDK_SUCCEED;
    1397           0 :   allocation_error:
    1398           0 :         bat_iterator_end(&pi);
    1399           0 :         bat_iterator_end(&oi);
    1400           0 :         bat_iterator_end(&bi);
    1401           0 :         bat_iterator_end(&sepi);
    1402           0 :         bat_iterator_end(&si);
    1403           0 :         bat_iterator_end(&ei);
    1404           0 :         GDKfree(single_str);
    1405           0 :         return GDK_FAIL;
    1406             :   notimplemented:
    1407           0 :         bat_iterator_end(&pi);
    1408           0 :         bat_iterator_end(&oi);
    1409           0 :         bat_iterator_end(&bi);
    1410           0 :         bat_iterator_end(&sepi);
    1411           0 :         bat_iterator_end(&si);
    1412           0 :         bat_iterator_end(&ei);
    1413           0 :         GDKerror("str_group_concat not yet implemented for current row until unbounded case\n");
    1414           0 :         return GDK_FAIL;
    1415             : }

Generated by: LCOV version 1.14