LCOV - code coverage report
Current view: top level - gdk - gdk_string.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 495 639 77.5 %
Date: 2021-09-14 19:48:19 Functions: 15 15 100.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : #include "monetdb_config.h"
      10             : #include "gdk.h"
      11             : #include "gdk_private.h"
      12             : #include "gdk_cand.h"
      13             : 
      14             : /* String Atom Implementation
      15             :  *
      16             :  * Strings are stored in two parts.  The first part is the normal tail
      17             :  * heap which contains a list of offsets.  The second part is the
      18             :  * theap which contains the actual strings.  The offsets in the tail
      19             :  * heap (a.k.a. offset heap) point into the theap (a.k.a. string
      20             :  * heap).  Strings are NULL-terminated and are stored without any
      21             :  * escape sequences.  Strings are encoded using the UTF-8 encoding
      22             :  * of Unicode.  This means that individual "characters" (really,
      23             :  * Unicode code points) can be between one and four bytes long.
      24             :  *
      25             :  * Because in many typical situations there are lots of duplicated
      26             :  * string values that are being stored in a table, but also in many
      27             :  * (other) typical situations there are very few duplicated string
      28             :  * values stored, a scheme has been introduced to cater to both
      29             :  * situations.
      30             :  *
      31             :  * When the string heap is "small" (defined as less than 64KiB), the
      32             :  * string heap is fully duplicate eliminated.  When the string heap
      33             :  * grows beyond this size, the heap is not kept free of duplicate
      34             :  * strings, but there is then a heuristic that tries to limit the
      35             :  * number of duplicates.
      36             :  *
      37             :  * This is done by having a fixed sized hash table at the start of the
      38             :  * string heap, and allocating space for collision lists in the first
      39             :  * 64KiB of the string heap.  After the first 64KiB no extra space is
      40             :  * allocated for lists, so hash collisions cannot be resolved.
      41             :  */
      42             : 
      43             : /* some of these macros are duplicates from gdk_atoms.c */
      44             : #define num08(x)        ((x) >= '0' && (x) <= '7')
      45             : #define base08(x)       ((x) - '0')
      46             : #define mult08(x)       ((x) << 3)
      47             : 
      48             : #define num16(x)        isxdigit((unsigned char) (x))
      49             : #define base16(x)       (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) >= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0')
      50             : #define mult16(x)       ((x) << 4)
      51             : 
      52             : #define atommem(size)                                   \
      53             :         do {                                            \
      54             :                 if (*dst == NULL || *len < (size)) { \
      55             :                         GDKfree(*dst);                  \
      56             :                         *len = (size);                  \
      57             :                         *dst = GDKmalloc(*len);         \
      58             :                         if (*dst == NULL) {             \
      59             :                                 *len = 0;               \
      60             :                                 return -1;              \
      61             :                         }                               \
      62             :                 }                                       \
      63             :         } while (0)
      64             : 
      65             : const char str_nil[2] = { '\200', 0 };
      66             : 
      67             : void
      68     1583233 : strHeap(Heap *d, size_t cap)
      69             : {
      70             :         size_t size;
      71             : 
      72     1583233 :         cap = MAX(cap, BATTINY);
      73     1583233 :         size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN);
      74     1583233 :         if (HEAPalloc(d, size, 1, 1) != GDK_SUCCEED)
      75           0 :                 GDKerror("alloc failed");
      76     1593600 : }
      77             : 
      78             : 
      79             : void
      80        3476 : strCleanHash(Heap *h, bool rebuild)
      81             : {
      82             :         stridx_t newhash[GDK_STRHASHTABLE];
      83             :         size_t pad, pos;
      84        3476 :         const size_t extralen = h->hashash ? EXTRALEN : 0;
      85             :         BUN off, strhash;
      86             :         const char *s;
      87             : 
      88             :         (void) rebuild;
      89        3476 :         if (!h->cleanhash)
      90         298 :                 return;
      91             :         /* rebuild hash table for double elimination
      92             :          *
      93             :          * If appending strings to the BAT was aborted, if the heap
      94             :          * was memory mapped, the hash in the string heap may well be
      95             :          * incorrect.  Therefore we don't trust it when we read in a
      96             :          * string heap and we rebuild the complete table (it is small,
      97             :          * so this won't take any time at all).
      98             :          * Note that we will only do this the first time the heap is
      99             :          * loaded, and only for heaps that existed when the server was
     100             :          * started. */
     101        3178 :         memset(newhash, 0, sizeof(newhash));
     102             :         pos = GDK_STRHASHSIZE;
     103      237810 :         while (pos < h->free) {
     104      234739 :                 pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
     105      234739 :                 if (pad < sizeof(stridx_t))
     106      199600 :                         pad += GDK_VARALIGN;
     107      234739 :                 pos += pad;
     108      234739 :                 if (pos >= GDK_ELIMLIMIT)
     109             :                         break;
     110      234632 :                 pos += extralen;
     111      234632 :                 s = h->base + pos;
     112      234632 :                 if (h->hashash)
     113       31015 :                         strhash = ((const BUN *) s)[-1];
     114             :                 else
     115      203617 :                         strhash = strHash(s);
     116      234632 :                 off = strhash & GDK_STRHASHMASK;
     117      234632 :                 newhash[off] = (stridx_t) (pos - extralen - sizeof(stridx_t));
     118      234632 :                 pos += strlen(s) + 1;
     119             :         }
     120             :         /* only set dirty flag if the hash table actually changed */
     121        3178 :         if (memcmp(newhash, h->base, sizeof(newhash)) != 0) {
     122         350 :                 memcpy(h->base, newhash, sizeof(newhash));
     123         350 :                 if (h->storage == STORE_MMAP) {
     124          19 :                         if (!(GDKdebug & NOSYNCMASK))
     125           0 :                                 (void) MT_msync(h->base, GDK_STRHASHSIZE);
     126             :                 } else
     127         331 :                         h->dirty = true;
     128             :         }
     129             : #ifndef NDEBUG
     130        3178 :         if (GDK_ELIMDOUBLES(h)) {
     131             :                 pos = GDK_STRHASHSIZE;
     132      167824 :                 while (pos < h->free) {
     133      164756 :                         pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
     134      164756 :                         if (pad < sizeof(stridx_t))
     135      135346 :                                 pad += GDK_VARALIGN;
     136      164756 :                         pos += pad + extralen;
     137      164756 :                         s = h->base + pos;
     138      164756 :                         assert(strLocate(h, s) != 0);
     139      164752 :                         pos += strlen(s) + 1;
     140             :                 }
     141             :         }
     142             : #endif
     143        3174 :         h->cleanhash = false;
     144             : }
     145             : 
     146             : /*
     147             :  * The strPut routine. The routine strLocate can be used to identify
     148             :  * the location of a string in the heap if it exists. Otherwise it
     149             :  * returns zero.
     150             :  */
     151             : var_t
     152      181394 : strLocate(Heap *h, const char *v)
     153             : {
     154             :         stridx_t *ref, *next;
     155      181394 :         const size_t extralen = h->hashash ? EXTRALEN : 0;
     156             : 
     157             :         /* search hash-table, if double-elimination is still in place */
     158             :         BUN off;
     159      181394 :         if (h->free == 0) {
     160             :                 /* empty, so there are no strings */
     161             :                 return 0;
     162             :         }
     163             : 
     164      181394 :         off = strHash(v);
     165      181394 :         off &= GDK_STRHASHMASK;
     166             : 
     167             :         /* should only use strLocate iff fully double eliminated */
     168      181394 :         assert(GDK_ELIMBASE(h->free) == 0);
     169             : 
     170             :         /* search the linked list */
     171      214173 :         for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
     172      212249 :                 next = (stridx_t *) (h->base + *ref);
     173      212249 :                 if (strcmp(v, (str) (next + 1) + extralen) == 0)
     174      179470 :                         return (var_t) ((sizeof(stridx_t) + *ref + extralen));  /* found */
     175             :         }
     176             :         return 0;
     177             : }
     178             : 
     179             : #ifdef __GNUC__
     180             : /* __builtin_expect returns its first argument; it is expected to be
     181             :  * equal to the second argument */
     182             : #define unlikely(expr)  __builtin_expect((expr) != 0, 0)
     183             : #define likely(expr)    __builtin_expect((expr) != 0, 1)
     184             : #else
     185             : #define unlikely(expr)  (expr)
     186             : #define likely(expr)    (expr)
     187             : #endif
     188             : 
     189             : var_t
     190    72284871 : strPut(BAT *b, var_t *dst, const void *V)
     191             : {
     192             :         const char *v = V;
     193    72284871 :         Heap *h = b->tvheap;
     194             :         size_t pad;
     195    72284871 :         size_t pos, len = strlen(v) + 1;
     196    72284871 :         const size_t extralen = h->hashash ? EXTRALEN : 0;
     197             :         stridx_t *bucket;
     198             :         BUN off, strhash;
     199             : 
     200    72284871 :         if (h->free == 0) {
     201      217031 :                 if (h->size < GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN) {
     202           0 :                         if (HEAPgrow(&b->theaplock, &b->tvheap, GDK_STRHASHTABLE * sizeof(stridx_t) + BATTINY * GDK_VARALIGN, true) != GDK_SUCCEED) {
     203             :                                 return 0;
     204             :                         }
     205           0 :                         h = b->tvheap;
     206             :                 }
     207      217031 :                 h->free = GDK_STRHASHTABLE * sizeof(stridx_t);
     208      217031 :                 h->dirty = true;
     209             : #ifdef NDEBUG
     210             :                 memset(h->base, 0, h->free);
     211             : #else
     212             :                 /* fill should solve initialization problems within valgrind */
     213      217031 :                 memset(h->base, 0, h->size);
     214             : #endif
     215      217031 :                 h->hashash = false;
     216             :         }
     217             : 
     218    72284871 :         off = strHash(v);
     219             :         strhash = off;
     220    72284871 :         off &= GDK_STRHASHMASK;
     221    72284871 :         bucket = ((stridx_t *) h->base) + off;
     222             : 
     223    72284871 :         if (*bucket) {
     224             :                 /* the hash list is not empty */
     225    69914019 :                 if (*bucket < GDK_ELIMLIMIT) {
     226             :                         /* small string heap (<64KiB) -- fully double
     227             :                          * eliminated: search the linked list */
     228             :                         const stridx_t *ref = bucket;
     229             : 
     230             :                         do {
     231    39463849 :                                 pos = *ref + sizeof(stridx_t) + extralen;
     232    39463849 :                                 if (strcmp(v, h->base + pos) == 0) {
     233             :                                         /* found */
     234    37030391 :                                         return *dst = (var_t) pos;
     235             :                                 }
     236     2433458 :                                 ref = (stridx_t *) (h->base + *ref);
     237     2433458 :                         } while (*ref);
     238             :                 } else {
     239             :                         /* large string heap (>=64KiB) -- there is no
     240             :                          * linked list, so only look at single
     241             :                          * entry */
     242    32057157 :                         pos = *bucket + extralen;
     243    32057157 :                         if (strcmp(v, h->base + pos) == 0) {
     244             :                                 /* already in heap: reuse */
     245      766058 :                                 return *dst = (var_t) pos;
     246             :                         }
     247             :                 }
     248             :         }
     249             :         /* the string was not found in the heap, we need to enter it */
     250             : 
     251             :         /* check that string is correctly encoded UTF-8; there was no
     252             :          * need to do this earlier: if the string was found above, it
     253             :          * must have gone through here in the past */
     254             : #ifndef NDEBUG
     255    34488422 :         if (!checkUTF8(v)) {
     256           0 :                 GDKerror("incorrectly encoded UTF-8\n");
     257           0 :                 return 0;
     258             :         }
     259             : #endif
     260             : 
     261    35031291 :         pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1));
     262    35031291 :         if (GDK_ELIMBASE(h->free + pad) == 0) {      /* i.e. h->free+pad < GDK_ELIMLIMIT */
     263     2597263 :                 if (pad < sizeof(stridx_t)) {
     264             :                         /* make room for hash link */
     265     2078254 :                         pad += GDK_VARALIGN;
     266             :                 }
     267    32434028 :         } else if (GDK_ELIMBASE(h->free) != 0) {
     268             :                 /* no extra padding needed when no hash links needed
     269             :                  * (but only when padding doesn't cross duplicate
     270             :                  * elimination boundary) */
     271             :                 pad = 0;
     272             :         }
     273             : 
     274    35031291 :         pad += extralen;
     275             : 
     276             :         /* check heap for space (limited to a certain maximum after
     277             :          * which nils are inserted) */
     278    35031291 :         if (h->free + pad + len >= h->size) {
     279        8712 :                 size_t newsize = MAX(h->size, 4096);
     280             : 
     281             :                 /* double the heap size until we have enough space */
     282             :                 do {
     283        8745 :                         if (newsize < 4 * 1024 * 1024)
     284        8349 :                                 newsize <<= 1;
     285             :                         else
     286         396 :                                 newsize += 4 * 1024 * 1024;
     287        8745 :                 } while (newsize <= h->free + pad + len);
     288             : 
     289        8712 :                 assert(newsize);
     290             : 
     291        8712 :                 if (h->free + pad + len >= (size_t) VAR_MAX) {
     292           0 :                         GDKerror("string heap gets larger than %zuGiB.\n", (size_t) VAR_MAX >> 30);
     293           0 :                         return 0;
     294             :                 }
     295        8712 :                 TRC_DEBUG(HEAP, "HEAPextend in strPut %s %zu %zu\n", h->filename, h->size, newsize);
     296        8712 :                 if (HEAPgrow(&b->theaplock, &b->tvheap, newsize, true) != GDK_SUCCEED) {
     297             :                         return 0;
     298             :                 }
     299        8713 :                 h = b->tvheap;
     300             : 
     301             :                 /* make bucket point into the new heap */
     302        8713 :                 bucket = ((stridx_t *) h->base) + off;
     303             :         }
     304             : 
     305             :         /* insert string */
     306    35031292 :         pos = h->free + pad;
     307    35031292 :         *dst = (var_t) pos;
     308    35031292 :         if (pad > 0)
     309     2591903 :                 memset(h->base + h->free, 0, pad);
     310    35031292 :         memcpy(h->base + pos, v, len);
     311    35031292 :         if (h->hashash) {
     312          36 :                 ((BUN *) (h->base + pos))[-1] = strhash;
     313             : #if EXTRALEN > SIZEOF_BUN
     314             :                 ((BUN *) (h->base + pos))[-2] = (BUN) len;
     315             : #endif
     316             :         }
     317    35031292 :         h->free += pad + len;
     318    35031292 :         h->dirty = true;
     319             : 
     320             :         /* maintain hash table */
     321    35031292 :         pos -= extralen;
     322    35031292 :         if (GDK_ELIMBASE(pos) == 0) {   /* small string heap: link the next pointer */
     323             :                 /* the stridx_t next pointer directly precedes the
     324             :                  * string and optional (depending on hashash) hash
     325             :                  * value */
     326     2588966 :                 pos -= sizeof(stridx_t);
     327     2588966 :                 *(stridx_t *) (h->base + pos) = *bucket;
     328             :         }
     329    35031292 :         *bucket = (stridx_t) pos;       /* set bucket to the new string */
     330             : 
     331    35031292 :         return *dst;
     332             : }
     333             : 
     334             : /*
     335             :  * Convert an "" separated string to a GDK string value, checking that
     336             :  * the input is correct UTF-8.
     337             :  */
     338             : 
     339             : ssize_t
     340   326736521 : GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len)
     341             : {
     342             :         unsigned char *p = dst;
     343   326736521 :         const unsigned char *cur = src, *end = src + len;
     344             :         bool escaped = false;
     345             :         int mask = 0, n, c, utf8char = 0;
     346             : 
     347   494650805 :         if (len >= 2 && strNil((const char *) src)) {
     348           0 :                 strcpy((char *) dst, str_nil);
     349           0 :                 return 1;
     350             :         }
     351             : 
     352             :         /* copy it in, while performing the correct escapes */
     353             :         /* n is the number of follow-on bytes left in a multi-byte
     354             :          * UTF-8 sequence */
     355  1766092945 :         for (cur = src, n = 0; cur < end || escaped; cur++) {
     356             :                 /* first convert any \ escapes and store value in c */
     357  1439356425 :                 if (escaped) {
     358      553524 :                         switch (*cur) {
     359        3819 :                         case '0':
     360             :                         case '1':
     361             :                         case '2':
     362             :                         case '3':
     363             :                         case '4':
     364             :                         case '5':
     365             :                         case '6':
     366             :                         case '7':
     367             :                                 /* \ with up to three octal digits */
     368        3819 :                                 c = base08(*cur);
     369        3819 :                                 if (num08(cur[1])) {
     370        3819 :                                         cur++;
     371        3819 :                                         c = mult08(c) + base08(*cur);
     372        3819 :                                         if (num08(cur[1])) {
     373        3819 :                                                 if (unlikely(c > 037)) {
     374             :                                                         /* octal
     375             :                                                          * escape
     376             :                                                          * sequence
     377             :                                                          * out or
     378             :                                                          * range */
     379           1 :                                                         GDKerror("not an octal number\n");
     380           1 :                                                         return -1;
     381             :                                                 }
     382        3818 :                                                 cur++;
     383        3818 :                                                 c = mult08(c) + base08(*cur);
     384        3818 :                                                 assert(c >= 0 && c <= 0377);
     385             :                                         }
     386             :                                 }
     387             :                                 break;
     388           1 :                         case 'x':
     389             :                                 /* \x with one or two hexadecimal digits */
     390           1 :                                 if (num16(cur[1])) {
     391           1 :                                         cur++;
     392           1 :                                         c = base16(*cur);
     393           1 :                                         if (num16(cur[1])) {
     394           1 :                                                 cur++;
     395           1 :                                                 c = mult16(c) + base16(*cur);
     396             :                                         }
     397             :                                 } else
     398             :                                         c = 'x';
     399             :                                 break;
     400           0 :                         case 'u':
     401             :                         case 'U':
     402             :                                 /* \u with four hexadecimal digits or
     403             :                                  * \U with eight hexadecimal digits */
     404           0 :                                 if (unlikely(n > 0)) {
     405             :                                         /* not when in the middle of a
     406             :                                          * UTF-8 sequence */
     407           0 :                                         goto notutf8;
     408             :                                 }
     409             :                                 c = 0;
     410           0 :                                 for (n = *cur == 'U' ? 8 : 4; n > 0; n--) {
     411           0 :                                         cur++;
     412           0 :                                         if (unlikely(!num16(*cur))) {
     413           0 :                                                 GDKerror("not a Unicode code point escape\n");
     414           0 :                                                 return -1;
     415             :                                         }
     416           0 :                                         c = c << 4 | base16(*cur);
     417             :                                 }
     418             :                                 /* n == 0 now */
     419           0 :                                 if (unlikely(c == 0 || c > 0x10FFFF ||
     420             :                                              (c & 0xFFF800) == 0xD800)) {
     421           0 :                                         GDKerror("illegal Unicode code point\n");
     422           0 :                                         return -1;
     423             :                                 }
     424           0 :                                 if (c < 0x80) {
     425           0 :                                         *p++ = (unsigned char) c;
     426             :                                 } else {
     427           0 :                                         if (c < 0x800) {
     428           0 :                                                 *p++ = 0xC0 | (c >> 6);
     429             :                                         } else {
     430           0 :                                                 if (c < 0x10000) {
     431           0 :                                                         *p++ = 0xE0 | (c >> 12);
     432             :                                                 } else {
     433           0 :                                                         *p++ = 0xF0 | (c >> 18);
     434           0 :                                                         *p++ = 0x80 | ((c >> 12) & 0x3F);
     435             :                                                 }
     436           0 :                                                 *p++ = 0x80 | ((c >> 6) & 0x3F);
     437             :                                         }
     438           0 :                                         *p++ = 0x80 | (c & 0x3F);
     439             :                                 }
     440             :                                 escaped = false;
     441           0 :                                 continue;
     442             :                         case 'a':
     443             :                                 c = '\a';
     444             :                                 break;
     445           0 :                         case 'b':
     446             :                                 c = '\b';
     447           0 :                                 break;
     448           3 :                         case 'f':
     449             :                                 c = '\f';
     450           3 :                                 break;
     451        6471 :                         case 'n':
     452             :                                 c = '\n';
     453        6471 :                                 break;
     454          10 :                         case 'r':
     455             :                                 c = '\r';
     456          10 :                                 break;
     457        1986 :                         case 't':
     458             :                                 c = '\t';
     459        1986 :                                 break;
     460           0 :                         case '\0':
     461             :                                 c = '\\';
     462           0 :                                 break;
     463      541234 :                         case '\'':
     464             :                         case '\\':
     465             :                                 /* \' and \\ can be handled by the
     466             :                                  * default case */
     467             :                         default:
     468             :                                 /* unrecognized \ escape, just copy
     469             :                                  * the backslashed character */
     470             :                                 c = *cur;
     471      541234 :                                 break;
     472             :                         }
     473             :                         escaped = false;
     474  1438802901 :                 } else if ((c = *cur) == '\\') {
     475             :                         escaped = true;
     476      553524 :                         continue;
     477             : #if 0
     478             :                 } else if (c == quote && cur[1] == quote) {
     479             :                         assert(c != 0);
     480             :                         if (unlikely(n > 0))
     481             :                                 goto notutf8;
     482             :                         *p++ = quote;
     483             :                         cur++;
     484             :                         continue;
     485             : #endif
     486             :                 }
     487             : 
     488  1438802900 :                 if (n > 0) {
     489             :                         /* we're still expecting follow-up bytes in a
     490             :                          * UTF-8 sequence */
     491       51201 :                         if (unlikely((c & 0xC0) != 0x80)) {
     492             :                                 /* incorrect UTF-8 sequence: byte is
     493             :                                  * not 10xxxxxx */
     494           0 :                                 goto notutf8;
     495             :                         }
     496       51201 :                         utf8char = (utf8char << 6) | (c & 0x3F);
     497       51201 :                         n--;
     498       51201 :                         if (n == 0) {
     499             :                                 /* this was the last byte in the sequence */
     500       26434 :                                 if (unlikely((utf8char & mask) == 0)) {
     501             :                                         /* incorrect UTF-8 sequence:
     502             :                                          * not shortest possible */
     503           0 :                                         goto notutf8;
     504             :                                 }
     505       26434 :                                 if (unlikely(utf8char > 0x10FFFF)) {
     506             :                                         /* incorrect UTF-8 sequence:
     507             :                                          * value too large */
     508           0 :                                         goto notutf8;
     509             :                                 }
     510       26434 :                                 if (unlikely((utf8char & 0x1FFF800) == 0xD800)) {
     511             :                                         /* incorrect UTF-8 sequence:
     512             :                                          * low or high surrogate
     513             :                                          * encoded as UTF-8 */
     514           0 :                                         goto notutf8;
     515             :                                 }
     516             :                         }
     517  1438751699 :                 } else if ((c & 0x80) == 0) {
     518             :                         ;
     519       26434 :                 } else if ((c & 0xE0) == 0xC0) {
     520             :                         n = 1;
     521             :                         mask = 0x000780;
     522        1677 :                         utf8char = c & 0x1F;
     523       24757 :                 } else if ((c & 0xF0) == 0xE0) {
     524             :                         n = 2;
     525             :                         mask = 0x00F800;
     526       24747 :                         utf8char = c & 0x0F;
     527          10 :                 } else if ((c & 0xF8) == 0xF0) {
     528             :                         n = 3;
     529             :                         mask = 0x1F0000;
     530          10 :                         utf8char = c & 0x07;
     531             :                 } else {
     532             :                         /* incorrect UTF-8 sequence */
     533           0 :                         goto notutf8;
     534             :                 }
     535  1438802900 :                 *p++ = c;
     536             :         }
     537   326736520 :         if (unlikely(n > 0)) {
     538             :                 /* incomplete UTF-8 sequence */
     539           0 :                 goto notutf8;
     540             :         }
     541   326736520 :         *p++ = 0;
     542   326736520 :         return len;
     543           0 :   notutf8:
     544           0 :         GDKerror("not a proper UTF-8 sequence\n");
     545           0 :         return -1;
     546             : }
     547             : 
     548             : ssize_t
     549    22296827 : strFromStr(const char *restrict src, size_t *restrict len, char **restrict dst, bool external)
     550             : {
     551             :         const char *cur = src, *start = NULL;
     552             :         size_t l = 1;
     553             :         bool escaped = false;
     554             : 
     555    22296827 :         if (!external) {
     556    22296825 :                 size_t sz = strLen(src);
     557    22296825 :                 atommem(sz);
     558    22296827 :                 return (ssize_t) strcpy_len(*dst, src, sz);
     559             :         }
     560             : 
     561           2 :         if (strNil(src)) {
     562           0 :                 atommem(2);
     563           0 :                 strcpy(*dst, str_nil);
     564           0 :                 return 1;
     565             :         }
     566             : 
     567           2 :         while (GDKisspace(*cur))
     568           0 :                 cur++;
     569           2 :         if (*cur != '"') {
     570           0 :                 if (strncmp(cur, "nil", 3) == 0) {
     571           0 :                         atommem(2);
     572           0 :                         strcpy(*dst, str_nil);
     573           0 :                         return (ssize_t) (cur - src) + 3;
     574             :                 }
     575           0 :                 GDKerror("not a quoted string\n");
     576           0 :                 return -1;
     577             :         }
     578             : 
     579             :         /* scout the string to find out its length and whether it was
     580             :          * properly quoted */
     581          17 :         for (start = ++cur; *cur != '"' || escaped; cur++) {
     582          15 :                 if (*cur == 0) {
     583           0 :                         GDKerror("no closing quotes\n");
     584           0 :                         return -1;
     585          15 :                 } else if (*cur == '\\' && !escaped) {
     586             :                         escaped = true;
     587             :                 } else {
     588             :                         escaped = false;
     589          15 :                         l++;
     590             :                 }
     591             :         }
     592             : 
     593             :         /* alloc new memory */
     594           2 :         if (*dst == NULL || *len < l) {
     595           2 :                 GDKfree(*dst);
     596           2 :                 *dst = GDKmalloc(*len = l);
     597           2 :                 if (*dst == NULL) {
     598           0 :                         *len = 0;
     599           0 :                         return -1;
     600             :                 }
     601             :         }
     602             : 
     603           2 :         return GDKstrFromStr((unsigned char *) *dst,
     604             :                              (const unsigned char *) start,
     605             :                              (ssize_t) (cur - start));
     606             : }
     607             : 
     608             : /*
     609             :  * Convert a GDK string value to something printable.
     610             :  */
     611             : /* all but control characters (in range 0 to 31) and DEL */
     612             : #define printable_chr(ch)       ((' ' <= (ch) && (ch) <= '~') || ((ch) & 0x80) != 0)
     613             : 
     614             : size_t
     615     9622872 : escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote)
     616             : {
     617             :         size_t end, sz = 0;
     618             :         size_t sep1len, sep2len;
     619             : 
     620     9622872 :         sep1len = sep1 ? strlen(sep1) : 0;
     621     9622872 :         sep2len = sep2 ? strlen(sep2) : 0;
     622   346234387 :         for (end = 0; src[end]; end++)
     623   336611515 :                 if (src[end] == '\\'
     624   336602456 :                     || src[end] == quote
     625   336515203 :                     || (sep1len && strncmp(src + end, sep1, sep1len) == 0)
     626   336515186 :                     || (sep2len && strncmp(src + end, sep2, sep2len) == 0)) {
     627       96329 :                         sz += 2;
     628   336515186 :                 } else if (src[end] == (char) '\302' &&
     629           6 :                            0200 <= ((int) src[end + 1] & 0377) &&
     630           6 :                            ((int) src[end + 1] & 0377) <= 0237) {
     631             :                         /* Unicode control character (code point range
     632             :                          * U-00000080 through U-0000009F encoded in
     633             :                          * UTF-8 */
     634             :                         /* for the first one of the two UTF-8 bytes we
     635             :                          * count a width of 7 and for the second one
     636             :                          * 1, together that's 8, i.e. the width of two
     637             :                          * backslash-escaped octal coded characters */
     638           0 :                         sz += 7;
     639   336515186 :                 } else if (!printable_chr(src[end])) {
     640       32435 :                         sz += 4;
     641             :                 } else {
     642   336482751 :                         sz++;
     643             :                 }
     644     9622872 :         return sz;
     645             : }
     646             : 
     647             : size_t
     648     4829385 : escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote)
     649             : {
     650             :         size_t cur = 0, l = 0;
     651             :         size_t sep1len, sep2len;
     652             : 
     653     4829385 :         sep1len = sep1 ? strlen(sep1) : 0;
     654     4829385 :         sep2len = sep2 ? strlen(sep2) : 0;
     655   173285201 :         for (; src[cur] && l < dstlen; cur++)
     656   168455816 :                 if (!printable_chr(src[cur])
     657   168438930 :                     || (src[cur] == '\302'
     658           3 :                         && 0200 <= (src[cur + 1] & 0377)
     659           3 :                         && ((int) src[cur + 1] & 0377) <= 0237)
     660   168438930 :                     || (cur > 0
     661   163625138 :                         && src[cur - 1] == '\302'
     662           3 :                         && 0200 <= (src[cur] & 0377)
     663           3 :                         && (src[cur] & 0377) <= 0237)) {
     664       16886 :                         dst[l++] = '\\';
     665       16886 :                         switch (src[cur]) {
     666        1012 :                         case '\t':
     667        1012 :                                 dst[l++] = 't';
     668        1012 :                                 break;
     669       15867 :                         case '\n':
     670       15867 :                                 dst[l++] = 'n';
     671       15867 :                                 break;
     672           4 :                         case '\r':
     673           4 :                                 dst[l++] = 'r';
     674           4 :                                 break;
     675           2 :                         case '\f':
     676           2 :                                 dst[l++] = 'f';
     677           2 :                                 break;
     678           1 :                         default:
     679           1 :                                 snprintf(dst + l, dstlen - l, "%03o", (unsigned char) src[cur]);
     680           1 :                                 l += 3;
     681           1 :                                 break;
     682             :                         }
     683   168438930 :                 } else if (src[cur] == '\\'
     684   168434211 :                            || src[cur] == quote
     685   168388582 :                            || (sep1len && strncmp(src + cur, sep1, sep1len) == 0)
     686   168388582 :                            || (sep2len && strncmp(src + cur, sep2, sep2len) == 0)) {
     687       50348 :                         dst[l++] = '\\';
     688       50348 :                         dst[l++] = src[cur];
     689             :                 } else {
     690   168388582 :                         dst[l++] = src[cur];
     691             :                 }
     692     4829385 :         assert(l < dstlen);
     693     4829385 :         dst[l] = 0;
     694     4829385 :         return l;
     695             : }
     696             : 
     697             : ssize_t
     698       36818 : strToStr(char **restrict dst, size_t *restrict len, const char *restrict src, bool external)
     699             : {
     700             :         size_t sz;
     701             : 
     702       36818 :         if (!external) {
     703         294 :                 sz = strLen(src);
     704         294 :                 atommem(sz);
     705         294 :                 return (ssize_t) strcpy_len(*dst, src, sz);
     706             :         }
     707       36524 :         if (strNil(src)) {
     708         938 :                 atommem(4);
     709         938 :                 strcpy(*dst, "nil");
     710         938 :                 return 3;
     711             :         } else {
     712             :                 ssize_t l = 0;
     713       35586 :                 size_t sz = escapedStrlen(src, NULL, NULL, '"');
     714             : 
     715       35585 :                 atommem(sz + 3);
     716       35585 :                 l = (ssize_t) escapedStr((*dst) + 1, src, *len - 1, NULL, NULL, '"');
     717       35584 :                 l++;
     718       35584 :                 (*dst)[0] = (*dst)[l++] = '"';
     719       35584 :                 (*dst)[l] = 0;
     720       35584 :                 return l;
     721             :         }
     722             : }
     723             : 
     724             : str
     725          85 : strRead(str a, size_t *dstlen, stream *s, size_t cnt)
     726             : {
     727             :         int len;
     728             : 
     729             :         (void) cnt;
     730          85 :         assert(cnt == 1);
     731          85 :         if (mnstr_readInt(s, &len) != 1 || len < 0)
     732             :                 return NULL;
     733          85 :         if (a == NULL || *dstlen < (size_t) len + 1) {
     734           0 :                 if ((a = GDKrealloc(a, len + 1)) == NULL)
     735             :                         return NULL;
     736           0 :                 *dstlen = len + 1;
     737             :         }
     738          85 :         if (len && mnstr_read(s, a, len, 1) != 1) {
     739           0 :                 GDKfree(a);
     740           0 :                 return NULL;
     741             :         }
     742          85 :         a[len] = 0;
     743          85 :         return a;
     744             : }
     745             : 
     746             : gdk_return
     747          85 : strWrite(const char *a, stream *s, size_t cnt)
     748             : {
     749          85 :         size_t len = strlen(a);
     750             : 
     751             :         (void) cnt;
     752          85 :         assert(cnt == 1);
     753          85 :         if (!checkUTF8(a)) {
     754           0 :                 GDKerror("incorrectly encoded UTF-8\n");
     755           0 :                 return GDK_FAIL;
     756             :         }
     757          85 :         if (mnstr_writeInt(s, (int) len) && mnstr_write(s, a, len, 1) == 1)
     758             :                 return GDK_SUCCEED;
     759             :         else
     760           0 :                 return GDK_FAIL;
     761             : }
     762             : 
     763             : static gdk_return
     764          70 : concat_strings(BAT **bnp, ValPtr pt, BAT *b, oid seqb,
     765             :                BUN ngrp, struct canditer *restrict ci, BUN ncand,
     766             :                const oid *restrict gids, oid min, oid max, bool skip_nils,
     767             :                BAT *sep, const char *restrict separator, BUN *has_nils)
     768             : {
     769             :         oid gid;
     770             :         BUN i, p, nils = 0;
     771             :         size_t *restrict lengths = NULL, *restrict lastseplength = NULL, separator_length = 0, next_length;
     772             :         str *restrict astrings = NULL, s, sl;
     773          70 :         BATiter bi, bis = (BATiter) {0};
     774             :         BAT *bn = NULL;
     775             :         gdk_return rres = GDK_SUCCEED;
     776             : 
     777             :         /* exactly one of bnp and pt must be NULL, the other non-NULL */
     778          70 :         assert((bnp == NULL) != (pt == NULL));
     779             :         /* if pt not NULL, only a single group allowed */
     780          70 :         assert(pt == NULL || ngrp == 1);
     781          70 :         if (bnp) {
     782          29 :                 if ((bn = COLnew(min, TYPE_str, ngrp, TRANSIENT)) == NULL) {
     783             :                         rres = GDK_FAIL;
     784           0 :                         goto finish;
     785             :                 }
     786          29 :                 *bnp = bn;
     787             :         }
     788             : 
     789          70 :         bi = bat_iterator(b);
     790          70 :         if (sep)
     791          26 :                 bis = bat_iterator(sep);
     792             :         else
     793          44 :                 separator_length = strlen(separator);
     794             : 
     795          70 :         if (ngrp == 1) {
     796             :                 size_t offset = 0, single_length = 0;
     797             :                 bool empty = true;
     798             : 
     799          47 :                 if (separator) {
     800         410 :                         for (i = 0; i < ncand; i++) {
     801         381 :                                 p = canditer_next(ci) - seqb;
     802         381 :                                 s = BUNtvar(bi, p);
     803         381 :                                 if (strNil(s)) {
     804          15 :                                         if (!skip_nils) {
     805             :                                                 nils = 1;
     806             :                                                 break;
     807             :                                         }
     808             :                                 } else {
     809         366 :                                         single_length += strlen(s);
     810         366 :                                         if (!empty)
     811         339 :                                                 single_length += separator_length;
     812             :                                         empty = false;
     813             :                                 }
     814             :                         }
     815             :                 } else { /* sep case */
     816          18 :                         assert(sep != NULL);
     817         168 :                         for (i = 0; i < ncand; i++) {
     818         150 :                                 p = canditer_next(ci) - seqb;
     819         150 :                                 s = BUNtvar(bi, p);
     820         150 :                                 sl = BUNtvar(bis, p);
     821         150 :                                 if (strNil(s)) {
     822           4 :                                         if (!skip_nils) {
     823             :                                                 nils = 1;
     824             :                                                 break;
     825             :                                         }
     826             :                                 } else {
     827         146 :                                         single_length += strlen(s);
     828         146 :                                         if (!empty) {
     829         128 :                                                 if (strNil(sl)) {
     830          23 :                                                         if (!skip_nils) {
     831             :                                                                 nils = 1;
     832             :                                                                 break;
     833             :                                                         }
     834             :                                                 } else
     835         105 :                                                         single_length += strlen(sl);
     836             :                                         }
     837             :                                         empty = false;
     838             :                                 }
     839             :                         }
     840             :                 }
     841          47 :                 canditer_reset(ci);
     842             : 
     843          47 :                 if (nils == 0 && !empty) {
     844             :                         char *single_str = NULL;
     845             : 
     846          45 :                         if ((single_str = GDKmalloc(single_length + 1)) == NULL) {
     847           0 :                                 bat_iterator_end(&bi);
     848           0 :                                 if (sep)
     849           0 :                                         bat_iterator_end(&bis);
     850           0 :                                 return GDK_FAIL;
     851             :                         }
     852             :                         empty = true;
     853          45 :                         if (separator) {
     854         395 :                                 for (i = 0; i < ncand; i++) {
     855         368 :                                         p = canditer_next(ci) - seqb;
     856         368 :                                         s = BUNtvar(bi, p);
     857         368 :                                         if (strNil(s))
     858           2 :                                                 continue;
     859         366 :                                         if (!empty) {
     860         339 :                                                 memcpy(single_str + offset, separator, separator_length);
     861         339 :                                                 offset += separator_length;
     862             :                                         }
     863         366 :                                         next_length = strlen(s);
     864         366 :                                         memcpy(single_str + offset, s, next_length);
     865         366 :                                         offset += next_length;
     866             :                                         empty = false;
     867             :                                 }
     868             :                         } else { /* sep case */
     869          18 :                                 assert(sep != NULL);
     870         168 :                                 for (i = 0; i < ncand; i++) {
     871         150 :                                         p = canditer_next(ci) - seqb;
     872         150 :                                         s = BUNtvar(bi, p);
     873         150 :                                         sl = BUNtvar(bis, p);
     874         150 :                                         if (strNil(s))
     875           4 :                                                 continue;
     876         274 :                                         if (!empty && !strNil(sl)) {
     877         105 :                                                 next_length = strlen(sl);
     878         105 :                                                 memcpy(single_str + offset, sl, next_length);
     879         105 :                                                 offset += next_length;
     880             :                                         }
     881         146 :                                         next_length = strlen(s);
     882         146 :                                         memcpy(single_str + offset, s, next_length);
     883         146 :                                         offset += next_length;
     884             :                                         empty = false;
     885             :                                 }
     886             :                         }
     887             : 
     888          45 :                         single_str[offset] = '\0';
     889          45 :                         if (bn) {
     890           6 :                                 if (BUNappend(bn, single_str, false) != GDK_SUCCEED) {
     891           0 :                                         GDKfree(single_str);
     892           0 :                                         bat_iterator_end(&bi);
     893           0 :                                         if (sep)
     894           0 :                                                 bat_iterator_end(&bis);
     895           0 :                                         return GDK_FAIL;
     896             :                                 }
     897             :                         } else {
     898          39 :                                 pt->len = offset + 1;
     899          39 :                                 pt->val.sval = single_str;
     900             :                                 single_str = NULL;      /* don't free */
     901             :                         }
     902          45 :                         GDKfree(single_str);
     903           2 :                 } else if (bn) {
     904           0 :                         if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
     905           0 :                                 bat_iterator_end(&bi);
     906           0 :                                 if (sep)
     907           0 :                                         bat_iterator_end(&bis);
     908           0 :                                 return GDK_FAIL;
     909             :                         }
     910             :                 } else {
     911           2 :                         if (VALinit(pt, TYPE_str, str_nil) == NULL) {
     912           0 :                                 bat_iterator_end(&bi);
     913           0 :                                 if (sep)
     914           0 :                                         bat_iterator_end(&bis);
     915           0 :                                 return GDK_FAIL;
     916             :                         }
     917             :                 }
     918          47 :                 bat_iterator_end(&bi);
     919          47 :                 if (sep)
     920          18 :                         bat_iterator_end(&bis);
     921          47 :                 return GDK_SUCCEED;
     922             :         } else {
     923             :                 /* first used to calculated the total length of
     924             :                  * each group, then the the total offset */
     925          23 :                 lengths = GDKzalloc(ngrp * sizeof(*lengths));
     926          23 :                 astrings = GDKmalloc(ngrp * sizeof(str));
     927          23 :                 if (sep)
     928           8 :                         lastseplength = GDKzalloc(ngrp * sizeof(*lastseplength));
     929          23 :                 if (lengths == NULL || astrings == NULL || (sep && lastseplength == NULL)) {
     930             :                         rres = GDK_FAIL;
     931           0 :                         goto finish;
     932             :                 }
     933             :                 /* at first, set astrings[i] to str_nil, then for each
     934             :                  * non-empty group (even if all strings in the group
     935             :                  * are empty), set to NULL */
     936         131 :                 for (i = 0; i < ngrp; i++)
     937         108 :                         astrings[i] = (char *) str_nil;
     938             : 
     939          23 :                 if (separator) {
     940         178 :                         for (p = 0; p < ncand; p++) {
     941         163 :                                 i = canditer_next(ci) - seqb;
     942         163 :                                 if (gids[i] >= min && gids[i] <= max) {
     943         163 :                                         gid = gids[i] - min;
     944         163 :                                         if (lengths[gid] == (size_t) -1)
     945           0 :                                                 continue;
     946         163 :                                         s = BUNtvar(bi, i);
     947         163 :                                         if (!strNil(s)) {
     948         155 :                                                 lengths[gid] += strlen(s) + separator_length;
     949         155 :                                                 astrings[gid] = NULL;
     950           8 :                                         } else if (!skip_nils) {
     951           0 :                                                 nils++;
     952           0 :                                                 lengths[gid] = (size_t) -1;
     953           0 :                                                 astrings[gid] = (char *) str_nil;
     954             :                                         }
     955             :                                 }
     956             :                         }
     957             :                 } else { /* sep case */
     958           8 :                         assert(sep != NULL);
     959         151 :                         for (p = 0; p < ncand; p++) {
     960         143 :                                 i = canditer_next(ci) - seqb;
     961         143 :                                 if (gids[i] >= min && gids[i] <= max) {
     962         143 :                                         gid = gids[i] - min;
     963         143 :                                         if (lengths[gid] == (size_t) -1)
     964           0 :                                                 continue;
     965         143 :                                         s = BUNtvar(bi, i);
     966         143 :                                         sl = BUNtvar(bis, i);
     967         143 :                                         if (!strNil(s)) {
     968         140 :                                                 lengths[gid] += strlen(s);
     969         140 :                                                 if (!strNil(sl)) {
     970         129 :                                                         next_length = strlen(sl);
     971         129 :                                                         lengths[gid] += next_length;
     972         129 :                                                         lastseplength[gid] = next_length;
     973             :                                                 } else
     974          11 :                                                         lastseplength[gid] = 0;
     975         140 :                                                 astrings[gid] = NULL;
     976           3 :                                         } else if (!skip_nils) {
     977           0 :                                                 nils++;
     978           0 :                                                 lengths[gid] = (size_t) -1;
     979           0 :                                                 lastseplength[gid] = 0;
     980           0 :                                                 astrings[gid] = (char *) str_nil;
     981             :                                         }
     982             :                                 }
     983             :                         }
     984             :                 }
     985             : 
     986          23 :                 if (separator) {
     987          69 :                         for (i = 0; i < ngrp; i++) {
     988          54 :                                 if (astrings[i] == NULL) {
     989          52 :                                         if ((astrings[i] = GDKmalloc(lengths[i] + 1 - separator_length)) == NULL) {
     990             :                                                 rres = GDK_FAIL;
     991           0 :                                                 goto finish;
     992             :                                         }
     993          52 :                                         astrings[i][0] = 0;
     994          52 :                                         lengths[i] = 0;
     995             :                                 } else
     996           2 :                                         astrings[i] = NULL;
     997             :                         }
     998             :                 } else { /* sep case */
     999           8 :                         assert(sep != NULL);
    1000          62 :                         for (i = 0; i < ngrp; i++) {
    1001          54 :                                 if (astrings[i] == NULL) {
    1002          53 :                                         if ((astrings[i] = GDKmalloc(lengths[i] + 1 - lastseplength[i])) == NULL) {
    1003             :                                                 rres = GDK_FAIL;
    1004           0 :                                                 goto finish;
    1005             :                                         }
    1006          53 :                                         astrings[i][0] = 0;
    1007          53 :                                         lengths[i] = 0;
    1008             :                                 } else
    1009           1 :                                         astrings[i] = NULL;
    1010             :                         }
    1011             :                 }
    1012          23 :                 canditer_reset(ci);
    1013             : 
    1014          23 :                 if (separator) {
    1015         178 :                         for (p = 0; p < ncand; p++) {
    1016         163 :                                 i = canditer_next(ci) - seqb;
    1017         163 :                                 if (gids[i] >= min && gids[i] <= max) {
    1018         163 :                                         gid = gids[i] - min;
    1019         163 :                                         if (astrings[gid]) {
    1020         160 :                                                 s = BUNtvar(bi, i);
    1021         160 :                                                 if (strNil(s))
    1022           5 :                                                         continue;
    1023         155 :                                                 if (astrings[gid][lengths[gid]]) {
    1024         103 :                                                         memcpy(astrings[gid] + lengths[gid], separator, separator_length);
    1025         103 :                                                         lengths[gid] += separator_length;
    1026             :                                                 }
    1027         155 :                                                 next_length = strlen(s);
    1028         155 :                                                 memcpy(astrings[gid] + lengths[gid], s, next_length);
    1029         155 :                                                 lengths[gid] += next_length;
    1030         155 :                                                 astrings[gid][lengths[gid]] = 1;
    1031             :                                         }
    1032             :                                 }
    1033             :                         }
    1034             :                 } else { /* sep case */
    1035           8 :                         assert(sep != NULL);
    1036         151 :                         for (p = 0; p < ncand; p++) {
    1037         143 :                                 i = canditer_next(ci) - seqb;
    1038         143 :                                 if (gids[i] >= min && gids[i] <= max) {
    1039         143 :                                         gid = gids[i] - min;
    1040         143 :                                         if (astrings[gid]) {
    1041         142 :                                                 s = BUNtvar(bi, i);
    1042         142 :                                                 sl = BUNtvar(bis, i);
    1043         142 :                                                 if (strNil(s))
    1044           2 :                                                         continue;
    1045         227 :                                                 if (astrings[gid][lengths[gid]] && !strNil(sl)) {
    1046          79 :                                                         next_length = strlen(sl);
    1047          79 :                                                         memcpy(astrings[gid] + lengths[gid], sl, next_length);
    1048          79 :                                                         lengths[gid] += next_length;
    1049             :                                                 }
    1050         140 :                                                 next_length = strlen(s);
    1051         140 :                                                 memcpy(astrings[gid] + lengths[gid], s, next_length);
    1052         140 :                                                 lengths[gid] += next_length;
    1053         140 :                                                 astrings[gid][lengths[gid]] = 1;
    1054             :                                         }
    1055             :                                 }
    1056             :                         }
    1057             :                 }
    1058             : 
    1059         131 :                 for (i = 0; i < ngrp; i++) {
    1060         108 :                         if (astrings[i]) {
    1061         105 :                                 astrings[i][lengths[i]] = '\0';
    1062         105 :                                 if (BUNappend(bn, astrings[i], false) != GDK_SUCCEED) {
    1063             :                                         rres = GDK_FAIL;
    1064           0 :                                         goto finish;
    1065             :                                 }
    1066           3 :                         } else if (BUNappend(bn, str_nil, false) != GDK_SUCCEED) {
    1067             :                                 rres = GDK_FAIL;
    1068           0 :                                 goto finish;
    1069             :                         }
    1070             :                 }
    1071             :         }
    1072             : 
    1073          23 :   finish:
    1074          23 :         bat_iterator_end(&bi);
    1075          23 :         if (sep)
    1076           8 :                 bat_iterator_end(&bis);
    1077          23 :         if (has_nils)
    1078          23 :                 *has_nils = nils;
    1079          23 :         GDKfree(lengths);
    1080          23 :         GDKfree(lastseplength);
    1081          23 :         if (astrings) {
    1082         131 :                 for (i = 0; i < ngrp; i++) {
    1083         108 :                         if (astrings[i] != str_nil)
    1084         108 :                                 GDKfree(astrings[i]);
    1085             :                 }
    1086          23 :                 GDKfree(astrings);
    1087             :         }
    1088          23 :         if (rres != GDK_SUCCEED)
    1089           0 :                 BBPreclaim(bn);
    1090             : 
    1091             :         return rres;
    1092             : }
    1093             : 
    1094             : gdk_return
    1095          42 : BATstr_group_concat(ValPtr res, BAT *b, BAT *s, BAT *sep, bool skip_nils,
    1096             :                     bool abort_on_error, bool nil_if_empty, const char *restrict separator)
    1097             : {
    1098             :         BUN ncand;
    1099             :         struct canditer ci;
    1100             : 
    1101             :         (void) abort_on_error;
    1102          42 :         assert((separator && !sep) || (!separator && sep)); /* only one of them must be set */
    1103          42 :         res->vtype = TYPE_str;
    1104             : 
    1105          42 :         ncand = canditer_init(&ci, b, s);
    1106             : 
    1107          42 :         if (sep && BATcount(sep) == 1) { /* Only one element in sep */
    1108           0 :                 BATiter bi = bat_iterator(sep);
    1109           0 :                 separator = BUNtvar(bi, 0);
    1110           0 :                 bat_iterator_end(&bi);
    1111             :                 sep = NULL;
    1112             :         }
    1113             : 
    1114          68 :         if (ncand == 0 || (separator && strNil(separator))) {
    1115           1 :                 if (VALinit(res, TYPE_str, nil_if_empty ? str_nil : "") == NULL)
    1116             :                         return GDK_FAIL;
    1117           1 :                 return GDK_SUCCEED;
    1118             :         }
    1119             : 
    1120          41 :         return concat_strings(NULL, res, b, b->hseqbase, 1, &ci, ncand, NULL, 0, 0,
    1121             :                               skip_nils, sep, separator, NULL);
    1122             : }
    1123             : 
    1124             : BAT *
    1125          49 : BATgroupstr_group_concat(BAT *b, BAT *g, BAT *e, BAT *s, BAT *sep, bool skip_nils,
    1126             :                          bool abort_on_error, const char *restrict separator)
    1127             : {
    1128          49 :         BAT *bn = NULL;
    1129             :         oid min, max;
    1130          49 :         BUN ngrp, ncand, nils = 0;
    1131             :         struct canditer ci;
    1132             :         const char *err;
    1133             :         gdk_return res;
    1134             : 
    1135          49 :         assert((separator && !sep) || (!separator && sep)); /* only one of them must be set */
    1136             :         (void) skip_nils;
    1137             : 
    1138          49 :         if ((err = BATgroupaggrinit(b, g, e, s, &min, &max, &ngrp,
    1139             :                                     &ci, &ncand)) !=NULL) {
    1140           0 :                 GDKerror("%s\n", err);
    1141           0 :                 return NULL;
    1142             :         }
    1143          49 :         if (g == NULL) {
    1144           0 :                 GDKerror("b and g must be aligned\n");
    1145           0 :                 return NULL;
    1146             :         }
    1147             : 
    1148          49 :         if (sep && BATcount(sep) == 1) { /* Only one element in sep */
    1149           0 :                 BATiter bi = bat_iterator(sep);
    1150           0 :                 separator = BUNtvar(bi, 0);
    1151           0 :                 bat_iterator_end(&bi);
    1152             :                 sep = NULL;
    1153             :         }
    1154             : 
    1155          67 :         if (ncand == 0 || ngrp == 0 || (separator && strNil(separator))) {
    1156             :                 /* trivial: no strings to concat, so return bat
    1157             :                  * aligned with g with nil in the tail */
    1158           5 :                 return BATconstant(ngrp == 0 ? 0 : min, TYPE_str, str_nil, ngrp, TRANSIENT);
    1159             :         }
    1160             : 
    1161          44 :         if (BATtdense(g) || (g->tkey && g->tnonil)) {
    1162             :                 /* trivial: singleton groups, so all results are equal
    1163             :                  * to the inputs (but possibly a different type) */
    1164          15 :                 return BATconvert(b, s, TYPE_str, abort_on_error, 0, 0, 0);
    1165             :         }
    1166             : 
    1167          29 :         res = concat_strings(&bn, NULL, b, b->hseqbase, ngrp, &ci, ncand,
    1168          29 :                              (const oid *) Tloc(g, 0), min, max, skip_nils, sep,
    1169             :                              separator, &nils);
    1170          29 :         if (res != GDK_SUCCEED)
    1171             :                 return NULL;
    1172             : 
    1173          29 :         return bn;
    1174             : }
    1175             : 
    1176             : #define compute_next_single_str(START, END)                             \
    1177             :         do {                                                            \
    1178             :                 for (oid m = START; m < END; m++) {                  \
    1179             :                         sb = BUNtvar(bi, m);                            \
    1180             :                                                                         \
    1181             :                         if (separator) {                                \
    1182             :                                 if (!strNil(sb)) {                      \
    1183             :                                         next_group_length += strlen(sb); \
    1184             :                                         if (!empty)                     \
    1185             :                                                 next_group_length += separator_length; \
    1186             :                                         empty = false;                  \
    1187             :                                 }                                       \
    1188             :                         } else { /* sep case */                         \
    1189             :                                 assert(sep != NULL);                    \
    1190             :                                 sl = BUNtvar(sepi, m);                  \
    1191             :                                                                         \
    1192             :                                 if (!strNil(sb)) {                      \
    1193             :                                         next_group_length += strlen(sb); \
    1194             :                                         if (!empty && !strNil(sl))      \
    1195             :                                                 next_group_length += strlen(sl); \
    1196             :                                         empty = false;                  \
    1197             :                                 }                                       \
    1198             :                         }                                               \
    1199             :                 }                                                       \
    1200             :                 if (empty) {                                            \
    1201             :                         if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
    1202             :                                 max_group_length = 1;                   \
    1203             :                                 if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
    1204             :                                         goto allocation_error;          \
    1205             :                         } else if (1 > max_group_length) {           \
    1206             :                                 max_group_length = 1;                   \
    1207             :                                 if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
    1208             :                                         goto allocation_error;          \
    1209             :                                 single_str = next_single_str;           \
    1210             :                         }                                               \
    1211             :                         strcpy(single_str, str_nil);                    \
    1212             :                         has_nils = true;                                \
    1213             :                 } else {                                                \
    1214             :                         empty = true;                                   \
    1215             :                         if (single_str == NULL) { /* reuse the same buffer, resize it when needed */ \
    1216             :                                 max_group_length = next_group_length;   \
    1217             :                                 if ((single_str = GDKmalloc(max_group_length + 1)) == NULL) \
    1218             :                                         goto allocation_error;          \
    1219             :                         } else if (next_group_length > max_group_length) { \
    1220             :                                 max_group_length = next_group_length;   \
    1221             :                                 if ((next_single_str = GDKrealloc(single_str, max_group_length + 1)) == NULL) \
    1222             :                                         goto allocation_error;          \
    1223             :                                 single_str = next_single_str;           \
    1224             :                         }                                               \
    1225             :                                                                         \
    1226             :                         for (oid m = START; m < END; m++) {          \
    1227             :                                 sb = BUNtvar(bi, m);                    \
    1228             :                                                                         \
    1229             :                                 if (separator) {                        \
    1230             :                                         if (strNil(sb))                 \
    1231             :                                                 continue;               \
    1232             :                                         if (!empty) {                   \
    1233             :                                                 memcpy(single_str + offset, separator, separator_length); \
    1234             :                                                 offset += separator_length; \
    1235             :                                         }                               \
    1236             :                                         next_length = strlen(sb);       \
    1237             :                                         memcpy(single_str + offset, sb, next_length); \
    1238             :                                         offset += next_length;          \
    1239             :                                         empty = false;                  \
    1240             :                                 } else { /* sep case */                 \
    1241             :                                         assert(sep != NULL);            \
    1242             :                                         sl = BUNtvar(sepi, m);          \
    1243             :                                                                         \
    1244             :                                         if (strNil(sb))                 \
    1245             :                                                 continue;               \
    1246             :                                         if (!empty && !strNil(sl)) {    \
    1247             :                                                 next_length = strlen(sl); \
    1248             :                                                 memcpy(single_str + offset, sl, next_length); \
    1249             :                                                 offset += next_length;  \
    1250             :                                         }                               \
    1251             :                                         next_length = strlen(sb);       \
    1252             :                                         memcpy(single_str + offset, sb, next_length); \
    1253             :                                         offset += next_length;          \
    1254             :                                         empty = false;                  \
    1255             :                                 }                                       \
    1256             :                         }                                               \
    1257             :                                                                         \
    1258             :                         single_str[offset] = '\0';                      \
    1259             :                 }                                                       \
    1260             : } while (0)
    1261             : 
    1262             : #define ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW          \
    1263             :         do {                                                            \
    1264             :                 size_t slice_length = 0;                                \
    1265             :                 next_group_length = next_length = offset = 0;           \
    1266             :                 empty = true;                                           \
    1267             :                 compute_next_single_str(k, i); /* compute the entire string then slice it starting from the beginning */ \
    1268             :                 empty = true;                                           \
    1269             :                 for (; k < i;) {                                     \
    1270             :                         str nsep, nstr;                                 \
    1271             :                         oid m = k;                                      \
    1272             :                         j = k;                                          \
    1273             :                         do {                                            \
    1274             :                                 k++;                                    \
    1275             :                         } while (k < i && !op[k]);                   \
    1276             :                         for (; j < k; j++) {                         \
    1277             :                                 nstr = BUNtvar(bi, j);                  \
    1278             :                                 if (!strNil(nstr)) {                    \
    1279             :                                         slice_length += strlen(nstr);   \
    1280             :                                         if (!empty) {                   \
    1281             :                                                 if (separator) {        \
    1282             :                                                         nsep = (str) separator; \
    1283             :                                                 } else { /* sep case */ \
    1284             :                                                         assert(sep != NULL); \
    1285             :                                                         nsep = BUNtvar(sepi, j); \
    1286             :                                                 }                       \
    1287             :                                                 if (!strNil(nsep))      \
    1288             :                                                         slice_length += strlen(nsep); \
    1289             :                                         }                               \
    1290             :                                         empty = false;                  \
    1291             :                                 }                                       \
    1292             :                         }                                               \
    1293             :                         if (empty) {                                    \
    1294             :                                 for (j = m; j < k; j++)                      \
    1295             :                                         if (tfastins_nocheckVAR(r, j, str_nil) != GDK_SUCCEED) \
    1296             :                                                 goto allocation_error;  \
    1297             :                                 has_nils = true;                        \
    1298             :                         } else {                                        \
    1299             :                                 char save = single_str[slice_length];   \
    1300             :                                 single_str[slice_length] = '\0';        \
    1301             :                                 for (j = m; j < k; j++)                      \
    1302             :                                         if (tfastins_nocheckVAR(r, j, single_str) != GDK_SUCCEED) \
    1303             :                                                 goto allocation_error;  \
    1304             :                                 single_str[slice_length] = save;        \
    1305             :                         }                                               \
    1306             :                 }                                                       \
    1307             :         } while (0)
    1308             : 
    1309             : #define ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS                            \
    1310             :         do {                                                            \
    1311             :                 next_group_length = next_length = offset = 0;           \
    1312             :                 empty = true;                                           \
    1313             :                 compute_next_single_str(k, i);                          \
    1314             :                 for (; k < i; k++)                                   \
    1315             :                         if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
    1316             :                                 goto allocation_error;                  \
    1317             :         } while (0)
    1318             : 
    1319             : #define ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW                         \
    1320             :         do {                                                            \
    1321             :                 for (; k < i; k++) {                                 \
    1322             :                         str next = BUNtvar(bi, k);                      \
    1323             :                         if (tfastins_nocheckVAR(r, k, next) != GDK_SUCCEED) \
    1324             :                                 goto allocation_error;                  \
    1325             :                         has_nils |= strNil(next);                       \
    1326             :                 }                                                       \
    1327             :         } while (0)
    1328             : 
    1329             : #define ANALYTICAL_STR_GROUP_CONCAT_OTHERS                              \
    1330             :         do {                                                            \
    1331             :                 for (; k < i; k++) {                                 \
    1332             :                         next_group_length = next_length = offset = 0;   \
    1333             :                         empty = true;                                   \
    1334             :                         compute_next_single_str(start[k], end[k]);      \
    1335             :                         if (tfastins_nocheckVAR(r, k, single_str) != GDK_SUCCEED) \
    1336             :                                 goto allocation_error;                  \
    1337             :                 }                                                       \
    1338             :         } while (0)
    1339             : 
    1340             : #define ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(IMP)     \
    1341             :         do {                                            \
    1342             :                 if (p) {                                \
    1343             :                         for (; i < cnt; i++) {               \
    1344             :                                 if (np[i])              \
    1345             :                                         IMP;            \
    1346             :                         }                               \
    1347             :                 }                                       \
    1348             :                 i = cnt;                                \
    1349             :                 IMP;                                    \
    1350             :         } while (0)
    1351             : 
    1352             : gdk_return
    1353          47 : GDKanalytical_str_group_concat(BAT *r, BAT *p, BAT *o, BAT *b, BAT *sep, BAT *s, BAT *e, const char *restrict separator, int frame_type)
    1354             : {
    1355             :         bool has_nils = false, empty;
    1356          47 :         BATiter pi = bat_iterator(p);
    1357          47 :         BATiter oi = bat_iterator(o);
    1358          47 :         BATiter bi = bat_iterator(b);
    1359          47 :         BATiter sepi = bat_iterator(sep);
    1360          47 :         BATiter si = bat_iterator(s);
    1361          47 :         BATiter ei = bat_iterator(e);
    1362          47 :         oid i = 0, j = 0, k = 0, cnt = BATcount(b), *restrict start = si.base, *restrict end = ei.base;
    1363          47 :         bit *np = pi.base, *op = oi.base;
    1364             :         str sb, sl, single_str = NULL, next_single_str;
    1365             :         size_t separator_length = 0, next_group_length, max_group_length = 0, next_length, offset;
    1366             : 
    1367          47 :         assert((sep && !separator && BATcount(b) == BATcount(sep)) || (!sep && separator));
    1368          47 :         if (b->ttype != TYPE_str || r->ttype != TYPE_str || (sep && sep->ttype != TYPE_str)) {
    1369           0 :                 GDKerror("only string type is supported\n");
    1370           0 :                 bat_iterator_end(&pi);
    1371           0 :                 bat_iterator_end(&oi);
    1372           0 :                 bat_iterator_end(&bi);
    1373           0 :                 bat_iterator_end(&sepi);
    1374           0 :                 bat_iterator_end(&si);
    1375           0 :                 bat_iterator_end(&ei);
    1376           0 :                 return GDK_FAIL;
    1377             :         }
    1378          47 :         if (sep && BATcount(sep) == 1) { /* Only one element in sep */
    1379           0 :                 separator = BUNtvar(sepi, 0);
    1380             :                 sep = NULL;
    1381             :         }
    1382             : 
    1383          47 :         if (sep == NULL)
    1384          17 :                 separator_length = strlen(separator);
    1385             : 
    1386          47 :         if (cnt > 0) {
    1387          46 :                 switch (frame_type) {
    1388          23 :                 case 3: /* unbounded until current row */       {
    1389      134749 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_UNBOUNDED_TILL_CURRENT_ROW);
    1390             :                 } break;
    1391           0 :                 case 4: /* current row until unbounded */
    1392           0 :                         goto notimplemented;
    1393          23 :                 case 5: /* all rows */  {
    1394         843 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_ALL_ROWS);
    1395             :                 } break;
    1396           0 :                 case 6: /* current row */ {
    1397           0 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_CURRENT_ROW);
    1398             :                 } break;
    1399           0 :                 default: {
    1400           0 :                         ANALYTICAL_STR_GROUP_CONCAT_PARTITIONS(ANALYTICAL_STR_GROUP_CONCAT_OTHERS);
    1401             :                 }
    1402             :                 }
    1403             :         }
    1404             : 
    1405          47 :         bat_iterator_end(&pi);
    1406          47 :         bat_iterator_end(&oi);
    1407          47 :         bat_iterator_end(&bi);
    1408          47 :         bat_iterator_end(&sepi);
    1409          47 :         bat_iterator_end(&si);
    1410          47 :         bat_iterator_end(&ei);
    1411          47 :         GDKfree(single_str);
    1412          47 :         BATsetcount(r, cnt);
    1413          47 :         r->tnonil = !has_nils;
    1414          47 :         r->tnil = has_nils;
    1415          47 :         return GDK_SUCCEED;
    1416           0 :   allocation_error:
    1417           0 :         bat_iterator_end(&pi);
    1418           0 :         bat_iterator_end(&oi);
    1419           0 :         bat_iterator_end(&bi);
    1420           0 :         bat_iterator_end(&sepi);
    1421           0 :         bat_iterator_end(&si);
    1422           0 :         bat_iterator_end(&ei);
    1423           0 :         GDKfree(single_str);
    1424           0 :         return GDK_FAIL;
    1425             :   notimplemented:
    1426           0 :         bat_iterator_end(&pi);
    1427           0 :         bat_iterator_end(&oi);
    1428           0 :         bat_iterator_end(&bi);
    1429           0 :         bat_iterator_end(&sepi);
    1430           0 :         bat_iterator_end(&si);
    1431           0 :         bat_iterator_end(&ei);
    1432           0 :         GDKerror("str_group_concat not yet implemented for current row until unbounded case\n");
    1433           0 :         return GDK_FAIL;
    1434             : }

Generated by: LCOV version 1.14