LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - pcre.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 805 1061 75.9 %
Date: 2021-10-13 02:24:04 Functions: 49 52 94.2 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * N. Nes
      11             :  * PCRE library interface
      12             :  * The  PCRE library is a set of functions that implement regular
      13             :  * expression pattern matching using the same syntax  and  semantics  as  Perl,
      14             :  * with  just  a  few  differences.  The  current  implementation of PCRE
      15             :  * (release 4.x) corresponds approximately with Perl 5.8, including  support
      16             :  * for  UTF-8  encoded  strings.   However,  this support has to be
      17             :  * explicitly enabled; it is not the default.
      18             :  *
      19             :  * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
      20             :  */
      21             : #include "monetdb_config.h"
      22             : #include <string.h>
      23             : 
      24             : #include "mal.h"
      25             : #include "mal_client.h"
      26             : #include "mal_interpreter.h"
      27             : #include "mal_exception.h"
      28             : 
      29             : #include <wchar.h>
      30             : #include <wctype.h>
      31             : 
      32             : #ifdef HAVE_LIBPCRE
      33             : #include <pcre.h>
      34             : #ifndef PCRE_STUDY_JIT_COMPILE
      35             : /* old library version on e.g. EPEL 6 */
      36             : #define pcre_free_study(x)              pcre_free(x)
      37             : #define PCRE_STUDY_JIT_COMPILE  0
      38             : #endif
      39             : #define JIT_COMPILE_MIN 1024    /* when to try JIT compilation of patterns */
      40             : 
      41             : #else
      42             : 
      43             : #include <regex.h>
      44             : 
      45             : typedef regex_t pcre;
      46             : #endif
      47             : 
      48             : /* current implementation assumes simple %keyword% [keyw%]* */
      49             : struct RE {
      50             :         char *k;
      51             :         uint32_t *w;
      52             :         bool search:1,
      53             :                 atend:1;
      54             :         size_t len;
      55             :         struct RE *n;
      56             : };
      57             : 
      58             : /* We cannot use strcasecmp and strncasecmp since they work byte for
      59             :  * byte and don't deal with multibyte encodings (such as UTF-8).
      60             :  *
      61             :  * We implement our own conversion from UTF-8 encoding to Unicode code
      62             :  * points which we store in uint32_t.  The reason for this is,
      63             :  * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
      64             :  * locale to use them), and on Windows, wchar_t is only 2 bytes and
      65             :  * therefore cannot hold all Unicode code points.  We do use functions
      66             :  * such as towlower to convert a Unicode code point to its lower-case
      67             :  * equivalent, but again on Windows, if the code point doesn't fit in
      68             :  * 2 bytes, we skip this conversion and compare the unconverted code
      69             :  * points.
      70             :  *
      71             :  * Note, towlower is also locale-dependent, but we don't need a UTF-8
      72             :  * locale in order to use it. */
      73             : 
      74             : /* helper function to convert a UTF-8 multibyte character to a wide
      75             :  * character */
      76             : static size_t
      77      542098 : utfc8touc(uint32_t *restrict dest, const char *restrict src)
      78             : {
      79      542098 :         if ((src[0] & 0x80) == 0) {
      80      542031 :                 *dest = src[0];
      81      542031 :                 return src[0] != 0;
      82          67 :         } else if ((src[0] & 0xE0) == 0xC0
      83          40 :                    && (src[1] & 0xC0) == 0x80
      84          40 :                    && (src[0] & 0x1E) != 0) {
      85          40 :                 *dest = (src[0] & 0x1F) << 6
      86          40 :                         | (src[1] & 0x3F);
      87          40 :                 return 2;
      88          27 :         } else if ((src[0] & 0xF0) == 0xE0
      89          27 :                    && (src[1] & 0xC0) == 0x80
      90          27 :                    && (src[2] & 0xC0) == 0x80
      91          27 :                    && ((src[0] & 0x0F) != 0
      92           0 :                        || (src[1] & 0x20) != 0)) {
      93          27 :                 *dest = (src[0] & 0x0F) << 12
      94          27 :                         | (src[1] & 0x3F) << 6
      95          27 :                         | (src[2] & 0x3F);
      96          27 :                 return 3;
      97           0 :         } else if ((src[0] & 0xF8) == 0xF0
      98           0 :                    && (src[1] & 0xC0) == 0x80
      99           0 :                    && (src[2] & 0xC0) == 0x80
     100           0 :                    && (src[3] & 0xC0) == 0x80) {
     101           0 :                 uint32_t c = (src[0] & 0x07) << 18
     102           0 :                         | (src[1] & 0x3F) << 12
     103           0 :                         | (src[2] & 0x3F) << 6
     104           0 :                         | (src[3] & 0x3F);
     105           0 :                 if (c < 0x10000
     106           0 :                     || c > 0x10FFFF
     107             :                     || (c & 0x1FF800) == 0x00D800)
     108             :                         return (size_t) -1;
     109           0 :                 *dest = c;
     110           0 :                 return 4;
     111             :         }
     112             :         return (size_t) -1;
     113             : }
     114             : 
     115             : /* helper function to convert a UTF-8 string to a wide character
     116             :  * string, the wide character string is allocated */
     117             : static uint32_t *
     118         309 : utf8stoucs(const char *src)
     119             : {
     120             :         uint32_t *dest;
     121             :         size_t i = 0;
     122             :         size_t j = 0;
     123             : 
     124             :         /* count how many uint32_t's we need, while also checking for
     125             :          * correctness of the input */
     126        2182 :         while (src[j]) {
     127        1873 :                 i++;
     128        1873 :                 if ((src[j+0] & 0x80) == 0) {
     129        1814 :                         j += 1;
     130          59 :                 } else if ((src[j+0] & 0xE0) == 0xC0
     131          24 :                            && (src[j+1] & 0xC0) == 0x80
     132          24 :                            && (src[j+0] & 0x1E) != 0) {
     133          24 :                         j += 2;
     134          35 :                 } else if ((src[j+0] & 0xF0) == 0xE0
     135          35 :                            && (src[j+1] & 0xC0) == 0x80
     136          35 :                            && (src[j+2] & 0xC0) == 0x80
     137          35 :                            && ((src[j+0] & 0x0F) != 0
     138           0 :                                || (src[j+1] & 0x20) != 0)) {
     139          35 :                         j += 3;
     140           0 :                 } else if ((src[j+0] & 0xF8) == 0xF0
     141           0 :                            && (src[j+1] & 0xC0) == 0x80
     142           0 :                            && (src[j+2] & 0xC0) == 0x80
     143           0 :                            && (src[j+3] & 0xC0) == 0x80) {
     144           0 :                         uint32_t c = (src[j+0] & 0x07) << 18
     145           0 :                                 | (src[j+1] & 0x3F) << 12
     146           0 :                                 | (src[j+2] & 0x3F) << 6
     147           0 :                                 | (src[j+3] & 0x3F);
     148           0 :                         if (c < 0x10000
     149           0 :                             || c > 0x10FFFF
     150             :                             || (c & 0x1FF800) == 0x00D800)
     151             :                                 return NULL;
     152           0 :                         j += 4;
     153             :                 } else {
     154             :                         return NULL;
     155             :                 }
     156             :         }
     157         309 :         dest = GDKmalloc((i + 1) * sizeof(uint32_t));
     158         309 :         if (dest == NULL)
     159             :                 return NULL;
     160             :         /* go through the source string again, this time we can skip
     161             :          * the correctness tests */
     162             :         i = j = 0;
     163        2198 :         while (src[j]) {
     164        1889 :                 if ((src[j+0] & 0x80) == 0) {
     165        1830 :                         dest[i++] = src[j+0];
     166        1830 :                         j += 1;
     167          59 :                 } else if ((src[j+0] & 0xE0) == 0xC0) {
     168          24 :                         dest[i++] = (src[j+0] & 0x1F) << 6
     169          24 :                                 | (src[j+1] & 0x3F);
     170          24 :                         j += 2;
     171          35 :                 } else if ((src[j+0] & 0xF0) == 0xE0) {
     172          35 :                         dest[i++] = (src[j+0] & 0x0F) << 12
     173          35 :                                 | (src[j+1] & 0x3F) << 6
     174          35 :                                 | (src[j+2] & 0x3F);
     175          35 :                         j += 3;
     176           0 :                 } else if ((src[j+0] & 0xF8) == 0xF0) {
     177           0 :                         dest[i++] = (src[j+0] & 0x07) << 18
     178           0 :                                 | (src[j+1] & 0x3F) << 12
     179           0 :                                 | (src[j+2] & 0x3F) << 6
     180           0 :                                 | (src[j+3] & 0x3F);
     181           0 :                         j += 4;
     182             :                 }
     183             :         }
     184         309 :         dest[i] = 0;
     185         309 :         return dest;
     186             : }
     187             : 
     188             : static size_t
     189             : myucslen(const uint32_t *ucs)
     190             : {
     191             :         size_t i = 0;
     192             : 
     193      219507 :         while (ucs[i])
     194      205642 :                 i++;
     195             :         return i;
     196             : }
     197             : 
     198             : static inline bool
     199          30 : mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2, bool atend)
     200             : {
     201             :         uint32_t c1;
     202             : 
     203          59 :         while (n2 > 0) {
     204          36 :                 size_t nn1 = utfc8touc(&c1, s1);
     205          36 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     206           0 :                         return (*s2 == 0);
     207          36 :                 if (*s2 == 0)
     208             :                         return false;
     209          36 :                 if (nn1 == (size_t) -1 || nn1 == (size_t) -2)
     210             :                         return true;     /* actually an error that shouldn't happen */
     211             : #if SIZEOF_WCHAR_T == 2
     212             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     213             :                         if (c1 != *s2)
     214             :                                 return false;
     215             :                 } else
     216             : #endif
     217          36 :                 if (towlower((wint_t) c1) != towlower((wint_t) *s2))
     218             :                         return false;
     219          29 :                 s1 += nn1;
     220          29 :                 n2--;
     221          29 :                 s2++;
     222             :         }
     223          23 :         return !atend || *s1 == 0;
     224             : }
     225             : 
     226             : static inline int
     227           4 : mystrcasecmp(const char *s1, const char *s2)
     228             : {
     229             :         uint32_t c1, c2;
     230             : 
     231           0 :         for (;;) {
     232           4 :                 size_t nn1 = utfc8touc(&c1, s1);
     233           4 :                 size_t nn2 = utfc8touc(&c2, s2);
     234           4 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     235           1 :                         return -(nn2 != 0 && nn2 != (size_t) -1);
     236           3 :                 if (nn2 == 0 || nn2 == (size_t) -1)
     237             :                         return 1;
     238           2 :                 if (nn1 == (size_t) -1 || nn1 == (size_t) -2 ||
     239           2 :                         nn2 == (size_t) -1 || nn2 == (size_t) -2)
     240             :                         return 0;        /* actually an error that shouldn't happen */
     241             : #if SIZEOF_WCHAR_T == 2
     242             :                 if (c1 > 0xFFFF || c2 > 0xFFFF) {
     243             :                         if (c1 != c2)
     244             :                                 return c1 - c2;
     245             :                 } else
     246             : #endif
     247           2 :                 if (towlower((wint_t) c1) != towlower((wint_t) c2))
     248           2 :                         return towlower((wint_t) c1) - towlower((wint_t) c2);
     249           0 :                 s1 += nn1;
     250           0 :                 s2 += nn2;
     251             :         }
     252             : }
     253             : 
     254             : static inline int
     255         263 : mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
     256             : {
     257             :         uint32_t c1;
     258             : 
     259         668 :         for (;;) {
     260         931 :                 size_t nn1 = utfc8touc(&c1, s1);
     261         931 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     262         165 :                         return -(*s2 != 0);
     263         766 :                 if (*s2 == 0)
     264             :                         return 1;
     265         764 :                 if (nn1 == (size_t) -1 || nn1 == (size_t) -2)
     266             :                         return 0;        /* actually an error that shouldn't happen */
     267             : #if SIZEOF_WCHAR_T == 2
     268             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     269             :                         if (c1 != *s2)
     270             :                                 return c1 - *s2;
     271             :                 } else
     272             : #endif
     273         764 :                 if (towlower((wint_t) c1) != towlower((wint_t) *s2))
     274          96 :                         return towlower((wint_t) c1) - towlower((wint_t) *s2);
     275         668 :                 s1 += nn1;
     276         668 :                 s2++;
     277             :         }
     278             : }
     279             : 
     280             : static inline const char *
     281       13865 : mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle, bool atend)
     282             : {
     283             :         size_t nlen = myucslen(wneedle);
     284             : 
     285       13865 :         if (nlen == 0)
     286           0 :                 return atend ? haystack + strlen(haystack) : haystack;
     287             : 
     288             :         size_t hlen = strlen(haystack);
     289             : 
     290      473551 :         while (*haystack) {
     291             :                 size_t i;
     292             :                 size_t h;
     293             :                 size_t step = 0;
     294      553060 :                 for (i = h = 0; i < nlen; i++) {
     295             :                         uint32_t c;
     296      550591 :                         size_t j = utfc8touc(&c, haystack + h);
     297      550629 :                         if (j == 0 || j == (size_t) -1)
     298          12 :                                 return NULL;
     299      550617 :                         if (i == 0) {
     300             :                                 step = j;
     301             :                         }
     302             : #if SIZEOF_WCHAR_T == 2
     303             :                         if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
     304             :                                 if (c != wneedle[i])
     305             :                                         break;
     306             :                         } else
     307             : #endif
     308      550617 :                         if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
     309             :                                 break;
     310       90620 :                         h += j;
     311             :                 }
     312      462466 :                 if (i == nlen && (!atend || haystack[h] == 0))
     313        2780 :                         return haystack;
     314      459686 :                 haystack += step;
     315             :                 hlen -= step;
     316             :         }
     317             :         return NULL;
     318             : }
     319             : 
     320             : /* returns true if the pattern does not contain unescaped `_' (single
     321             :  * character match) and ends with unescaped `%' (any sequence
     322             :  * match) */
     323             : static inline bool
     324        1917 : re_simple(const char *pat, unsigned char esc)
     325             : {
     326             :         bool escaped = false;
     327             : 
     328        1917 :         if (pat == 0)
     329             :                 return false;
     330        1917 :         if (*pat == '%') {
     331        1255 :                 pat++;
     332             :         }
     333       14618 :         while (*pat) {
     334       12999 :                 if (escaped) {
     335             :                         escaped = false;
     336       12875 :                 } else if ((unsigned char) *pat == esc) {
     337             :                         escaped = true;
     338       12752 :                 } else if (*pat == '_') {
     339             :                         return false;
     340             :                 }
     341       12701 :                 pat++;
     342             :         }
     343             :         return true;
     344             : }
     345             : 
     346             : static inline bool
     347        2796 : re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
     348             : {
     349             :         bool escaped = false;
     350             : 
     351        2796 :         if (pat == 0)
     352             :                 return true;
     353       22852 :         while (*pat) {
     354       20056 :                 if (escaped) {
     355             :                         escaped = false;
     356       19924 :                 } else if ((unsigned char) *pat == esc) {
     357             :                         escaped = true;
     358             :                 }
     359       20056 :                 pat++;
     360             :         }
     361        2796 :         return escaped ? false : true;
     362             : }
     363             : 
     364             : static inline bool
     365        2789 : is_strcmpable(const char *pat, const char *esc)
     366             : {
     367        2789 :         if (pat[strcspn(pat, "%_")])
     368             :                 return false;
     369         902 :         return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
     370             : }
     371             : 
     372             : static inline bool
     373       13981 : re_match_ignore(const char *restrict s, const struct RE *restrict pattern)
     374             : {
     375             :         const struct RE *r;
     376             : 
     377       16780 :         for (r = pattern; r; r = r->n) {
     378       14002 :                 if (*r->w == 0 && (r->search || *s == 0))
     379             :                         return true;
     380       27920 :                 if (!*s ||
     381             :                         (r->search
     382       13906 :                          ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL
     383          30 :                          : !mywstrncaseeq(s, r->w, r->len, r->atend)))
     384       11215 :                         return false;
     385        2799 :                 s += r->len;
     386             :         }
     387             :         return true;
     388             : }
     389             : 
     390             : static inline bool
     391       39573 : re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern)
     392             : {
     393             :         const struct RE *r;
     394             :         size_t l;
     395             : 
     396       43175 :         for (r = pattern; r; r = r->n) {
     397       40950 :                 if (*r->k == 0 && (r->search || *s == 0))
     398             :                         return true;
     399       41286 :                 if (!*s ||
     400             :                         (r->search
     401       40842 :                          ? (r->atend
     402       25377 :                                 ? (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0
     403       25008 :                                 : (s = strstr(s, r->k)) == NULL)
     404             :                          : (r->atend
     405       15465 :                                 ? strcmp(s, r->k) != 0
     406       15371 :                                 : strncmp(s, r->k, r->len) != 0)))
     407             :                         return false;
     408        3602 :                 s += r->len;
     409             :         }
     410             :         return true;
     411             : }
     412             : 
     413             : static void
     414        1620 : re_destroy(struct RE *p)
     415             : {
     416        1620 :         if (p) {
     417        1620 :                 GDKfree(p->k);
     418        1621 :                 GDKfree(p->w);
     419             :                 do {
     420        1715 :                         struct RE *n = p->n;
     421             : 
     422        1715 :                         GDKfree(p);
     423             :                         p = n;
     424        1714 :                 } while (p);
     425             :         }
     426        1619 : }
     427             : 
     428             : /* Create a linked list of RE structures.  Depending on the caseignore
     429             :  * flag, the w (if true) or the k (if false) field is used.  These
     430             :  * fields in the first structure are allocated, whereas in all
     431             :  * subsequent structures the fields point into the allocated buffer of
     432             :  * the first. */
     433             : static struct RE *
     434        1621 : re_create(const char *pat, bool caseignore, uint32_t esc)
     435             : {
     436        1621 :         struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
     437             :         bool escaped = false;
     438             : 
     439        1618 :         if (r == NULL)
     440             :                 return NULL;
     441        1618 :         *r = (struct RE) {.atend = true};
     442             : 
     443        2721 :         while (esc != '%' && *pat == '%') {
     444        1103 :                 pat++; /* skip % */
     445        1103 :                 r->search = true;
     446             :         }
     447        1618 :         if (caseignore) {
     448             :                 uint32_t *wp;
     449             :                 uint32_t *wq;
     450          98 :                 wp = utf8stoucs(pat);
     451          98 :                 if (wp == NULL) {
     452           0 :                         GDKfree(r);
     453           0 :                         return NULL;
     454             :                 }
     455          98 :                 r->w = wp;
     456             :                 wq = wp;
     457         912 :                 while (*wp) {
     458         814 :                         if (escaped) {
     459           1 :                                 *wq++ = *wp;
     460           1 :                                 n->len++;
     461             :                                 escaped = false;
     462         813 :                         } else if (*wp == esc) {
     463             :                                 escaped = true;
     464         812 :                         } else if (*wp == '%') {
     465          93 :                                 n->atend = false;
     466          93 :                                 while (wp[1] == '%')
     467           0 :                                         wp++;
     468          93 :                                 if (wp[1]) {
     469          20 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     470          20 :                                         if (n == NULL)
     471           0 :                                                 goto bailout;
     472          20 :                                         *n = (struct RE) {.search = true, .atend = true, .w = wp + 1};
     473             :                                 }
     474          93 :                                 *wq++ = 0;
     475             :                         } else {
     476         719 :                                 *wq++ = *wp;
     477         719 :                                 n->len++;
     478             :                         }
     479         814 :                         wp++;
     480             :                 }
     481          98 :                 *wq = 0;
     482             :         } else {
     483             :                 char *p, *q;
     484        1520 :                 if ((p = GDKstrdup(pat)) == NULL) {
     485           0 :                         GDKfree(r);
     486           0 :                         return NULL;
     487             :                 }
     488        1523 :                 r->k = p;
     489             :                 q = p;
     490       13042 :                 while (*p) {
     491       11519 :                         if (escaped) {
     492         115 :                                 *q++ = *p;
     493         115 :                                 n->len++;
     494             :                                 escaped = false;
     495       11404 :                         } else if ((unsigned char) *p == esc) {
     496             :                                 escaped = true;
     497       11289 :                         } else if (*p == '%') {
     498        1371 :                                 n->atend = false;
     499        1371 :                                 while (p[1] == '%')
     500           0 :                                         p++;
     501        1371 :                                 if (p[1]) {
     502          75 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     503          75 :                                         if (n == NULL)
     504           0 :                                                 goto bailout;
     505          75 :                                         *n = (struct RE) {.search = true, .atend = true, .k = p + 1};
     506             :                                 }
     507        1371 :                                 *q++ = 0;
     508             :                         } else {
     509        9918 :                                 *q++ = *p;
     510        9918 :                                 n->len++;
     511             :                         }
     512       11519 :                         p++;
     513             :                 }
     514        1523 :                 *q = 0;
     515             :         }
     516             :         return r;
     517           0 :   bailout:
     518           0 :         re_destroy(r);
     519           0 :         return NULL;
     520             : }
     521             : 
     522             : #ifdef HAVE_LIBPCRE
     523             : static str
     524          24 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
     525             : {
     526             :         pcre *r;
     527          24 :         const char *err_p = NULL;
     528          24 :         int errpos = 0;
     529             :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
     530          24 :         if (insensitive)
     531             :                 options |= PCRE_CASELESS;
     532             : 
     533          24 :         if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
     534           0 :                 throw(MAL, "pcre.compile", OPERATION_FAILED
     535             :                           " with\n'%s'\nat %d in\n'%s'.\n",
     536             :                           err_p, errpos, pattern);
     537             :         }
     538          24 :         *res = r;
     539          24 :         return MAL_SUCCEED;
     540             : }
     541             : #endif
     542             : 
     543             : /* maximum number of back references and quoted \ or $ in replacement string */
     544             : #define MAX_NR_REFS             20
     545             : 
     546             : struct backref {
     547             :         int idx;
     548             :         int start;
     549             :         int end;
     550             : };
     551             : 
     552             : #ifdef HAVE_LIBPCRE
     553             : /* fill in parameter backrefs (length maxrefs) with information about
     554             :  * back references in the replacement string; a back reference is a
     555             :  * dollar or backslash followed by a number */
     556             : static int
     557         127 : parse_replacement(const char *replacement, int len_replacement,
     558             :                                   struct backref *backrefs, int maxrefs)
     559             : {
     560             :         int nbackrefs = 0;
     561             : 
     562         756 :         for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
     563         629 :                 if (replacement[i] == '$' || replacement[i] == '\\') {
     564             :                         char *endptr;
     565           9 :                         backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
     566           9 :                         if (endptr > replacement + i + 1) {
     567           9 :                                 int k = (int) (endptr - (replacement + i + 1));
     568           9 :                                 backrefs[nbackrefs].start = i;
     569           9 :                                 backrefs[nbackrefs].end = i + k + 1;
     570           9 :                                 nbackrefs++;
     571           0 :                         } else if (replacement[i] == replacement[i + 1]) {
     572             :                                 /* doubled $ or \, we must copy just one to the output */
     573           0 :                                 backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
     574           0 :                                 backrefs[nbackrefs].start = i;
     575           0 :                                 backrefs[nbackrefs].end = i + 1;
     576             :                                 i++;                    /* don't look at second $ or \ again */
     577           0 :                                 nbackrefs++;
     578             :                         }
     579             :                         /* else: $ or \ followed by something we don't recognize,
     580             :                          * so just leave it */
     581             :                 }
     582             :         }
     583         127 :         return nbackrefs;
     584             : }
     585             : 
     586             : static char *
     587       60980 : single_replace(pcre *pcre_code, pcre_extra *extra,
     588             :                            const char *origin_str, int len_origin_str,
     589             :                            int exec_options, int *ovector, int ovecsize,
     590             :                            const char *replacement, int len_replacement,
     591             :                            struct backref *backrefs, int nbackrefs,
     592             :                            bool global, char *result, int *max_result)
     593             : {
     594             :         int offset = 0;
     595             :         int len_result = 0;
     596             :         int addlen;
     597             :         char *tmp;
     598             : 
     599             :         do {
     600      172037 :                 int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
     601             :                                           exec_options, ovector, ovecsize);
     602      172056 :                 if (j <= 0)
     603             :                         break;
     604      113803 :                 addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
     605      113803 :                 if (len_result + addlen >= *max_result) {
     606        9700 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     607        9700 :                         if (tmp == NULL) {
     608           0 :                                 GDKfree(result);
     609           0 :                                 return NULL;
     610             :                         }
     611             :                         result = tmp;
     612        9700 :                         *max_result = len_result + addlen + 1;
     613             :                 }
     614      113803 :                 if (ovector[0] > offset) {
     615      110329 :                         strncpy(result + len_result, origin_str + offset,
     616      110329 :                                         ovector[0] - offset);
     617      110329 :                         len_result += ovector[0] - offset;
     618             :                 }
     619      113803 :                 if (nbackrefs == 0) {
     620      111057 :                         strncpy(result + len_result, replacement, len_replacement);
     621      111057 :                         len_result += len_replacement;
     622             :                 } else {
     623             :                         int prevend = 0;
     624        5492 :                         for (int i = 0; i < nbackrefs; i++) {
     625             :                                 int off, len;
     626        2746 :                                 if (backrefs[i].idx >= ovecsize / 3) {
     627             :                                         /* out of bounds, replace with empty string */
     628             :                                         off = 0;
     629             :                                         len = 0;
     630             :                                 } else {
     631        2746 :                                         off = ovector[backrefs[i].idx * 2];
     632        2746 :                                         len = ovector[backrefs[i].idx * 2 + 1] - off;
     633             :                                 }
     634        2746 :                                 addlen = backrefs[i].start - prevend + len;
     635        2746 :                                 if (len_result + addlen >= *max_result) {
     636          25 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     637          25 :                                         if (tmp == NULL) {
     638           0 :                                                 GDKfree(result);
     639           0 :                                                 return NULL;
     640             :                                         }
     641             :                                         result = tmp;
     642          25 :                                         *max_result = len_result + addlen + 1;
     643             :                                 }
     644        2746 :                                 if (backrefs[i].start > prevend) {
     645           0 :                                         strncpy(result + len_result, replacement + prevend,
     646           0 :                                                         backrefs[i].start - prevend);
     647           0 :                                         len_result += backrefs[i].start - prevend;
     648             :                                 }
     649        2746 :                                 if (len > 0) {
     650        2746 :                                         strncpy(result + len_result, origin_str + off, len);
     651        2746 :                                         len_result += len;
     652             :                                 }
     653        2746 :                                 prevend = backrefs[i].end;
     654             :                         }
     655             :                         /* copy rest of replacement string (after last backref) */
     656        2746 :                         addlen = len_replacement - prevend;
     657        2746 :                         if (addlen > 0) {
     658           0 :                                 if (len_result + addlen >= *max_result) {
     659           0 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     660           0 :                                         if (tmp == NULL) {
     661           0 :                                                 GDKfree(result);
     662           0 :                                                 return NULL;
     663             :                                         }
     664             :                                         result = tmp;
     665           0 :                                         *max_result = len_result + addlen + 1;
     666             :                                 }
     667           0 :                                 strncpy(result + len_result, replacement + prevend, addlen);
     668             :                                 len_result += addlen;
     669             :                         }
     670             :                 }
     671      113803 :                 offset = ovector[1];
     672      113803 :         } while (offset < len_origin_str && global);
     673       60999 :         if (offset < len_origin_str) {
     674       57896 :                 addlen = len_origin_str - offset;
     675       57896 :                 if (len_result + addlen >= *max_result) {
     676         518 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     677         518 :                         if (tmp == NULL) {
     678           0 :                                 GDKfree(result);
     679           0 :                                 return NULL;
     680             :                         }
     681             :                         result = tmp;
     682         518 :                         *max_result = len_result + addlen + 1;
     683             :                 }
     684       57896 :                 strncpy(result + len_result, origin_str + offset, addlen);
     685             :                 len_result += addlen;
     686             :         }
     687             :         /* null terminate string */
     688       60999 :         result[len_result] = '\0';
     689       60999 :         return result;
     690             : }
     691             : #endif
     692             : 
     693             : static str
     694          37 : pcre_replace(str *res, const char *origin_str, const char *pattern,
     695             :                          const char *replacement, const char *flags, bool global)
     696             : {
     697             : #ifdef HAVE_LIBPCRE
     698          37 :         const char *err_p = NULL;
     699             :         pcre *pcre_code = NULL;
     700             :         pcre_extra *extra;
     701             :         char *tmpres;
     702             :         int max_result;
     703          37 :         int i, errpos = 0;
     704             :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     705             :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     706             :         int *ovector, ovecsize;
     707          37 :         int len_origin_str = (int) strlen(origin_str);
     708          37 :         int len_replacement = (int) strlen(replacement);
     709             :         struct backref backrefs[MAX_NR_REFS];
     710             :         int nbackrefs = 0;
     711             : 
     712         185 :         while (*flags) {
     713         148 :                 switch (*flags) {
     714             :                 case 'e':
     715             :                         exec_options &= ~PCRE_NOTEMPTY;
     716             :                         break;
     717          37 :                 case 'i':
     718          37 :                         compile_options |= PCRE_CASELESS;
     719          37 :                         break;
     720          37 :                 case 'm':
     721          37 :                         compile_options |= PCRE_MULTILINE;
     722          37 :                         break;
     723          37 :                 case 's':
     724          37 :                         compile_options |= PCRE_DOTALL;
     725          37 :                         break;
     726          37 :                 case 'x':
     727          37 :                         compile_options |= PCRE_EXTENDED;
     728          37 :                         break;
     729           0 :                 default:
     730           0 :                         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     731             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     732             :                                   *flags);
     733             :                 }
     734         148 :                 flags++;
     735             :         }
     736             : 
     737          37 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     738           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     739             :                           OPERATION_FAILED ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     740             :                           pattern, errpos, err_p);
     741             :         }
     742             : 
     743             :         /* Since the compiled pattern is going to be used several times, it is
     744             :          * worth spending more time analyzing it in order to speed up the time
     745             :          * taken for matching.
     746             :          */
     747          37 :         extra = pcre_study(pcre_code, 0, &err_p);
     748          37 :         if (err_p != NULL) {
     749           0 :                 pcre_free(pcre_code);
     750           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     751             :                           OPERATION_FAILED ": pcre study of pattern (%s) failed with '%s'.\n",
     752             :                           pattern, err_p);
     753             :         }
     754          37 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
     755          37 :         ovecsize = (i + 1) * 3;
     756          37 :         if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
     757           0 :                 pcre_free_study(extra);
     758           0 :                 pcre_free(pcre_code);
     759           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     760             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     761             :         }
     762             : 
     763             :         /* identify back references in the replacement string */
     764          37 :         nbackrefs = parse_replacement(replacement, len_replacement,
     765             :                                                                   backrefs, MAX_NR_REFS);
     766             : 
     767          37 :         max_result = len_origin_str + 1;
     768          37 :         tmpres = GDKmalloc(max_result);
     769          37 :         if (tmpres == NULL) {
     770           0 :                 GDKfree(ovector);
     771           0 :                 pcre_free_study(extra);
     772           0 :                 pcre_free(pcre_code);
     773           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     774             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     775             :         }
     776             : 
     777          37 :         tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
     778             :                                                         exec_options, ovector, ovecsize, replacement,
     779             :                                                         len_replacement, backrefs, nbackrefs, global,
     780             :                                                         tmpres, &max_result);
     781          37 :         GDKfree(ovector);
     782          37 :         pcre_free_study(extra);
     783          37 :         pcre_free(pcre_code);
     784          37 :         if (tmpres == NULL)
     785           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     786             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     787             : 
     788          37 :         *res = tmpres;
     789          37 :         return MAL_SUCCEED;
     790             : #else
     791             :         (void) res;
     792             :         (void) origin_str;
     793             :         (void) pattern;
     794             :         (void) replacement;
     795             :         (void) flags;
     796             :         (void) global;
     797             :         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     798             :                   "Database was compiled without PCRE support.");
     799             : #endif
     800             : }
     801             : 
     802             : static str
     803          90 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
     804             :                                  const char *replacement, const char *flags, bool global)
     805             : {
     806             : #ifdef HAVE_LIBPCRE
     807          90 :         const char *err_p = NULL;
     808             :         char *tmpres;
     809          90 :         int i, errpos = 0;
     810             :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     811             :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     812             :         pcre *pcre_code = NULL;
     813             :         pcre_extra *extra;
     814             :         BAT *tmpbat;
     815             :         BUN p, q;
     816             :         int *ovector, ovecsize;
     817          90 :         int len_replacement = (int) strlen(replacement);
     818             :         struct backref backrefs[MAX_NR_REFS];
     819             :         int nbackrefs = 0;
     820             :         const char *origin_str;
     821          90 :         int max_dest_size = 0;
     822             : 
     823         126 :         while (*flags) {
     824          36 :                 switch (*flags) {
     825             :                 case 'e':
     826             :                         exec_options &= ~PCRE_NOTEMPTY;
     827             :                         break;
     828           9 :                 case 'i':
     829           9 :                         compile_options |= PCRE_CASELESS;
     830           9 :                         break;
     831          18 :                 case 'm':
     832          18 :                         compile_options |= PCRE_MULTILINE;
     833          18 :                         break;
     834           9 :                 case 's':
     835           9 :                         compile_options |= PCRE_DOTALL;
     836           9 :                         break;
     837           0 :                 case 'x':
     838           0 :                         compile_options |= PCRE_EXTENDED;
     839           0 :                         break;
     840           0 :                 default:
     841           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     842             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     843             :                                   *flags);
     844             :                 }
     845          36 :                 flags++;
     846             :         }
     847             : 
     848          90 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     849           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     850             :                           OPERATION_FAILED
     851             :                           ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     852             :                           pattern, errpos, err_p);
     853             :         }
     854             : 
     855             :         /* Since the compiled pattern is going to be used several times,
     856             :          * it is worth spending more time analyzing it in order to speed
     857             :          * up the time taken for matching.
     858             :          */
     859          90 :         extra = pcre_study(pcre_code, BATcount(origin_strs) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
     860          90 :         if (err_p != NULL) {
     861           0 :                 pcre_free(pcre_code);
     862           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     863             :                           OPERATION_FAILED);
     864             :         }
     865          90 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
     866          90 :         ovecsize = (i + 1) * 3;
     867          90 :         if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
     868           0 :                 pcre_free_study(extra);
     869           0 :                 pcre_free(pcre_code);
     870           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     871             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     872             :         }
     873             : 
     874             :         /* identify back references in the replacement string */
     875          90 :         nbackrefs = parse_replacement(replacement, len_replacement,
     876             :                                                                   backrefs, MAX_NR_REFS);
     877             : 
     878          90 :         tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs), TRANSIENT);
     879             : 
     880             :         /* the buffer for all destination strings is allocated only once,
     881             :          * and extended when needed */
     882          90 :         max_dest_size = len_replacement + 1;
     883          90 :         tmpres = GDKmalloc(max_dest_size);
     884          90 :         if (tmpbat == NULL || tmpres == NULL) {
     885           0 :                 pcre_free_study(extra);
     886           0 :                 pcre_free(pcre_code);
     887           0 :                 GDKfree(ovector);
     888           0 :                 BBPreclaim(tmpbat);
     889           0 :                 GDKfree(tmpres);
     890           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     891             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     892             :         }
     893          90 :         BATiter origin_strsi = bat_iterator(origin_strs);
     894       60962 :         BATloop(origin_strs, p, q) {
     895       60872 :                 origin_str = BUNtvar(origin_strsi, p);
     896       60872 :                 tmpres = single_replace(pcre_code, extra, origin_str,
     897       60872 :                                                                 (int) strlen(origin_str), exec_options,
     898             :                                                                 ovector, ovecsize, replacement,
     899             :                                                                 len_replacement, backrefs, nbackrefs, global,
     900             :                                                                 tmpres, &max_dest_size);
     901       60961 :                 if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
     902           0 :                         bat_iterator_end(&origin_strsi);
     903           0 :                         pcre_free_study(extra);
     904           0 :                         pcre_free(pcre_code);
     905           0 :                         GDKfree(ovector);
     906           0 :                         GDKfree(tmpres);
     907           0 :                         BBPreclaim(tmpbat);
     908           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     909             :                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
     910             :                 }
     911             :         }
     912          90 :         bat_iterator_end(&origin_strsi);
     913          90 :         pcre_free_study(extra);
     914          90 :         pcre_free(pcre_code);
     915          90 :         GDKfree(ovector);
     916          90 :         GDKfree(tmpres);
     917          90 :         *res = tmpbat;
     918          90 :         return MAL_SUCCEED;
     919             : #else
     920             :         (void) res;
     921             :         (void) origin_strs;
     922             :         (void) pattern;
     923             :         (void) replacement;
     924             :         (void) flags;
     925             :         (void) global;
     926             :         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     927             :                   "Database was compiled without PCRE support.");
     928             : #endif
     929             : }
     930             : 
     931             : static str
     932         266 : pcre_init(void *ret)
     933             : {
     934             :         (void) ret;
     935         266 :         return NULL;
     936             : }
     937             : 
     938             : static str
     939         130 : pcre_match_with_flags(bit *ret, const char *val, const char *pat, const char *flags)
     940             : {
     941             :         int pos;
     942             : #ifdef HAVE_LIBPCRE
     943         130 :         const char *err_p = NULL;
     944         130 :         int errpos = 0;
     945             :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     946             :         pcre *re;
     947             : #else
     948             :         int options = REG_NOSUB;
     949             :         regex_t re;
     950             :         int errcode;
     951             :         int retval;
     952             : #endif
     953             : 
     954         260 :         while (*flags) {
     955         130 :                 switch (*flags) {
     956           6 :                 case 'i':
     957             : #ifdef HAVE_LIBPCRE
     958           6 :                         options |= PCRE_CASELESS;
     959             : #else
     960             :                         options |= REG_ICASE;
     961             : #endif
     962           6 :                         break;
     963           0 :                 case 'm':
     964             : #ifdef HAVE_LIBPCRE
     965           0 :                         options |= PCRE_MULTILINE;
     966             : #else
     967             :                         options |= REG_NEWLINE;
     968             : #endif
     969           0 :                         break;
     970             : #ifdef HAVE_LIBPCRE
     971         124 :                 case 's':
     972         124 :                         options |= PCRE_DOTALL;
     973         124 :                         break;
     974             : #endif
     975           0 :                 case 'x':
     976             : #ifdef HAVE_LIBPCRE
     977           0 :                         options |= PCRE_EXTENDED;
     978             : #else
     979             :                         options |= REG_EXTENDED;
     980             : #endif
     981           0 :                         break;
     982           0 :                 default:
     983           0 :                         throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
     984             :                                   ": unsupported flag character '%c'\n", *flags);
     985             :                 }
     986         130 :                 flags++;
     987             :         }
     988         130 :         if (strNil(val)) {
     989           0 :                 *ret = FALSE;
     990           0 :                 return MAL_SUCCEED;
     991             :         }
     992             : 
     993             : #ifdef HAVE_LIBPCRE
     994         130 :         if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
     995             : #else
     996             :                 if ((errcode = regcomp(&re, pat, options)) != 0)
     997             : #endif
     998             :                         {
     999           0 :                                 throw(MAL, "pcre.match", OPERATION_FAILED
    1000             :                                           ": compilation of regular expression (%s) failed "
    1001             : #ifdef HAVE_LIBPCRE
    1002             :                                           "at %d with '%s'", pat, errpos, err_p
    1003             : #else
    1004             :                                           , pat
    1005             : #endif
    1006             :                                         );
    1007             :                         }
    1008             : #ifdef HAVE_LIBPCRE
    1009         130 :         pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
    1010         130 :         pcre_free(re);
    1011             : #else
    1012             :         retval = regexec(&re, val, (size_t) 0, NULL, 0);
    1013             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1014             :         regfree(&re);
    1015             : #endif
    1016         130 :         if (pos >= 0)
    1017          46 :                 *ret = TRUE;
    1018          84 :         else if (pos == -1)
    1019          84 :                 *ret = FALSE;
    1020             :         else
    1021           0 :                 throw(MAL, "pcre.match", OPERATION_FAILED
    1022             :                           ": matching of regular expression (%s) failed with %d",
    1023             :                           pat, pos);
    1024             :         return MAL_SUCCEED;
    1025             : }
    1026             : 
    1027             : #ifdef HAVE_LIBPCRE
    1028             : /* special characters in PCRE that need to be escaped */
    1029             : static const char *pcre_specials = ".+?*()[]{}|^$\\";
    1030             : #else
    1031             : /* special characters in POSIX basic regular expressions that need to
    1032             :  * be escaped */
    1033             : static const char *pcre_specials = ".*[]^$\\";
    1034             : #endif
    1035             : 
    1036             : /* change SQL LIKE pattern into PCRE pattern */
    1037             : static str
    1038         304 : sql2pcre(str *r, const char *pat, const char *esc_str)
    1039             : {
    1040             :         int escaped = 0;
    1041             :         int hasWildcard = 0;
    1042             :         char *ppat;
    1043         304 :         int esc = esc_str[0] == '\200' ? 0 : esc_str[0]; /* should change to utf8_convert() */
    1044             :         int specials;
    1045             :         int c;
    1046             : 
    1047         304 :         if (strlen(esc_str) > 1)
    1048           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": ESCAPE string must have length 1");
    1049         304 :         if (pat == NULL)
    1050           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": (I)LIKE pattern must not be NULL");
    1051         304 :         ppat = GDKmalloc(strlen(pat)*3+3 /* 3 = "^'the translated regexp'$0" */);
    1052         304 :         if (ppat == NULL)
    1053           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1054             : 
    1055         304 :         *r = ppat;
    1056             :         /* The escape character can be a char which is special in a PCRE
    1057             :          * expression.  If the user used the "+" char as escape and has "++"
    1058             :          * in their pattern, then replacing this with "+" is not correct and
    1059             :          * should be "\+" instead. */
    1060         304 :         specials = (esc && strchr(pcre_specials, esc) != NULL);
    1061             : 
    1062         304 :         *ppat++ = '^';
    1063        2068 :         while ((c = *pat++) != 0) {
    1064        1764 :                 if (c == esc) {
    1065          15 :                         if (escaped) {
    1066           1 :                                 if (specials) { /* change ++ into \+ */
    1067           1 :                                         *ppat++ = esc;
    1068             :                                 } else { /* do not escape simple escape symbols */
    1069           0 :                                         ppat[-1] = esc; /* overwrite backslash */
    1070             :                                 }
    1071             :                                 escaped = 0;
    1072             :                         } else {
    1073          14 :                                 *ppat++ = '\\';
    1074             :                                 escaped = 1;
    1075             :                         }
    1076             :                         hasWildcard = 1;
    1077        1749 :                 } else if (strchr(pcre_specials, c) != NULL) {
    1078             :                         /* escape PCRE special chars, avoid double backslash if the
    1079             :                          * user uses an invalid escape sequence */
    1080          28 :                         if (!escaped)
    1081          28 :                                 *ppat++ = '\\';
    1082          28 :                         *ppat++ = c;
    1083             :                         hasWildcard = 1;
    1084             :                         escaped = 0;
    1085        1721 :                 } else if (c == '%' && !escaped) {
    1086         310 :                         *ppat++ = '.';
    1087         310 :                         *ppat++ = '*';
    1088         310 :                         *ppat++ = '?';
    1089             :                         hasWildcard = 1;
    1090             :                         /* collapse multiple %, but only if it isn't the escape */
    1091         310 :                         if (esc != '%')
    1092         309 :                                 while (*pat == '%')
    1093           0 :                                         pat++;
    1094        1411 :                 } else if (c == '_' && !escaped) {
    1095         354 :                         *ppat++ = '.';
    1096             :                         hasWildcard = 1;
    1097             :                 } else {
    1098        1057 :                         if (escaped) {
    1099          13 :                                 ppat[-1] = c; /* overwrite backslash of invalid escape */
    1100             :                         } else {
    1101        1044 :                                 *ppat++ = c;
    1102             :                         }
    1103             :                         escaped = 0;
    1104             :                 }
    1105             :         }
    1106             :         /* no wildcard or escape character at end of string */
    1107         304 :         if (!hasWildcard || escaped) {
    1108           2 :                 GDKfree(*r);
    1109           1 :                 *r = NULL;
    1110           1 :                 if (escaped)
    1111           0 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": (I)LIKE pattern must not end with escape character");
    1112           1 :                 *r = GDKstrdup(str_nil);
    1113           1 :                 if (*r == NULL)
    1114           0 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1115             :         } else {
    1116         302 :                 *ppat++ = '$';
    1117         302 :                 *ppat = 0;
    1118             :         }
    1119             :         return MAL_SUCCEED;
    1120             : }
    1121             : 
    1122             : #ifdef HAVE_LIBPCRE
    1123             : /* change SQL PATINDEX pattern into PCRE pattern */
    1124             : static str
    1125          24 : pat2pcre(str *r, const char *pat)
    1126             : {
    1127          24 :         size_t len = strlen(pat);
    1128          24 :         char *ppat = GDKmalloc(len*2+3 /* 3 = "^'the translated regexp'$0" */);
    1129             :         int start = 0;
    1130             : 
    1131          24 :         if (ppat == NULL)
    1132           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1133          24 :         *r = ppat;
    1134          72 :         while (*pat) {
    1135          48 :                 int c = *pat++;
    1136             : 
    1137          48 :                 if (strchr(pcre_specials, c) != NULL) {
    1138          17 :                         *ppat++ = '\\';
    1139          17 :                         *ppat++ = c;
    1140          31 :                 } else if (c == '%') {
    1141           1 :                         if (start && *pat) {
    1142           0 :                                 *ppat++ = '.';
    1143           0 :                                 *ppat++ = '*';
    1144             :                         }
    1145           1 :                         start++;
    1146          30 :                 } else if (c == '_') {
    1147           0 :                         *ppat++ = '.';
    1148             :                 } else {
    1149          30 :                         *ppat++ = c;
    1150             :                 }
    1151             :         }
    1152          24 :         *ppat = 0;
    1153          24 :         return MAL_SUCCEED;
    1154             : }
    1155             : #endif
    1156             : 
    1157             : /*
    1158             :  * @+ Wrapping
    1159             :  */
    1160             : 
    1161             : static str
    1162          37 : PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags)
    1163             : {
    1164          37 :         return pcre_replace(res, *or, *pat, *repl, *flags, true);
    1165             : }
    1166             : 
    1167             : static str
    1168          90 : PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags)
    1169             : {
    1170          90 :         BAT *b, *bn = NULL;
    1171             :         str msg;
    1172          90 :         if ((b = BATdescriptor(*bid)) == NULL)
    1173           0 :                 throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1174             : 
    1175          90 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
    1176          90 :         if (msg == MAL_SUCCEED) {
    1177          90 :                 *res = bn->batCacheid;
    1178          90 :                 BBPkeepref(*res);
    1179             :         }
    1180          90 :         BBPunfix(b->batCacheid);
    1181          90 :         return msg;
    1182             : }
    1183             : 
    1184             : static str
    1185           0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags)
    1186             : {
    1187           0 :         BAT *b,*bn = NULL;
    1188             :         str msg;
    1189           0 :         if ((b = BATdescriptor(*bid)) == NULL)
    1190           0 :                 throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
    1191             : 
    1192           0 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
    1193           0 :         if (msg == MAL_SUCCEED) {
    1194           0 :                 *res = bn->batCacheid;
    1195           0 :                 BBPkeepref(*res);
    1196             :         }
    1197           0 :         BBPunfix(b->batCacheid);
    1198           0 :         return msg;
    1199             : }
    1200             : 
    1201             : static str
    1202           4 : PCREmatch(bit *ret, const str *val, const str *pat)
    1203             : {
    1204         124 :         return pcre_match_with_flags(ret, *val, *pat,
    1205             : #ifdef HAVE_LIBPCRE
    1206             :                                                                  "s"
    1207             : #else
    1208             :                                                                  "x"
    1209             : #endif
    1210             :                 );
    1211             : }
    1212             : 
    1213             : static str
    1214           0 : PCREimatch(bit *ret, const str *val, const str *pat)
    1215             : {
    1216           6 :         return pcre_match_with_flags(ret, *val, *pat, "i"
    1217             : #ifndef HAVE_LIBPCRE
    1218             :                                                                  "x"
    1219             : #endif
    1220             :                 );
    1221             : }
    1222             : 
    1223             : static str
    1224          24 : PCREindex(int *res, const pcre *pattern, const str *s)
    1225             : {
    1226             : #ifdef HAVE_LIBPCRE
    1227             :         int v[3];
    1228             : 
    1229          24 :         v[0] = v[1] = *res = 0;
    1230          24 :         if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0, PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
    1231          22 :                 *res = v[1];
    1232             :         }
    1233          24 :         return MAL_SUCCEED;
    1234             : #else
    1235             :         (void) res;
    1236             :         (void) pattern;
    1237             :         (void) s;
    1238             :         throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
    1239             : #endif
    1240             : }
    1241             : 
    1242             : static str
    1243          26 : PCREpatindex(int *ret, const str *pat, const str *val)
    1244             : {
    1245             : #ifdef HAVE_LIBPCRE
    1246          26 :         pcre *re = NULL;
    1247          26 :         char *ppat = NULL, *msg;
    1248             : 
    1249          77 :         if (strNil(*pat) || strNil(*val)) {
    1250           2 :                 *ret = int_nil;
    1251           2 :                 return MAL_SUCCEED;
    1252             :         }
    1253             : 
    1254          24 :         if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
    1255             :                 return msg;
    1256          24 :         if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
    1257           0 :                 GDKfree(ppat);
    1258           0 :                 return msg;
    1259             :         }
    1260          24 :         GDKfree(ppat);
    1261          24 :         msg = PCREindex(ret, re, val);
    1262          24 :         pcre_free(re);
    1263          24 :         return msg;
    1264             : #else
    1265             :         (void) ret;
    1266             :         (void) pat;
    1267             :         (void) val;
    1268             :         throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
    1269             : #endif
    1270             : }
    1271             : 
    1272             : static str
    1273           0 : PCREquote(str *ret, const str *val)
    1274             : {
    1275             :         char *p;
    1276           0 :         const char *s = *val;
    1277             : 
    1278           0 :         *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
    1279           0 :         if (p == NULL)
    1280           0 :                 throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1281             :         /* quote all non-alphanumeric ASCII characters (i.e. leave
    1282             :            non-ASCII and alphanumeric alone) */
    1283           0 :         while (*s) {
    1284           0 :                 if (!((*s & 0x80) != 0 ||
    1285           0 :                       ('a' <= *s && *s <= 'z') ||
    1286           0 :                       ('A' <= *s && *s <= 'Z') ||
    1287           0 :                       isdigit((unsigned char) *s)))
    1288           0 :                         *p++ = '\\';
    1289           0 :                 *p++ = *s++;
    1290             :         }
    1291           0 :         *p = 0;
    1292           0 :         return MAL_SUCCEED;
    1293             : }
    1294             : 
    1295             : static str
    1296           6 : PCREsql2pcre(str *ret, const str *pat, const str *esc)
    1297             : {
    1298           6 :         return sql2pcre(ret, *pat, *esc);
    1299             : }
    1300             : 
    1301             : static inline str
    1302        3269 : choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty, const str *pat, const str *esc)
    1303             : {
    1304             :         str res = MAL_SUCCEED;
    1305        3269 :         *use_re = false;
    1306        3269 :         *use_strcmp = false;
    1307        3269 :         *empty = false;
    1308             : 
    1309        9338 :         if (strNil(*pat) || strNil(*esc)) {
    1310         469 :                 *empty = true;
    1311             :         } else {
    1312        2800 :                 if (!re_is_pattern_properly_escaped(*pat, (unsigned char) **esc))
    1313           5 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": (I)LIKE pattern must not end with escape character");
    1314        2794 :                 if (is_strcmpable(*pat, *esc)) {
    1315         876 :                         *use_re = true;
    1316         876 :                         *use_strcmp = true;
    1317        1918 :                 } else if (re_simple(*pat, (unsigned char) **esc)) {
    1318        1617 :                         *use_re = true;
    1319             :                 } else {
    1320         298 :                         if ((res = sql2pcre(ppat, *pat, *esc)) != MAL_SUCCEED)
    1321             :                                 return res;
    1322         594 :                         if (strNil(*ppat)) {
    1323           0 :                                 GDKfree(*ppat);
    1324           0 :                                 *ppat = NULL;
    1325           0 :                                 *use_re = true;
    1326           0 :                                 *use_strcmp = true;
    1327             :                         }
    1328             :                 }
    1329             :         }
    1330             :         return res;
    1331             : }
    1332             : 
    1333             : static str
    1334         414 : PCRElike_imp(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens)
    1335             : {
    1336             :         str res = MAL_SUCCEED;
    1337         414 :         char *ppat = NULL;
    1338         414 :         bool use_re = false, use_strcmp = false, empty = false;
    1339             :         struct RE *re = NULL;
    1340             : 
    1341         414 :         if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, pat, esc)) != MAL_SUCCEED)
    1342             :                 return res;
    1343             : 
    1344         780 :         MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ? "pcrelike: pattern matching using strcmp" :
    1345         371 :                                                    use_re ? "pcrelike: pattern matching using RE" : "pcrelike: pattern matching using pcre");
    1346             : 
    1347         818 :         if (strNil(*s) || empty) {
    1348          12 :                 *ret = bit_nil;
    1349         397 :         } else if (use_re) {
    1350         271 :                 if (use_strcmp) {
    1351          26 :                         *ret = *isens ? mystrcasecmp(*s, *pat) == 0 : strcmp(*s, *pat) == 0;
    1352             :                 } else {
    1353         245 :                         if (!(re = re_create(*pat, *isens, (unsigned char) **esc)))
    1354           0 :                                 res = createException(MAL, "pcre.like4", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1355             :                         else
    1356         245 :                                 *ret = *isens ? re_match_ignore(*s, re) : re_match_no_ignore(*s, re);
    1357             :                 }
    1358             :         } else {
    1359         126 :                 res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat);
    1360             :         }
    1361             : 
    1362         245 :         if (re)
    1363         245 :                 re_destroy(re);
    1364         409 :         GDKfree(ppat);
    1365         409 :         return res;
    1366             : }
    1367             : 
    1368             : static str
    1369         308 : PCRElike(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens)
    1370             : {
    1371         308 :         return PCRElike_imp(ret, s, pat, esc, isens);
    1372             : }
    1373             : 
    1374             : static str
    1375         106 : PCREnotlike(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens)
    1376             : {
    1377             :         str tmp;
    1378             :         bit r;
    1379             : 
    1380         106 :         rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
    1381         102 :         *ret = r==bit_nil?bit_nil:!r;
    1382         102 :         return MAL_SUCCEED;
    1383             : }
    1384             : 
    1385             : static inline str
    1386        2221 : re_like_build(struct RE **re, uint32_t **wpat, const char *pat, bool caseignore, bool use_strcmp, uint32_t esc)
    1387             : {
    1388        2221 :         if (!use_strcmp) {
    1389        1375 :                 if (!(*re = re_create(pat, caseignore, esc)))
    1390           0 :                         return createException(MAL, "pcre.re_like_build", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1391         846 :         } else if (caseignore) {
    1392         211 :                 if (!(*wpat = utf8stoucs(pat)))
    1393           0 :                         return createException(MAL, "pcre.re_like_build", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1394             :         }
    1395             :         return MAL_SUCCEED;
    1396             : }
    1397             : 
    1398             : #define proj_scanloop(TEST)     \
    1399             :         do {    \
    1400             :                 if (*s == '\200') \
    1401             :                         return bit_nil; \
    1402             :                 else \
    1403             :                         return TEST; \
    1404             :         } while (0)
    1405             : 
    1406             : static inline bit
    1407        4403 : re_like_proj_apply(str s, struct RE *re, uint32_t *wpat, const char *pat, bool caseignore, bool anti, bool use_strcmp)
    1408             : {
    1409        4403 :         if (use_strcmp) {
    1410         633 :                 if (caseignore) {
    1411         174 :                         if (anti)
    1412         128 :                                 proj_scanloop(mywstrcasecmp(s, wpat) != 0);
    1413             :                         else
    1414          46 :                                 proj_scanloop(mywstrcasecmp(s, wpat) == 0);
    1415             :                 } else {
    1416         459 :                         if (anti)
    1417         298 :                                 proj_scanloop(strcmp(s, pat) != 0);
    1418             :                         else
    1419         161 :                                 proj_scanloop(strcmp(s, pat) == 0);
    1420             :                 }
    1421             :         } else {
    1422        3770 :                 if (caseignore) {
    1423          83 :                         if (anti)
    1424          83 :                                 proj_scanloop(!re_match_ignore(s, re));
    1425             :                         else
    1426           0 :                                 proj_scanloop(re_match_ignore(s, re));
    1427             :                 } else {
    1428        3687 :                         if (anti)
    1429           0 :                                 proj_scanloop(!re_match_no_ignore(s, re));
    1430             :                         else
    1431        3687 :                                 proj_scanloop(re_match_no_ignore(s, re));
    1432             :                 }
    1433             :         }
    1434             : }
    1435             : 
    1436             : static inline void
    1437        2383 : re_like_clean(struct RE **re, uint32_t **wpat)
    1438             : {
    1439        2383 :         if (*re) {
    1440        1376 :                 re_destroy(*re);
    1441        1372 :                 *re = NULL;
    1442             :         }
    1443        2379 :         if (*wpat) {
    1444         211 :                 GDKfree(*wpat);
    1445         211 :                 *wpat = NULL;
    1446             :         }
    1447        2379 : }
    1448             : 
    1449             : static inline str
    1450         172 : pcre_like_build(
    1451             : #ifdef HAVE_LIBPCRE
    1452             :         pcre **res,
    1453             :         pcre_extra **ex
    1454             : #else
    1455             :         regex_t *res,
    1456             :         void *ex
    1457             : #endif
    1458             : , const char *ppat, bool caseignore, BUN count)
    1459             : {
    1460             : #ifdef HAVE_LIBPCRE
    1461         172 :         const char *err_p = NULL;
    1462         172 :         int errpos = 0;
    1463             :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE | PCRE_DOTALL;
    1464         172 :         int pcrestopt = count > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
    1465             : 
    1466         172 :         *res = NULL;
    1467         172 :         *ex = NULL;
    1468             : #else
    1469             :         int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
    1470             :         int errcode;
    1471             : 
    1472             :         *res = (regex_t) {0};
    1473             :         (void) count;
    1474             : #endif
    1475             : 
    1476         172 :         if (caseignore) {
    1477             : #ifdef HAVE_LIBPCRE
    1478             :                 options |= PCRE_CASELESS;
    1479             : #else
    1480             :                 options |= REG_ICASE;
    1481             : #endif
    1482             :         }
    1483         172 :         if (
    1484             : #ifdef HAVE_LIBPCRE
    1485         172 :                 (*res = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL
    1486             : #else
    1487             :                 (errcode = regcomp(res, ppat, options)) != 0
    1488             : #endif
    1489             :                 )
    1490           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1491             :                                                                 ": compilation of regular expression (%s) failed"
    1492             : #ifdef HAVE_LIBPCRE
    1493             :                                                                 " at %d with '%s'", ppat, errpos, err_p
    1494             : #else
    1495             :                                                                 , ppat
    1496             : #endif
    1497             :                         );
    1498             : #ifdef HAVE_LIBPCRE
    1499         172 :         *ex = pcre_study(*res, pcrestopt, &err_p);
    1500         172 :         if (err_p != NULL)
    1501           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1502             :                                                                 ": pcre study of pattern (%s) "
    1503             :                                                                 "failed with '%s'", ppat, err_p);
    1504             : #else
    1505             :         (void) ex;
    1506             : #endif
    1507             :         return MAL_SUCCEED;
    1508             : }
    1509             : 
    1510             : #define PCRE_LIKE_BODY(LOOP_BODY, RES1, RES2) \
    1511             :         do { \
    1512             :                 LOOP_BODY  \
    1513             :                 if (*s == '\200') \
    1514             :                         *ret = bit_nil; \
    1515             :                 else if (pos >= 0) \
    1516             :                         *ret = RES1; \
    1517             :                 else if (pos == -1) \
    1518             :                         *ret = RES2; \
    1519             :                 else \
    1520             :                         return createException(MAL, "pcre.match", OPERATION_FAILED ": matching of regular expression (%s) failed with %d", ppat, pos); \
    1521             :         } while(0)
    1522             : 
    1523             : static inline str
    1524        1095 : pcre_like_apply(bit *ret, str s,
    1525             : #ifdef HAVE_LIBPCRE
    1526             :         pcre *re, pcre_extra *ex
    1527             : #else
    1528             :         regex_t re, void *ex
    1529             : #endif
    1530             : , const char *ppat, bool anti)
    1531             : {
    1532             :         int pos;
    1533             : 
    1534             : #ifdef HAVE_LIBPCRE
    1535             : #define LOOP_BODY       \
    1536             :         pos = pcre_exec(re, ex, s, (int) strlen(s), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
    1537             : #else
    1538             : #define LOOP_BODY       \
    1539             :         int retval = regexec(&re, s, (size_t) 0, NULL, 0); \
    1540             :         (void) ex; \
    1541             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1542             : #endif
    1543             : 
    1544        1095 :         if (anti)
    1545           6 :                 PCRE_LIKE_BODY(LOOP_BODY, FALSE, TRUE);
    1546             :         else
    1547        1089 :                 PCRE_LIKE_BODY(LOOP_BODY, TRUE, FALSE);
    1548             : 
    1549             :         return MAL_SUCCEED;
    1550             : }
    1551             : 
    1552             : static inline void
    1553         650 : pcre_clean(
    1554             : #ifdef HAVE_LIBPCRE
    1555             :         pcre **re, pcre_extra **ex) {
    1556         650 :         if (*re)
    1557         172 :                 pcre_free(*re);
    1558         650 :         if (*ex)
    1559         172 :                 pcre_free_study(*ex);
    1560         650 :         *re = NULL;
    1561         650 :         *ex = NULL;
    1562             : #else
    1563             :         regex_t *re, void *ex) {
    1564             :         regfree(re);
    1565             :         *re = (regex_t) {0};
    1566             :         (void) ex;
    1567             : #endif
    1568         650 : }
    1569             : 
    1570             : static str
    1571         448 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci, const str *esc, const bit *isens, const bit *not)
    1572             : {
    1573         448 :         str msg = MAL_SUCCEED, input = NULL, pat = NULL;
    1574             :         BAT *b = NULL, *pbn = NULL, *bn = NULL;
    1575         448 :         char *ppat = NULL;
    1576         448 :         bool use_re = false, use_strcmp = false, empty = false, isensitive = (bool) *isens, anti = (bool) *not, has_nil = false,
    1577         451 :                  input_is_a_bat = isaBatType(getArgType(mb, pci, 1)), pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
    1578         448 :         bat *r = getArgReference_bat(stk, pci, 0);
    1579             :         BUN q = 0;
    1580             :         bit *ret = NULL;
    1581             : #ifdef HAVE_LIBPCRE
    1582         448 :         pcre *re = NULL;
    1583         448 :         pcre_extra *ex = NULL;
    1584             : #else
    1585             :         regex_t re = (regex_t) {0};
    1586             :         void *ex = NULL;
    1587             : #endif
    1588         448 :         struct RE *re_simple = NULL;
    1589         448 :         uint32_t *wpat = NULL;
    1590         448 :         BATiter bi = (BATiter) {0}, pi;
    1591             : 
    1592             :         (void) cntxt;
    1593         448 :         if (input_is_a_bat) {
    1594         445 :                 bat *bid = getArgReference_bat(stk, pci, 1);
    1595         445 :                 if (!(b = BATdescriptor(*bid))) {
    1596           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1597           0 :                         goto bailout;
    1598             :                 }
    1599             :         }
    1600         448 :         if (pattern_is_a_bat) {
    1601          74 :                 bat *pb = getArgReference_bat(stk, pci, 2);
    1602          74 :                 if (!(pbn = BATdescriptor(*pb))) {
    1603           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1604           0 :                         goto bailout;
    1605             :                 }
    1606             :         }
    1607         448 :         assert((!b || ATOMstorage(b->ttype) == TYPE_str) && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
    1608             : 
    1609         448 :         q = BATcount(b ? b : pbn);
    1610         448 :         if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
    1611           0 :                 msg = createException(MAL, "batalgebra.batpcrelike3", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1612           0 :                 goto bailout;
    1613             :         }
    1614         448 :         ret = (bit*) Tloc(bn, 0);
    1615             : 
    1616         448 :         if (pattern_is_a_bat) {
    1617          74 :                 pi = bat_iterator(pbn);
    1618          74 :                 if (b)
    1619          71 :                         bi = bat_iterator(b);
    1620             :                 else
    1621           3 :                         input = *getArgReference_str(stk, pci, 1);
    1622             : 
    1623        1156 :                 for (BUN p = 0; p < q; p++) {
    1624        1082 :                         const str next_input = b ? BUNtail(bi, p) : input, np = BUNtail(pi, p);
    1625             : 
    1626        1082 :                         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &np, esc)) != MAL_SUCCEED) {
    1627           0 :                                 bat_iterator_end(&pi);
    1628           0 :                                 if (b)
    1629           0 :                                         bat_iterator_end(&bi);
    1630           0 :                                 goto bailout;
    1631             :                         }
    1632             : 
    1633        1082 :                         if (use_re) {
    1634         622 :                                 if ((msg = re_like_build(&re_simple, &wpat, np, isensitive, use_strcmp, (unsigned char) **esc)) != MAL_SUCCEED) {
    1635           0 :                                         bat_iterator_end(&pi);
    1636           0 :                                         if (b)
    1637           0 :                                                 bat_iterator_end(&bi);
    1638           0 :                                         goto bailout;
    1639             :                                 }
    1640         622 :                                 ret[p] = re_like_proj_apply(next_input, re_simple, wpat, np, isensitive, anti, use_strcmp);
    1641         622 :                                 re_like_clean(&re_simple, &wpat);
    1642         460 :                         } else if (empty) {
    1643         454 :                                 ret[p] = bit_nil;
    1644             :                         } else {
    1645           6 :                                 if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, 1)) != MAL_SUCCEED) {
    1646           0 :                                         bat_iterator_end(&pi);
    1647           0 :                                         if (b)
    1648           0 :                                                 bat_iterator_end(&bi);
    1649           0 :                                         goto bailout;
    1650             :                                 }
    1651           6 :                                 if ((msg = pcre_like_apply(&(ret[p]), next_input, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1652           0 :                                         bat_iterator_end(&pi);
    1653           0 :                                         if (b)
    1654           0 :                                                 bat_iterator_end(&bi);
    1655           0 :                                         goto bailout;
    1656             :                                 }
    1657           6 :                                 pcre_clean(&re, &ex);
    1658             :                         }
    1659        1082 :                         has_nil |= is_bit_nil(ret[p]);
    1660        1082 :                         GDKfree(ppat);
    1661        1082 :                         ppat = NULL;
    1662             :                 }
    1663          74 :                 bat_iterator_end(&pi);
    1664          74 :                 if (b)
    1665          71 :                         bat_iterator_end(&bi);
    1666             :         } else {
    1667         374 :                 pat = *getArgReference_str(stk, pci, 2);
    1668         374 :                 if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &pat, esc)) != MAL_SUCCEED)
    1669           0 :                         goto bailout;
    1670             : 
    1671         374 :                 bi = bat_iterator(b);
    1672         740 :                 MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ? "pcrelike: pattern matching using strcmp" :
    1673         366 :                                                            use_re ? "pcrelike: pattern matching using RE" : "pcrelike: pattern matching using pcre");
    1674             : 
    1675         374 :                 if (use_re) {
    1676         293 :                         if ((msg = re_like_build(&re_simple, &wpat, pat, isensitive, use_strcmp, (unsigned char) **esc)) != MAL_SUCCEED) {
    1677           0 :                                 bat_iterator_end(&bi);
    1678           0 :                                 goto bailout;
    1679             :                         }
    1680        4079 :                         for (BUN p = 0; p < q; p++) {
    1681        3786 :                                 const str s = BUNtail(bi, p);
    1682        3786 :                                 ret[p] = re_like_proj_apply(s, re_simple, wpat, pat, isensitive, anti, use_strcmp);
    1683        3786 :                                 has_nil |= is_bit_nil(ret[p]);
    1684             :                         }
    1685          81 :                 } else if (empty) {
    1686           0 :                         for (BUN p = 0; p < q; p++)
    1687           0 :                                 ret[p] = bit_nil;
    1688             :                         has_nil = true;
    1689             :                 } else {
    1690          81 :                         if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, q)) != MAL_SUCCEED) {
    1691           0 :                                 bat_iterator_end(&bi);
    1692           0 :                                 goto bailout;
    1693             :                         }
    1694        1170 :                         for (BUN p = 0; p < q; p++) {
    1695        1089 :                                 const str s = BUNtail(bi, p);
    1696        1089 :                                 if ((msg = pcre_like_apply(&(ret[p]), s, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1697           0 :                                         bat_iterator_end(&bi);
    1698           0 :                                         goto bailout;
    1699             :                                 }
    1700        1089 :                                 has_nil |= is_bit_nil(ret[p]);
    1701             :                         }
    1702             :                 }
    1703         374 :                 bat_iterator_end(&bi);
    1704             :         }
    1705             : 
    1706         448 : bailout:
    1707         448 :         GDKfree(ppat);
    1708         448 :         re_like_clean(&re_simple, &wpat);
    1709         448 :         pcre_clean(&re, &ex);
    1710         448 :         if (bn && !msg) {
    1711         448 :                 BATsetcount(bn, q);
    1712         448 :                 bn->tnil = has_nil;
    1713         448 :                 bn->tnonil = !has_nil;
    1714         448 :                 bn->tkey = BATcount(bn) <= 1;
    1715         448 :                 bn->tsorted = BATcount(bn) <= 1;
    1716         448 :                 bn->trevsorted = BATcount(bn) <= 1;
    1717         448 :                 BBPkeepref(*r = bn->batCacheid);
    1718           0 :         } else if (bn)
    1719           0 :                 BBPreclaim(bn);
    1720         448 :         if (b)
    1721         445 :                 BBPunfix(b->batCacheid);
    1722         448 :         if (pbn)
    1723          74 :                 BBPunfix(pbn->batCacheid);
    1724         448 :         return msg;
    1725             : }
    1726             : 
    1727             : static str
    1728         417 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1729             : {
    1730         417 :         const str *esc = getArgReference_str(stk, pci, 3);
    1731         417 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1732         417 :         bit no = FALSE;
    1733             : 
    1734         417 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &no);
    1735             : }
    1736             : 
    1737             : static str
    1738          31 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1739             : {
    1740          31 :         const str *esc = getArgReference_str(stk, pci, 3);
    1741          31 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1742          31 :         bit yes = TRUE;
    1743             : 
    1744          31 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &yes);
    1745             : }
    1746             : 
    1747             : /* scan select loop with or without candidates */
    1748             : #define pcrescanloop(TEST)              \
    1749             :         do {    \
    1750             :                 TRC_DEBUG(ALGO,                 \
    1751             :                                   "PCREselect(b=%s#"BUNFMT",anti=%d): "             \
    1752             :                                   "scanselect %s\n", BATgetId(b), BATcount(b),        \
    1753             :                                   anti, #TEST);         \
    1754             :                 if (!s || BATtdense(s)) {       \
    1755             :                         for (; p < q; p++) { \
    1756             :                 GDK_CHECK_TIMEOUT(timeoffset, counter,                                          \
    1757             :                         GOTO_LABEL_TIMEOUT_HANDLER(bailout));                           \
    1758             :                                 const char *restrict v = BUNtvar(bi, p - off);  \
    1759             :                                 if (TEST)       \
    1760             :                                         vals[cnt++] = p;        \
    1761             :                         }               \
    1762             :                 } else {                \
    1763             :                         for (; p < ncands; p++) {            \
    1764             :                 GDK_CHECK_TIMEOUT(timeoffset, counter,                                          \
    1765             :                         GOTO_LABEL_TIMEOUT_HANDLER(bailout));                           \
    1766             :                                 oid o = canditer_next(ci);              \
    1767             :                                 const char *restrict v = BUNtvar(bi, o - off);  \
    1768             :                                 if (TEST)       \
    1769             :                                         vals[cnt++] = o;        \
    1770             :                         }               \
    1771             :                 }               \
    1772             :         } while (0)
    1773             : 
    1774             : #ifdef HAVE_LIBPCRE
    1775             : #define PCRE_LIKESELECT_BODY (pcre_exec(re, ex, v, (int) strlen(v), 0, PCRE_NO_UTF8_CHECK, NULL, 0) >= 0)
    1776             : #else
    1777             : #define PCRE_LIKESELECT_BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
    1778             : #endif
    1779             : 
    1780             : static str
    1781          79 : pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q, BUN *rcnt, const char *pat, bool caseignore, bool anti)
    1782             : {
    1783             : #ifdef HAVE_LIBPCRE
    1784          79 :         pcre *re = NULL;
    1785          79 :         pcre_extra *ex = NULL;
    1786             : #else
    1787             :         regex_t re = (regex_t) {0};
    1788             :         void *ex = NULL;
    1789             : #endif
    1790          79 :         BATiter bi = bat_iterator(b);
    1791          79 :         BUN cnt = 0, ncands = ci->ncand;
    1792          79 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    1793             :         str msg = MAL_SUCCEED;
    1794             : 
    1795             :         size_t counter = 0;
    1796             :         lng timeoffset = 0;
    1797          79 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    1798          79 :         if (qry_ctx != NULL) {
    1799          79 :                 timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
    1800             :         }
    1801             : 
    1802          79 :         if ((msg = pcre_like_build(&re, &ex, pat, caseignore, ci->ncand)) != MAL_SUCCEED)
    1803           0 :                 goto bailout;
    1804             : 
    1805          79 :         if (anti)
    1806           2 :                 pcrescanloop(v && *v != '\200' && !PCRE_LIKESELECT_BODY);
    1807             :         else
    1808       32553 :                 pcrescanloop(v && *v != '\200' && PCRE_LIKESELECT_BODY);
    1809             : 
    1810           1 : bailout:
    1811          79 :         bat_iterator_end(&bi);
    1812          79 :         pcre_clean(&re, &ex);
    1813          79 :         *rcnt = cnt;
    1814          79 :         return msg;
    1815             : }
    1816             : 
    1817             : static str
    1818        1196 : re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q, BUN *rcnt, const char *pat, bool caseignore, bool anti, bool use_strcmp, uint32_t esc)
    1819             : {
    1820        1196 :         BATiter bi = bat_iterator(b);
    1821        1195 :         BUN cnt = 0, ncands = ci->ncand;
    1822        1195 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    1823        1195 :         struct RE *re = NULL;
    1824        1195 :         uint32_t *wpat = NULL;
    1825             :         str msg = MAL_SUCCEED;
    1826             : 
    1827             :         size_t counter = 0;
    1828             :         lng timeoffset = 0;
    1829        1195 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    1830        1194 :         if (qry_ctx != NULL) {
    1831        1194 :                 timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
    1832             :         }
    1833        1194 :         if ((msg = re_like_build(&re, &wpat, pat, caseignore, use_strcmp, esc)) != MAL_SUCCEED)
    1834           0 :                 goto bailout;
    1835             : 
    1836        1185 :         if (use_strcmp) {
    1837         116 :                 if (caseignore) {
    1838          22 :                         if (anti)
    1839          64 :                                 pcrescanloop(v && *v != '\200' && mywstrcasecmp(v, wpat) != 0);
    1840             :                         else
    1841          53 :                                 pcrescanloop(v && *v != '\200' && mywstrcasecmp(v, wpat) == 0);
    1842             :                 } else {
    1843          94 :                         if (anti)
    1844          49 :                                 pcrescanloop(v && *v != '\200' && strcmp(v, pat) != 0);
    1845             :                         else
    1846        1089 :                                 pcrescanloop(v && *v != '\200' && strcmp(v, pat) == 0);
    1847             :                 }
    1848             :         } else {
    1849        1069 :                 if (caseignore) {
    1850          68 :                         if (anti)
    1851           0 :                                 pcrescanloop(v && *v != '\200' && !re_match_ignore(v, re));
    1852             :                         else
    1853       13996 :                                 pcrescanloop(v && *v != '\200' && re_match_ignore(v, re));
    1854             :                 } else {
    1855        1001 :                         if (anti)
    1856       16935 :                                 pcrescanloop(v && *v != '\200' && !re_match_no_ignore(v, re));
    1857             :                         else
    1858       22870 :                                 pcrescanloop(v && *v != '\200' && re_match_no_ignore(v, re));
    1859             :                 }
    1860             :         }
    1861             : 
    1862           9 : bailout:
    1863        1189 :         bat_iterator_end(&bi);
    1864        1196 :         re_like_clean(&re, &wpat);
    1865        1193 :         *rcnt = cnt;
    1866        1193 :         return msg;
    1867             : }
    1868             : 
    1869             : static str
    1870        1275 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *caseignore, const bit *anti)
    1871             : {
    1872             :         BAT *b, *s = NULL, *bn = NULL;
    1873             :         str msg = MAL_SUCCEED;
    1874        1275 :         char *ppat = NULL;
    1875        1275 :         bool use_re = false, use_strcmp = false, empty = false;
    1876             : 
    1877        1275 :         if ((b = BATdescriptor(*bid)) == NULL) {
    1878           0 :                 msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1879           0 :                 goto bailout;
    1880             :         }
    1881        1275 :         if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
    1882           0 :                 msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1883           0 :                 goto bailout;
    1884             :         }
    1885             : 
    1886        1275 :         assert(ATOMstorage(b->ttype) == TYPE_str);
    1887        1275 :         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, pat, esc)) != MAL_SUCCEED)
    1888           0 :                 goto bailout;
    1889             : 
    1890        2426 :         MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ? "pcrelike: pattern matching using strcmp" :
    1891        1156 :                                                    use_re ? "pcrelike: pattern matching using RE" : "pcrelike: pattern matching using pcre");
    1892             : 
    1893        1272 :         if (empty) {
    1894           0 :                 if (!(bn = BATdense(0, 0, 0)))
    1895           0 :                         msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1896             :         } else {
    1897        1272 :                 BUN p = 0, q = 0, rcnt = 0;
    1898             :                 struct canditer ci;
    1899             : 
    1900        1272 :                 canditer_init(&ci, b, s);
    1901        1271 :                 if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
    1902           0 :                         msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1903           0 :                         goto bailout;
    1904             :                 }
    1905             : 
    1906        1275 :                 if (!s || BATtdense(s)) {
    1907        1264 :                         if (s) {
    1908         749 :                                 assert(BATtdense(s));
    1909             :                                 p = (BUN) s->tseqbase;
    1910         749 :                                 q = p + BATcount(s);
    1911         749 :                                 if ((oid) p < b->hseqbase)
    1912             :                                         p = b->hseqbase;
    1913         749 :                                 if ((oid) q > b->hseqbase + BATcount(b))
    1914             :                                         q = b->hseqbase + BATcount(b);
    1915             :                         } else {
    1916         515 :                                 p = b->hseqbase;
    1917         515 :                                 q = BUNlast(b) + b->hseqbase;
    1918             :                         }
    1919             :                 }
    1920             : 
    1921        1275 :                 if (use_re) {
    1922        1196 :                         msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, (bool) *caseignore, (bool) *anti, use_strcmp, (unsigned char) **esc);
    1923             :                 } else {
    1924          79 :                         msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, (bool) *caseignore, (bool) *anti);
    1925             :                 }
    1926        1271 :                 if (!msg) { /* set some properties */
    1927        1269 :                         BATsetcount(bn, rcnt);
    1928        1269 :                         bn->tsorted = true;
    1929        1269 :                         bn->trevsorted = bn->batCount <= 1;
    1930        1269 :                         bn->tkey = true;
    1931        1269 :                         bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 || rcnt == b->batCount ? b->hseqbase : oid_nil;
    1932             :                 }
    1933             :         }
    1934             : 
    1935        1271 : bailout:
    1936        1271 :         if (b)
    1937        1272 :                 BBPunfix(b->batCacheid);
    1938        1275 :         if (s)
    1939         760 :                 BBPunfix(s->batCacheid);
    1940        1275 :         GDKfree(ppat);
    1941        1275 :         if (bn && !msg)
    1942        1275 :                 BBPkeepref(*ret = bn->batCacheid);
    1943           0 :         else if (bn)
    1944           0 :                 BBPreclaim(bn);
    1945        1274 :         return msg;
    1946             : }
    1947             : 
    1948             : #define APPEND(b, o)    (((oid *) b->theap->base)[b->batCount++] = (o))
    1949             : #define VALUE(s, x)             (s##vars + VarHeapVal(s##vals, (x), s##i.width))
    1950             : 
    1951             : #ifdef HAVE_LIBPCRE
    1952             : #define PCRE_EXEC \
    1953             :         do { \
    1954             :                 retval = pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, PCRE_NO_UTF8_CHECK, NULL, 0); \
    1955             :         } while (0)
    1956             : #define PCRE_EXEC_COND (retval < 0)
    1957             : #else
    1958             : #define PCRE_EXEC \
    1959             :         do { \
    1960             :                 retval = regexec(&pcrere, vl, (size_t) 0, NULL, 0); \
    1961             :         } while (0)
    1962             : #define PCRE_EXEC_COND (retval == REG_NOMATCH || retval == REG_ENOSYS)
    1963             : #endif
    1964             : 
    1965             : /* nested loop implementation for PCRE join */
    1966             : #define pcre_join_loop(STRCMP, RE_MATCH, PCRE_COND) \
    1967             :         do { \
    1968             :                 for (BUN ridx = 0; ridx < nrcand; ridx++) { \
    1969             :                         GDK_CHECK_TIMEOUT(timeoffset, counter, \
    1970             :                                         GOTO_LABEL_TIMEOUT_HANDLER(bailout)); \
    1971             :                         ro = canditer_next(&rci); \
    1972             :                         vr = VALUE(r, ro - rbase); \
    1973             :                         nl = 0; \
    1974             :                         use_re = use_strcmp = empty = false; \
    1975             :                         if ((msg = choose_like_path(&pcrepat, &use_re, &use_strcmp, &empty, (const str*)&vr, (const str*)&esc))) \
    1976             :                                 goto bailout; \
    1977             :                         if (!empty) { \
    1978             :                                 if (use_re) { \
    1979             :                                         if ((msg = re_like_build(&re, &wpat, vr, caseignore, use_strcmp, (unsigned char) *esc)) != MAL_SUCCEED) \
    1980             :                                                 goto bailout; \
    1981             :                                 } else if (pcrepat) { \
    1982             :                                         if ((msg = pcre_like_build(&pcrere, &pcreex, pcrepat, caseignore, nlcand)) != MAL_SUCCEED) \
    1983             :                                                 goto bailout; \
    1984             :                                         GDKfree(pcrepat); \
    1985             :                                         pcrepat = NULL; \
    1986             :                                 } \
    1987             :                                 canditer_reset(&lci); \
    1988             :                                 for (BUN lidx = 0; lidx < nlcand; lidx++) { \
    1989             :                                         lo = canditer_next(&lci); \
    1990             :                                         vl = VALUE(l, lo - lbase); \
    1991             :                                         if (strNil(vl)) { \
    1992             :                                                 continue; \
    1993             :                                         } else if (use_re) { \
    1994             :                                                 if (use_strcmp) { \
    1995             :                                                         if (STRCMP) \
    1996             :                                                                 continue; \
    1997             :                                                 } else { \
    1998             :                                                         assert(re); \
    1999             :                                                         if (RE_MATCH) \
    2000             :                                                                 continue; \
    2001             :                                                 } \
    2002             :                                         } else { \
    2003             :                                                 int retval; \
    2004             :                                                 PCRE_EXEC;  \
    2005             :                                                 if (PCRE_COND) \
    2006             :                                                         continue; \
    2007             :                                         } \
    2008             :                                         if (BUNlast(r1) == BATcapacity(r1)) { \
    2009             :                                                 newcap = BATgrows(r1); \
    2010             :                                                 BATsetcount(r1, BATcount(r1)); \
    2011             :                                                 if (r2) \
    2012             :                                                         BATsetcount(r2, BATcount(r2)); \
    2013             :                                                 if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
    2014             :                                                         msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
    2015             :                                                         goto bailout; \
    2016             :                                                 } \
    2017             :                                                 assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
    2018             :                                         } \
    2019             :                                         if (BATcount(r1) > 0) { \
    2020             :                                                 if (lastl + 1 != lo) \
    2021             :                                                         r1->tseqbase = oid_nil; \
    2022             :                                                 if (nl == 0) { \
    2023             :                                                         if (r2) \
    2024             :                                                                 r2->trevsorted = false; \
    2025             :                                                         if (lastl > lo) { \
    2026             :                                                                 r1->tsorted = false; \
    2027             :                                                                 r1->tkey = false; \
    2028             :                                                         } else if (lastl < lo) { \
    2029             :                                                                 r1->trevsorted = false; \
    2030             :                                                         } else { \
    2031             :                                                                 r1->tkey = false; \
    2032             :                                                         } \
    2033             :                                                 } \
    2034             :                                         } \
    2035             :                                         APPEND(r1, lo); \
    2036             :                                         if (r2) \
    2037             :                                                 APPEND(r2, ro); \
    2038             :                                         lastl = lo; \
    2039             :                                         nl++; \
    2040             :                                 } \
    2041             :                                 re_like_clean(&re, &wpat); \
    2042             :                                 pcre_clean(&pcrere, &pcreex); \
    2043             :                         } \
    2044             :                         if (r2) { \
    2045             :                                 if (nl > 1) { \
    2046             :                                         r2->tkey = false; \
    2047             :                                         r2->tseqbase = oid_nil; \
    2048             :                                         r1->trevsorted = false; \
    2049             :                                 } else if (nl == 0) { \
    2050             :                                         rskipped = BATcount(r2) > 0; \
    2051             :                                 } else if (rskipped) { \
    2052             :                                         r2->tseqbase = oid_nil; \
    2053             :                                 } \
    2054             :                         } else if (nl > 1) { \
    2055             :                                 r1->trevsorted = false; \
    2056             :                         } \
    2057             :                 } \
    2058             :         } while (0)
    2059             : 
    2060             : static char *
    2061          38 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc, bit caseignore, bit anti)
    2062             : {
    2063             :         struct canditer lci, rci;
    2064             :         const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
    2065             :         int rskipped = 0;                       /* whether we skipped values in r */
    2066             :         oid lbase, rbase, lo, ro, lastl = 0;            /* last value inserted into r1 */
    2067             :         BUN nl, newcap, nlcand, nrcand;
    2068          38 :         char *pcrepat = NULL, *msg = MAL_SUCCEED;
    2069          38 :         struct RE *re = NULL;
    2070          38 :         bool use_re = false, use_strcmp = false, empty = false;
    2071          38 :         uint32_t *wpat = NULL;
    2072             : #ifdef HAVE_LIBPCRE
    2073          38 :         pcre *pcrere = NULL;
    2074          38 :         pcre_extra *pcreex = NULL;
    2075             : #else
    2076             :         regex_t pcrere = (regex_t) {0};
    2077             :         void *pcreex = NULL;
    2078             : #endif
    2079             : 
    2080             :         size_t counter = 0;
    2081             :         lng timeoffset = 0;
    2082          38 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2083          38 :         if (qry_ctx != NULL) {
    2084          38 :                 timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
    2085             :         }
    2086             : 
    2087          38 :         TRC_DEBUG(ALGO,
    2088             :                           "pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
    2089             :                           "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
    2090             :                           "sr=%s#" BUNFMT "%s%s)\n",
    2091             :                           BATgetId(l), BATcount(l), ATOMname(l->ttype),
    2092             :                           l->tsorted ? "-sorted" : "",
    2093             :                           l->trevsorted ? "-revsorted" : "",
    2094             :                           BATgetId(r), BATcount(r), ATOMname(r->ttype),
    2095             :                           r->tsorted ? "-sorted" : "",
    2096             :                           r->trevsorted ? "-revsorted" : "",
    2097             :                           sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
    2098             :                           sl && sl->tsorted ? "-sorted" : "",
    2099             :                           sl && sl->trevsorted ? "-revsorted" : "",
    2100             :                           sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
    2101             :                           sr && sr->tsorted ? "-sorted" : "",
    2102             :                           sr && sr->trevsorted ? "-revsorted" : "");
    2103             : 
    2104         114 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
    2105          38 :         assert(ATOMtype(l->ttype) == TYPE_str);
    2106             : 
    2107          38 :         nlcand = canditer_init(&lci, l, sl);
    2108          38 :         nrcand = canditer_init(&rci, r, sr);
    2109             : 
    2110          38 :         BATiter li = bat_iterator(l);
    2111          38 :         BATiter ri = bat_iterator(r);
    2112          38 :         lbase = l->hseqbase;
    2113          38 :         rbase = r->hseqbase;
    2114          38 :         lvals = (const char *) li.base;
    2115          38 :         rvals = (const char *) ri.base;
    2116          38 :         assert(r->tvarsized && r->ttype);
    2117          38 :         lvars = li.vh->base;
    2118          38 :         rvars = ri.vh->base;
    2119             : 
    2120          38 :         r1->tkey = true;
    2121          38 :         r1->tsorted = true;
    2122          38 :         r1->trevsorted = true;
    2123          38 :         if (r2) {
    2124          22 :                 r2->tkey = true;
    2125          22 :                 r2->tsorted = true;
    2126          22 :                 r2->trevsorted = true;
    2127             :         }
    2128             : 
    2129          38 :         if (anti) {
    2130          20 :                 if (caseignore) {
    2131          81 :                         pcre_join_loop(mywstrcasecmp(vl, wpat) == 0, re_match_ignore(vl, re), !PCRE_EXEC_COND);
    2132             :                 } else {
    2133         242 :                         pcre_join_loop(strcmp(vl, vr) == 0, re_match_no_ignore(vl, re), !PCRE_EXEC_COND);
    2134             :                 }
    2135             :         } else {
    2136          18 :                 if (caseignore) {
    2137           4 :                         pcre_join_loop(mywstrcasecmp(vl, wpat) != 0, !re_match_ignore(vl, re), PCRE_EXEC_COND);
    2138             :                 } else {
    2139         127 :                         pcre_join_loop(strcmp(vl, vr) != 0, !re_match_no_ignore(vl, re), PCRE_EXEC_COND);
    2140             :                 }
    2141             :         }
    2142          38 :         bat_iterator_end(&li);
    2143          38 :         bat_iterator_end(&ri);
    2144             : 
    2145          38 :         assert(!r2 || BATcount(r1) == BATcount(r2));
    2146             :         /* also set other bits of heap to correct value to indicate size */
    2147          38 :         BATsetcount(r1, BATcount(r1));
    2148          38 :         if (r2)
    2149          22 :                 BATsetcount(r2, BATcount(r2));
    2150          38 :         if (BATcount(r1) > 0) {
    2151          26 :                 if (BATtdense(r1))
    2152           7 :                         r1->tseqbase = ((oid *) r1->theap->base)[0];
    2153          26 :                 if (r2 && BATtdense(r2))
    2154          11 :                         r2->tseqbase = ((oid *) r2->theap->base)[0];
    2155             :         } else {
    2156          12 :                 r1->tseqbase = 0;
    2157          12 :                 if (r2)
    2158           6 :                         r2->tseqbase = 0;
    2159             :         }
    2160          38 :         if (r2)
    2161          22 :                 TRC_DEBUG(ALGO,
    2162             :                                 "pcrejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
    2163             :                                 BATgetId(l), BATgetId(r),
    2164             :                                 BATgetId(r1), BATcount(r1),
    2165             :                                 r1->tsorted ? "-sorted" : "",
    2166             :                                 r1->trevsorted ? "-revsorted" : "",
    2167             :                                 BATgetId(r2), BATcount(r2),
    2168             :                                 r2->tsorted ? "-sorted" : "",
    2169             :                                 r2->trevsorted ? "-revsorted" : "");
    2170             :         else
    2171          16 :                 TRC_DEBUG(ALGO,
    2172             :                         "pcrejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s\n",
    2173             :                         BATgetId(l), BATgetId(r),
    2174             :                         BATgetId(r1), BATcount(r1),
    2175             :                         r1->tsorted ? "-sorted" : "",
    2176             :                         r1->trevsorted ? "-revsorted" : "");
    2177             :         return MAL_SUCCEED;
    2178             : 
    2179           0 : bailout:
    2180           0 :         bat_iterator_end(&li);
    2181           0 :         bat_iterator_end(&ri);
    2182           0 :         GDKfree(pcrepat);
    2183           0 :         re_like_clean(&re, &wpat);
    2184           0 :         pcre_clean(&pcrere, &pcreex);
    2185           0 :         assert(msg != MAL_SUCCEED);
    2186             :         return msg;
    2187             : }
    2188             : 
    2189             : static str
    2190          38 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid, bat ciid, bit anti)
    2191             : {
    2192             :         BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL, *candleft = NULL, *candright = NULL;
    2193             :         BAT *result1 = NULL, *result2 = NULL;
    2194             :         char *msg = MAL_SUCCEED, *esc = "";
    2195             :         bit ci;
    2196             : 
    2197          38 :         if ((left = BATdescriptor(lid)) == NULL)
    2198           0 :                 goto fail;
    2199          38 :         if ((right = BATdescriptor(rid)) == NULL)
    2200           0 :                 goto fail;
    2201          38 :         if ((escape = BATdescriptor(elid)) == NULL)
    2202           0 :                 goto fail;
    2203          38 :         if ((caseignore = BATdescriptor(ciid)) == NULL)
    2204           0 :                 goto fail;
    2205          38 :         if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
    2206           0 :                 goto fail;
    2207          38 :         if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
    2208           0 :                 goto fail;
    2209          38 :         result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2210          38 :         if (r2)
    2211          22 :                 result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2212          38 :         if (!result1 || (r2 && !result2)) {
    2213           0 :                 msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2214           0 :                 goto fail;
    2215             :         }
    2216          38 :         result1->tnil = false;
    2217          38 :         result1->tnonil = true;
    2218          38 :         result1->tkey = true;
    2219          38 :         result1->tsorted = true;
    2220          38 :         result1->trevsorted = true;
    2221          38 :         result1->tseqbase = 0;
    2222          38 :         if (r2) {
    2223          22 :                 result2->tnil = false;
    2224          22 :                 result2->tnonil = true;
    2225          22 :                 result2->tkey = true;
    2226          22 :                 result2->tsorted = true;
    2227          22 :                 result2->trevsorted = true;
    2228          22 :                 result2->tseqbase = 0;
    2229             :         }
    2230          38 :         if (BATcount(escape) != 1) {
    2231           0 :                 msg = createException(MAL, "pcre.join", SQLSTATE(42000) "At the moment, only one value is allowed for the escape input at pcre join");
    2232           0 :                 goto fail;
    2233             :         }
    2234             :         BATiter bi;
    2235          38 :         bi = bat_iterator(escape);
    2236          38 :         esc = BUNtvar(bi, 0);
    2237          38 :         bat_iterator_end(&bi);
    2238          38 :         if (BATcount(caseignore) != 1) {
    2239           0 :                 msg = createException(MAL, "pcre.join", SQLSTATE(42000) "At the moment, only one value is allowed for the case ignore input at pcre join");
    2240           0 :                 goto fail;
    2241             :         }
    2242          38 :         bi = bat_iterator(caseignore);
    2243          38 :         ci = *(bit*)BUNtail(bi, 0);
    2244          38 :         bat_iterator_end(&bi);
    2245          38 :         msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci, anti);
    2246          38 :         if (msg)
    2247           0 :                 goto fail;
    2248          38 :         *r1 = result1->batCacheid;
    2249          38 :         BBPkeepref(*r1);
    2250          38 :         if (r2) {
    2251          22 :                 *r2 = result2->batCacheid;
    2252          22 :                 BBPkeepref(*r2);
    2253             :         }
    2254          38 :         BBPunfix(left->batCacheid);
    2255          38 :         BBPunfix(right->batCacheid);
    2256             :         if (escape)
    2257          38 :                 BBPunfix(escape->batCacheid);
    2258             :         if (caseignore)
    2259          38 :                 BBPunfix(caseignore->batCacheid);
    2260          38 :         if (candleft)
    2261           0 :                 BBPunfix(candleft->batCacheid);
    2262          38 :         if (candright)
    2263           0 :                 BBPunfix(candright->batCacheid);
    2264             :         return MAL_SUCCEED;
    2265             : 
    2266           0 :   fail:
    2267           0 :         if (left)
    2268           0 :                 BBPunfix(left->batCacheid);
    2269           0 :         if (right)
    2270           0 :                 BBPunfix(right->batCacheid);
    2271           0 :         if (escape)
    2272           0 :                 BBPunfix(escape->batCacheid);
    2273           0 :         if (caseignore)
    2274           0 :                 BBPunfix(caseignore->batCacheid);
    2275           0 :         if (candleft)
    2276           0 :                 BBPunfix(candleft->batCacheid);
    2277           0 :         if (candright)
    2278           0 :                 BBPunfix(candright->batCacheid);
    2279           0 :         if (result1)
    2280           0 :                 BBPunfix(result1->batCacheid);
    2281           0 :         if (result2)
    2282           0 :                 BBPunfix(result2->batCacheid);
    2283           0 :         if (msg)
    2284             :                 return msg;
    2285           0 :         throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2286             : }
    2287             : 
    2288             : static str
    2289          22 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid, const bat *cid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate, const bit *anti)
    2290             : {
    2291             :         (void) nil_matches;
    2292             :         (void) estimate;
    2293          22 :         return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *elid, *cid, *anti);
    2294             : }
    2295             : 
    2296             : static str
    2297          16 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid, const bat *cid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate, const bit *anti)
    2298             : {
    2299             :         (void) nil_matches;
    2300             :         (void) estimate;
    2301          16 :         return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *elid, *cid, *anti);
    2302             : }
    2303             : 
    2304             : #include "mel.h"
    2305             : mel_atom pcre_init_atoms[] = {
    2306             :  { .name="pcre", },  { .cmp=NULL }
    2307             : };
    2308             : mel_func pcre_init_funcs[] = {
    2309             :  command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
    2310             :  command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2311             :  command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2312             :  command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
    2313             :  command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2314             :  command("pcre", "replace_first", PCREreplace_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2315             :  command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
    2316             :  command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
    2317             :  command("pcre", "prelude", pcre_init, false, "Initialize pcre", args(1,1, arg("",void))),
    2318             :  command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2319             :  command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2320             :  command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2321             :  command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2322             :  command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2323             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2324             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2325             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2326             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2327             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2328             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2329             :  command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds.  The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
    2330             :  command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    2331             :  command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
    2332             :  { .imp=NULL }
    2333             : };
    2334             : #include "mal_import.h"
    2335             : #ifdef _MSC_VER
    2336             : #undef read
    2337             : #pragma section(".CRT$XCU",read)
    2338             : #endif
    2339         259 : LIB_STARTUP_FUNC(init_pcre_mal)
    2340         259 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }

Generated by: LCOV version 1.14