LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - pcre.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 761 1061 71.7 %
Date: 2021-10-27 03:06:47 Functions: 48 52 92.3 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * N. Nes
      11             :  * PCRE library interface
      12             :  * The  PCRE library is a set of functions that implement regular
      13             :  * expression pattern matching using the same syntax  and  semantics  as  Perl,
      14             :  * with  just  a  few  differences.  The  current  implementation of PCRE
      15             :  * (release 4.x) corresponds approximately with Perl 5.8, including  support
      16             :  * for  UTF-8  encoded  strings.   However,  this support has to be
      17             :  * explicitly enabled; it is not the default.
      18             :  *
      19             :  * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
      20             :  */
      21             : #include "monetdb_config.h"
      22             : #include <string.h>
      23             : 
      24             : #include "mal.h"
      25             : #include "mal_client.h"
      26             : #include "mal_interpreter.h"
      27             : #include "mal_exception.h"
      28             : 
      29             : #include <wchar.h>
      30             : #include <wctype.h>
      31             : 
      32             : #ifdef HAVE_LIBPCRE
      33             : #include <pcre.h>
      34             : #ifndef PCRE_STUDY_JIT_COMPILE
      35             : /* old library version on e.g. EPEL 6 */
      36             : #define pcre_free_study(x)              pcre_free(x)
      37             : #define PCRE_STUDY_JIT_COMPILE  0
      38             : #endif
      39             : #define JIT_COMPILE_MIN 1024    /* when to try JIT compilation of patterns */
      40             : 
      41             : #else
      42             : 
      43             : #include <regex.h>
      44             : 
      45             : typedef regex_t pcre;
      46             : #endif
      47             : 
      48             : /* current implementation assumes simple %keyword% [keyw%]* */
      49             : struct RE {
      50             :         char *k;
      51             :         uint32_t *w;
      52             :         bool search:1,
      53             :                 atend:1;
      54             :         size_t len;
      55             :         struct RE *n;
      56             : };
      57             : 
      58             : /* We cannot use strcasecmp and strncasecmp since they work byte for
      59             :  * byte and don't deal with multibyte encodings (such as UTF-8).
      60             :  *
      61             :  * We implement our own conversion from UTF-8 encoding to Unicode code
      62             :  * points which we store in uint32_t.  The reason for this is,
      63             :  * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
      64             :  * locale to use them), and on Windows, wchar_t is only 2 bytes and
      65             :  * therefore cannot hold all Unicode code points.  We do use functions
      66             :  * such as towlower to convert a Unicode code point to its lower-case
      67             :  * equivalent, but again on Windows, if the code point doesn't fit in
      68             :  * 2 bytes, we skip this conversion and compare the unconverted code
      69             :  * points.
      70             :  *
      71             :  * Note, towlower is also locale-dependent, but we don't need a UTF-8
      72             :  * locale in order to use it. */
      73             : 
      74             : /* helper function to convert a UTF-8 multibyte character to a wide
      75             :  * character */
      76             : static size_t
      77      400933 : utfc8touc(uint32_t *restrict dest, const char *restrict src)
      78             : {
      79      400933 :         if ((src[0] & 0x80) == 0) {
      80      400893 :                 *dest = src[0];
      81      400893 :                 return src[0] != 0;
      82          40 :         } else if ((src[0] & 0xE0) == 0xC0
      83          40 :                    && (src[1] & 0xC0) == 0x80
      84          40 :                    && (src[0] & 0x1E) != 0) {
      85          40 :                 *dest = (src[0] & 0x1F) << 6
      86          40 :                         | (src[1] & 0x3F);
      87          40 :                 return 2;
      88           0 :         } else if ((src[0] & 0xF0) == 0xE0
      89           0 :                    && (src[1] & 0xC0) == 0x80
      90           0 :                    && (src[2] & 0xC0) == 0x80
      91           0 :                    && ((src[0] & 0x0F) != 0
      92           0 :                        || (src[1] & 0x20) != 0)) {
      93           0 :                 *dest = (src[0] & 0x0F) << 12
      94           0 :                         | (src[1] & 0x3F) << 6
      95           0 :                         | (src[2] & 0x3F);
      96           0 :                 return 3;
      97           0 :         } else if ((src[0] & 0xF8) == 0xF0
      98           0 :                    && (src[1] & 0xC0) == 0x80
      99           0 :                    && (src[2] & 0xC0) == 0x80
     100           0 :                    && (src[3] & 0xC0) == 0x80) {
     101           0 :                 uint32_t c = (src[0] & 0x07) << 18
     102           0 :                         | (src[1] & 0x3F) << 12
     103           0 :                         | (src[2] & 0x3F) << 6
     104           0 :                         | (src[3] & 0x3F);
     105           0 :                 if (c < 0x10000
     106           0 :                     || c > 0x10FFFF
     107             :                     || (c & 0x1FF800) == 0x00D800)
     108             :                         return (size_t) -1;
     109           0 :                 *dest = c;
     110           0 :                 return 4;
     111             :         }
     112             :         return (size_t) -1;
     113             : }
     114             : 
     115             : /* helper function to convert a UTF-8 string to a wide character
     116             :  * string, the wide character string is allocated */
     117             : static uint32_t *
     118          91 : utf8stoucs(const char *src)
     119             : {
     120             :         uint32_t *dest;
     121             :         size_t i = 0;
     122             :         size_t j = 0;
     123             : 
     124             :         /* count how many uint32_t's we need, while also checking for
     125             :          * correctness of the input */
     126         762 :         while (src[j]) {
     127         671 :                 i++;
     128         671 :                 if ((src[j+0] & 0x80) == 0) {
     129         647 :                         j += 1;
     130          24 :                 } else if ((src[j+0] & 0xE0) == 0xC0
     131          24 :                            && (src[j+1] & 0xC0) == 0x80
     132          24 :                            && (src[j+0] & 0x1E) != 0) {
     133          24 :                         j += 2;
     134           0 :                 } else if ((src[j+0] & 0xF0) == 0xE0
     135           0 :                            && (src[j+1] & 0xC0) == 0x80
     136           0 :                            && (src[j+2] & 0xC0) == 0x80
     137           0 :                            && ((src[j+0] & 0x0F) != 0
     138           0 :                                || (src[j+1] & 0x20) != 0)) {
     139           0 :                         j += 3;
     140           0 :                 } else if ((src[j+0] & 0xF8) == 0xF0
     141           0 :                            && (src[j+1] & 0xC0) == 0x80
     142           0 :                            && (src[j+2] & 0xC0) == 0x80
     143           0 :                            && (src[j+3] & 0xC0) == 0x80) {
     144           0 :                         uint32_t c = (src[j+0] & 0x07) << 18
     145           0 :                                 | (src[j+1] & 0x3F) << 12
     146           0 :                                 | (src[j+2] & 0x3F) << 6
     147           0 :                                 | (src[j+3] & 0x3F);
     148           0 :                         if (c < 0x10000
     149           0 :                             || c > 0x10FFFF
     150             :                             || (c & 0x1FF800) == 0x00D800)
     151             :                                 return NULL;
     152           0 :                         j += 4;
     153             :                 } else {
     154             :                         return NULL;
     155             :                 }
     156             :         }
     157          91 :         dest = GDKmalloc((i + 1) * sizeof(uint32_t));
     158          91 :         if (dest == NULL)
     159             :                 return NULL;
     160             :         /* go through the source string again, this time we can skip
     161             :          * the correctness tests */
     162             :         i = j = 0;
     163         762 :         while (src[j]) {
     164         671 :                 if ((src[j+0] & 0x80) == 0) {
     165         647 :                         dest[i++] = src[j+0];
     166         647 :                         j += 1;
     167          24 :                 } else if ((src[j+0] & 0xE0) == 0xC0) {
     168          24 :                         dest[i++] = (src[j+0] & 0x1F) << 6
     169          24 :                                 | (src[j+1] & 0x3F);
     170          24 :                         j += 2;
     171           0 :                 } else if ((src[j+0] & 0xF0) == 0xE0) {
     172           0 :                         dest[i++] = (src[j+0] & 0x0F) << 12
     173           0 :                                 | (src[j+1] & 0x3F) << 6
     174           0 :                                 | (src[j+2] & 0x3F);
     175           0 :                         j += 3;
     176           0 :                 } else if ((src[j+0] & 0xF8) == 0xF0) {
     177           0 :                         dest[i++] = (src[j+0] & 0x07) << 18
     178           0 :                                 | (src[j+1] & 0x3F) << 12
     179           0 :                                 | (src[j+2] & 0x3F) << 6
     180           0 :                                 | (src[j+3] & 0x3F);
     181           0 :                         j += 4;
     182             :                 }
     183             :         }
     184          91 :         dest[i] = 0;
     185          91 :         return dest;
     186             : }
     187             : 
     188             : static size_t
     189             : myucslen(const uint32_t *ucs)
     190             : {
     191             :         size_t i = 0;
     192             : 
     193      171887 :         while (ucs[i])
     194      161074 :                 i++;
     195             :         return i;
     196             : }
     197             : 
     198             : static inline bool
     199          26 : mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2, bool atend)
     200             : {
     201             :         uint32_t c1;
     202             : 
     203          46 :         while (n2 > 0) {
     204          26 :                 size_t nn1 = utfc8touc(&c1, s1);
     205          26 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     206           0 :                         return (*s2 == 0);
     207          26 :                 if (*s2 == 0)
     208             :                         return false;
     209          26 :                 if (nn1 == (size_t) -1 || nn1 == (size_t) -2)
     210             :                         return true;     /* actually an error that shouldn't happen */
     211             : #if SIZEOF_WCHAR_T == 2
     212             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     213             :                         if (c1 != *s2)
     214             :                                 return false;
     215             :                 } else
     216             : #endif
     217          26 :                 if (towlower((wint_t) c1) != towlower((wint_t) *s2))
     218             :                         return false;
     219          20 :                 s1 += nn1;
     220          20 :                 n2--;
     221          20 :                 s2++;
     222             :         }
     223          20 :         return !atend || *s1 == 0;
     224             : }
     225             : 
     226             : static inline int
     227           0 : mystrcasecmp(const char *s1, const char *s2)
     228             : {
     229             :         uint32_t c1, c2;
     230             : 
     231           0 :         for (;;) {
     232           0 :                 size_t nn1 = utfc8touc(&c1, s1);
     233           0 :                 size_t nn2 = utfc8touc(&c2, s2);
     234           0 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     235           0 :                         return -(nn2 != 0 && nn2 != (size_t) -1);
     236           0 :                 if (nn2 == 0 || nn2 == (size_t) -1)
     237             :                         return 1;
     238           0 :                 if (nn1 == (size_t) -1 || nn1 == (size_t) -2 ||
     239           0 :                         nn2 == (size_t) -1 || nn2 == (size_t) -2)
     240             :                         return 0;        /* actually an error that shouldn't happen */
     241             : #if SIZEOF_WCHAR_T == 2
     242             :                 if (c1 > 0xFFFF || c2 > 0xFFFF) {
     243             :                         if (c1 != c2)
     244             :                                 return c1 - c2;
     245             :                 } else
     246             : #endif
     247           0 :                 if (towlower((wint_t) c1) != towlower((wint_t) c2))
     248           0 :                         return towlower((wint_t) c1) - towlower((wint_t) c2);
     249           0 :                 s1 += nn1;
     250           0 :                 s2 += nn2;
     251             :         }
     252             : }
     253             : 
     254             : static inline int
     255          25 : mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
     256             : {
     257             :         uint32_t c1;
     258             : 
     259          22 :         for (;;) {
     260          47 :                 size_t nn1 = utfc8touc(&c1, s1);
     261          47 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     262          11 :                         return -(*s2 != 0);
     263          36 :                 if (*s2 == 0)
     264             :                         return 1;
     265          36 :                 if (nn1 == (size_t) -1 || nn1 == (size_t) -2)
     266             :                         return 0;        /* actually an error that shouldn't happen */
     267             : #if SIZEOF_WCHAR_T == 2
     268             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     269             :                         if (c1 != *s2)
     270             :                                 return c1 - *s2;
     271             :                 } else
     272             : #endif
     273          36 :                 if (towlower((wint_t) c1) != towlower((wint_t) *s2))
     274          14 :                         return towlower((wint_t) c1) - towlower((wint_t) *s2);
     275          22 :                 s1 += nn1;
     276          22 :                 s2++;
     277             :         }
     278             : }
     279             : 
     280             : static inline const char *
     281       10813 : mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle, bool atend)
     282             : {
     283             :         size_t nlen = myucslen(wneedle);
     284             : 
     285       10813 :         if (nlen == 0)
     286           0 :                 return atend ? haystack + strlen(haystack) : haystack;
     287             : 
     288             :         size_t hlen = strlen(haystack);
     289             : 
     290      351240 :         while (*haystack) {
     291             :                 size_t i;
     292             :                 size_t h;
     293             :                 size_t step = 0;
     294      414520 :                 for (i = h = 0; i < nlen; i++) {
     295             :                         uint32_t c;
     296      412270 :                         size_t j = utfc8touc(&c, haystack + h);
     297      412309 :                         if (j == 0 || j == (size_t) -1)
     298           0 :                                 return NULL;
     299      412309 :                         if (i == 0) {
     300             :                                 step = j;
     301             :                         }
     302             : #if SIZEOF_WCHAR_T == 2
     303             :                         if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
     304             :                                 if (c != wneedle[i])
     305             :                                         break;
     306             :                         } else
     307             : #endif
     308      412309 :                         if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
     309             :                                 break;
     310       71817 :                         h += j;
     311             :                 }
     312      342742 :                 if (i == nlen && (!atend || haystack[h] == 0))
     313        2315 :                         return haystack;
     314      340427 :                 haystack += step;
     315             :                 hlen -= step;
     316             :         }
     317             :         return NULL;
     318             : }
     319             : 
     320             : /* returns true if the pattern does not contain unescaped `_' (single
     321             :  * character match) and ends with unescaped `%' (any sequence
     322             :  * match) */
     323             : static inline bool
     324        1862 : re_simple(const char *pat, unsigned char esc)
     325             : {
     326             :         bool escaped = false;
     327             : 
     328        1862 :         if (pat == 0)
     329             :                 return false;
     330        1862 :         if (*pat == '%') {
     331        1222 :                 pat++;
     332             :         }
     333       14104 :         while (*pat) {
     334       12527 :                 if (escaped) {
     335             :                         escaped = false;
     336       12403 :                 } else if ((unsigned char) *pat == esc) {
     337             :                         escaped = true;
     338       12280 :                 } else if (*pat == '_') {
     339             :                         return false;
     340             :                 }
     341       12242 :                 pat++;
     342             :         }
     343             :         return true;
     344             : }
     345             : 
     346             : static inline bool
     347        2071 : re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
     348             : {
     349             :         bool escaped = false;
     350             : 
     351        2071 :         if (pat == 0)
     352             :                 return true;
     353       17601 :         while (*pat) {
     354       15530 :                 if (escaped) {
     355             :                         escaped = false;
     356       15398 :                 } else if ((unsigned char) *pat == esc) {
     357             :                         escaped = true;
     358             :                 }
     359       15530 :                 pat++;
     360             :         }
     361        2071 :         return escaped ? false : true;
     362             : }
     363             : 
     364             : static inline bool
     365        2065 : is_strcmpable(const char *pat, const char *esc)
     366             : {
     367        2065 :         if (pat[strcspn(pat, "%_")])
     368             :                 return false;
     369         231 :         return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
     370             : }
     371             : 
     372             : static inline bool
     373       10890 : re_match_ignore(const char *restrict s, const struct RE *restrict pattern)
     374             : {
     375             :         const struct RE *r;
     376             : 
     377       13228 :         for (r = pattern; r; r = r->n) {
     378       10914 :                 if (*r->w == 0 && (r->search || *s == 0))
     379             :                         return true;
     380       21786 :                 if (!*s ||
     381             :                         (r->search
     382       10835 :                          ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL
     383          26 :                          : !mywstrncaseeq(s, r->w, r->len, r->atend)))
     384        8613 :                         return false;
     385        2338 :                 s += r->len;
     386             :         }
     387             :         return true;
     388             : }
     389             : 
     390             : static inline bool
     391       37557 : re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern)
     392             : {
     393             :         const struct RE *r;
     394             :         size_t l;
     395             : 
     396       41114 :         for (r = pattern; r; r = r->n) {
     397       38987 :                 if (*r->k == 0 && (r->search || *s == 0))
     398             :                         return true;
     399       39446 :                 if (!*s ||
     400             :                         (r->search
     401       38879 :                          ? (r->atend
     402       25497 :                                 ? (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0
     403       25005 :                                 : (s = strstr(s, r->k)) == NULL)
     404             :                          : (r->atend
     405       13382 :                                 ? strcmp(s, r->k) != 0
     406       13288 :                                 : strncmp(s, r->k, r->len) != 0)))
     407             :                         return false;
     408        3557 :                 s += r->len;
     409             :         }
     410             :         return true;
     411             : }
     412             : 
     413             : static void
     414        1575 : re_destroy(struct RE *p)
     415             : {
     416        1575 :         if (p) {
     417        1575 :                 GDKfree(p->k);
     418        1574 :                 GDKfree(p->w);
     419             :                 do {
     420        1661 :                         struct RE *n = p->n;
     421             : 
     422        1661 :                         GDKfree(p);
     423             :                         p = n;
     424        1665 :                 } while (p);
     425             :         }
     426        1575 : }
     427             : 
     428             : /* Create a linked list of RE structures.  Depending on the caseignore
     429             :  * flag, the w (if true) or the k (if false) field is used.  These
     430             :  * fields in the first structure are allocated, whereas in all
     431             :  * subsequent structures the fields point into the allocated buffer of
     432             :  * the first. */
     433             : static struct RE *
     434        1577 : re_create(const char *pat, bool caseignore, uint32_t esc)
     435             : {
     436        1577 :         struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
     437             :         bool escaped = false;
     438             : 
     439        1575 :         if (r == NULL)
     440             :                 return NULL;
     441        1575 :         *r = (struct RE) {.atend = true};
     442             : 
     443        2641 :         while (esc != '%' && *pat == '%') {
     444        1066 :                 pat++; /* skip % */
     445        1066 :                 r->search = true;
     446             :         }
     447        1575 :         if (caseignore) {
     448             :                 uint32_t *wp;
     449             :                 uint32_t *wq;
     450          78 :                 wp = utf8stoucs(pat);
     451          78 :                 if (wp == NULL) {
     452           0 :                         GDKfree(r);
     453           0 :                         return NULL;
     454             :                 }
     455          78 :                 r->w = wp;
     456             :                 wq = wp;
     457         717 :                 while (*wp) {
     458         639 :                         if (escaped) {
     459           1 :                                 *wq++ = *wp;
     460           1 :                                 n->len++;
     461             :                                 escaped = false;
     462         638 :                         } else if (*wp == esc) {
     463             :                                 escaped = true;
     464         637 :                         } else if (*wp == '%') {
     465          82 :                                 n->atend = false;
     466          82 :                                 while (wp[1] == '%')
     467           0 :                                         wp++;
     468          82 :                                 if (wp[1]) {
     469          16 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     470          16 :                                         if (n == NULL)
     471           0 :                                                 goto bailout;
     472          16 :                                         *n = (struct RE) {.search = true, .atend = true, .w = wp + 1};
     473             :                                 }
     474          82 :                                 *wq++ = 0;
     475             :                         } else {
     476         555 :                                 *wq++ = *wp;
     477         555 :                                 n->len++;
     478             :                         }
     479         639 :                         wp++;
     480             :                 }
     481          78 :                 *wq = 0;
     482             :         } else {
     483             :                 char *p, *q;
     484        1497 :                 if ((p = GDKstrdup(pat)) == NULL) {
     485           0 :                         GDKfree(r);
     486           0 :                         return NULL;
     487             :                 }
     488        1499 :                 r->k = p;
     489             :                 q = p;
     490       12745 :                 while (*p) {
     491       11246 :                         if (escaped) {
     492         115 :                                 *q++ = *p;
     493         115 :                                 n->len++;
     494             :                                 escaped = false;
     495       11131 :                         } else if ((unsigned char) *p == esc) {
     496             :                                 escaped = true;
     497       11016 :                         } else if (*p == '%') {
     498        1348 :                                 n->atend = false;
     499        1348 :                                 while (p[1] == '%')
     500           0 :                                         p++;
     501        1348 :                                 if (p[1]) {
     502          74 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     503          74 :                                         if (n == NULL)
     504           0 :                                                 goto bailout;
     505          74 :                                         *n = (struct RE) {.search = true, .atend = true, .k = p + 1};
     506             :                                 }
     507        1348 :                                 *q++ = 0;
     508             :                         } else {
     509        9668 :                                 *q++ = *p;
     510        9668 :                                 n->len++;
     511             :                         }
     512       11246 :                         p++;
     513             :                 }
     514        1499 :                 *q = 0;
     515             :         }
     516             :         return r;
     517           0 :   bailout:
     518           0 :         re_destroy(r);
     519           0 :         return NULL;
     520             : }
     521             : 
     522             : #ifdef HAVE_LIBPCRE
     523             : static str
     524          22 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
     525             : {
     526             :         pcre *r;
     527          22 :         const char *err_p = NULL;
     528          22 :         int errpos = 0;
     529             :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
     530          22 :         if (insensitive)
     531             :                 options |= PCRE_CASELESS;
     532             : 
     533          22 :         if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
     534           0 :                 throw(MAL, "pcre.compile", OPERATION_FAILED
     535             :                           " with\n'%s'\nat %d in\n'%s'.\n",
     536             :                           err_p, errpos, pattern);
     537             :         }
     538          22 :         *res = r;
     539          22 :         return MAL_SUCCEED;
     540             : }
     541             : #endif
     542             : 
     543             : /* maximum number of back references and quoted \ or $ in replacement string */
     544             : #define MAX_NR_REFS             20
     545             : 
     546             : struct backref {
     547             :         int idx;
     548             :         int start;
     549             :         int end;
     550             : };
     551             : 
     552             : #ifdef HAVE_LIBPCRE
     553             : /* fill in parameter backrefs (length maxrefs) with information about
     554             :  * back references in the replacement string; a back reference is a
     555             :  * dollar or backslash followed by a number */
     556             : static int
     557         107 : parse_replacement(const char *replacement, int len_replacement,
     558             :                                   struct backref *backrefs, int maxrefs)
     559             : {
     560             :         int nbackrefs = 0;
     561             : 
     562         728 :         for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
     563         621 :                 if (replacement[i] == '$' || replacement[i] == '\\') {
     564             :                         char *endptr;
     565           7 :                         backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
     566           7 :                         if (endptr > replacement + i + 1) {
     567           7 :                                 int k = (int) (endptr - (replacement + i + 1));
     568           7 :                                 backrefs[nbackrefs].start = i;
     569           7 :                                 backrefs[nbackrefs].end = i + k + 1;
     570           7 :                                 nbackrefs++;
     571           0 :                         } else if (replacement[i] == replacement[i + 1]) {
     572             :                                 /* doubled $ or \, we must copy just one to the output */
     573           0 :                                 backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
     574           0 :                                 backrefs[nbackrefs].start = i;
     575           0 :                                 backrefs[nbackrefs].end = i + 1;
     576             :                                 i++;                    /* don't look at second $ or \ again */
     577           0 :                                 nbackrefs++;
     578             :                         }
     579             :                         /* else: $ or \ followed by something we don't recognize,
     580             :                          * so just leave it */
     581             :                 }
     582             :         }
     583         107 :         return nbackrefs;
     584             : }
     585             : 
     586             : static char *
     587       47622 : single_replace(pcre *pcre_code, pcre_extra *extra,
     588             :                            const char *origin_str, int len_origin_str,
     589             :                            int exec_options, int *ovector, int ovecsize,
     590             :                            const char *replacement, int len_replacement,
     591             :                            struct backref *backrefs, int nbackrefs,
     592             :                            bool global, char *result, int *max_result)
     593             : {
     594             :         int offset = 0;
     595             :         int len_result = 0;
     596             :         int addlen;
     597             :         char *tmp;
     598             : 
     599             :         do {
     600      134611 :                 int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
     601             :                                           exec_options, ovector, ovecsize);
     602      134617 :                 if (j <= 0)
     603             :                         break;
     604       89268 :                 addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
     605       89268 :                 if (len_result + addlen >= *max_result) {
     606        7545 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     607        7545 :                         if (tmp == NULL) {
     608           0 :                                 GDKfree(result);
     609           0 :                                 return NULL;
     610             :                         }
     611             :                         result = tmp;
     612        7545 :                         *max_result = len_result + addlen + 1;
     613             :                 }
     614       89268 :                 if (ovector[0] > offset) {
     615       86463 :                         strncpy(result + len_result, origin_str + offset,
     616       86463 :                                         ovector[0] - offset);
     617       86463 :                         len_result += ovector[0] - offset;
     618             :                 }
     619       89268 :                 if (nbackrefs == 0) {
     620       86989 :                         strncpy(result + len_result, replacement, len_replacement);
     621       86989 :                         len_result += len_replacement;
     622             :                 } else {
     623             :                         int prevend = 0;
     624        4558 :                         for (int i = 0; i < nbackrefs; i++) {
     625             :                                 int off, len;
     626        2279 :                                 if (backrefs[i].idx >= ovecsize / 3) {
     627             :                                         /* out of bounds, replace with empty string */
     628             :                                         off = 0;
     629             :                                         len = 0;
     630             :                                 } else {
     631        2279 :                                         off = ovector[backrefs[i].idx * 2];
     632        2279 :                                         len = ovector[backrefs[i].idx * 2 + 1] - off;
     633             :                                 }
     634        2279 :                                 addlen = backrefs[i].start - prevend + len;
     635        2279 :                                 if (len_result + addlen >= *max_result) {
     636          19 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     637          19 :                                         if (tmp == NULL) {
     638           0 :                                                 GDKfree(result);
     639           0 :                                                 return NULL;
     640             :                                         }
     641             :                                         result = tmp;
     642          19 :                                         *max_result = len_result + addlen + 1;
     643             :                                 }
     644        2279 :                                 if (backrefs[i].start > prevend) {
     645           0 :                                         strncpy(result + len_result, replacement + prevend,
     646           0 :                                                         backrefs[i].start - prevend);
     647           0 :                                         len_result += backrefs[i].start - prevend;
     648             :                                 }
     649        2279 :                                 if (len > 0) {
     650        2279 :                                         strncpy(result + len_result, origin_str + off, len);
     651        2279 :                                         len_result += len;
     652             :                                 }
     653        2279 :                                 prevend = backrefs[i].end;
     654             :                         }
     655             :                         /* copy rest of replacement string (after last backref) */
     656        2279 :                         addlen = len_replacement - prevend;
     657        2279 :                         if (addlen > 0) {
     658           0 :                                 if (len_result + addlen >= *max_result) {
     659           0 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     660           0 :                                         if (tmp == NULL) {
     661           0 :                                                 GDKfree(result);
     662           0 :                                                 return NULL;
     663             :                                         }
     664             :                                         result = tmp;
     665           0 :                                         *max_result = len_result + addlen + 1;
     666             :                                 }
     667           0 :                                 strncpy(result + len_result, replacement + prevend, addlen);
     668             :                                 len_result += addlen;
     669             :                         }
     670             :                 }
     671       89268 :                 offset = ovector[1];
     672       89268 :         } while (offset < len_origin_str && global);
     673       47628 :         if (offset < len_origin_str) {
     674       45071 :                 addlen = len_origin_str - offset;
     675       45071 :                 if (len_result + addlen >= *max_result) {
     676         400 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     677         400 :                         if (tmp == NULL) {
     678           0 :                                 GDKfree(result);
     679           0 :                                 return NULL;
     680             :                         }
     681             :                         result = tmp;
     682         400 :                         *max_result = len_result + addlen + 1;
     683             :                 }
     684       45071 :                 strncpy(result + len_result, origin_str + offset, addlen);
     685             :                 len_result += addlen;
     686             :         }
     687             :         /* null terminate string */
     688       47628 :         result[len_result] = '\0';
     689       47628 :         return result;
     690             : }
     691             : #endif
     692             : 
     693             : static str
     694          37 : pcre_replace(str *res, const char *origin_str, const char *pattern,
     695             :                          const char *replacement, const char *flags, bool global)
     696             : {
     697             : #ifdef HAVE_LIBPCRE
     698          37 :         const char *err_p = NULL;
     699             :         pcre *pcre_code = NULL;
     700             :         pcre_extra *extra;
     701             :         char *tmpres;
     702             :         int max_result;
     703          37 :         int i, errpos = 0;
     704             :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     705             :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     706             :         int *ovector, ovecsize;
     707          37 :         int len_origin_str = (int) strlen(origin_str);
     708          37 :         int len_replacement = (int) strlen(replacement);
     709             :         struct backref backrefs[MAX_NR_REFS];
     710             :         int nbackrefs = 0;
     711             : 
     712         185 :         while (*flags) {
     713         148 :                 switch (*flags) {
     714             :                 case 'e':
     715             :                         exec_options &= ~PCRE_NOTEMPTY;
     716             :                         break;
     717          37 :                 case 'i':
     718          37 :                         compile_options |= PCRE_CASELESS;
     719          37 :                         break;
     720          37 :                 case 'm':
     721          37 :                         compile_options |= PCRE_MULTILINE;
     722          37 :                         break;
     723          37 :                 case 's':
     724          37 :                         compile_options |= PCRE_DOTALL;
     725          37 :                         break;
     726          37 :                 case 'x':
     727          37 :                         compile_options |= PCRE_EXTENDED;
     728          37 :                         break;
     729           0 :                 default:
     730           0 :                         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     731             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     732             :                                   *flags);
     733             :                 }
     734         148 :                 flags++;
     735             :         }
     736             : 
     737          37 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     738           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     739             :                           OPERATION_FAILED ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     740             :                           pattern, errpos, err_p);
     741             :         }
     742             : 
     743             :         /* Since the compiled pattern is going to be used several times, it is
     744             :          * worth spending more time analyzing it in order to speed up the time
     745             :          * taken for matching.
     746             :          */
     747          37 :         extra = pcre_study(pcre_code, 0, &err_p);
     748          37 :         if (err_p != NULL) {
     749           0 :                 pcre_free(pcre_code);
     750           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     751             :                           OPERATION_FAILED ": pcre study of pattern (%s) failed with '%s'.\n",
     752             :                           pattern, err_p);
     753             :         }
     754          37 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
     755          37 :         ovecsize = (i + 1) * 3;
     756          37 :         if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
     757           0 :                 pcre_free_study(extra);
     758           0 :                 pcre_free(pcre_code);
     759           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     760             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     761             :         }
     762             : 
     763             :         /* identify back references in the replacement string */
     764          37 :         nbackrefs = parse_replacement(replacement, len_replacement,
     765             :                                                                   backrefs, MAX_NR_REFS);
     766             : 
     767          37 :         max_result = len_origin_str + 1;
     768          37 :         tmpres = GDKmalloc(max_result);
     769          37 :         if (tmpres == NULL) {
     770           0 :                 GDKfree(ovector);
     771           0 :                 pcre_free_study(extra);
     772           0 :                 pcre_free(pcre_code);
     773           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     774             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     775             :         }
     776             : 
     777          37 :         tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
     778             :                                                         exec_options, ovector, ovecsize, replacement,
     779             :                                                         len_replacement, backrefs, nbackrefs, global,
     780             :                                                         tmpres, &max_result);
     781          37 :         GDKfree(ovector);
     782          37 :         pcre_free_study(extra);
     783          37 :         pcre_free(pcre_code);
     784          37 :         if (tmpres == NULL)
     785           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     786             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     787             : 
     788          37 :         *res = tmpres;
     789          37 :         return MAL_SUCCEED;
     790             : #else
     791             :         (void) res;
     792             :         (void) origin_str;
     793             :         (void) pattern;
     794             :         (void) replacement;
     795             :         (void) flags;
     796             :         (void) global;
     797             :         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     798             :                   "Database was compiled without PCRE support.");
     799             : #endif
     800             : }
     801             : 
     802             : static str
     803          70 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
     804             :                                  const char *replacement, const char *flags, bool global)
     805             : {
     806             : #ifdef HAVE_LIBPCRE
     807          70 :         const char *err_p = NULL;
     808             :         char *tmpres;
     809          70 :         int i, errpos = 0;
     810             :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     811             :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     812             :         pcre *pcre_code = NULL;
     813             :         pcre_extra *extra;
     814             :         BAT *tmpbat;
     815             :         BUN p, q;
     816             :         int *ovector, ovecsize;
     817          70 :         int len_replacement = (int) strlen(replacement);
     818             :         struct backref backrefs[MAX_NR_REFS];
     819             :         int nbackrefs = 0;
     820             :         const char *origin_str;
     821          70 :         int max_dest_size = 0;
     822             : 
     823          98 :         while (*flags) {
     824          28 :                 switch (*flags) {
     825             :                 case 'e':
     826             :                         exec_options &= ~PCRE_NOTEMPTY;
     827             :                         break;
     828           7 :                 case 'i':
     829           7 :                         compile_options |= PCRE_CASELESS;
     830           7 :                         break;
     831          14 :                 case 'm':
     832          14 :                         compile_options |= PCRE_MULTILINE;
     833          14 :                         break;
     834           7 :                 case 's':
     835           7 :                         compile_options |= PCRE_DOTALL;
     836           7 :                         break;
     837           0 :                 case 'x':
     838           0 :                         compile_options |= PCRE_EXTENDED;
     839           0 :                         break;
     840           0 :                 default:
     841           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     842             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     843             :                                   *flags);
     844             :                 }
     845          28 :                 flags++;
     846             :         }
     847             : 
     848          70 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     849           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     850             :                           OPERATION_FAILED
     851             :                           ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     852             :                           pattern, errpos, err_p);
     853             :         }
     854             : 
     855             :         /* Since the compiled pattern is going to be used several times,
     856             :          * it is worth spending more time analyzing it in order to speed
     857             :          * up the time taken for matching.
     858             :          */
     859          70 :         extra = pcre_study(pcre_code, BATcount(origin_strs) > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
     860          70 :         if (err_p != NULL) {
     861           0 :                 pcre_free(pcre_code);
     862           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     863             :                           OPERATION_FAILED);
     864             :         }
     865          70 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
     866          70 :         ovecsize = (i + 1) * 3;
     867          70 :         if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
     868           0 :                 pcre_free_study(extra);
     869           0 :                 pcre_free(pcre_code);
     870           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     871             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     872             :         }
     873             : 
     874             :         /* identify back references in the replacement string */
     875          70 :         nbackrefs = parse_replacement(replacement, len_replacement,
     876             :                                                                   backrefs, MAX_NR_REFS);
     877             : 
     878          70 :         tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs), TRANSIENT);
     879             : 
     880             :         /* the buffer for all destination strings is allocated only once,
     881             :          * and extended when needed */
     882          70 :         max_dest_size = len_replacement + 1;
     883          70 :         tmpres = GDKmalloc(max_dest_size);
     884          70 :         if (tmpbat == NULL || tmpres == NULL) {
     885           0 :                 pcre_free_study(extra);
     886           0 :                 pcre_free(pcre_code);
     887           0 :                 GDKfree(ovector);
     888           0 :                 BBPreclaim(tmpbat);
     889           0 :                 GDKfree(tmpres);
     890           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     891             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     892             :         }
     893          70 :         BATiter origin_strsi = bat_iterator(origin_strs);
     894       47644 :         BATloop(origin_strs, p, q) {
     895       47574 :                 origin_str = BUNtvar(origin_strsi, p);
     896       47574 :                 tmpres = single_replace(pcre_code, extra, origin_str,
     897       47574 :                                                                 (int) strlen(origin_str), exec_options,
     898             :                                                                 ovector, ovecsize, replacement,
     899             :                                                                 len_replacement, backrefs, nbackrefs, global,
     900             :                                                                 tmpres, &max_dest_size);
     901       47586 :                 if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
     902           0 :                         bat_iterator_end(&origin_strsi);
     903           0 :                         pcre_free_study(extra);
     904           0 :                         pcre_free(pcre_code);
     905           0 :                         GDKfree(ovector);
     906           0 :                         GDKfree(tmpres);
     907           0 :                         BBPreclaim(tmpbat);
     908           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     909             :                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
     910             :                 }
     911             :         }
     912          70 :         bat_iterator_end(&origin_strsi);
     913          70 :         pcre_free_study(extra);
     914          70 :         pcre_free(pcre_code);
     915          70 :         GDKfree(ovector);
     916          70 :         GDKfree(tmpres);
     917          70 :         *res = tmpbat;
     918          70 :         return MAL_SUCCEED;
     919             : #else
     920             :         (void) res;
     921             :         (void) origin_strs;
     922             :         (void) pattern;
     923             :         (void) replacement;
     924             :         (void) flags;
     925             :         (void) global;
     926             :         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     927             :                   "Database was compiled without PCRE support.");
     928             : #endif
     929             : }
     930             : 
     931             : static str
     932         264 : pcre_init(void *ret)
     933             : {
     934             :         (void) ret;
     935         264 :         return NULL;
     936             : }
     937             : 
     938             : static str
     939         130 : pcre_match_with_flags(bit *ret, const char *val, const char *pat, const char *flags)
     940             : {
     941             :         int pos;
     942             : #ifdef HAVE_LIBPCRE
     943         130 :         const char *err_p = NULL;
     944         130 :         int errpos = 0;
     945             :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     946             :         pcre *re;
     947             : #else
     948             :         int options = REG_NOSUB;
     949             :         regex_t re;
     950             :         int errcode;
     951             :         int retval;
     952             : #endif
     953             : 
     954         260 :         while (*flags) {
     955         130 :                 switch (*flags) {
     956           6 :                 case 'i':
     957             : #ifdef HAVE_LIBPCRE
     958           6 :                         options |= PCRE_CASELESS;
     959             : #else
     960             :                         options |= REG_ICASE;
     961             : #endif
     962           6 :                         break;
     963           0 :                 case 'm':
     964             : #ifdef HAVE_LIBPCRE
     965           0 :                         options |= PCRE_MULTILINE;
     966             : #else
     967             :                         options |= REG_NEWLINE;
     968             : #endif
     969           0 :                         break;
     970             : #ifdef HAVE_LIBPCRE
     971         124 :                 case 's':
     972         124 :                         options |= PCRE_DOTALL;
     973         124 :                         break;
     974             : #endif
     975           0 :                 case 'x':
     976             : #ifdef HAVE_LIBPCRE
     977           0 :                         options |= PCRE_EXTENDED;
     978             : #else
     979             :                         options |= REG_EXTENDED;
     980             : #endif
     981           0 :                         break;
     982           0 :                 default:
     983           0 :                         throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
     984             :                                   ": unsupported flag character '%c'\n", *flags);
     985             :                 }
     986         130 :                 flags++;
     987             :         }
     988         130 :         if (strNil(val)) {
     989           0 :                 *ret = FALSE;
     990           0 :                 return MAL_SUCCEED;
     991             :         }
     992             : 
     993             : #ifdef HAVE_LIBPCRE
     994         130 :         if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
     995             : #else
     996             :                 if ((errcode = regcomp(&re, pat, options)) != 0)
     997             : #endif
     998             :                         {
     999           0 :                                 throw(MAL, "pcre.match", OPERATION_FAILED
    1000             :                                           ": compilation of regular expression (%s) failed "
    1001             : #ifdef HAVE_LIBPCRE
    1002             :                                           "at %d with '%s'", pat, errpos, err_p
    1003             : #else
    1004             :                                           , pat
    1005             : #endif
    1006             :                                         );
    1007             :                         }
    1008             : #ifdef HAVE_LIBPCRE
    1009         130 :         pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
    1010         130 :         pcre_free(re);
    1011             : #else
    1012             :         retval = regexec(&re, val, (size_t) 0, NULL, 0);
    1013             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1014             :         regfree(&re);
    1015             : #endif
    1016         130 :         if (pos >= 0)
    1017          46 :                 *ret = TRUE;
    1018          84 :         else if (pos == -1)
    1019          84 :                 *ret = FALSE;
    1020             :         else
    1021           0 :                 throw(MAL, "pcre.match", OPERATION_FAILED
    1022             :                           ": matching of regular expression (%s) failed with %d",
    1023             :                           pat, pos);
    1024             :         return MAL_SUCCEED;
    1025             : }
    1026             : 
    1027             : #ifdef HAVE_LIBPCRE
    1028             : /* special characters in PCRE that need to be escaped */
    1029             : static const char *pcre_specials = ".+?*()[]{}|^$\\";
    1030             : #else
    1031             : /* special characters in POSIX basic regular expressions that need to
    1032             :  * be escaped */
    1033             : static const char *pcre_specials = ".*[]^$\\";
    1034             : #endif
    1035             : 
    1036             : /* change SQL LIKE pattern into PCRE pattern */
    1037             : static str
    1038         291 : sql2pcre(str *r, const char *pat, const char *esc_str)
    1039             : {
    1040             :         int escaped = 0;
    1041             :         int hasWildcard = 0;
    1042             :         char *ppat;
    1043         291 :         int esc = esc_str[0] == '\200' ? 0 : esc_str[0]; /* should change to utf8_convert() */
    1044             :         int specials;
    1045             :         int c;
    1046             : 
    1047         291 :         if (strlen(esc_str) > 1)
    1048           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": ESCAPE string must have length 1");
    1049         291 :         if (pat == NULL)
    1050           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": (I)LIKE pattern must not be NULL");
    1051         291 :         ppat = GDKmalloc(strlen(pat)*3+3 /* 3 = "^'the translated regexp'$0" */);
    1052         291 :         if (ppat == NULL)
    1053           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1054             : 
    1055         291 :         *r = ppat;
    1056             :         /* The escape character can be a char which is special in a PCRE
    1057             :          * expression.  If the user used the "+" char as escape and has "++"
    1058             :          * in their pattern, then replacing this with "+" is not correct and
    1059             :          * should be "\+" instead. */
    1060         291 :         specials = (esc && strchr(pcre_specials, esc) != NULL);
    1061             : 
    1062         291 :         *ppat++ = '^';
    1063        1993 :         while ((c = *pat++) != 0) {
    1064        1702 :                 if (c == esc) {
    1065          15 :                         if (escaped) {
    1066           1 :                                 if (specials) { /* change ++ into \+ */
    1067           1 :                                         *ppat++ = esc;
    1068             :                                 } else { /* do not escape simple escape symbols */
    1069           0 :                                         ppat[-1] = esc; /* overwrite backslash */
    1070             :                                 }
    1071             :                                 escaped = 0;
    1072             :                         } else {
    1073          14 :                                 *ppat++ = '\\';
    1074             :                                 escaped = 1;
    1075             :                         }
    1076             :                         hasWildcard = 1;
    1077        1687 :                 } else if (strchr(pcre_specials, c) != NULL) {
    1078             :                         /* escape PCRE special chars, avoid double backslash if the
    1079             :                          * user uses an invalid escape sequence */
    1080          28 :                         if (!escaped)
    1081          28 :                                 *ppat++ = '\\';
    1082          28 :                         *ppat++ = c;
    1083             :                         hasWildcard = 1;
    1084             :                         escaped = 0;
    1085        1659 :                 } else if (c == '%' && !escaped) {
    1086         311 :                         *ppat++ = '.';
    1087         311 :                         *ppat++ = '*';
    1088         311 :                         *ppat++ = '?';
    1089             :                         hasWildcard = 1;
    1090             :                         /* collapse multiple %, but only if it isn't the escape */
    1091         311 :                         if (esc != '%')
    1092         311 :                                 while (*pat == '%')
    1093           0 :                                         pat++;
    1094        1348 :                 } else if (c == '_' && !escaped) {
    1095         342 :                         *ppat++ = '.';
    1096             :                         hasWildcard = 1;
    1097             :                 } else {
    1098        1006 :                         if (escaped) {
    1099          13 :                                 ppat[-1] = c; /* overwrite backslash of invalid escape */
    1100             :                         } else {
    1101         993 :                                 *ppat++ = c;
    1102             :                         }
    1103             :                         escaped = 0;
    1104             :                 }
    1105             :         }
    1106             :         /* no wildcard or escape character at end of string */
    1107         291 :         if (!hasWildcard || escaped) {
    1108           1 :                 GDKfree(*r);
    1109           1 :                 *r = NULL;
    1110           1 :                 if (escaped)
    1111           0 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": (I)LIKE pattern must not end with escape character");
    1112           1 :                 *r = GDKstrdup(str_nil);
    1113           1 :                 if (*r == NULL)
    1114           0 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1115             :         } else {
    1116         290 :                 *ppat++ = '$';
    1117         290 :                 *ppat = 0;
    1118             :         }
    1119             :         return MAL_SUCCEED;
    1120             : }
    1121             : 
    1122             : #ifdef HAVE_LIBPCRE
    1123             : /* change SQL PATINDEX pattern into PCRE pattern */
    1124             : static str
    1125          22 : pat2pcre(str *r, const char *pat)
    1126             : {
    1127          22 :         size_t len = strlen(pat);
    1128          22 :         char *ppat = GDKmalloc(len*2+3 /* 3 = "^'the translated regexp'$0" */);
    1129             :         int start = 0;
    1130             : 
    1131          22 :         if (ppat == NULL)
    1132           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1133          22 :         *r = ppat;
    1134          44 :         while (*pat) {
    1135          22 :                 int c = *pat++;
    1136             : 
    1137          22 :                 if (strchr(pcre_specials, c) != NULL) {
    1138          14 :                         *ppat++ = '\\';
    1139          14 :                         *ppat++ = c;
    1140           8 :                 } else if (c == '%') {
    1141           0 :                         if (start && *pat) {
    1142           0 :                                 *ppat++ = '.';
    1143           0 :                                 *ppat++ = '*';
    1144             :                         }
    1145           0 :                         start++;
    1146           8 :                 } else if (c == '_') {
    1147           0 :                         *ppat++ = '.';
    1148             :                 } else {
    1149           8 :                         *ppat++ = c;
    1150             :                 }
    1151             :         }
    1152          22 :         *ppat = 0;
    1153          22 :         return MAL_SUCCEED;
    1154             : }
    1155             : #endif
    1156             : 
    1157             : /*
    1158             :  * @+ Wrapping
    1159             :  */
    1160             : 
    1161             : static str
    1162          37 : PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl, const str *flags)
    1163             : {
    1164          37 :         return pcre_replace(res, *or, *pat, *repl, *flags, true);
    1165             : }
    1166             : 
    1167             : static str
    1168          70 : PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags)
    1169             : {
    1170          70 :         BAT *b, *bn = NULL;
    1171             :         str msg;
    1172          70 :         if ((b = BATdescriptor(*bid)) == NULL)
    1173           0 :                 throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1174             : 
    1175          70 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
    1176          70 :         if (msg == MAL_SUCCEED) {
    1177          70 :                 *res = bn->batCacheid;
    1178          70 :                 BBPkeepref(*res);
    1179             :         }
    1180          70 :         BBPunfix(b->batCacheid);
    1181          70 :         return msg;
    1182             : }
    1183             : 
    1184             : static str
    1185           0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl, const str *flags)
    1186             : {
    1187           0 :         BAT *b,*bn = NULL;
    1188             :         str msg;
    1189           0 :         if ((b = BATdescriptor(*bid)) == NULL)
    1190           0 :                 throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
    1191             : 
    1192           0 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
    1193           0 :         if (msg == MAL_SUCCEED) {
    1194           0 :                 *res = bn->batCacheid;
    1195           0 :                 BBPkeepref(*res);
    1196             :         }
    1197           0 :         BBPunfix(b->batCacheid);
    1198           0 :         return msg;
    1199             : }
    1200             : 
    1201             : static str
    1202           4 : PCREmatch(bit *ret, const str *val, const str *pat)
    1203             : {
    1204         124 :         return pcre_match_with_flags(ret, *val, *pat,
    1205             : #ifdef HAVE_LIBPCRE
    1206             :                                                                  "s"
    1207             : #else
    1208             :                                                                  "x"
    1209             : #endif
    1210             :                 );
    1211             : }
    1212             : 
    1213             : static str
    1214           0 : PCREimatch(bit *ret, const str *val, const str *pat)
    1215             : {
    1216           6 :         return pcre_match_with_flags(ret, *val, *pat, "i"
    1217             : #ifndef HAVE_LIBPCRE
    1218             :                                                                  "x"
    1219             : #endif
    1220             :                 );
    1221             : }
    1222             : 
    1223             : static str
    1224          22 : PCREindex(int *res, const pcre *pattern, const str *s)
    1225             : {
    1226             : #ifdef HAVE_LIBPCRE
    1227             :         int v[3];
    1228             : 
    1229          22 :         v[0] = v[1] = *res = 0;
    1230          22 :         if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0, PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
    1231          22 :                 *res = v[1];
    1232             :         }
    1233          22 :         return MAL_SUCCEED;
    1234             : #else
    1235             :         (void) res;
    1236             :         (void) pattern;
    1237             :         (void) s;
    1238             :         throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
    1239             : #endif
    1240             : }
    1241             : 
    1242             : static str
    1243          22 : PCREpatindex(int *ret, const str *pat, const str *val)
    1244             : {
    1245             : #ifdef HAVE_LIBPCRE
    1246          22 :         pcre *re = NULL;
    1247          22 :         char *ppat = NULL, *msg;
    1248             : 
    1249          66 :         if (strNil(*pat) || strNil(*val)) {
    1250           0 :                 *ret = int_nil;
    1251           0 :                 return MAL_SUCCEED;
    1252             :         }
    1253             : 
    1254          22 :         if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
    1255             :                 return msg;
    1256          22 :         if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
    1257           0 :                 GDKfree(ppat);
    1258           0 :                 return msg;
    1259             :         }
    1260          22 :         GDKfree(ppat);
    1261          22 :         msg = PCREindex(ret, re, val);
    1262          22 :         pcre_free(re);
    1263          22 :         return msg;
    1264             : #else
    1265             :         (void) ret;
    1266             :         (void) pat;
    1267             :         (void) val;
    1268             :         throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
    1269             : #endif
    1270             : }
    1271             : 
    1272             : static str
    1273           0 : PCREquote(str *ret, const str *val)
    1274             : {
    1275             :         char *p;
    1276           0 :         const char *s = *val;
    1277             : 
    1278           0 :         *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
    1279           0 :         if (p == NULL)
    1280           0 :                 throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1281             :         /* quote all non-alphanumeric ASCII characters (i.e. leave
    1282             :            non-ASCII and alphanumeric alone) */
    1283           0 :         while (*s) {
    1284           0 :                 if (!((*s & 0x80) != 0 ||
    1285           0 :                       ('a' <= *s && *s <= 'z') ||
    1286           0 :                       ('A' <= *s && *s <= 'Z') ||
    1287           0 :                       isdigit((unsigned char) *s)))
    1288           0 :                         *p++ = '\\';
    1289           0 :                 *p++ = *s++;
    1290             :         }
    1291           0 :         *p = 0;
    1292           0 :         return MAL_SUCCEED;
    1293             : }
    1294             : 
    1295             : static str
    1296           6 : PCREsql2pcre(str *ret, const str *pat, const str *esc)
    1297             : {
    1298           6 :         return sql2pcre(ret, *pat, *esc);
    1299             : }
    1300             : 
    1301             : static inline str
    1302        2094 : choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty, const str *pat, const str *esc)
    1303             : {
    1304             :         str res = MAL_SUCCEED;
    1305        2094 :         *use_re = false;
    1306        2094 :         *use_strcmp = false;
    1307        2094 :         *empty = false;
    1308             : 
    1309        6260 :         if (strNil(*pat) || strNil(*esc)) {
    1310          22 :                 *empty = true;
    1311             :         } else {
    1312        2072 :                 if (!re_is_pattern_properly_escaped(*pat, (unsigned char) **esc))
    1313           5 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(22019) ILLEGAL_ARGUMENT ": (I)LIKE pattern must not end with escape character");
    1314        2067 :                 if (is_strcmpable(*pat, *esc)) {
    1315         205 :                         *use_re = true;
    1316         205 :                         *use_strcmp = true;
    1317        1862 :                 } else if (re_simple(*pat, (unsigned char) **esc)) {
    1318        1577 :                         *use_re = true;
    1319             :                 } else {
    1320         285 :                         if ((res = sql2pcre(ppat, *pat, *esc)) != MAL_SUCCEED)
    1321             :                                 return res;
    1322         570 :                         if (strNil(*ppat)) {
    1323           0 :                                 GDKfree(*ppat);
    1324           0 :                                 *ppat = NULL;
    1325           0 :                                 *use_re = true;
    1326           0 :                                 *use_strcmp = true;
    1327             :                         }
    1328             :                 }
    1329             :         }
    1330             :         return res;
    1331             : }
    1332             : 
    1333             : static str
    1334         398 : PCRElike_imp(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens)
    1335             : {
    1336             :         str res = MAL_SUCCEED;
    1337         398 :         char *ppat = NULL;
    1338         398 :         bool use_re = false, use_strcmp = false, empty = false;
    1339             :         struct RE *re = NULL;
    1340             : 
    1341         398 :         if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, pat, esc)) != MAL_SUCCEED)
    1342             :                 return res;
    1343             : 
    1344         763 :         MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ? "pcrelike: pattern matching using strcmp" :
    1345         370 :                                                    use_re ? "pcrelike: pattern matching using RE" : "pcrelike: pattern matching using pcre");
    1346             : 
    1347         786 :         if (strNil(*s) || empty) {
    1348          12 :                 *ret = bit_nil;
    1349         381 :         } else if (use_re) {
    1350         255 :                 if (use_strcmp) {
    1351          11 :                         *ret = *isens ? mystrcasecmp(*s, *pat) == 0 : strcmp(*s, *pat) == 0;
    1352             :                 } else {
    1353         244 :                         if (!(re = re_create(*pat, *isens, (unsigned char) **esc)))
    1354           0 :                                 res = createException(MAL, "pcre.like4", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1355             :                         else
    1356         244 :                                 *ret = *isens ? re_match_ignore(*s, re) : re_match_no_ignore(*s, re);
    1357             :                 }
    1358             :         } else {
    1359         126 :                 res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat);
    1360             :         }
    1361             : 
    1362         244 :         if (re)
    1363         244 :                 re_destroy(re);
    1364         393 :         GDKfree(ppat);
    1365         393 :         return res;
    1366             : }
    1367             : 
    1368             : static str
    1369         297 : PCRElike(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens)
    1370             : {
    1371         297 :         return PCRElike_imp(ret, s, pat, esc, isens);
    1372             : }
    1373             : 
    1374             : static str
    1375         101 : PCREnotlike(bit *ret, const str *s, const str *pat, const str *esc, const bit *isens)
    1376             : {
    1377             :         str tmp;
    1378             :         bit r;
    1379             : 
    1380         101 :         rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
    1381          97 :         *ret = r==bit_nil?bit_nil:!r;
    1382          97 :         return MAL_SUCCEED;
    1383             : }
    1384             : 
    1385             : static inline str
    1386        1523 : re_like_build(struct RE **re, uint32_t **wpat, const char *pat, bool caseignore, bool use_strcmp, uint32_t esc)
    1387             : {
    1388        1523 :         if (!use_strcmp) {
    1389        1333 :                 if (!(*re = re_create(pat, caseignore, esc)))
    1390           0 :                         return createException(MAL, "pcre.re_like_build", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1391         190 :         } else if (caseignore) {
    1392          13 :                 if (!(*wpat = utf8stoucs(pat)))
    1393           0 :                         return createException(MAL, "pcre.re_like_build", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1394             :         }
    1395             :         return MAL_SUCCEED;
    1396             : }
    1397             : 
    1398             : #define proj_scanloop(TEST)     \
    1399             :         do {    \
    1400             :                 if (*s == '\200') \
    1401             :                         return bit_nil; \
    1402             :                 else \
    1403             :                         return TEST; \
    1404             :         } while (0)
    1405             : 
    1406             : static inline bit
    1407        3424 : re_like_proj_apply(str s, struct RE *re, uint32_t *wpat, const char *pat, bool caseignore, bool anti, bool use_strcmp)
    1408             : {
    1409        3424 :         if (use_strcmp) {
    1410          26 :                 if (caseignore) {
    1411           2 :                         if (anti)
    1412           1 :                                 proj_scanloop(mywstrcasecmp(s, wpat) != 0);
    1413             :                         else
    1414           1 :                                 proj_scanloop(mywstrcasecmp(s, wpat) == 0);
    1415             :                 } else {
    1416          24 :                         if (anti)
    1417           1 :                                 proj_scanloop(strcmp(s, pat) != 0);
    1418             :                         else
    1419          23 :                                 proj_scanloop(strcmp(s, pat) == 0);
    1420             :                 }
    1421             :         } else {
    1422        3398 :                 if (caseignore) {
    1423           0 :                         if (anti)
    1424           0 :                                 proj_scanloop(!re_match_ignore(s, re));
    1425             :                         else
    1426           0 :                                 proj_scanloop(re_match_ignore(s, re));
    1427             :                 } else {
    1428        3398 :                         if (anti)
    1429           0 :                                 proj_scanloop(!re_match_no_ignore(s, re));
    1430             :                         else
    1431        3398 :                                 proj_scanloop(re_match_no_ignore(s, re));
    1432             :                 }
    1433             :         }
    1434             : }
    1435             : 
    1436             : static inline void
    1437        1613 : re_like_clean(struct RE **re, uint32_t **wpat)
    1438             : {
    1439        1613 :         if (*re) {
    1440        1331 :                 re_destroy(*re);
    1441        1331 :                 *re = NULL;
    1442             :         }
    1443        1613 :         if (*wpat) {
    1444          13 :                 GDKfree(*wpat);
    1445          13 :                 *wpat = NULL;
    1446             :         }
    1447        1613 : }
    1448             : 
    1449             : static inline str
    1450         159 : pcre_like_build(
    1451             : #ifdef HAVE_LIBPCRE
    1452             :         pcre **res,
    1453             :         pcre_extra **ex
    1454             : #else
    1455             :         regex_t *res,
    1456             :         void *ex
    1457             : #endif
    1458             : , const char *ppat, bool caseignore, BUN count)
    1459             : {
    1460             : #ifdef HAVE_LIBPCRE
    1461         159 :         const char *err_p = NULL;
    1462         159 :         int errpos = 0;
    1463             :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE | PCRE_DOTALL;
    1464         159 :         int pcrestopt = count > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
    1465             : 
    1466         159 :         *res = NULL;
    1467         159 :         *ex = NULL;
    1468             : #else
    1469             :         int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
    1470             :         int errcode;
    1471             : 
    1472             :         *res = (regex_t) {0};
    1473             :         (void) count;
    1474             : #endif
    1475             : 
    1476         159 :         if (caseignore) {
    1477             : #ifdef HAVE_LIBPCRE
    1478             :                 options |= PCRE_CASELESS;
    1479             : #else
    1480             :                 options |= REG_ICASE;
    1481             : #endif
    1482             :         }
    1483         159 :         if (
    1484             : #ifdef HAVE_LIBPCRE
    1485         159 :                 (*res = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL
    1486             : #else
    1487             :                 (errcode = regcomp(res, ppat, options)) != 0
    1488             : #endif
    1489             :                 )
    1490           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1491             :                                                                 ": compilation of regular expression (%s) failed"
    1492             : #ifdef HAVE_LIBPCRE
    1493             :                                                                 " at %d with '%s'", ppat, errpos, err_p
    1494             : #else
    1495             :                                                                 , ppat
    1496             : #endif
    1497             :                         );
    1498             : #ifdef HAVE_LIBPCRE
    1499         159 :         *ex = pcre_study(*res, pcrestopt, &err_p);
    1500         159 :         if (err_p != NULL)
    1501           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1502             :                                                                 ": pcre study of pattern (%s) "
    1503             :                                                                 "failed with '%s'", ppat, err_p);
    1504             : #else
    1505             :         (void) ex;
    1506             : #endif
    1507             :         return MAL_SUCCEED;
    1508             : }
    1509             : 
    1510             : #define PCRE_LIKE_BODY(LOOP_BODY, RES1, RES2) \
    1511             :         do { \
    1512             :                 LOOP_BODY  \
    1513             :                 if (*s == '\200') \
    1514             :                         *ret = bit_nil; \
    1515             :                 else if (pos >= 0) \
    1516             :                         *ret = RES1; \
    1517             :                 else if (pos == -1) \
    1518             :                         *ret = RES2; \
    1519             :                 else \
    1520             :                         return createException(MAL, "pcre.match", OPERATION_FAILED ": matching of regular expression (%s) failed with %d", ppat, pos); \
    1521             :         } while(0)
    1522             : 
    1523             : static inline str
    1524        1091 : pcre_like_apply(bit *ret, str s,
    1525             : #ifdef HAVE_LIBPCRE
    1526             :         pcre *re, pcre_extra *ex
    1527             : #else
    1528             :         regex_t re, void *ex
    1529             : #endif
    1530             : , const char *ppat, bool anti)
    1531             : {
    1532             :         int pos;
    1533             : 
    1534             : #ifdef HAVE_LIBPCRE
    1535             : #define LOOP_BODY       \
    1536             :         pos = pcre_exec(re, ex, s, (int) strlen(s), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
    1537             : #else
    1538             : #define LOOP_BODY       \
    1539             :         int retval = regexec(&re, s, (size_t) 0, NULL, 0); \
    1540             :         (void) ex; \
    1541             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1542             : #endif
    1543             : 
    1544        1091 :         if (anti)
    1545           0 :                 PCRE_LIKE_BODY(LOOP_BODY, FALSE, TRUE);
    1546             :         else
    1547        1091 :                 PCRE_LIKE_BODY(LOOP_BODY, TRUE, FALSE);
    1548             : 
    1549             :         return MAL_SUCCEED;
    1550             : }
    1551             : 
    1552             : static inline void
    1553         519 : pcre_clean(
    1554             : #ifdef HAVE_LIBPCRE
    1555             :         pcre **re, pcre_extra **ex) {
    1556         519 :         if (*re)
    1557         159 :                 pcre_free(*re);
    1558         519 :         if (*ex)
    1559         159 :                 pcre_free_study(*ex);
    1560         519 :         *re = NULL;
    1561         519 :         *ex = NULL;
    1562             : #else
    1563             :         regex_t *re, void *ex) {
    1564             :         regfree(re);
    1565             :         *re = (regex_t) {0};
    1566             :         (void) ex;
    1567             : #endif
    1568         519 : }
    1569             : 
    1570             : static str
    1571         367 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci, const str *esc, const bit *isens, const bit *not)
    1572             : {
    1573         367 :         str msg = MAL_SUCCEED, input = NULL, pat = NULL;
    1574             :         BAT *b = NULL, *pbn = NULL, *bn = NULL;
    1575         367 :         char *ppat = NULL;
    1576         367 :         bool use_re = false, use_strcmp = false, empty = false, isensitive = (bool) *isens, anti = (bool) *not, has_nil = false,
    1577         367 :                  input_is_a_bat = isaBatType(getArgType(mb, pci, 1)), pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
    1578         367 :         bat *r = getArgReference_bat(stk, pci, 0);
    1579             :         BUN q = 0;
    1580             :         bit *ret = NULL;
    1581             : #ifdef HAVE_LIBPCRE
    1582         367 :         pcre *re = NULL;
    1583         367 :         pcre_extra *ex = NULL;
    1584             : #else
    1585             :         regex_t re = (regex_t) {0};
    1586             :         void *ex = NULL;
    1587             : #endif
    1588         367 :         struct RE *re_simple = NULL;
    1589         367 :         uint32_t *wpat = NULL;
    1590         367 :         BATiter bi = (BATiter) {0}, pi;
    1591             : 
    1592             :         (void) cntxt;
    1593         367 :         if (input_is_a_bat) {
    1594         367 :                 bat *bid = getArgReference_bat(stk, pci, 1);
    1595         367 :                 if (!(b = BATdescriptor(*bid))) {
    1596           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1597           0 :                         goto bailout;
    1598             :                 }
    1599             :         }
    1600         367 :         if (pattern_is_a_bat) {
    1601          11 :                 bat *pb = getArgReference_bat(stk, pci, 2);
    1602          11 :                 if (!(pbn = BATdescriptor(*pb))) {
    1603           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1604           0 :                         goto bailout;
    1605             :                 }
    1606             :         }
    1607         367 :         assert((!b || ATOMstorage(b->ttype) == TYPE_str) && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
    1608             : 
    1609         367 :         q = BATcount(b ? b : pbn);
    1610         367 :         if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
    1611           0 :                 msg = createException(MAL, "batalgebra.batpcrelike3", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1612           0 :                 goto bailout;
    1613             :         }
    1614         367 :         ret = (bit*) Tloc(bn, 0);
    1615             : 
    1616         367 :         if (pattern_is_a_bat) {
    1617          11 :                 pi = bat_iterator(pbn);
    1618          11 :                 if (b)
    1619          11 :                         bi = bat_iterator(b);
    1620             :                 else
    1621           0 :                         input = *getArgReference_str(stk, pci, 1);
    1622             : 
    1623          39 :                 for (BUN p = 0; p < q; p++) {
    1624          28 :                         const str next_input = b ? BUNtail(bi, p) : input, np = BUNtail(pi, p);
    1625             : 
    1626          28 :                         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &np, esc)) != MAL_SUCCEED) {
    1627           0 :                                 bat_iterator_end(&pi);
    1628           0 :                                 if (b)
    1629           0 :                                         bat_iterator_end(&bi);
    1630           0 :                                 goto bailout;
    1631             :                         }
    1632             : 
    1633          28 :                         if (use_re) {
    1634          24 :                                 if ((msg = re_like_build(&re_simple, &wpat, np, isensitive, use_strcmp, (unsigned char) **esc)) != MAL_SUCCEED) {
    1635           0 :                                         bat_iterator_end(&pi);
    1636           0 :                                         if (b)
    1637           0 :                                                 bat_iterator_end(&bi);
    1638           0 :                                         goto bailout;
    1639             :                                 }
    1640          24 :                                 ret[p] = re_like_proj_apply(next_input, re_simple, wpat, np, isensitive, anti, use_strcmp);
    1641          24 :                                 re_like_clean(&re_simple, &wpat);
    1642           4 :                         } else if (empty) {
    1643           4 :                                 ret[p] = bit_nil;
    1644             :                         } else {
    1645           0 :                                 if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, 1)) != MAL_SUCCEED) {
    1646           0 :                                         bat_iterator_end(&pi);
    1647           0 :                                         if (b)
    1648           0 :                                                 bat_iterator_end(&bi);
    1649           0 :                                         goto bailout;
    1650             :                                 }
    1651           0 :                                 if ((msg = pcre_like_apply(&(ret[p]), next_input, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1652           0 :                                         bat_iterator_end(&pi);
    1653           0 :                                         if (b)
    1654           0 :                                                 bat_iterator_end(&bi);
    1655           0 :                                         goto bailout;
    1656             :                                 }
    1657           0 :                                 pcre_clean(&re, &ex);
    1658             :                         }
    1659          28 :                         has_nil |= is_bit_nil(ret[p]);
    1660          28 :                         GDKfree(ppat);
    1661          28 :                         ppat = NULL;
    1662             :                 }
    1663          11 :                 bat_iterator_end(&pi);
    1664          11 :                 if (b)
    1665          11 :                         bat_iterator_end(&bi);
    1666             :         } else {
    1667         356 :                 pat = *getArgReference_str(stk, pci, 2);
    1668         356 :                 if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &pat, esc)) != MAL_SUCCEED)
    1669           0 :                         goto bailout;
    1670             : 
    1671         356 :                 bi = bat_iterator(b);
    1672         711 :                 MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ? "pcrelike: pattern matching using strcmp" :
    1673         355 :                                                            use_re ? "pcrelike: pattern matching using RE" : "pcrelike: pattern matching using pcre");
    1674             : 
    1675         356 :                 if (use_re) {
    1676         275 :                         if ((msg = re_like_build(&re_simple, &wpat, pat, isensitive, use_strcmp, (unsigned char) **esc)) != MAL_SUCCEED) {
    1677           0 :                                 bat_iterator_end(&bi);
    1678           0 :                                 goto bailout;
    1679             :                         }
    1680        3673 :                         for (BUN p = 0; p < q; p++) {
    1681        3398 :                                 const str s = BUNtail(bi, p);
    1682        3398 :                                 ret[p] = re_like_proj_apply(s, re_simple, wpat, pat, isensitive, anti, use_strcmp);
    1683        3398 :                                 has_nil |= is_bit_nil(ret[p]);
    1684             :                         }
    1685          81 :                 } else if (empty) {
    1686           0 :                         for (BUN p = 0; p < q; p++)
    1687           0 :                                 ret[p] = bit_nil;
    1688             :                         has_nil = true;
    1689             :                 } else {
    1690          81 :                         if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, q)) != MAL_SUCCEED) {
    1691           0 :                                 bat_iterator_end(&bi);
    1692           0 :                                 goto bailout;
    1693             :                         }
    1694        1171 :                         for (BUN p = 0; p < q; p++) {
    1695        1090 :                                 const str s = BUNtail(bi, p);
    1696        1090 :                                 if ((msg = pcre_like_apply(&(ret[p]), s, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1697           0 :                                         bat_iterator_end(&bi);
    1698           0 :                                         goto bailout;
    1699             :                                 }
    1700        1090 :                                 has_nil |= is_bit_nil(ret[p]);
    1701             :                         }
    1702             :                 }
    1703         356 :                 bat_iterator_end(&bi);
    1704             :         }
    1705             : 
    1706         367 : bailout:
    1707         367 :         GDKfree(ppat);
    1708         367 :         re_like_clean(&re_simple, &wpat);
    1709         367 :         pcre_clean(&re, &ex);
    1710         367 :         if (bn && !msg) {
    1711         367 :                 BATsetcount(bn, q);
    1712         367 :                 bn->tnil = has_nil;
    1713         367 :                 bn->tnonil = !has_nil;
    1714         367 :                 bn->tkey = BATcount(bn) <= 1;
    1715         367 :                 bn->tsorted = BATcount(bn) <= 1;
    1716         367 :                 bn->trevsorted = BATcount(bn) <= 1;
    1717         367 :                 BBPkeepref(*r = bn->batCacheid);
    1718           0 :         } else if (bn)
    1719           0 :                 BBPreclaim(bn);
    1720         367 :         if (b)
    1721         367 :                 BBPunfix(b->batCacheid);
    1722         367 :         if (pbn)
    1723          11 :                 BBPunfix(pbn->batCacheid);
    1724         367 :         return msg;
    1725             : }
    1726             : 
    1727             : static str
    1728         365 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1729             : {
    1730         365 :         const str *esc = getArgReference_str(stk, pci, 3);
    1731         365 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1732         365 :         bit no = FALSE;
    1733             : 
    1734         365 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &no);
    1735             : }
    1736             : 
    1737             : static str
    1738           2 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1739             : {
    1740           2 :         const str *esc = getArgReference_str(stk, pci, 3);
    1741           2 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1742           2 :         bit yes = TRUE;
    1743             : 
    1744           2 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &yes);
    1745             : }
    1746             : 
    1747             : /* scan select loop with or without candidates */
    1748             : #define pcrescanloop(TEST)              \
    1749             :         do {    \
    1750             :                 TRC_DEBUG(ALGO,                 \
    1751             :                                   "PCREselect(b=%s#"BUNFMT",anti=%d): "             \
    1752             :                                   "scanselect %s\n", BATgetId(b), BATcount(b),        \
    1753             :                                   anti, #TEST);         \
    1754             :                 if (!s || BATtdense(s)) {       \
    1755             :                         for (; p < q; p++) { \
    1756             :                 GDK_CHECK_TIMEOUT(timeoffset, counter,                                          \
    1757             :                         GOTO_LABEL_TIMEOUT_HANDLER(bailout));                           \
    1758             :                                 const char *restrict v = BUNtvar(bi, p - off);  \
    1759             :                                 if (TEST)       \
    1760             :                                         vals[cnt++] = p;        \
    1761             :                         }               \
    1762             :                 } else {                \
    1763             :                         for (; p < ncands; p++) {            \
    1764             :                 GDK_CHECK_TIMEOUT(timeoffset, counter,                                          \
    1765             :                         GOTO_LABEL_TIMEOUT_HANDLER(bailout));                           \
    1766             :                                 oid o = canditer_next(ci);              \
    1767             :                                 const char *restrict v = BUNtvar(bi, o - off);  \
    1768             :                                 if (TEST)       \
    1769             :                                         vals[cnt++] = o;        \
    1770             :                         }               \
    1771             :                 }               \
    1772             :         } while (0)
    1773             : 
    1774             : #ifdef HAVE_LIBPCRE
    1775             : #define PCRE_LIKESELECT_BODY (pcre_exec(re, ex, v, (int) strlen(v), 0, PCRE_NO_UTF8_CHECK, NULL, 0) >= 0)
    1776             : #else
    1777             : #define PCRE_LIKESELECT_BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
    1778             : #endif
    1779             : 
    1780             : static str
    1781          77 : pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q, BUN *rcnt, const char *pat, bool caseignore, bool anti)
    1782             : {
    1783             : #ifdef HAVE_LIBPCRE
    1784          77 :         pcre *re = NULL;
    1785          77 :         pcre_extra *ex = NULL;
    1786             : #else
    1787             :         regex_t re = (regex_t) {0};
    1788             :         void *ex = NULL;
    1789             : #endif
    1790          77 :         BATiter bi = bat_iterator(b);
    1791          78 :         BUN cnt = 0, ncands = ci->ncand;
    1792          78 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    1793             :         str msg = MAL_SUCCEED;
    1794             : 
    1795             :         size_t counter = 0;
    1796             :         lng timeoffset = 0;
    1797          78 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    1798          78 :         if (qry_ctx != NULL) {
    1799          78 :                 timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
    1800             :         }
    1801             : 
    1802          78 :         if ((msg = pcre_like_build(&re, &ex, pat, caseignore, ci->ncand)) != MAL_SUCCEED)
    1803           0 :                 goto bailout;
    1804             : 
    1805          78 :         if (anti)
    1806           0 :                 pcrescanloop(v && *v != '\200' && !PCRE_LIKESELECT_BODY);
    1807             :         else
    1808       33400 :                 pcrescanloop(v && *v != '\200' && PCRE_LIKESELECT_BODY);
    1809             : 
    1810           1 : bailout:
    1811          78 :         bat_iterator_end(&bi);
    1812          78 :         pcre_clean(&re, &ex);
    1813          78 :         *rcnt = cnt;
    1814          78 :         return msg;
    1815             : }
    1816             : 
    1817             : static str
    1818        1149 : re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q, BUN *rcnt, const char *pat, bool caseignore, bool anti, bool use_strcmp, uint32_t esc)
    1819             : {
    1820        1149 :         BATiter bi = bat_iterator(b);
    1821        1150 :         BUN cnt = 0, ncands = ci->ncand;
    1822        1150 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    1823        1150 :         struct RE *re = NULL;
    1824        1150 :         uint32_t *wpat = NULL;
    1825             :         str msg = MAL_SUCCEED;
    1826             : 
    1827             :         size_t counter = 0;
    1828             :         lng timeoffset = 0;
    1829        1150 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    1830        1150 :         if (qry_ctx != NULL) {
    1831        1150 :                 timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
    1832             :         }
    1833        1150 :         if ((msg = re_like_build(&re, &wpat, pat, caseignore, use_strcmp, esc)) != MAL_SUCCEED)
    1834           0 :                 goto bailout;
    1835             : 
    1836        1142 :         if (use_strcmp) {
    1837          97 :                 if (caseignore) {
    1838          11 :                         if (anti)
    1839           9 :                                 pcrescanloop(v && *v != '\200' && mywstrcasecmp(v, wpat) != 0);
    1840             :                         else
    1841          36 :                                 pcrescanloop(v && *v != '\200' && mywstrcasecmp(v, wpat) == 0);
    1842             :                 } else {
    1843          86 :                         if (anti)
    1844           0 :                                 pcrescanloop(v && *v != '\200' && strcmp(v, pat) != 0);
    1845             :                         else
    1846        1113 :                                 pcrescanloop(v && *v != '\200' && strcmp(v, pat) == 0);
    1847             :                 }
    1848             :         } else {
    1849        1045 :                 if (caseignore) {
    1850          60 :                         if (anti)
    1851           0 :                                 pcrescanloop(v && *v != '\200' && !re_match_ignore(v, re));
    1852             :                         else
    1853       10967 :                                 pcrescanloop(v && *v != '\200' && re_match_ignore(v, re));
    1854             :                 } else {
    1855         985 :                         if (anti)
    1856       15166 :                                 pcrescanloop(v && *v != '\200' && !re_match_no_ignore(v, re));
    1857             :                         else
    1858       22810 :                                 pcrescanloop(v && *v != '\200' && re_match_no_ignore(v, re));
    1859             :                 }
    1860             :         }
    1861             : 
    1862           8 : bailout:
    1863        1148 :         bat_iterator_end(&bi);
    1864        1149 :         re_like_clean(&re, &wpat);
    1865        1148 :         *rcnt = cnt;
    1866        1148 :         return msg;
    1867             : }
    1868             : 
    1869             : static str
    1870        1228 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const str *pat, const str *esc, const bit *caseignore, const bit *anti)
    1871             : {
    1872             :         BAT *b, *s = NULL, *bn = NULL;
    1873             :         str msg = MAL_SUCCEED;
    1874        1228 :         char *ppat = NULL;
    1875        1228 :         bool use_re = false, use_strcmp = false, empty = false;
    1876             : 
    1877        1228 :         if ((b = BATdescriptor(*bid)) == NULL) {
    1878           0 :                 msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1879           0 :                 goto bailout;
    1880             :         }
    1881        1228 :         if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
    1882           0 :                 msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1883           0 :                 goto bailout;
    1884             :         }
    1885             : 
    1886        1228 :         assert(ATOMstorage(b->ttype) == TYPE_str);
    1887        1228 :         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, pat, esc)) != MAL_SUCCEED)
    1888           0 :                 goto bailout;
    1889             : 
    1890        2354 :         MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ? "pcrelike: pattern matching using strcmp" :
    1891        1131 :                                                    use_re ? "pcrelike: pattern matching using RE" : "pcrelike: pattern matching using pcre");
    1892             : 
    1893        1224 :         if (empty) {
    1894           0 :                 if (!(bn = BATdense(0, 0, 0)))
    1895           0 :                         msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1896             :         } else {
    1897        1224 :                 BUN p = 0, q = 0, rcnt = 0;
    1898             :                 struct canditer ci;
    1899             : 
    1900        1224 :                 canditer_init(&ci, b, s);
    1901        1226 :                 if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
    1902           0 :                         msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1903           0 :                         goto bailout;
    1904             :                 }
    1905             : 
    1906        1227 :                 if (!s || BATtdense(s)) {
    1907        1218 :                         if (s) {
    1908         723 :                                 assert(BATtdense(s));
    1909             :                                 p = (BUN) s->tseqbase;
    1910         723 :                                 q = p + BATcount(s);
    1911         723 :                                 if ((oid) p < b->hseqbase)
    1912             :                                         p = b->hseqbase;
    1913         723 :                                 if ((oid) q > b->hseqbase + BATcount(b))
    1914             :                                         q = b->hseqbase + BATcount(b);
    1915             :                         } else {
    1916         495 :                                 p = b->hseqbase;
    1917         495 :                                 q = BUNlast(b) + b->hseqbase;
    1918             :                         }
    1919             :                 }
    1920             : 
    1921        1227 :                 if (use_re) {
    1922        1149 :                         msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, (bool) *caseignore, (bool) *anti, use_strcmp, (unsigned char) **esc);
    1923             :                 } else {
    1924          78 :                         msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, (bool) *caseignore, (bool) *anti);
    1925             :                 }
    1926        1225 :                 if (!msg) { /* set some properties */
    1927        1226 :                         BATsetcount(bn, rcnt);
    1928        1225 :                         bn->tsorted = true;
    1929        1225 :                         bn->trevsorted = bn->batCount <= 1;
    1930        1225 :                         bn->tkey = true;
    1931        1225 :                         bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 || rcnt == b->batCount ? b->hseqbase : oid_nil;
    1932             :                 }
    1933             :         }
    1934             : 
    1935        1224 : bailout:
    1936        1224 :         if (b)
    1937        1224 :                 BBPunfix(b->batCacheid);
    1938        1228 :         if (s)
    1939         732 :                 BBPunfix(s->batCacheid);
    1940        1228 :         GDKfree(ppat);
    1941        1228 :         if (bn && !msg)
    1942        1228 :                 BBPkeepref(*ret = bn->batCacheid);
    1943           0 :         else if (bn)
    1944           0 :                 BBPreclaim(bn);
    1945        1228 :         return msg;
    1946             : }
    1947             : 
    1948             : #define APPEND(b, o)    (((oid *) b->theap->base)[b->batCount++] = (o))
    1949             : #define VALUE(s, x)             (s##vars + VarHeapVal(s##vals, (x), s##i.width))
    1950             : 
    1951             : #ifdef HAVE_LIBPCRE
    1952             : #define PCRE_EXEC \
    1953             :         do { \
    1954             :                 retval = pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, PCRE_NO_UTF8_CHECK, NULL, 0); \
    1955             :         } while (0)
    1956             : #define PCRE_EXEC_COND (retval < 0)
    1957             : #else
    1958             : #define PCRE_EXEC \
    1959             :         do { \
    1960             :                 retval = regexec(&pcrere, vl, (size_t) 0, NULL, 0); \
    1961             :         } while (0)
    1962             : #define PCRE_EXEC_COND (retval == REG_NOMATCH || retval == REG_ENOSYS)
    1963             : #endif
    1964             : 
    1965             : /* nested loop implementation for PCRE join */
    1966             : #define pcre_join_loop(STRCMP, RE_MATCH, PCRE_COND) \
    1967             :         do { \
    1968             :                 for (BUN ridx = 0; ridx < nrcand; ridx++) { \
    1969             :                         GDK_CHECK_TIMEOUT(timeoffset, counter, \
    1970             :                                         GOTO_LABEL_TIMEOUT_HANDLER(bailout)); \
    1971             :                         ro = canditer_next(&rci); \
    1972             :                         vr = VALUE(r, ro - rbase); \
    1973             :                         nl = 0; \
    1974             :                         use_re = use_strcmp = empty = false; \
    1975             :                         if ((msg = choose_like_path(&pcrepat, &use_re, &use_strcmp, &empty, (const str*)&vr, (const str*)&esc))) \
    1976             :                                 goto bailout; \
    1977             :                         if (!empty) { \
    1978             :                                 if (use_re) { \
    1979             :                                         if ((msg = re_like_build(&re, &wpat, vr, caseignore, use_strcmp, (unsigned char) *esc)) != MAL_SUCCEED) \
    1980             :                                                 goto bailout; \
    1981             :                                 } else if (pcrepat) { \
    1982             :                                         if ((msg = pcre_like_build(&pcrere, &pcreex, pcrepat, caseignore, nlcand)) != MAL_SUCCEED) \
    1983             :                                                 goto bailout; \
    1984             :                                         GDKfree(pcrepat); \
    1985             :                                         pcrepat = NULL; \
    1986             :                                 } \
    1987             :                                 canditer_reset(&lci); \
    1988             :                                 for (BUN lidx = 0; lidx < nlcand; lidx++) { \
    1989             :                                         lo = canditer_next(&lci); \
    1990             :                                         vl = VALUE(l, lo - lbase); \
    1991             :                                         if (strNil(vl)) { \
    1992             :                                                 continue; \
    1993             :                                         } else if (use_re) { \
    1994             :                                                 if (use_strcmp) { \
    1995             :                                                         if (STRCMP) \
    1996             :                                                                 continue; \
    1997             :                                                 } else { \
    1998             :                                                         assert(re); \
    1999             :                                                         if (RE_MATCH) \
    2000             :                                                                 continue; \
    2001             :                                                 } \
    2002             :                                         } else { \
    2003             :                                                 int retval; \
    2004             :                                                 PCRE_EXEC;  \
    2005             :                                                 if (PCRE_COND) \
    2006             :                                                         continue; \
    2007             :                                         } \
    2008             :                                         if (BUNlast(r1) == BATcapacity(r1)) { \
    2009             :                                                 newcap = BATgrows(r1); \
    2010             :                                                 BATsetcount(r1, BATcount(r1)); \
    2011             :                                                 if (r2) \
    2012             :                                                         BATsetcount(r2, BATcount(r2)); \
    2013             :                                                 if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
    2014             :                                                         msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
    2015             :                                                         goto bailout; \
    2016             :                                                 } \
    2017             :                                                 assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
    2018             :                                         } \
    2019             :                                         if (BATcount(r1) > 0) { \
    2020             :                                                 if (lastl + 1 != lo) \
    2021             :                                                         r1->tseqbase = oid_nil; \
    2022             :                                                 if (nl == 0) { \
    2023             :                                                         if (r2) \
    2024             :                                                                 r2->trevsorted = false; \
    2025             :                                                         if (lastl > lo) { \
    2026             :                                                                 r1->tsorted = false; \
    2027             :                                                                 r1->tkey = false; \
    2028             :                                                         } else if (lastl < lo) { \
    2029             :                                                                 r1->trevsorted = false; \
    2030             :                                                         } else { \
    2031             :                                                                 r1->tkey = false; \
    2032             :                                                         } \
    2033             :                                                 } \
    2034             :                                         } \
    2035             :                                         APPEND(r1, lo); \
    2036             :                                         if (r2) \
    2037             :                                                 APPEND(r2, ro); \
    2038             :                                         lastl = lo; \
    2039             :                                         nl++; \
    2040             :                                 } \
    2041             :                                 re_like_clean(&re, &wpat); \
    2042             :                                 pcre_clean(&pcrere, &pcreex); \
    2043             :                         } \
    2044             :                         if (r2) { \
    2045             :                                 if (nl > 1) { \
    2046             :                                         r2->tkey = false; \
    2047             :                                         r2->tseqbase = oid_nil; \
    2048             :                                         r1->trevsorted = false; \
    2049             :                                 } else if (nl == 0) { \
    2050             :                                         rskipped = BATcount(r2) > 0; \
    2051             :                                 } else if (rskipped) { \
    2052             :                                         r2->tseqbase = oid_nil; \
    2053             :                                 } \
    2054             :                         } else if (nl > 1) { \
    2055             :                                 r1->trevsorted = false; \
    2056             :                         } \
    2057             :                 } \
    2058             :         } while (0)
    2059             : 
    2060             : static char *
    2061          29 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc, bit caseignore, bit anti)
    2062             : {
    2063             :         struct canditer lci, rci;
    2064             :         const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
    2065             :         int rskipped = 0;                       /* whether we skipped values in r */
    2066             :         oid lbase, rbase, lo, ro, lastl = 0;            /* last value inserted into r1 */
    2067             :         BUN nl, newcap, nlcand, nrcand;
    2068          29 :         char *pcrepat = NULL, *msg = MAL_SUCCEED;
    2069          29 :         struct RE *re = NULL;
    2070          29 :         bool use_re = false, use_strcmp = false, empty = false;
    2071          29 :         uint32_t *wpat = NULL;
    2072             : #ifdef HAVE_LIBPCRE
    2073          29 :         pcre *pcrere = NULL;
    2074          29 :         pcre_extra *pcreex = NULL;
    2075             : #else
    2076             :         regex_t pcrere = (regex_t) {0};
    2077             :         void *pcreex = NULL;
    2078             : #endif
    2079             : 
    2080             :         size_t counter = 0;
    2081             :         lng timeoffset = 0;
    2082          29 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2083          29 :         if (qry_ctx != NULL) {
    2084          29 :                 timeoffset = (qry_ctx->starttime && qry_ctx->querytimeout) ? (qry_ctx->starttime + qry_ctx->querytimeout) : 0;
    2085             :         }
    2086             : 
    2087          29 :         TRC_DEBUG(ALGO,
    2088             :                           "pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
    2089             :                           "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
    2090             :                           "sr=%s#" BUNFMT "%s%s)\n",
    2091             :                           BATgetId(l), BATcount(l), ATOMname(l->ttype),
    2092             :                           l->tsorted ? "-sorted" : "",
    2093             :                           l->trevsorted ? "-revsorted" : "",
    2094             :                           BATgetId(r), BATcount(r), ATOMname(r->ttype),
    2095             :                           r->tsorted ? "-sorted" : "",
    2096             :                           r->trevsorted ? "-revsorted" : "",
    2097             :                           sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
    2098             :                           sl && sl->tsorted ? "-sorted" : "",
    2099             :                           sl && sl->trevsorted ? "-revsorted" : "",
    2100             :                           sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
    2101             :                           sr && sr->tsorted ? "-sorted" : "",
    2102             :                           sr && sr->trevsorted ? "-revsorted" : "");
    2103             : 
    2104          87 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
    2105          29 :         assert(ATOMtype(l->ttype) == TYPE_str);
    2106             : 
    2107          29 :         nlcand = canditer_init(&lci, l, sl);
    2108          29 :         nrcand = canditer_init(&rci, r, sr);
    2109             : 
    2110          29 :         BATiter li = bat_iterator(l);
    2111          29 :         BATiter ri = bat_iterator(r);
    2112          29 :         lbase = l->hseqbase;
    2113          29 :         rbase = r->hseqbase;
    2114          29 :         lvals = (const char *) li.base;
    2115          29 :         rvals = (const char *) ri.base;
    2116          29 :         assert(r->tvarsized && r->ttype);
    2117          29 :         lvars = li.vh->base;
    2118          29 :         rvars = ri.vh->base;
    2119             : 
    2120          29 :         r1->tkey = true;
    2121          29 :         r1->tsorted = true;
    2122          29 :         r1->trevsorted = true;
    2123          29 :         if (r2) {
    2124          13 :                 r2->tkey = true;
    2125          13 :                 r2->tsorted = true;
    2126          13 :                 r2->trevsorted = true;
    2127             :         }
    2128             : 
    2129          29 :         if (anti) {
    2130          10 :                 if (caseignore) {
    2131           0 :                         pcre_join_loop(mywstrcasecmp(vl, wpat) == 0, re_match_ignore(vl, re), !PCRE_EXEC_COND);
    2132             :                 } else {
    2133          80 :                         pcre_join_loop(strcmp(vl, vr) == 0, re_match_no_ignore(vl, re), !PCRE_EXEC_COND);
    2134             :                 }
    2135             :         } else {
    2136          19 :                 if (caseignore) {
    2137           0 :                         pcre_join_loop(mywstrcasecmp(vl, wpat) != 0, !re_match_ignore(vl, re), PCRE_EXEC_COND);
    2138             :                 } else {
    2139         328 :                         pcre_join_loop(strcmp(vl, vr) != 0, !re_match_no_ignore(vl, re), PCRE_EXEC_COND);
    2140             :                 }
    2141             :         }
    2142          29 :         bat_iterator_end(&li);
    2143          29 :         bat_iterator_end(&ri);
    2144             : 
    2145          29 :         assert(!r2 || BATcount(r1) == BATcount(r2));
    2146             :         /* also set other bits of heap to correct value to indicate size */
    2147          29 :         BATsetcount(r1, BATcount(r1));
    2148          29 :         if (r2)
    2149          13 :                 BATsetcount(r2, BATcount(r2));
    2150          29 :         if (BATcount(r1) > 0) {
    2151          19 :                 if (BATtdense(r1))
    2152           7 :                         r1->tseqbase = ((oid *) r1->theap->base)[0];
    2153          19 :                 if (r2 && BATtdense(r2))
    2154           5 :                         r2->tseqbase = ((oid *) r2->theap->base)[0];
    2155             :         } else {
    2156          10 :                 r1->tseqbase = 0;
    2157          10 :                 if (r2)
    2158           4 :                         r2->tseqbase = 0;
    2159             :         }
    2160          29 :         if (r2)
    2161          13 :                 TRC_DEBUG(ALGO,
    2162             :                                 "pcrejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
    2163             :                                 BATgetId(l), BATgetId(r),
    2164             :                                 BATgetId(r1), BATcount(r1),
    2165             :                                 r1->tsorted ? "-sorted" : "",
    2166             :                                 r1->trevsorted ? "-revsorted" : "",
    2167             :                                 BATgetId(r2), BATcount(r2),
    2168             :                                 r2->tsorted ? "-sorted" : "",
    2169             :                                 r2->trevsorted ? "-revsorted" : "");
    2170             :         else
    2171          16 :                 TRC_DEBUG(ALGO,
    2172             :                         "pcrejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s\n",
    2173             :                         BATgetId(l), BATgetId(r),
    2174             :                         BATgetId(r1), BATcount(r1),
    2175             :                         r1->tsorted ? "-sorted" : "",
    2176             :                         r1->trevsorted ? "-revsorted" : "");
    2177             :         return MAL_SUCCEED;
    2178             : 
    2179           0 : bailout:
    2180           0 :         bat_iterator_end(&li);
    2181           0 :         bat_iterator_end(&ri);
    2182           0 :         GDKfree(pcrepat);
    2183           0 :         re_like_clean(&re, &wpat);
    2184           0 :         pcre_clean(&pcrere, &pcreex);
    2185           0 :         assert(msg != MAL_SUCCEED);
    2186             :         return msg;
    2187             : }
    2188             : 
    2189             : static str
    2190          29 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid, bat ciid, bit anti)
    2191             : {
    2192             :         BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL, *candleft = NULL, *candright = NULL;
    2193             :         BAT *result1 = NULL, *result2 = NULL;
    2194             :         char *msg = MAL_SUCCEED, *esc = "";
    2195             :         bit ci;
    2196             : 
    2197          29 :         if ((left = BATdescriptor(lid)) == NULL)
    2198           0 :                 goto fail;
    2199          29 :         if ((right = BATdescriptor(rid)) == NULL)
    2200           0 :                 goto fail;
    2201          29 :         if ((escape = BATdescriptor(elid)) == NULL)
    2202           0 :                 goto fail;
    2203          29 :         if ((caseignore = BATdescriptor(ciid)) == NULL)
    2204           0 :                 goto fail;
    2205          29 :         if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
    2206           0 :                 goto fail;
    2207          29 :         if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
    2208           0 :                 goto fail;
    2209          29 :         result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2210          29 :         if (r2)
    2211          13 :                 result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2212          29 :         if (!result1 || (r2 && !result2)) {
    2213           0 :                 msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2214           0 :                 goto fail;
    2215             :         }
    2216          29 :         result1->tnil = false;
    2217          29 :         result1->tnonil = true;
    2218          29 :         result1->tkey = true;
    2219          29 :         result1->tsorted = true;
    2220          29 :         result1->trevsorted = true;
    2221          29 :         result1->tseqbase = 0;
    2222          29 :         if (r2) {
    2223          13 :                 result2->tnil = false;
    2224          13 :                 result2->tnonil = true;
    2225          13 :                 result2->tkey = true;
    2226          13 :                 result2->tsorted = true;
    2227          13 :                 result2->trevsorted = true;
    2228          13 :                 result2->tseqbase = 0;
    2229             :         }
    2230          29 :         if (BATcount(escape) != 1) {
    2231           0 :                 msg = createException(MAL, "pcre.join", SQLSTATE(42000) "At the moment, only one value is allowed for the escape input at pcre join");
    2232           0 :                 goto fail;
    2233             :         }
    2234             :         BATiter bi;
    2235          29 :         bi = bat_iterator(escape);
    2236          29 :         esc = BUNtvar(bi, 0);
    2237          29 :         bat_iterator_end(&bi);
    2238          29 :         if (BATcount(caseignore) != 1) {
    2239           0 :                 msg = createException(MAL, "pcre.join", SQLSTATE(42000) "At the moment, only one value is allowed for the case ignore input at pcre join");
    2240           0 :                 goto fail;
    2241             :         }
    2242          29 :         bi = bat_iterator(caseignore);
    2243          29 :         ci = *(bit*)BUNtail(bi, 0);
    2244          29 :         bat_iterator_end(&bi);
    2245          29 :         msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci, anti);
    2246          29 :         if (msg)
    2247           0 :                 goto fail;
    2248          29 :         *r1 = result1->batCacheid;
    2249          29 :         BBPkeepref(*r1);
    2250          29 :         if (r2) {
    2251          13 :                 *r2 = result2->batCacheid;
    2252          13 :                 BBPkeepref(*r2);
    2253             :         }
    2254          29 :         BBPunfix(left->batCacheid);
    2255          29 :         BBPunfix(right->batCacheid);
    2256             :         if (escape)
    2257          29 :                 BBPunfix(escape->batCacheid);
    2258             :         if (caseignore)
    2259          29 :                 BBPunfix(caseignore->batCacheid);
    2260          29 :         if (candleft)
    2261           0 :                 BBPunfix(candleft->batCacheid);
    2262          29 :         if (candright)
    2263           0 :                 BBPunfix(candright->batCacheid);
    2264             :         return MAL_SUCCEED;
    2265             : 
    2266           0 :   fail:
    2267           0 :         if (left)
    2268           0 :                 BBPunfix(left->batCacheid);
    2269           0 :         if (right)
    2270           0 :                 BBPunfix(right->batCacheid);
    2271           0 :         if (escape)
    2272           0 :                 BBPunfix(escape->batCacheid);
    2273           0 :         if (caseignore)
    2274           0 :                 BBPunfix(caseignore->batCacheid);
    2275           0 :         if (candleft)
    2276           0 :                 BBPunfix(candleft->batCacheid);
    2277           0 :         if (candright)
    2278           0 :                 BBPunfix(candright->batCacheid);
    2279           0 :         if (result1)
    2280           0 :                 BBPunfix(result1->batCacheid);
    2281           0 :         if (result2)
    2282           0 :                 BBPunfix(result2->batCacheid);
    2283           0 :         if (msg)
    2284             :                 return msg;
    2285           0 :         throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2286             : }
    2287             : 
    2288             : static str
    2289          13 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid, const bat *cid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate, const bit *anti)
    2290             : {
    2291             :         (void) nil_matches;
    2292             :         (void) estimate;
    2293          13 :         return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *elid, *cid, *anti);
    2294             : }
    2295             : 
    2296             : static str
    2297          16 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid, const bat *cid, const bat *slid, const bat *srid, const bit *nil_matches, const lng *estimate, const bit *anti)
    2298             : {
    2299             :         (void) nil_matches;
    2300             :         (void) estimate;
    2301          16 :         return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0, *elid, *cid, *anti);
    2302             : }
    2303             : 
    2304             : #include "mel.h"
    2305             : mel_atom pcre_init_atoms[] = {
    2306             :  { .name="pcre", },  { .cmp=NULL }
    2307             : };
    2308             : mel_func pcre_init_funcs[] = {
    2309             :  command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
    2310             :  command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2311             :  command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2312             :  command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
    2313             :  command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2314             :  command("pcre", "replace_first", PCREreplace_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2315             :  command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
    2316             :  command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
    2317             :  command("pcre", "prelude", pcre_init, false, "Initialize pcre", args(1,1, arg("",void))),
    2318             :  command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2319             :  command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2320             :  command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2321             :  command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2322             :  command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2323             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2324             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2325             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2326             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2327             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2328             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2329             :  command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds.  The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
    2330             :  command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    2331             :  command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
    2332             :  { .imp=NULL }
    2333             : };
    2334             : #include "mal_import.h"
    2335             : #ifdef _MSC_VER
    2336             : #undef read
    2337             : #pragma section(".CRT$XCU",read)
    2338             : #endif
    2339         257 : LIB_STARTUP_FUNC(init_pcre_mal)
    2340         257 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }

Generated by: LCOV version 1.14