LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - txtsim.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 208 428 48.6 %
Date: 2021-10-27 03:06:47 Functions: 15 18 83.3 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * @f txtsim
      11             :  * @t Module providing similarity metrics for strings
      12             :  * @a Romulo Goncalves (from M4 to M5)
      13             :  * @d 01/12/2007
      14             :  * @v 0.1
      15             :  *
      16             :  * @+ String metrics
      17             :  *
      18             :  * Provides basic similarity metrics for strings.
      19             :  *
      20             :  */
      21             : #include "monetdb_config.h"
      22             : #include "mal.h"
      23             : #include <string.h>
      24             : #include "gdk.h"
      25             : #include "str.h"
      26             : #include <limits.h>
      27             : #include "mal_exception.h"
      28             : 
      29             : 
      30             : #define RETURN_NIL_IF(b,t) \
      31             :         if (b) {\
      32             :            if (ATOMextern(t)) {\
      33             :               *(ptr*) res = (ptr) ATOMnil(t);\
      34             :                   if ( *(ptr *) res == NULL)\
      35             :                         throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL);\
      36             :            } else {\
      37             :               memcpy(res, ATOMnilptr(t), ATOMsize(t));\
      38             :            }\
      39             :            return MAL_SUCCEED; \
      40             :         }
      41             : 
      42             : /* =========================================================================
      43             :  * LEVENSH?TEIN FUNCTION
      44             :  * Source:
      45             :  * http://www.merriampark.com/ld.htm
      46             :  * =========================================================================
      47             :  */
      48             : 
      49             : #define MYMIN(x,y) ( (x<y) ? x : y )
      50             : #define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) )
      51             : #define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) )
      52             : 
      53             : /***************************************************
      54             :  * Get a pointer to the specified cell of the matrix
      55             :  **************************************************/
      56             : 
      57             : static inline int *
      58             : levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols)
      59             : {
      60         845 :         return pOrigin + col + (row * (nCols + 1));
      61             : }
      62             : 
      63             : /******************************************************
      64             :  * Get the contents of the specified cell in the matrix
      65             :  *****************************************************/
      66             : 
      67             : static inline int
      68             : levenshtein_GetAt(int *pOrigin, int col, int row, int nCols)
      69             : {
      70             :         int *pCell;
      71             : 
      72             :         pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
      73         493 :         return *pCell;
      74             : 
      75             : }
      76             : 
      77             : /********************************************************
      78             :  * Fill the specified cell in the matrix with the value x
      79             :  *******************************************************/
      80             : 
      81             : static inline void
      82             : levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x)
      83             : {
      84             :         int *pCell;
      85             : 
      86             :         pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
      87         352 :         *pCell = x;
      88             : }
      89             : 
      90             : 
      91             : /******************************
      92             :  * Compute Levenshtein distance
      93             :  *****************************/
      94             : static str
      95          33 : levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int *replace_cost, int *transpose_cost)
      96             : {
      97          33 :         char *s = *S;
      98          33 :         char *t = *T;
      99             :         int *d;                 /* pointer to matrix */
     100             :         int n;                  /* length of s */
     101             :         int m;                  /* length of t */
     102             :         int i;                  /* iterates through s */
     103             :         int j;                  /* iterates through t */
     104             :         char s_i;               /* ith character of s */
     105             :         char t_j;               /* jth character of t */
     106             :         int cost;               /* cost */
     107             :         int cell;               /* contents of target cell */
     108             :         int above;              /* contents of cell immediately above */
     109             :         int left;               /* contents of cell immediately to left */
     110             :         int diag;               /* contents of cell immediately above and to left */
     111             :         int sz;                 /* number of cells in matrix */
     112             :         int diag2 = 0, cost2 = 0;
     113             : 
     114          58 :         if (strNil(*S) || strNil(*T)) {
     115          12 :                 *result = int_nil;
     116          12 :                 return MAL_SUCCEED;
     117             :         }
     118             : 
     119             :         /* Step 1 */
     120          21 :         n = (int) strlen(s);    /* 64bit: assume strings are less than 2 GB */
     121          21 :         m = (int) strlen(t);
     122          21 :         if (n == 0) {
     123           8 :                 *result = m;
     124           8 :                 return MAL_SUCCEED;
     125             :         }
     126          13 :         if (m == 0) {
     127           8 :                 *result = n;
     128           8 :                 return MAL_SUCCEED;
     129             :         }
     130           5 :         sz = (n + 1) * (m + 1) * sizeof(int);
     131           5 :         d = (int *) GDKmalloc(sz);
     132           5 :         if ( d == NULL)
     133           0 :                 throw(MAL,"levenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     134             : 
     135             :         /* Step 2 */
     136          45 :         for (i = 0; i <= n; i++) {
     137             :                 levenshtein_PutAt(d, i, 0, n, i);
     138             :         }
     139             : 
     140          42 :         for (j = 0; j <= m; j++) {
     141             :                 levenshtein_PutAt(d, 0, j, n, j);
     142             :         }
     143             : 
     144             :         /* Step 3 */
     145          40 :         for (i = 1; i <= n; i++) {
     146             : 
     147          35 :                 s_i = s[i - 1];
     148             : 
     149             :                 /* Step 4 */
     150         310 :                 for (j = 1; j <= m; j++) {
     151             : 
     152         275 :                         t_j = t[j - 1];
     153             : 
     154             :                         /* Step 5 */
     155         275 :                         if (s_i == t_j) {
     156             :                                 cost = 0;
     157             :                         } else {
     158         232 :                                 cost = *replace_cost;
     159             :                         }
     160             : 
     161             :                         /* Step 6 */
     162         275 :                         above = levenshtein_GetAt(d, i - 1, j, n);
     163         275 :                         left = levenshtein_GetAt(d, i, j - 1, n);
     164             :                         diag = levenshtein_GetAt(d, i - 1, j - 1, n);
     165             : 
     166         275 :                         if (j >= 2 && i >= 2) {
     167             :                                 /* NEW: detect transpositions */
     168             : 
     169         213 :                                 diag2 = levenshtein_GetAt(d, i - 2, j - 2, n);
     170         213 :                                 if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
     171           4 :                                         cost2 = *transpose_cost;
     172             :                                 } else {
     173             :                                         cost2 = 2;
     174             :                                 }
     175         213 :                                 cell = SMALLEST_OF4(above + *insdel_cost, left + *insdel_cost, diag + cost, diag2 + cost2);
     176             :                         } else {
     177          62 :                                 cell = SMALLEST_OF(above + *insdel_cost, left + *insdel_cost, diag + cost);
     178             :                         }
     179             :                         levenshtein_PutAt(d, i, j, n, cell);
     180             :                 }
     181             :         }
     182             : 
     183             :         /* Step 7 */
     184           5 :         *result = levenshtein_GetAt(d, n, m, n);
     185           5 :         GDKfree(d);
     186           5 :         return MAL_SUCCEED;
     187             : }
     188             : 
     189             : static str
     190          20 : levenshteinbasic_impl(int *result, str *s, str *t)
     191             : {
     192          20 :         int insdel = 1, replace = 1, transpose = 2;
     193             : 
     194          20 :         return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
     195             : }
     196             : 
     197             : static str
     198           8 : levenshteinbasic2_impl(int *result, str *s, str *t)
     199             : {
     200           8 :         int insdel = 1, replace = 1, transpose = 1;
     201             : 
     202           8 :         return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
     203             : }
     204             : 
     205             : 
     206             : /* =========================================================================
     207             :  * SOUNDEX FUNCTION
     208             :  * Source:
     209             :  * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c
     210             :  * =========================================================================
     211             :  */
     212             : 
     213             : #define SoundexLen 4            /* length of a soundex code */
     214             : #define SoundexKey "Z000"     /* default key for soundex code */
     215             : 
     216             : /* set letter values */
     217             : static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
     218             :         1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
     219             : };
     220             : 
     221             : static inline char
     222          34 : SCode(unsigned char c)
     223             : {
     224          34 :         if (c == 95)
     225             :                 return (2);     /* german sz */
     226          34 :         return (Code[toupper(c) - 'A']);
     227             : }
     228             : 
     229             : static str
     230          20 : soundex_code(const char *Name, char *Key)
     231             : {
     232             :         char LastLetter;
     233             :         int Index;
     234             : 
     235          70 :         for (const char *p = Name; *p; p++)
     236          50 :                 if ((*p & 0x80) != 0)
     237           0 :                         throw(MAL,"soundex", SQLSTATE(42000) "Soundex function not available for non ASCII strings");
     238             : 
     239             :         /* set default key */
     240          20 :         strcpy(Key, SoundexKey);
     241             : 
     242             :         /* keep first letter */
     243          20 :         Key[0] = *Name;
     244          20 :         if (!isupper((unsigned char) (Key[0])))
     245          20 :                 Key[0] = toupper(Key[0]);
     246             : 
     247          20 :         LastLetter = *Name;
     248          20 :         if (!*Name)
     249             :                 return MAL_SUCCEED;
     250          10 :         Name++;
     251             : 
     252             :         /* scan rest of string */
     253          48 :         for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
     254             :                 /* use only letters */
     255          38 :                 if (isalpha((unsigned char) (*Name))) {
     256             :                         /* ignore duplicate successive chars */
     257          38 :                         if (LastLetter != *Name) {
     258             :                                 /* new LastLetter */
     259             :                                 LastLetter = *Name;
     260             : 
     261             :                                 /* ignore letters with code 0 */
     262          34 :                                 if (SCode(*Name) != 0) {
     263          22 :                                         Key[Index] = '0' + SCode(*Name);
     264          22 :                                         Index ++;
     265             :                                 }
     266             :                         }
     267             :                 }
     268             :         }
     269             :         return MAL_SUCCEED;
     270             : }
     271             : 
     272             : static str
     273          24 : soundex_impl(str *res, str *Name)
     274             : {
     275             :         str msg = MAL_SUCCEED;
     276             : 
     277          24 :         GDKfree(*res);
     278          48 :         RETURN_NIL_IF(strNil(*Name), TYPE_str);
     279             : 
     280          20 :         *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
     281          20 :         if( *res == NULL)
     282           0 :                 throw(MAL,"soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     283             : 
     284             :         /* calculate Key for Name */
     285          20 :         if ((msg = soundex_code(*Name, *res))) {
     286           0 :                 GDKfree(*res);
     287           0 :                 *res = NULL;
     288           0 :                 return msg;
     289             :         }
     290             :         return msg;
     291             : }
     292             : 
     293             : static str
     294           8 : stringdiff_impl(int *res, str *s1, str *s2)
     295             : {
     296             :         str r = MAL_SUCCEED;
     297           8 :         char *S1 = NULL, *S2 = NULL;
     298             : 
     299           8 :         r = soundex_impl(&S1, s1);
     300           8 :         if( r != MAL_SUCCEED)
     301             :                 return r;
     302           8 :         r = soundex_impl(&S2, s2);
     303           8 :         if( r != MAL_SUCCEED){
     304           0 :                 GDKfree(S1);
     305           0 :                 return r;
     306             :         }
     307           8 :         r = levenshteinbasic_impl(res, &S1, &S2);
     308           8 :         GDKfree(S1);
     309           8 :         GDKfree(S2);
     310           8 :         return r;
     311             : }
     312             : 
     313             : /******************************
     314             :  * QGRAMNORMALIZE
     315             :  *
     316             :  * This function 'normalizes' a string so valid q-grams can  be made of it:
     317             :  * All characters are transformed to uppercase, and all characters
     318             :  * which are not letters or digits are stripped to a single space.
     319             :  *
     320             :  * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
     321             :  * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
     322             :  *
     323             :  *****************************/
     324             : static str
     325          12 : CMDqgramnormalize(str *res, str *Input)
     326             : {
     327          12 :         char *input = *Input;
     328             :         int i, j = 0;
     329             :         char c, last = ' ';
     330             : 
     331          12 :         GDKfree(*res);
     332          12 :         RETURN_NIL_IF(strNil(input), TYPE_str);
     333          12 :         *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1));     /* normalized strings are never longer than original */
     334          12 :         if (*res == NULL)
     335           0 :                 throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     336             : 
     337          87 :         for (i = 0; input[i]; i++) {
     338          75 :                 c = toupper(input[i]);
     339          75 :                 if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
     340             :                         c = ' ';
     341          75 :                 if (c != ' ' || last != ' ') {
     342          69 :                         (*res)[j++] = c;
     343             :                 }
     344             :                 last = c;
     345             :         }
     346          12 :         (*res)[j] = 0;
     347             :         /* strip final whitespace */
     348          14 :         while (j > 0 && (*res)[--j] == ' ')
     349           2 :                 (*res)[j] = 0;
     350             : 
     351             :         return MAL_SUCCEED;
     352             : }
     353             : 
     354             : /* =========================================================================
     355             :  * FSTRCMP FUNCTION
     356             :  * Source:
     357             :  * http://search.cpan.org/src/MLEHMANN/String-Similarity-1/fstrcmp.c
     358             :  * =========================================================================
     359             :  */
     360             : 
     361             : /*
     362             :  * Data on one input string being compared.
     363             :  */
     364             : struct string_data {
     365             :         /* The string to be compared. */
     366             :         const char *data;
     367             : 
     368             :         /* The length of the string to be compared. */
     369             :         int data_length;
     370             : 
     371             :         /* The number of characters inserted or deleted. */
     372             :         int edit_count;
     373             : };
     374             : 
     375             : struct partition {
     376             :         /* Midpoints of this partition.  */
     377             :         int xmid, ymid;
     378             : 
     379             :         /* Nonzero if low half will be analyzed minimally.  */
     380             :         int lo_minimal;
     381             : 
     382             :         /* Likewise for high half.  */
     383             :         int hi_minimal;
     384             : };
     385             : 
     386             : /* NAME
     387             :         diag - find diagonal path
     388             : 
     389             :    SYNOPSIS
     390             :         int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
     391             :                 struct partition *part);
     392             : 
     393             :    DESCRIPTION
     394             :         Find the midpoint of the shortest edit script for a specified
     395             :         portion of the two strings.
     396             : 
     397             :         Scan from the beginnings of the strings, and simultaneously from
     398             :         the ends, doing a breadth-first search through the space of
     399             :         edit-sequence.  When the two searches meet, we have found the
     400             :         midpoint of the shortest edit sequence.
     401             : 
     402             :         If MINIMAL is nonzero, find the minimal edit script regardless
     403             :         of expense.  Otherwise, if the search is too expensive, use
     404             :         heuristics to stop the search and report a suboptimal answer.
     405             : 
     406             :    RETURNS
     407             :         Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
     408             :         number XMID - YMID equals the number of inserted characters
     409             :         minus the number of deleted characters (counting only characters
     410             :         before the midpoint).  Return the approximate edit cost; this is
     411             :         the total number of characters inserted or deleted (counting
     412             :         only characters before the midpoint), unless a heuristic is used
     413             :         to terminate the search prematurely.
     414             : 
     415             :         Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
     416             :         for the left half of the partition is known; similarly for
     417             :         PART->RIGHT_MINIMAL.
     418             : 
     419             :    CAVEAT
     420             :         This function assumes that the first characters of the specified
     421             :         portions of the two strings do not match, and likewise that the
     422             :         last characters do not match.  The caller must trim matching
     423             :         characters from the beginning and end of the portions it is
     424             :         going to specify.
     425             : 
     426             :         If we return the "wrong" partitions, the worst this can do is
     427             :         cause suboptimal diff output.  It cannot cause incorrect diff
     428             :         output.  */
     429             : 
     430             : static inline int
     431           0 : diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part, int too_expensive, struct string_data *string, int *fdiag, int *bdiag)
     432             : {
     433             :         int *const fd = fdiag;  /* Give the compiler a chance. */
     434             :         int *const bd = bdiag;  /* Additional help for the compiler. */
     435           0 :         const char *const xv = string[0].data;  /* Still more help for the compiler. */
     436           0 :         const char *const yv = string[1].data;  /* And more and more . . . */
     437           0 :         const int dmin = xoff - ylim;   /* Minimum valid diagonal. */
     438           0 :         const int dmax = xlim - yoff;   /* Maximum valid diagonal. */
     439           0 :         const int fmid = xoff - yoff;   /* Center diagonal of top-down search. */
     440           0 :         const int bmid = xlim - ylim;   /* Center diagonal of bottom-up search. */
     441             :         int fmin = fmid;
     442             :         int fmax = fmid;        /* Limits of top-down search. */
     443             :         int bmin = bmid;
     444             :         int bmax = bmid;        /* Limits of bottom-up search. */
     445             :         int c;                  /* Cost. */
     446           0 :         int odd = (fmid - bmid) & 1;
     447             : 
     448             :         /*
     449             :          * True if southeast corner is on an odd diagonal with respect
     450             :          * to the northwest.
     451             :          */
     452           0 :         fd[fmid] = xoff;
     453           0 :         bd[bmid] = xlim;
     454           0 :         for (c = 1;; ++c) {
     455             :                 int d;          /* Active diagonal. */
     456             : 
     457             :                 /* Extend the top-down search by an edit step in each diagonal. */
     458           0 :                 if (fmin > dmin)
     459           0 :                         fd[--fmin - 1] = -1;
     460             :                 else
     461           0 :                         ++fmin;
     462           0 :                 if (fmax < dmax)
     463           0 :                         fd[++fmax + 1] = -1;
     464             :                 else
     465           0 :                         --fmax;
     466           0 :                 for (d = fmax; d >= fmin; d -= 2) {
     467             :                         int x;
     468             :                         int y;
     469             :                         int tlo;
     470             :                         int thi;
     471             : 
     472           0 :                         tlo = fd[d - 1], thi = fd[d + 1];
     473             : 
     474           0 :                         if (tlo >= thi)
     475           0 :                                 x = tlo + 1;
     476             :                         else
     477             :                                 x = thi;
     478           0 :                         y = x - d;
     479           0 :                         while (x < xlim && y < ylim && xv[x] == yv[y]) {
     480           0 :                                 ++x;
     481           0 :                                 ++y;
     482             :                         }
     483           0 :                         fd[d] = x;
     484           0 :                         if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
     485           0 :                                 part->xmid = x;
     486           0 :                                 part->ymid = y;
     487           0 :                                 part->lo_minimal = part->hi_minimal = 1;
     488           0 :                                 return 2 * c - 1;
     489             :                         }
     490             :                 }
     491             :                 /* Similarly extend the bottom-up search.  */
     492           0 :                 if (bmin > dmin)
     493           0 :                         bd[--bmin - 1] = INT_MAX;
     494             :                 else
     495           0 :                         ++bmin;
     496           0 :                 if (bmax < dmax)
     497           0 :                         bd[++bmax + 1] = INT_MAX;
     498             :                 else
     499           0 :                         --bmax;
     500           0 :                 for (d = bmax; d >= bmin; d -= 2) {
     501             :                         int x;
     502             :                         int y;
     503             :                         int tlo;
     504             :                         int thi;
     505             : 
     506           0 :                         tlo = bd[d - 1], thi = bd[d + 1];
     507           0 :                         if (tlo < thi)
     508             :                                 x = tlo;
     509             :                         else
     510           0 :                                 x = thi - 1;
     511           0 :                         y = x - d;
     512           0 :                         while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
     513           0 :                                 --x;
     514           0 :                                 --y;
     515             :                         }
     516           0 :                         bd[d] = x;
     517           0 :                         if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
     518           0 :                                 part->xmid = x;
     519           0 :                                 part->ymid = y;
     520           0 :                                 part->lo_minimal = part->hi_minimal = 1;
     521           0 :                                 return 2 * c;
     522             :                         }
     523             :                 }
     524             : 
     525           0 :                 if (minimal)
     526           0 :                         continue;
     527             : 
     528             :                 /* Heuristic: if we've gone well beyond the call of duty, give up
     529             :                    and report halfway between our best results so far.  */
     530           0 :                 if (c >= too_expensive) {
     531             :                         int fxybest;
     532             :                         int fxbest;
     533             :                         int bxybest;
     534             :                         int bxbest;
     535             : 
     536             :                         /* Pacify `gcc -Wall'. */
     537             :                         fxbest = 0;
     538             :                         bxbest = 0;
     539             : 
     540             :                         /* Find forward diagonal that maximizes X + Y.  */
     541             :                         fxybest = -1;
     542           0 :                         for (d = fmax; d >= fmin; d -= 2) {
     543             :                                 int x;
     544             :                                 int y;
     545             : 
     546           0 :                                 x = fd[d] < xlim ? fd[d] : xlim;
     547           0 :                                 y = x - d;
     548             : 
     549           0 :                                 if (ylim < y) {
     550           0 :                                         x = ylim + d;
     551             :                                         y = ylim;
     552             :                                 }
     553           0 :                                 if (fxybest < x + y) {
     554             :                                         fxybest = x + y;
     555             :                                         fxbest = x;
     556             :                                 }
     557             :                         }
     558             :                         /* Find backward diagonal that minimizes X + Y.  */
     559             :                         bxybest = INT_MAX;
     560           0 :                         for (d = bmax; d >= bmin; d -= 2) {
     561             :                                 int x;
     562             :                                 int y;
     563             : 
     564           0 :                                 x = xoff > bd[d] ? xoff : bd[d];
     565           0 :                                 y = x - d;
     566             : 
     567           0 :                                 if (y < yoff) {
     568           0 :                                         x = yoff + d;
     569             :                                         y = yoff;
     570             :                                 }
     571           0 :                                 if (x + y < bxybest) {
     572             :                                         bxybest = x + y;
     573             :                                         bxbest = x;
     574             :                                 }
     575             :                         }
     576             :                         /* Use the better of the two diagonals.  */
     577           0 :                         if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) {
     578           0 :                                 part->xmid = fxbest;
     579           0 :                                 part->ymid = fxybest - fxbest;
     580           0 :                                 part->lo_minimal = 1;
     581           0 :                                 part->hi_minimal = 0;
     582             :                         } else {
     583           0 :                                 part->xmid = bxbest;
     584           0 :                                 part->ymid = bxybest - bxbest;
     585           0 :                                 part->lo_minimal = 0;
     586           0 :                                 part->hi_minimal = 1;
     587             :                         }
     588           0 :                         return 2 * c - 1;
     589             :                 }
     590             :         }
     591             : }
     592             : 
     593             : 
     594             : /* NAME
     595             :         compareseq - find edit sequence
     596             : 
     597             :    SYNOPSIS
     598             :         void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
     599             : 
     600             :    DESCRIPTION
     601             :         Compare in detail contiguous subsequences of the two strings
     602             :         which are known, as a whole, to match each other.
     603             : 
     604             :         The subsequence of string 0 is [XOFF, XLIM) and likewise for
     605             :         string 1.
     606             : 
     607             :         Note that XLIM, YLIM are exclusive bounds.  All character
     608             :         numbers are origin-0.
     609             : 
     610             :         If MINIMAL is nonzero, find a minimal difference no matter how
     611             :         expensive it is.  */
     612             : 
     613             : static inline void
     614      399841 : compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, int max_edits, int too_expensive, struct string_data *string, int *fdiag, int *bdiag) /* compareseq stops when edits > max_edits */
     615             : {
     616      399841 :         const char *const xv = string[0].data;  /* Help the compiler.  */
     617      399841 :         const char *const yv = string[1].data;
     618             : 
     619      399841 :         if (string[1].edit_count + string[0].edit_count > max_edits)
     620             :                 return;
     621             : 
     622             :         /* Slide down the bottom initial diagonal. */
     623     2576608 :         while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) {
     624     2176767 :                 ++xoff;
     625     2176767 :                 ++yoff;
     626             :         }
     627             : 
     628             :         /* Slide up the top initial diagonal. */
     629      399846 :         while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) {
     630           5 :                 --xlim;
     631           5 :                 --ylim;
     632             :         }
     633             : 
     634             :         /* Handle simple cases. */
     635      399841 :         if (xoff == xlim) {
     636      399840 :                 while (yoff < ylim) {
     637           0 :                         ++string[1].edit_count;
     638           0 :                         ++yoff;
     639             :                 }
     640           1 :         } else if (yoff == ylim) {
     641           2 :                 while (xoff < xlim) {
     642           1 :                         ++string[0].edit_count;
     643           1 :                         ++xoff;
     644             :                 }
     645             :         } else {
     646             :                 int c;
     647             :                 struct partition part;
     648             : 
     649             :                 /* Find a point of correspondence in the middle of the strings.  */
     650           0 :                 c = diag(xoff, xlim, yoff, ylim, minimal, &part, too_expensive, string, fdiag, bdiag);
     651           0 :                 if (c == 1) {
     652             :                         /* The two subsequences differ by a single insert or delete;
     653             :                            record it and we are done.  */
     654           0 :                         if (part.xmid - part.ymid < xoff - yoff)
     655           0 :                                 ++string[1].edit_count;
     656             :                         else
     657           0 :                                 ++string[0].edit_count;
     658             :                 } else {
     659             :                         /* Use the partitions to split this problem into subproblems.  */
     660           0 :                         compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal, max_edits, too_expensive, string, fdiag, bdiag);
     661           0 :                         compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal, max_edits, too_expensive, string, fdiag, bdiag);
     662             :                 }
     663             :         }
     664             : }
     665             : 
     666             : /* NAME
     667             :         fstrcmp - fuzzy string compare
     668             : 
     669             :    SYNOPSIS
     670             :         double fstrcmp(const char *s1, int l1, const char *s2, int l2, double);
     671             : 
     672             :    DESCRIPTION
     673             :         The fstrcmp function may be used to compare two string for
     674             :         similarity.  It is very useful in reducing "cascade" or
     675             :         "secondary" errors in compilers or other situations where
     676             :         symbol tables occur.
     677             : 
     678             :    RETURNS
     679             :         double; 0 if the strings are entirly dissimilar, 1 if the
     680             :         strings are identical, and a number in between if they are
     681             :         similar.  */
     682             : 
     683             : #define INITIAL_INT_BUFFER_LENGTH 2048
     684             : 
     685             : #define CHECK_INT_BUFFER_LENGTH(BUFFER, BUFFER_LEN, NEXT_LEN, OP) \
     686             :         do { \
     687             :                 if ((NEXT_LEN) > *BUFFER_LEN) { \
     688             :                         size_t newlen = (((NEXT_LEN) + 1023) & ~1023); /* align to a multiple of 1024 bytes */ \
     689             :                         int *newbuf = GDKmalloc(newlen); \
     690             :                         if (!newbuf) \
     691             :                                 throw(MAL, OP, SQLSTATE(HY013) MAL_MALLOC_FAIL); \
     692             :                         GDKfree(*BUFFER); \
     693             :                         *BUFFER = newbuf; \
     694             :                         *BUFFER_LEN = newlen; \
     695             :                 } \
     696             :         } while (0)
     697             : 
     698             : static str
     699      399763 : fstrcmp_impl_internal(dbl *ret, int **fdiag_buf, size_t *fdiag_buflen, str string1, str string2, dbl minimum)
     700             : {
     701             :         int i, max_edits, *fdiag, *bdiag, too_expensive = 1;
     702             :         size_t fdiag_len;
     703             :         struct string_data string[2];
     704             : 
     705             :         /* set the info for each string.  */
     706      399763 :         string[0].data = string1;
     707      399763 :         string[0].data_length = (int) strlen(string1); /* 64bit: assume string not too long */
     708      399763 :         string[1].data = string2;
     709      399763 :         string[1].data_length = (int) strlen(string2); /* 64bit: assume string not too long */
     710             : 
     711             :         /* short-circuit obvious comparisons */
     712      399763 :         if (string[0].data_length == 0 && string[1].data_length == 0) {
     713           0 :                 *ret = 1.0;
     714           0 :                 return MAL_SUCCEED;
     715             :         }
     716      399763 :         if (string[0].data_length == 0 || string[1].data_length == 0) {
     717           0 :                 *ret = 0.0;
     718           0 :                 return MAL_SUCCEED;
     719             :         }
     720             : 
     721             :         /* Set TOO_EXPENSIVE to be approximate square root of input size,
     722             :            bounded below by 256.  */
     723     1199219 :         for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
     724      799456 :                 too_expensive <<= 1;
     725             :         if (too_expensive < 256)
     726             :                 too_expensive = 256;
     727             : 
     728             :         /* Because fstrcmp is typically called multiple times, while scanning
     729             :            symbol tables, etc, attempt to minimize the number of memory
     730             :            allocations performed.  Thus, we use a static buffer for the
     731             :            diagonal vectors, and never free them.  */
     732      399763 :         fdiag_len = string[0].data_length + string[1].data_length + 3;
     733      399763 :         CHECK_INT_BUFFER_LENGTH(fdiag_buf, fdiag_buflen, fdiag_len * 2 * sizeof(int), "txtsim.similarity");
     734      399763 :         fdiag = *fdiag_buf + string[1].data_length + 1;
     735      399763 :         bdiag = fdiag + fdiag_len;
     736             : 
     737      399763 :         max_edits = 1 + (int) ((string[0].data_length + string[1].data_length) * (1. - minimum));
     738             : 
     739             :         /* Now do the main comparison algorithm */
     740      399763 :         string[0].edit_count = 0;
     741      399763 :         string[1].edit_count = 0;
     742      399763 :         compareseq(0, string[0].data_length, 0, string[1].data_length, 0, max_edits, too_expensive, string, fdiag, bdiag);
     743             : 
     744             :         /* The result is
     745             :            ((number of chars in common) / (average length of the strings)).
     746             :            This is admittedly biased towards finding that the strings are
     747             :            similar, however it does produce meaningful results.  */
     748      399822 :         *ret = ((double)
     749      399822 :                 (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
     750      399822 :                 / (string[0].data_length + string[1].data_length));
     751      399822 :         return MAL_SUCCEED;
     752             : }
     753             : 
     754             : static str
     755           0 : fstrcmp_impl(dbl *ret, str *string1, str *string2, dbl *minimum)
     756             : {
     757           0 :         str s1 = *string1, s2 = *string2;
     758           0 :         dbl min = *minimum;
     759             : 
     760           0 :         if (strNil(s1) || strNil(s2) || is_dbl_nil(min)) {
     761           0 :                 *ret = dbl_nil;
     762           0 :                 return MAL_SUCCEED;
     763             :         } else {
     764             :                 str msg = MAL_SUCCEED;
     765           0 :                 int *fdiag_buf = NULL;
     766           0 :                 size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
     767             : 
     768           0 :                 if (!(fdiag_buf = GDKmalloc(fdiag_buflen)))
     769           0 :                         throw(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     770           0 :                 msg = fstrcmp_impl_internal(ret, &fdiag_buf, &fdiag_buflen, s1, s1, min);
     771           0 :                 GDKfree(fdiag_buf);
     772           0 :                 return msg;
     773             :         }
     774             : }
     775             : 
     776             : static str
     777           4 : fstrcmp0_impl(dbl *ret, str *string1, str *string2)
     778             : {
     779           4 :         str s1 = *string1, s2 = *string2;
     780             : 
     781           6 :         if (strNil(s1) || strNil(s2)) {
     782           3 :                 *ret = dbl_nil;
     783           3 :                 return MAL_SUCCEED;
     784             :         } else {
     785             :                 str msg = MAL_SUCCEED;
     786           1 :                 int *fdiag_buf = NULL;
     787           1 :                 size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
     788             : 
     789           1 :                 if (!(fdiag_buf = GDKmalloc(fdiag_buflen)))
     790           0 :                         throw(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     791           1 :                 msg = fstrcmp_impl_internal(ret, &fdiag_buf, &fdiag_buflen, s1, s2, 0.0);
     792           1 :                 GDKfree(fdiag_buf);
     793           1 :                 return msg;
     794             :         }
     795             : }
     796             : 
     797             : static str
     798          16 : fstrcmp0_impl_bulk(bat *res, bat *strings1, bat *strings2)
     799             : {
     800             :         BATiter lefti, righti;
     801             :         BAT *bn = NULL, *left = NULL, *right = NULL;
     802             :         BUN q = 0;
     803          16 :         size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
     804             :         str x, y, msg = MAL_SUCCEED;
     805             :         bool nils = false;
     806             :         dbl *restrict vals;
     807          16 :         int *fdiag_buf = GDKmalloc(fdiag_buflen);
     808             : 
     809          16 :         if (!fdiag_buf) {
     810           0 :                 msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     811           0 :                 goto bailout;
     812             :         }
     813          16 :         if (!(left = BATdescriptor(*strings1)) || !(right = BATdescriptor(*strings2))) {
     814           0 :                 msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     815           0 :                 goto bailout;
     816             :         }
     817          16 :         q = BATcount(left);
     818          16 :         if (!(bn = COLnew(left->hseqbase, TYPE_dbl, q, TRANSIENT))) {
     819           0 :                 msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     820           0 :                 goto bailout;
     821             :         }
     822             : 
     823          16 :         lefti = bat_iterator(left);
     824          16 :         righti = bat_iterator(right);
     825          16 :         vals = Tloc(bn, 0);
     826      399814 :         for (BUN i = 0; i < q && !msg; i++) {
     827      399798 :                 x = (str) BUNtvar(lefti, i);
     828      399798 :                 y = (str) BUNtvar(righti, i);
     829             : 
     830      799593 :                 if (strNil(x) || strNil(y)) {
     831          11 :                         vals[i] = dbl_nil;
     832          11 :                         nils = true;
     833             :                 } else {
     834      399787 :                         msg = fstrcmp_impl_internal(&vals[i], &fdiag_buf, &fdiag_buflen, x, y, 0.0);
     835             :                 }
     836             :         }
     837          16 :         bat_iterator_end(&lefti);
     838          16 :         bat_iterator_end(&righti);
     839             : 
     840          16 : bailout:
     841          16 :         GDKfree(fdiag_buf);
     842          16 :         if (bn && !msg) {
     843          16 :                 BATsetcount(bn, q);
     844          16 :                 bn->tnil = nils;
     845          16 :                 bn->tnonil = !nils;
     846          16 :                 bn->tkey = BATcount(bn) <= 1;
     847          16 :                 bn->tsorted = BATcount(bn) <= 1;
     848          16 :                 bn->trevsorted = BATcount(bn) <= 1;
     849          16 :                 BBPkeepref(*res = bn->batCacheid);
     850           0 :         } else if (bn)
     851           0 :                 BBPreclaim(bn);
     852          16 :         if (left)
     853          16 :                 BBPunfix(left->batCacheid);
     854          16 :         if (right)
     855          16 :                 BBPunfix(right->batCacheid);
     856          16 :         return msg;
     857             : }
     858             : 
     859             : 
     860             : /* ============ Q-GRAM SELF JOIN ============== */
     861             : 
     862             : static str
     863           0 : CMDqgramselfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid, flt *c, int *k)
     864             : {
     865             :         BAT *qgram, *id, *pos, *len;
     866             :         BUN n;
     867             :         BUN i, j;
     868             :         BAT *bn, *bn2;
     869             :         oid *qbuf;
     870             :         int *ibuf;
     871             :         int *pbuf;
     872             :         int *lbuf;
     873             :         str msg = MAL_SUCCEED;
     874             : 
     875           0 :         qgram = BATdescriptor(*qid);
     876           0 :         id = BATdescriptor(*bid);
     877           0 :         pos = BATdescriptor(*pid);
     878           0 :         len = BATdescriptor(*lid);
     879           0 :         if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
     880           0 :                 if (qgram)
     881           0 :                         BBPunfix(qgram->batCacheid);
     882           0 :                 if (id)
     883           0 :                         BBPunfix(id->batCacheid);
     884           0 :                 if (pos)
     885           0 :                         BBPunfix(pos->batCacheid);
     886           0 :                 if (len)
     887           0 :                         BBPunfix(len->batCacheid);
     888           0 :                 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     889             :         }
     890             : 
     891           0 :         if (qgram->ttype != TYPE_oid)
     892           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     893             :                           SEMANTIC_TYPE_MISMATCH ": tail of BAT qgram must be oid");
     894           0 :         else if (id->ttype != TYPE_int)
     895           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     896             :                           SEMANTIC_TYPE_MISMATCH ": tail of BAT id must be int");
     897           0 :         else if (pos->ttype != TYPE_int)
     898           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     899             :                           SEMANTIC_TYPE_MISMATCH ": tail of BAT pos must be int");
     900           0 :         else if (len->ttype != TYPE_int)
     901           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     902             :                           SEMANTIC_TYPE_MISMATCH ": tail of BAT len must be int");
     903             :         if (msg) {
     904           0 :                 BBPunfix(qgram->batCacheid);
     905           0 :                 BBPunfix(id->batCacheid);
     906           0 :                 BBPunfix(pos->batCacheid);
     907           0 :                 BBPunfix(len->batCacheid);
     908           0 :                 return msg;
     909             :         }
     910             : 
     911           0 :         n = BATcount(qgram);
     912             : 
     913             :         /* if (BATcount(qgram)>1 && !BATtordered(qgram)) throw(MAL, "tstsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
     914             : 
     915           0 :         if (!ALIGNsynced(qgram, id))
     916           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     917             :                           SEMANTIC_TYPE_MISMATCH ": qgram and id are not synced");
     918             : 
     919           0 :         else if (!ALIGNsynced(qgram, pos))
     920           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     921             :                           SEMANTIC_TYPE_MISMATCH ": qgram and pos are not synced");
     922           0 :         else if (!ALIGNsynced(qgram, len))
     923           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     924             :                           SEMANTIC_TYPE_MISMATCH ": qgram and len are not synced");
     925             : 
     926           0 :         else if (Tsize(qgram) != ATOMsize(qgram->ttype))
     927           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     928             :                           SEMANTIC_TYPE_MISMATCH ": qgram is not a true void bat");
     929           0 :         else if (Tsize(id) != ATOMsize(id->ttype))
     930           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     931             :                           SEMANTIC_TYPE_MISMATCH ": id is not a true void bat");
     932             : 
     933           0 :         else if (Tsize(pos) != ATOMsize(pos->ttype))
     934           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     935             :                           SEMANTIC_TYPE_MISMATCH ": pos is not a true void bat");
     936           0 :         else if (Tsize(len) != ATOMsize(len->ttype))
     937           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     938             :                           SEMANTIC_TYPE_MISMATCH ": len is not a true void bat");
     939           0 :         if (msg) {
     940           0 :                 BBPunfix(qgram->batCacheid);
     941           0 :                 BBPunfix(id->batCacheid);
     942           0 :                 BBPunfix(pos->batCacheid);
     943           0 :                 BBPunfix(len->batCacheid);
     944           0 :                 return msg;
     945             :         }
     946             : 
     947           0 :         bn = COLnew(0, TYPE_int, n, TRANSIENT);
     948           0 :         bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
     949           0 :         if (bn == NULL || bn2 == NULL){
     950           0 :                 BBPreclaim(bn);
     951           0 :                 BBPreclaim(bn2);
     952           0 :                 BBPunfix(qgram->batCacheid);
     953           0 :                 BBPunfix(id->batCacheid);
     954           0 :                 BBPunfix(pos->batCacheid);
     955           0 :                 BBPunfix(len->batCacheid);
     956           0 :                 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     957             :         }
     958             : 
     959           0 :         BATiter qgrami = bat_iterator(qgram);
     960           0 :         BATiter idi = bat_iterator(id);
     961           0 :         BATiter posi = bat_iterator(pos);
     962           0 :         BATiter leni = bat_iterator(len);
     963           0 :         qbuf = (oid *) qgrami.base;
     964           0 :         ibuf = (int *) idi.base;
     965           0 :         pbuf = (int *) posi.base;
     966           0 :         lbuf = (int *) leni.base;
     967           0 :         for (i = 0; i < n - 1; i++) {
     968           0 :                 for (j = i + 1; (j < n && qbuf[j] == qbuf[i] && pbuf[j] <= (pbuf[i] + (*k + *c * MYMIN(lbuf[i], lbuf[j])))); j++) {
     969           0 :                         if (ibuf[i] != ibuf[j] && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MYMIN(lbuf[i], lbuf[j]))) {
     970           0 :                                 if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED ||
     971           0 :                                         BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
     972           0 :                                         bat_iterator_end(&qgrami);
     973           0 :                                         bat_iterator_end(&idi);
     974           0 :                                         bat_iterator_end(&posi);
     975           0 :                                         bat_iterator_end(&leni);
     976           0 :                                         BBPunfix(qgram->batCacheid);
     977           0 :                                         BBPunfix(id->batCacheid);
     978           0 :                                         BBPunfix(pos->batCacheid);
     979           0 :                                         BBPunfix(len->batCacheid);
     980           0 :                                         BBPreclaim(bn);
     981           0 :                                         BBPreclaim(bn2);
     982           0 :                                         throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     983             :                                 }
     984             :                         }
     985             :                 }
     986             :         }
     987           0 :         bat_iterator_end(&qgrami);
     988           0 :         bat_iterator_end(&idi);
     989           0 :         bat_iterator_end(&posi);
     990           0 :         bat_iterator_end(&leni);
     991             : 
     992           0 :         BBPunfix(qgram->batCacheid);
     993           0 :         BBPunfix(id->batCacheid);
     994           0 :         BBPunfix(pos->batCacheid);
     995           0 :         BBPunfix(len->batCacheid);
     996             : 
     997           0 :         BBPkeepref(*res1 = bn->batCacheid);
     998           0 :         BBPkeepref(*res2 = bn2->batCacheid);
     999             : 
    1000           0 :         return MAL_SUCCEED;
    1001             : }
    1002             : 
    1003             : /* copy up to utf8len UTF-8 encoded characters from src to buf
    1004             :  * stop early if buf (size given by bufsize) is too small, or if src runs out
    1005             :  * return number of UTF-8 characters copied (excluding NUL)
    1006             :  * close with NUL if enough space */
    1007             : static size_t
    1008          26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
    1009             : {
    1010             :         size_t cnt = 0;
    1011             : 
    1012         128 :         while (utf8len != 0 && *src != 0 && bufsize != 0) {
    1013         102 :                 bufsize--;
    1014         102 :                 utf8len--;
    1015         102 :                 cnt++;
    1016         102 :                 if (((*buf++ = *src++) & 0x80) != 0) {
    1017          40 :                         while ((*src & 0xC0) == 0x80 && bufsize != 0) {
    1018          20 :                                 *buf++ = *src++;
    1019          20 :                                 bufsize--;
    1020             :                         }
    1021             :                 }
    1022             :         }
    1023          26 :         if (bufsize != 0)
    1024          26 :                 *buf = 0;
    1025          26 :         return cnt;
    1026             : }
    1027             : 
    1028             : static str
    1029           2 : CMDstr2qgrams(bat *ret, str *val)
    1030             : {
    1031             :         BAT *bn;
    1032           2 :         size_t i, len = strlen(*val) + 5;
    1033           2 :         str s = GDKmalloc(len);
    1034             :         char qgram[4 * 6 + 1];          /* 4 UTF-8 code points plus NULL byte */
    1035             : 
    1036           2 :         if (s == NULL)
    1037           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1038           2 :         strcpy(s, "##");
    1039           2 :         strcpy(s + 2, *val);
    1040           2 :         strcpy(s + len - 3, "$$");
    1041           2 :         bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
    1042           2 :         if (bn == NULL) {
    1043           0 :                 GDKfree(s);
    1044           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1045             :         }
    1046             : 
    1047             :         i = 0;
    1048          26 :         while (s[i]) {
    1049          26 :                 if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
    1050             :                         break;
    1051          24 :                 if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
    1052           0 :                         BBPreclaim(bn);
    1053           0 :                         GDKfree(s);
    1054           0 :                         throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1055             :                 }
    1056          24 :                 if ((s[i++] & 0xC0) == 0xC0) {
    1057           8 :                         while ((s[i] & 0xC0) == 0x80)
    1058           4 :                                 i++;
    1059             :                 }
    1060             :         }
    1061           2 :         BBPkeepref(*ret = bn->batCacheid);
    1062           2 :         GDKfree(s);
    1063           2 :         return MAL_SUCCEED;
    1064             : }
    1065             : 
    1066             : #include "mel.h"
    1067             : mel_func txtsim_init_funcs[] = {
    1068             :  command("txtsim", "levenshtein", levenshtein_impl, false, "Calculates Levenshtein distance (edit distance) between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6, arg("",int),arg("s",str),arg("t",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
    1069             :  command("txtsim", "levenshtein", levenshteinbasic_impl, false, "Calculates Levenshtein distance (edit distance) between two strings", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1070             :  command("txtsim", "editdistance", levenshteinbasic_impl, false, "Alias for Levenshtein(str,str)", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1071             :  command("txtsim", "editdistance2", levenshteinbasic2_impl, false, "Calculates Levenshtein distance (edit distance) between two strings. Cost of transposition is 1 instead of 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1072             :  command("txtsim", "similarity", fstrcmp_impl, false, "Normalized edit distance between two strings", args(1,4, arg("",dbl),arg("string1",str),arg("string2",str),arg("minimum",dbl))),
    1073             :  command("txtsim", "similarity", fstrcmp0_impl, false, "Normalized edit distance between two strings", args(1,3, arg("",dbl),arg("string1",str),arg("string2",str))),
    1074             :  command("battxtsim", "similarity", fstrcmp0_impl_bulk, false, "Normalized edit distance between two strings", args(1,3, batarg("",dbl),batarg("string1",str),batarg("string2",str))),
    1075             :  command("txtsim", "soundex", soundex_impl, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
    1076             :  command("txtsim", "stringdiff", stringdiff_impl, false, "calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
    1077             :  command("txtsim", "qgramnormalize", CMDqgramnormalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
    1078             :  command("txtsim", "qgramselfjoin", CMDqgramselfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
    1079             :  command("txtsim", "str2qgrams", CMDstr2qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
    1080             :  { .imp=NULL }
    1081             : };
    1082             : #include "mal_import.h"
    1083             : #ifdef _MSC_VER
    1084             : #undef read
    1085             : #pragma section(".CRT$XCU",read)
    1086             : #endif
    1087         257 : LIB_STARTUP_FUNC(init_txtsim_mal)
    1088         257 : { mal_module("txtsim", NULL, txtsim_init_funcs); }

Generated by: LCOV version 1.14