LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - txtsim.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 200 404 49.5 %
Date: 2021-01-13 20:07:21 Functions: 14 17 82.4 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * @f txtsim
      11             :  * @t Module providing similarity metrics for strings
      12             :  * @a Romulo Goncalves (from M4 to M5)
      13             :  * @d 01/12/2007
      14             :  * @v 0.1
      15             :  *
      16             :  * @+ String metrics
      17             :  *
      18             :  * Provides basic similarity metrics for strings.
      19             :  *
      20             :  */
      21             : #include "monetdb_config.h"
      22             : #include "mal.h"
      23             : #include <string.h>
      24             : #include "gdk.h"
      25             : #include "str.h"
      26             : #include <limits.h>
      27             : #include "mal_exception.h"
      28             : 
      29             : 
      30             : #define RETURN_NIL_IF(b,t) \
      31             :         if (b) {\
      32             :            if (ATOMextern(t)) {\
      33             :               *(ptr*) res = (ptr) ATOMnil(t);\
      34             :                   if ( *(ptr *) res == NULL)\
      35             :                         throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL);\
      36             :            } else {\
      37             :               memcpy(res, ATOMnilptr(t), ATOMsize(t));\
      38             :            }\
      39             :            return MAL_SUCCEED; \
      40             :         }
      41             : 
      42             : /* =========================================================================
      43             :  * LEVENSH?TEIN FUNCTION
      44             :  * Source:
      45             :  * http://www.merriampark.com/ld.htm
      46             :  * =========================================================================
      47             :  */
      48             : 
      49             : #define MYMIN(x,y) ( (x<y) ? x : y )
      50             : #define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) )
      51             : #define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) )
      52             : 
      53             : /***************************************************
      54             :  * Get a pointer to the specified cell of the matrix
      55             :  **************************************************/
      56             : 
      57             : static inline int *
      58             : levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols)
      59             : {
      60         845 :         return pOrigin + col + (row * (nCols + 1));
      61             : }
      62             : 
      63             : /******************************************************
      64             :  * Get the contents of the specified cell in the matrix
      65             :  *****************************************************/
      66             : 
      67             : static inline int
      68             : levenshtein_GetAt(int *pOrigin, int col, int row, int nCols)
      69             : {
      70             :         int *pCell;
      71             : 
      72             :         pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
      73         493 :         return *pCell;
      74             : 
      75             : }
      76             : 
      77             : /********************************************************
      78             :  * Fill the specified cell in the matrix with the value x
      79             :  *******************************************************/
      80             : 
      81             : static inline void
      82             : levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x)
      83             : {
      84             :         int *pCell;
      85             : 
      86             :         pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
      87         352 :         *pCell = x;
      88             : }
      89             : 
      90             : 
      91             : /******************************
      92             :  * Compute Levenshtein distance
      93             :  *****************************/
      94             : static str
      95          32 : levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int *replace_cost, int *transpose_cost)
      96             : {
      97          32 :         char *s = *S;
      98          32 :         char *t = *T;
      99             :         int *d;                 /* pointer to matrix */
     100             :         int n;                  /* length of s */
     101             :         int m;                  /* length of t */
     102             :         int i;                  /* iterates through s */
     103             :         int j;                  /* iterates through t */
     104             :         char s_i;               /* ith character of s */
     105             :         char t_j;               /* jth character of t */
     106             :         int cost;               /* cost */
     107             :         int cell;               /* contents of target cell */
     108             :         int above;              /* contents of cell immediately above */
     109             :         int left;               /* contents of cell immediately to left */
     110             :         int diag;               /* contents of cell immediately above and to left */
     111             :         int sz;                 /* number of cells in matrix */
     112             :         int diag2 = 0, cost2 = 0;
     113             : 
     114          56 :         if (strNil(*S) || strNil(*T)) {
     115          12 :                 *result = int_nil;
     116          12 :                 return MAL_SUCCEED;
     117             :         }
     118             : 
     119             :         /* Step 1 */
     120          20 :         n = (int) strlen(s);    /* 64bit: assume strings are less than 2 GB */
     121          20 :         m = (int) strlen(t);
     122          20 :         if (n == 0) {
     123           7 :                 *result = m;
     124           7 :                 return MAL_SUCCEED;
     125             :         }
     126          13 :         if (m == 0) {
     127           8 :                 *result = n;
     128           8 :                 return MAL_SUCCEED;
     129             :         }
     130           5 :         sz = (n + 1) * (m + 1) * sizeof(int);
     131           5 :         d = (int *) GDKmalloc(sz);
     132           5 :         if ( d == NULL)
     133           0 :                 throw(MAL,"levenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     134             : 
     135             :         /* Step 2 */
     136          45 :         for (i = 0; i <= n; i++) {
     137             :                 levenshtein_PutAt(d, i, 0, n, i);
     138             :         }
     139             : 
     140          42 :         for (j = 0; j <= m; j++) {
     141             :                 levenshtein_PutAt(d, 0, j, n, j);
     142             :         }
     143             : 
     144             :         /* Step 3 */
     145          40 :         for (i = 1; i <= n; i++) {
     146             : 
     147          35 :                 s_i = s[i - 1];
     148             : 
     149             :                 /* Step 4 */
     150         310 :                 for (j = 1; j <= m; j++) {
     151             : 
     152         275 :                         t_j = t[j - 1];
     153             : 
     154             :                         /* Step 5 */
     155         275 :                         if (s_i == t_j) {
     156             :                                 cost = 0;
     157             :                         } else {
     158         232 :                                 cost = *replace_cost;
     159             :                         }
     160             : 
     161             :                         /* Step 6 */
     162         275 :                         above = levenshtein_GetAt(d, i - 1, j, n);
     163         275 :                         left = levenshtein_GetAt(d, i, j - 1, n);
     164             :                         diag = levenshtein_GetAt(d, i - 1, j - 1, n);
     165             : 
     166         275 :                         if (j >= 2 && i >= 2) {
     167             :                                 /* NEW: detect transpositions */
     168             : 
     169         213 :                                 diag2 = levenshtein_GetAt(d, i - 2, j - 2, n);
     170         213 :                                 if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
     171           4 :                                         cost2 = *transpose_cost;
     172             :                                 } else {
     173             :                                         cost2 = 2;
     174             :                                 }
     175         213 :                                 cell = SMALLEST_OF4(above + *insdel_cost, left + *insdel_cost, diag + cost, diag2 + cost2);
     176             :                         } else {
     177          62 :                                 cell = SMALLEST_OF(above + *insdel_cost, left + *insdel_cost, diag + cost);
     178             :                         }
     179             :                         levenshtein_PutAt(d, i, j, n, cell);
     180             :                 }
     181             :         }
     182             : 
     183             :         /* Step 7 */
     184           5 :         *result = levenshtein_GetAt(d, n, m, n);
     185           5 :         GDKfree(d);
     186           5 :         return MAL_SUCCEED;
     187             : }
     188             : 
     189             : static str
     190          12 : levenshteinbasic_impl(int *result, str *s, str *t)
     191             : {
     192          19 :         int insdel = 1, replace = 1, transpose = 2;
     193             : 
     194          19 :         return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
     195             : }
     196             : 
     197             : static str
     198           8 : levenshteinbasic2_impl(int *result, str *s, str *t)
     199             : {
     200           8 :         int insdel = 1, replace = 1, transpose = 1;
     201             : 
     202           8 :         return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
     203             : }
     204             : 
     205             : 
     206             : /* =========================================================================
     207             :  * SOUNDEX FUNCTION
     208             :  * Source:
     209             :  * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c
     210             :  * =========================================================================
     211             :  */
     212             : 
     213             : #define SoundexLen 4            /* length of a soundex code */
     214             : #define SoundexKey "Z000"     /* default key for soundex code */
     215             : 
     216             : /* set letter values */
     217             : static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
     218             :         1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
     219             : };
     220             : 
     221             : static inline char
     222             : SCode(unsigned char c)
     223             : {
     224          56 :         if (c == 95)
     225             :                 return (2);     /* german sz */
     226          22 :         return (Code[toupper(c) - 'A']);
     227             : }
     228             : 
     229             : static void
     230          20 : soundex_code(char *Name, char *Key)
     231             : {
     232             :         char LastLetter;
     233             :         int Index;
     234             : 
     235             :         /* set default key */
     236          20 :         strcpy(Key, SoundexKey);
     237             : 
     238             :         /* keep first letter */
     239          20 :         Key[0] = *Name;
     240          20 :         if (!isupper((unsigned char) (Key[0])))
     241          20 :                 Key[0] = toupper(Key[0]);
     242             : 
     243          20 :         LastLetter = *Name;
     244          20 :         if (!*Name)
     245             :                 return;
     246          10 :         Name++;
     247             : 
     248             :         /* scan rest of string */
     249          48 :         for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
     250             :                 /* use only letters */
     251          38 :                 if (isalpha((unsigned char) (*Name))) {
     252             :                         /* ignore duplicate successive chars */
     253          38 :                         if (LastLetter != *Name) {
     254             :                                 /* new LastLetter */
     255             :                                 LastLetter = *Name;
     256             : 
     257             :                                 /* ignore letters with code 0 */
     258          34 :                                 if (SCode(*Name) != 0) {
     259          22 :                                         Key[Index] = '0' + SCode(*Name);
     260          22 :                                         Index ++;
     261             :                                 }
     262             :                         }
     263             :                 }
     264             :         }
     265             : }
     266             : 
     267             : static str
     268          24 : soundex_impl(str *res, str *Name)
     269             : {
     270          48 :         RETURN_NIL_IF(strNil(*Name), TYPE_str);
     271             : 
     272          20 :         *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
     273          20 :         if( *res == NULL)
     274           0 :                 throw(MAL,"soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     275             : 
     276             :         /* calculate Key for Name */
     277          20 :         soundex_code(*Name, *res);
     278             : 
     279          19 :         return MAL_SUCCEED;
     280             : }
     281             : 
     282             : static str
     283           8 : stringdiff_impl(int *res, str *s1, str *s2)
     284             : {
     285             :         str r = MAL_SUCCEED;
     286           8 :         char *S1 = NULL, *S2 = NULL;
     287             : 
     288           8 :         r = soundex_impl(&S1, s1);
     289           8 :         if( r != MAL_SUCCEED)
     290             :                 return r;
     291           8 :         r = soundex_impl(&S2, s2);
     292           7 :         if( r != MAL_SUCCEED){
     293           0 :                 GDKfree(S1);
     294           0 :                 return r;
     295             :         }
     296             :         r = levenshteinbasic_impl(res, &S1, &S2);
     297           7 :         GDKfree(S1);
     298           8 :         GDKfree(S2);
     299           8 :         return r;
     300             : }
     301             : 
     302             : /******************************
     303             :  * QGRAMNORMALIZE
     304             :  *
     305             :  * This function 'normalizes' a string so valid q-grams can  be made of it:
     306             :  * All characters are transformed to uppercase, and all characters
     307             :  * which are not letters or digits are stripped to a single space.
     308             :  *
     309             :  * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
     310             :  * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
     311             :  *
     312             :  *****************************/
     313             : static str
     314          12 : CMDqgramnormalize(str *res, str *Input)
     315             : {
     316          12 :         char *input = *Input;
     317             :         int i, j = 0;
     318             :         char c, last = ' ';
     319             : 
     320          12 :         RETURN_NIL_IF(strNil(input), TYPE_str);
     321          12 :         *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1));     /* normalized strings are never longer than original */
     322          12 :         if (*res == NULL)
     323           0 :                 throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     324             : 
     325          70 :         for (i = 0; input[i]; i++) {
     326          58 :                 c = toupper(input[i]);
     327          58 :                 if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
     328             :                         c = ' ';
     329          58 :                 if (c != ' ' || last != ' ') {
     330          53 :                         (*res)[j++] = c;
     331             :                 }
     332             :                 last = c;
     333             :         }
     334          12 :         (*res)[j] = 0;
     335             :         /* strip final whitespace */
     336          14 :         while (j > 0 && (*res)[--j] == ' ')
     337           2 :                 (*res)[j] = 0;
     338             : 
     339             :         return MAL_SUCCEED;
     340             : }
     341             : 
     342             : /* =========================================================================
     343             :  * FSTRCMP FUNCTION
     344             :  * Source:
     345             :  * http://search.cpan.org/src/MLEHMANN/String-Similarity-1/fstrcmp.c
     346             :  * =========================================================================
     347             :  */
     348             : 
     349             : /*
     350             :  * Data on one input string being compared.
     351             :  */
     352             : struct string_data {
     353             :         /* The string to be compared. */
     354             :         const char *data;
     355             : 
     356             :         /* The length of the string to be compared. */
     357             :         int data_length;
     358             : 
     359             :         /* The number of characters inserted or deleted. */
     360             :         int edit_count;
     361             : };
     362             : 
     363             : struct partition {
     364             :         /* Midpoints of this partition.  */
     365             :         int xmid, ymid;
     366             : 
     367             :         /* Nonzero if low half will be analyzed minimally.  */
     368             :         int lo_minimal;
     369             : 
     370             :         /* Likewise for high half.  */
     371             :         int hi_minimal;
     372             : };
     373             : 
     374             : /* NAME
     375             :         diag - find diagonal path
     376             : 
     377             :    SYNOPSIS
     378             :         int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
     379             :                 struct partition *part);
     380             : 
     381             :    DESCRIPTION
     382             :         Find the midpoint of the shortest edit script for a specified
     383             :         portion of the two strings.
     384             : 
     385             :         Scan from the beginnings of the strings, and simultaneously from
     386             :         the ends, doing a breadth-first search through the space of
     387             :         edit-sequence.  When the two searches meet, we have found the
     388             :         midpoint of the shortest edit sequence.
     389             : 
     390             :         If MINIMAL is nonzero, find the minimal edit script regardless
     391             :         of expense.  Otherwise, if the search is too expensive, use
     392             :         heuristics to stop the search and report a suboptimal answer.
     393             : 
     394             :    RETURNS
     395             :         Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
     396             :         number XMID - YMID equals the number of inserted characters
     397             :         minus the number of deleted characters (counting only characters
     398             :         before the midpoint).  Return the approximate edit cost; this is
     399             :         the total number of characters inserted or deleted (counting
     400             :         only characters before the midpoint), unless a heuristic is used
     401             :         to terminate the search prematurely.
     402             : 
     403             :         Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
     404             :         for the left half of the partition is known; similarly for
     405             :         PART->RIGHT_MINIMAL.
     406             : 
     407             :    CAVEAT
     408             :         This function assumes that the first characters of the specified
     409             :         portions of the two strings do not match, and likewise that the
     410             :         last characters do not match.  The caller must trim matching
     411             :         characters from the beginning and end of the portions it is
     412             :         going to specify.
     413             : 
     414             :         If we return the "wrong" partitions, the worst this can do is
     415             :         cause suboptimal diff output.  It cannot cause incorrect diff
     416             :         output.  */
     417             : 
     418             : static inline int
     419           0 : diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part, int too_expensive, struct string_data *string, int *fdiag, int *bdiag)
     420             : {
     421             :         int *const fd = fdiag;  /* Give the compiler a chance. */
     422             :         int *const bd = bdiag;  /* Additional help for the compiler. */
     423           0 :         const char *const xv = string[0].data;  /* Still more help for the compiler. */
     424           0 :         const char *const yv = string[1].data;  /* And more and more . . . */
     425           0 :         const int dmin = xoff - ylim;   /* Minimum valid diagonal. */
     426           0 :         const int dmax = xlim - yoff;   /* Maximum valid diagonal. */
     427           0 :         const int fmid = xoff - yoff;   /* Center diagonal of top-down search. */
     428           0 :         const int bmid = xlim - ylim;   /* Center diagonal of bottom-up search. */
     429             :         int fmin = fmid;
     430             :         int fmax = fmid;        /* Limits of top-down search. */
     431             :         int bmin = bmid;
     432             :         int bmax = bmid;        /* Limits of bottom-up search. */
     433             :         int c;                  /* Cost. */
     434           0 :         int odd = (fmid - bmid) & 1;
     435             : 
     436             :         /*
     437             :          * True if southeast corner is on an odd diagonal with respect
     438             :          * to the northwest.
     439             :          */
     440           0 :         fd[fmid] = xoff;
     441           0 :         bd[bmid] = xlim;
     442           0 :         for (c = 1;; ++c) {
     443             :                 int d;          /* Active diagonal. */
     444             : 
     445             :                 /* Extend the top-down search by an edit step in each diagonal. */
     446           0 :                 if (fmin > dmin)
     447           0 :                         fd[--fmin - 1] = -1;
     448             :                 else
     449           0 :                         ++fmin;
     450           0 :                 if (fmax < dmax)
     451           0 :                         fd[++fmax + 1] = -1;
     452             :                 else
     453           0 :                         --fmax;
     454           0 :                 for (d = fmax; d >= fmin; d -= 2) {
     455             :                         int x;
     456             :                         int y;
     457             :                         int tlo;
     458             :                         int thi;
     459             : 
     460           0 :                         tlo = fd[d - 1], thi = fd[d + 1];
     461             : 
     462           0 :                         if (tlo >= thi)
     463           0 :                                 x = tlo + 1;
     464             :                         else
     465             :                                 x = thi;
     466           0 :                         y = x - d;
     467           0 :                         while (x < xlim && y < ylim && xv[x] == yv[y]) {
     468           0 :                                 ++x;
     469           0 :                                 ++y;
     470             :                         }
     471           0 :                         fd[d] = x;
     472           0 :                         if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
     473           0 :                                 part->xmid = x;
     474           0 :                                 part->ymid = y;
     475           0 :                                 part->lo_minimal = part->hi_minimal = 1;
     476           0 :                                 return 2 * c - 1;
     477             :                         }
     478             :                 }
     479             :                 /* Similarly extend the bottom-up search.  */
     480           0 :                 if (bmin > dmin)
     481           0 :                         bd[--bmin - 1] = INT_MAX;
     482             :                 else
     483           0 :                         ++bmin;
     484           0 :                 if (bmax < dmax)
     485           0 :                         bd[++bmax + 1] = INT_MAX;
     486             :                 else
     487           0 :                         --bmax;
     488           0 :                 for (d = bmax; d >= bmin; d -= 2) {
     489             :                         int x;
     490             :                         int y;
     491             :                         int tlo;
     492             :                         int thi;
     493             : 
     494           0 :                         tlo = bd[d - 1], thi = bd[d + 1];
     495           0 :                         if (tlo < thi)
     496             :                                 x = tlo;
     497             :                         else
     498           0 :                                 x = thi - 1;
     499           0 :                         y = x - d;
     500           0 :                         while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
     501           0 :                                 --x;
     502           0 :                                 --y;
     503             :                         }
     504           0 :                         bd[d] = x;
     505           0 :                         if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
     506           0 :                                 part->xmid = x;
     507           0 :                                 part->ymid = y;
     508           0 :                                 part->lo_minimal = part->hi_minimal = 1;
     509           0 :                                 return 2 * c;
     510             :                         }
     511             :                 }
     512             : 
     513           0 :                 if (minimal)
     514           0 :                         continue;
     515             : 
     516             :                 /* Heuristic: if we've gone well beyond the call of duty, give up
     517             :                    and report halfway between our best results so far.  */
     518           0 :                 if (c >= too_expensive) {
     519             :                         int fxybest;
     520             :                         int fxbest;
     521             :                         int bxybest;
     522             :                         int bxbest;
     523             : 
     524             :                         /* Pacify `gcc -Wall'. */
     525             :                         fxbest = 0;
     526             :                         bxbest = 0;
     527             : 
     528             :                         /* Find forward diagonal that maximizes X + Y.  */
     529             :                         fxybest = -1;
     530           0 :                         for (d = fmax; d >= fmin; d -= 2) {
     531             :                                 int x;
     532             :                                 int y;
     533             : 
     534           0 :                                 x = fd[d] < xlim ? fd[d] : xlim;
     535           0 :                                 y = x - d;
     536             : 
     537           0 :                                 if (ylim < y) {
     538           0 :                                         x = ylim + d;
     539             :                                         y = ylim;
     540             :                                 }
     541           0 :                                 if (fxybest < x + y) {
     542             :                                         fxybest = x + y;
     543             :                                         fxbest = x;
     544             :                                 }
     545             :                         }
     546             :                         /* Find backward diagonal that minimizes X + Y.  */
     547             :                         bxybest = INT_MAX;
     548           0 :                         for (d = bmax; d >= bmin; d -= 2) {
     549             :                                 int x;
     550             :                                 int y;
     551             : 
     552           0 :                                 x = xoff > bd[d] ? xoff : bd[d];
     553           0 :                                 y = x - d;
     554             : 
     555           0 :                                 if (y < yoff) {
     556           0 :                                         x = yoff + d;
     557             :                                         y = yoff;
     558             :                                 }
     559           0 :                                 if (x + y < bxybest) {
     560             :                                         bxybest = x + y;
     561             :                                         bxbest = x;
     562             :                                 }
     563             :                         }
     564             :                         /* Use the better of the two diagonals.  */
     565           0 :                         if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) {
     566           0 :                                 part->xmid = fxbest;
     567           0 :                                 part->ymid = fxybest - fxbest;
     568           0 :                                 part->lo_minimal = 1;
     569           0 :                                 part->hi_minimal = 0;
     570             :                         } else {
     571           0 :                                 part->xmid = bxbest;
     572           0 :                                 part->ymid = bxybest - bxbest;
     573           0 :                                 part->lo_minimal = 0;
     574           0 :                                 part->hi_minimal = 1;
     575             :                         }
     576           0 :                         return 2 * c - 1;
     577             :                 }
     578             :         }
     579             : }
     580             : 
     581             : 
     582             : /* NAME
     583             :         compareseq - find edit sequence
     584             : 
     585             :    SYNOPSIS
     586             :         void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
     587             : 
     588             :    DESCRIPTION
     589             :         Compare in detail contiguous subsequences of the two strings
     590             :         which are known, as a whole, to match each other.
     591             : 
     592             :         The subsequence of string 0 is [XOFF, XLIM) and likewise for
     593             :         string 1.
     594             : 
     595             :         Note that XLIM, YLIM are exclusive bounds.  All character
     596             :         numbers are origin-0.
     597             : 
     598             :         If MINIMAL is nonzero, find a minimal difference no matter how
     599             :         expensive it is.  */
     600             : 
     601             : static inline void
     602      373680 : compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, int max_edits, int too_expensive, struct string_data *string, int *fdiag, int *bdiag) /* compareseq stops when edits > max_edits */
     603             : {
     604      373680 :         const char *const xv = string[0].data;  /* Help the compiler.  */
     605      373680 :         const char *const yv = string[1].data;
     606             : 
     607      373680 :         if (string[1].edit_count + string[0].edit_count > max_edits)
     608             :                 return;
     609             : 
     610             :         /* Slide down the bottom initial diagonal. */
     611     2399780 :         while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) {
     612     2026100 :                 ++xoff;
     613     2026100 :                 ++yoff;
     614             :         }
     615             : 
     616             :         /* Slide up the top initial diagonal. */
     617      373685 :         while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) {
     618           5 :                 --xlim;
     619           5 :                 --ylim;
     620             :         }
     621             : 
     622             :         /* Handle simple cases. */
     623      373680 :         if (xoff == xlim) {
     624      373679 :                 while (yoff < ylim) {
     625           0 :                         ++string[1].edit_count;
     626           0 :                         ++yoff;
     627             :                 }
     628           1 :         } else if (yoff == ylim) {
     629           2 :                 while (xoff < xlim) {
     630           1 :                         ++string[0].edit_count;
     631           1 :                         ++xoff;
     632             :                 }
     633             :         } else {
     634             :                 int c;
     635             :                 struct partition part;
     636             : 
     637             :                 /* Find a point of correspondence in the middle of the strings.  */
     638           0 :                 c = diag(xoff, xlim, yoff, ylim, minimal, &part, too_expensive, string, fdiag, bdiag);
     639           0 :                 if (c == 1) {
     640             :                         /* The two subsequences differ by a single insert or delete;
     641             :                            record it and we are done.  */
     642           0 :                         if (part.xmid - part.ymid < xoff - yoff)
     643           0 :                                 ++string[1].edit_count;
     644             :                         else
     645           0 :                                 ++string[0].edit_count;
     646             :                 } else {
     647             :                         /* Use the partitions to split this problem into subproblems.  */
     648           0 :                         compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal, max_edits, too_expensive, string, fdiag, bdiag);
     649           0 :                         compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal, max_edits, too_expensive, string, fdiag, bdiag);
     650             :                 }
     651             :         }
     652             : }
     653             : 
     654             : /* NAME
     655             :         fstrcmp - fuzzy string compare
     656             : 
     657             :    SYNOPSIS
     658             :         double fstrcmp(const char *s1, int l1, const char *s2, int l2, double);
     659             : 
     660             :    DESCRIPTION
     661             :         The fstrcmp function may be used to compare two string for
     662             :         similarity.  It is very useful in reducing "cascade" or
     663             :         "secondary" errors in compilers or other situations where
     664             :         symbol tables occur.
     665             : 
     666             :    RETURNS
     667             :         double; 0 if the strings are entirly dissimilar, 1 if the
     668             :         strings are identical, and a number in between if they are
     669             :         similar.  */
     670             : 
     671             : #define INITIAL_INT_BUFFER_LENGTH 2048
     672             : 
     673             : #define CHECK_INT_BUFFER_LENGTH(BUFFER, BUFFER_LEN, NEXT_LEN, OP) \
     674             :         do { \
     675             :                 if ((NEXT_LEN) > *BUFFER_LEN) { \
     676             :                         size_t newlen = (((NEXT_LEN) + 1023) & ~1023); /* align to a multiple of 1024 bytes */ \
     677             :                         int *newbuf = GDKmalloc(newlen); \
     678             :                         if (!newbuf) \
     679             :                                 throw(MAL, OP, SQLSTATE(HY013) MAL_MALLOC_FAIL); \
     680             :                         GDKfree(*BUFFER); \
     681             :                         *BUFFER = newbuf; \
     682             :                         *BUFFER_LEN = newlen; \
     683             :                 } \
     684             :         } while (0)
     685             : 
     686             : static str
     687      372987 : fstrcmp_impl_internal(dbl *ret, int **fdiag_buf, size_t *fdiag_buflen, str string1, str string2, dbl minimum)
     688             : {
     689             :         int i, max_edits, *fdiag, *bdiag, too_expensive = 1;
     690             :         size_t fdiag_len;
     691             :         struct string_data string[2];
     692             : 
     693             :         /* set the info for each string.  */
     694      372987 :         string[0].data = string1;
     695      372987 :         string[0].data_length = (int) strlen(string1); /* 64bit: assume string not too long */
     696      372987 :         string[1].data = string2;
     697      372987 :         string[1].data_length = (int) strlen(string2); /* 64bit: assume string not too long */
     698             : 
     699             :         /* short-circuit obvious comparisons */
     700      372987 :         if (string[0].data_length == 0 && string[1].data_length == 0) {
     701           0 :                 *ret = 1.0;
     702           0 :                 return MAL_SUCCEED;
     703             :         }
     704      372987 :         if (string[0].data_length == 0 || string[1].data_length == 0) {
     705           0 :                 *ret = 0.0;
     706           0 :                 return MAL_SUCCEED;
     707             :         }
     708             : 
     709             :         /* Set TOO_EXPENSIVE to be approximate square root of input size,
     710             :            bounded below by 256.  */
     711     1118890 :         for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
     712      745900 :                 too_expensive <<= 1;
     713             :         if (too_expensive < 256)
     714             :                 too_expensive = 256;
     715             : 
     716             :         /* Because fstrcmp is typically called multiple times, while scanning
     717             :            symbol tables, etc, attempt to minimize the number of memory
     718             :            allocations performed.  Thus, we use a static buffer for the
     719             :            diagonal vectors, and never free them.  */
     720      372987 :         fdiag_len = string[0].data_length + string[1].data_length + 3;
     721      372987 :         CHECK_INT_BUFFER_LENGTH(fdiag_buf, fdiag_buflen, fdiag_len, "txtsim.similarity");
     722      372987 :         fdiag = *fdiag_buf + string[1].data_length + 1;
     723      372987 :         bdiag = fdiag + fdiag_len;
     724             : 
     725      372987 :         max_edits = 1 + (int) ((string[0].data_length + string[1].data_length) * (1. - minimum));
     726             : 
     727             :         /* Now do the main comparison algorithm */
     728      372987 :         string[0].edit_count = 0;
     729      372987 :         string[1].edit_count = 0;
     730      372987 :         compareseq(0, string[0].data_length, 0, string[1].data_length, 0, max_edits, too_expensive, string, fdiag, bdiag);
     731             : 
     732             :         /* The result is
     733             :            ((number of chars in common) / (average length of the strings)).
     734             :            This is admittedly biased towards finding that the strings are
     735             :            similar, however it does produce meaningful results.  */
     736      373751 :         *ret = ((double)
     737      373751 :                 (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
     738      373751 :                 / (string[0].data_length + string[1].data_length));
     739      373751 :         return MAL_SUCCEED;
     740             : }
     741             : 
     742             : static str
     743           0 : fstrcmp_impl(dbl *ret, str *string1, str *string2, dbl *minimum)
     744             : {
     745           0 :         str s1 = *string1, s2 = *string2;
     746           0 :         dbl min = *minimum;
     747             : 
     748           0 :         if (strNil(s1) || strNil(s2) || is_dbl_nil(min)) {
     749           0 :                 *ret = dbl_nil;
     750           0 :                 return MAL_SUCCEED;
     751             :         } else {
     752             :                 str msg = MAL_SUCCEED;
     753           0 :                 int *fdiag_buf = NULL;
     754           0 :                 size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
     755             : 
     756           0 :                 if (!(fdiag_buf = GDKmalloc(fdiag_buflen)))
     757           0 :                         throw(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     758           0 :                 msg = fstrcmp_impl_internal(ret, &fdiag_buf, &fdiag_buflen, s1, s1, min);
     759           0 :                 GDKfree(fdiag_buf);
     760           0 :                 return msg;
     761             :         }
     762             : }
     763             : 
     764             : static str
     765           4 : fstrcmp0_impl(dbl *ret, str *string1, str *string2)
     766             : {
     767           4 :         str s1 = *string1, s2 = *string2;
     768             : 
     769           6 :         if (strNil(s1) || strNil(s2)) {
     770           3 :                 *ret = dbl_nil;
     771           3 :                 return MAL_SUCCEED;
     772             :         } else {
     773             :                 str msg = MAL_SUCCEED;
     774           1 :                 int *fdiag_buf = NULL;
     775           1 :                 size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
     776             : 
     777           1 :                 if (!(fdiag_buf = GDKmalloc(fdiag_buflen)))
     778           0 :                         throw(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     779           1 :                 msg = fstrcmp_impl_internal(ret, &fdiag_buf, &fdiag_buflen, s1, s2, 0.0);
     780           1 :                 GDKfree(fdiag_buf);
     781           1 :                 return msg;
     782             :         }
     783             : }
     784             : 
     785             : static str
     786           8 : fstrcmp0_impl_bulk(bat *res, bat *strings1, bat *strings2)
     787             : {
     788             :         BATiter lefti, righti;
     789             :         BAT *bn = NULL, *left = NULL, *right = NULL;
     790             :         BUN q = 0;
     791           8 :         size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
     792             :         str x, y, msg = MAL_SUCCEED;
     793             :         bool nils = false;
     794             :         dbl *restrict vals;
     795           8 :         int *fdiag_buf = GDKmalloc(fdiag_buflen);
     796             : 
     797           8 :         if (!fdiag_buf) {
     798           0 :                 msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     799           0 :                 goto bailout;
     800             :         }
     801           8 :         if (!(left = BATdescriptor(*strings1)) || !(right = BATdescriptor(*strings2))) {
     802           0 :                 msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY005) RUNTIME_OBJECT_MISSING);
     803           0 :                 goto bailout;
     804             :         }
     805           8 :         q = BATcount(left);
     806           8 :         if (!(bn = COLnew(left->hseqbase, TYPE_dbl, q, TRANSIENT))) {
     807           0 :                 msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     808           0 :                 goto bailout;
     809             :         }
     810             : 
     811             :         lefti = bat_iterator(left);
     812             :         righti = bat_iterator(right);
     813           8 :         vals = Tloc(bn, 0);
     814      373526 :         for (BUN i = 0; i < q && !msg; i++) {
     815      373518 :                 x = (str) BUNtvar(lefti, i);
     816      373518 :                 y = (str) BUNtvar(righti, i);
     817             : 
     818      746922 :                 if (strNil(x) || strNil(y)) {
     819         231 :                         vals[i] = dbl_nil;
     820         231 :                         nils = true;
     821             :                 } else {
     822      373287 :                         msg = fstrcmp_impl_internal(&vals[i], &fdiag_buf, &fdiag_buflen, x, y, 0.0);
     823             :                 }
     824             :         }
     825             : 
     826           8 : bailout:
     827           8 :         GDKfree(fdiag_buf);
     828           8 :         if (bn && !msg) {
     829           8 :                 BATsetcount(bn, q);
     830           8 :                 bn->tnil = nils;
     831           8 :                 bn->tnonil = !nils;
     832           8 :                 bn->tkey = BATcount(bn) <= 1;
     833           8 :                 bn->tsorted = BATcount(bn) <= 1;
     834           8 :                 bn->trevsorted = BATcount(bn) <= 1;
     835           8 :                 bn->theap.dirty = true;
     836           8 :                 BBPkeepref(*res = bn->batCacheid);
     837           0 :         } else if (bn)
     838           0 :                 BBPreclaim(bn);
     839           8 :         if (left)
     840           8 :                 BBPunfix(left->batCacheid);
     841           8 :         if (right)
     842           8 :                 BBPunfix(right->batCacheid);
     843           8 :         return msg;
     844             : }
     845             : 
     846             : 
     847             : /* ============ Q-GRAM SELF JOIN ============== */
     848             : 
     849             : static str
     850           0 : CMDqgramselfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid, flt *c, int *k)
     851             : {
     852             :         BAT *qgram, *id, *pos, *len;
     853             :         BUN n;
     854             :         BUN i, j;
     855             :         BAT *bn, *bn2;
     856             :         oid *qbuf;
     857             :         int *ibuf;
     858             :         int *pbuf;
     859             :         int *lbuf;
     860             :         str msg = MAL_SUCCEED;
     861             : 
     862           0 :         qgram = BATdescriptor(*qid);
     863           0 :         id = BATdescriptor(*bid);
     864           0 :         pos = BATdescriptor(*pid);
     865           0 :         len = BATdescriptor(*lid);
     866           0 :         if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
     867           0 :                 if (qgram)
     868           0 :                         BBPunfix(qgram->batCacheid);
     869           0 :                 if (id)
     870           0 :                         BBPunfix(id->batCacheid);
     871           0 :                 if (pos)
     872           0 :                         BBPunfix(pos->batCacheid);
     873           0 :                 if (len)
     874           0 :                         BBPunfix(len->batCacheid);
     875           0 :                 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
     876             :         }
     877             : 
     878           0 :         if (qgram->ttype != TYPE_oid)
     879           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     880             :                           SEMANTIC_TYPE_MISMATCH ": tail of BAT qgram must be oid");
     881           0 :         else if (id->ttype != TYPE_int)
     882           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     883             :                           SEMANTIC_TYPE_MISMATCH ": tail of BAT id must be int");
     884           0 :         else if (pos->ttype != TYPE_int)
     885           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     886             :                           SEMANTIC_TYPE_MISMATCH ": tail of BAT pos must be int");
     887           0 :         else if (len->ttype != TYPE_int)
     888           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     889             :                           SEMANTIC_TYPE_MISMATCH ": tail of BAT len must be int");
     890             :         if (msg) {
     891           0 :                 BBPunfix(qgram->batCacheid);
     892           0 :                 BBPunfix(id->batCacheid);
     893           0 :                 BBPunfix(pos->batCacheid);
     894           0 :                 BBPunfix(len->batCacheid);
     895           0 :                 return msg;
     896             :         }
     897             : 
     898           0 :         n = BATcount(qgram);
     899           0 :         qbuf = (oid *) Tloc(qgram, 0);
     900           0 :         ibuf = (int *) Tloc(id, 0);
     901           0 :         pbuf = (int *) Tloc(pos, 0);
     902           0 :         lbuf = (int *) Tloc(len, 0);
     903             : 
     904             :         /* if (BATcount(qgram)>1 && !BATtordered(qgram)) throw(MAL, "tstsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
     905             : 
     906           0 :         if (!ALIGNsynced(qgram, id))
     907           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     908             :                           SEMANTIC_TYPE_MISMATCH ": qgram and id are not synced");
     909             : 
     910           0 :         else if (!ALIGNsynced(qgram, pos))
     911           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     912             :                           SEMANTIC_TYPE_MISMATCH ": qgram and pos are not synced");
     913           0 :         else if (!ALIGNsynced(qgram, len))
     914           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     915             :                           SEMANTIC_TYPE_MISMATCH ": qgram and len are not synced");
     916             : 
     917           0 :         else if (Tsize(qgram) != ATOMsize(qgram->ttype))
     918           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     919             :                           SEMANTIC_TYPE_MISMATCH ": qgram is not a true void bat");
     920           0 :         else if (Tsize(id) != ATOMsize(id->ttype))
     921           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     922             :                           SEMANTIC_TYPE_MISMATCH ": id is not a true void bat");
     923             : 
     924           0 :         else if (Tsize(pos) != ATOMsize(pos->ttype))
     925           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     926             :                           SEMANTIC_TYPE_MISMATCH ": pos is not a true void bat");
     927           0 :         else if (Tsize(len) != ATOMsize(len->ttype))
     928           0 :                 msg = createException(MAL, "tstsim.qgramselfjoin",
     929             :                           SEMANTIC_TYPE_MISMATCH ": len is not a true void bat");
     930           0 :         if (msg) {
     931           0 :                 BBPunfix(qgram->batCacheid);
     932           0 :                 BBPunfix(id->batCacheid);
     933           0 :                 BBPunfix(pos->batCacheid);
     934           0 :                 BBPunfix(len->batCacheid);
     935           0 :                 return msg;
     936             :         }
     937             : 
     938           0 :         bn = COLnew(0, TYPE_int, n, TRANSIENT);
     939           0 :         bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
     940           0 :         if (bn == NULL || bn2 == NULL){
     941           0 :                 BBPreclaim(bn);
     942           0 :                 BBPreclaim(bn2);
     943           0 :                 BBPunfix(qgram->batCacheid);
     944           0 :                 BBPunfix(id->batCacheid);
     945           0 :                 BBPunfix(pos->batCacheid);
     946           0 :                 BBPunfix(len->batCacheid);
     947           0 :                 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     948             :         }
     949             : 
     950           0 :         for (i = 0; i < n - 1; i++) {
     951           0 :                 for (j = i + 1; (j < n && qbuf[j] == qbuf[i] && pbuf[j] <= (pbuf[i] + (*k + *c * MYMIN(lbuf[i], lbuf[j])))); j++) {
     952           0 :                         if (ibuf[i] != ibuf[j] && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MYMIN(lbuf[i], lbuf[j]))) {
     953           0 :                                 if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED ||
     954           0 :                                         BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
     955           0 :                                         BBPunfix(qgram->batCacheid);
     956           0 :                                         BBPunfix(id->batCacheid);
     957           0 :                                         BBPunfix(pos->batCacheid);
     958           0 :                                         BBPunfix(len->batCacheid);
     959           0 :                                         BBPreclaim(bn);
     960           0 :                                         BBPreclaim(bn2);
     961           0 :                                         throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     962             :                                 }
     963             :                         }
     964             :                 }
     965             :         }
     966             : 
     967           0 :         BBPunfix(qgram->batCacheid);
     968           0 :         BBPunfix(id->batCacheid);
     969           0 :         BBPunfix(pos->batCacheid);
     970           0 :         BBPunfix(len->batCacheid);
     971             : 
     972           0 :         BBPkeepref(*res1 = bn->batCacheid);
     973           0 :         BBPkeepref(*res2 = bn2->batCacheid);
     974             : 
     975           0 :         return MAL_SUCCEED;
     976             : }
     977             : 
     978             : /* copy up to utf8len UTF-8 encoded characters from src to buf
     979             :  * stop early if buf (size given by bufsize) is too small, or if src runs out
     980             :  * return number of UTF-8 characters copied (excluding NUL)
     981             :  * close with NUL if enough space */
     982             : static size_t
     983          26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
     984             : {
     985             :         size_t cnt = 0;
     986             : 
     987         128 :         while (utf8len != 0 && *src != 0 && bufsize != 0) {
     988         102 :                 bufsize--;
     989         102 :                 utf8len--;
     990         102 :                 cnt++;
     991         102 :                 if (((*buf++ = *src++) & 0x80) != 0) {
     992          40 :                         while ((*src & 0xC0) == 0x80 && bufsize != 0) {
     993          20 :                                 *buf++ = *src++;
     994          20 :                                 bufsize--;
     995             :                         }
     996             :                 }
     997             :         }
     998          26 :         if (bufsize != 0)
     999          26 :                 *buf = 0;
    1000          26 :         return cnt;
    1001             : }
    1002             : 
    1003             : static str
    1004           2 : CMDstr2qgrams(bat *ret, str *val)
    1005             : {
    1006             :         BAT *bn;
    1007           2 :         size_t i, len = strlen(*val) + 5;
    1008           2 :         str s = GDKmalloc(len);
    1009             :         char qgram[4 * 6 + 1];          /* 4 UTF-8 code points plus NULL byte */
    1010             : 
    1011           2 :         if (s == NULL)
    1012           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1013           2 :         strcpy(s, "##");
    1014           2 :         strcpy(s + 2, *val);
    1015           2 :         strcpy(s + len - 3, "$$");
    1016           2 :         bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
    1017           2 :         if (bn == NULL) {
    1018           0 :                 GDKfree(s);
    1019           0 :                 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1020             :         }
    1021             : 
    1022             :         i = 0;
    1023          26 :         while (s[i]) {
    1024          26 :                 if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
    1025             :                         break;
    1026          24 :                 if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
    1027           0 :                         BBPreclaim(bn);
    1028           0 :                         GDKfree(s);
    1029           0 :                         throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1030             :                 }
    1031          24 :                 if ((s[i++] & 0xC0) == 0xC0) {
    1032           8 :                         while ((s[i] & 0xC0) == 0x80)
    1033           4 :                                 i++;
    1034             :                 }
    1035             :         }
    1036           2 :         BBPkeepref(*ret = bn->batCacheid);
    1037           2 :         GDKfree(s);
    1038           2 :         return MAL_SUCCEED;
    1039             : }
    1040             : 
    1041             : #include "mel.h"
    1042             : mel_func txtsim_init_funcs[] = {
    1043             :  command("txtsim", "levenshtein", levenshtein_impl, false, "Calculates Levenshtein distance (edit distance) between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6, arg("",int),arg("s",str),arg("t",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
    1044             :  command("txtsim", "levenshtein", levenshteinbasic_impl, false, "Calculates Levenshtein distance (edit distance) between two strings", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1045             :  command("txtsim", "editdistance", levenshteinbasic_impl, false, "Alias for Levenshtein(str,str)", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1046             :  command("txtsim", "editdistance2", levenshteinbasic2_impl, false, "Calculates Levenshtein distance (edit distance) between two strings. Cost of transposition is 1 instead of 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
    1047             :  command("txtsim", "similarity", fstrcmp_impl, false, "Normalized edit distance between two strings", args(1,4, arg("",dbl),arg("string1",str),arg("string2",str),arg("minimum",dbl))),
    1048             :  command("txtsim", "similarity", fstrcmp0_impl, false, "Normalized edit distance between two strings", args(1,3, arg("",dbl),arg("string1",str),arg("string2",str))),
    1049             :  command("battxtsim", "similarity", fstrcmp0_impl_bulk, false, "Normalized edit distance between two strings", args(1,3, batarg("",dbl),batarg("string1",str),batarg("string2",str))),
    1050             :  command("txtsim", "soundex", soundex_impl, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
    1051             :  command("txtsim", "stringdiff", stringdiff_impl, false, "calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
    1052             :  command("txtsim", "qgramnormalize", CMDqgramnormalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
    1053             :  command("txtsim", "qgramselfjoin", CMDqgramselfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
    1054             :  command("txtsim", "str2qgrams", CMDstr2qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
    1055             :  { .imp=NULL }
    1056             : };
    1057             : #include "mal_import.h"
    1058             : #ifdef _MSC_VER
    1059             : #undef read
    1060             : #pragma section(".CRT$XCU",read)
    1061             : #endif
    1062         255 : LIB_STARTUP_FUNC(init_txtsim_mal)
    1063         255 : { mal_module("txtsim", NULL, txtsim_init_funcs); }

Generated by: LCOV version 1.14