Line data Source code
1 : /*
2 : * This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 : *
6 : * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
7 : */
8 :
9 : /*
10 : * @f txtsim
11 : * @t Module providing similarity metrics for strings
12 : * @a Romulo Goncalves (from M4 to M5)
13 : * @d 01/12/2007
14 : * @v 0.1
15 : *
16 : * @+ String metrics
17 : *
18 : * Provides basic similarity metrics for strings.
19 : *
20 : */
21 : #include "monetdb_config.h"
22 : #include "mal.h"
23 : #include <string.h>
24 : #include "gdk.h"
25 : #include "str.h"
26 : #include <limits.h>
27 : #include "mal_exception.h"
28 :
29 :
30 : #define RETURN_NIL_IF(b,t) \
31 : if (b) {\
32 : if (ATOMextern(t)) {\
33 : *(ptr*) res = (ptr) ATOMnil(t);\
34 : if ( *(ptr *) res == NULL)\
35 : throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL);\
36 : } else {\
37 : memcpy(res, ATOMnilptr(t), ATOMsize(t));\
38 : }\
39 : return MAL_SUCCEED; \
40 : }
41 :
42 : /* =========================================================================
43 : * LEVENSH?TEIN FUNCTION
44 : * Source:
45 : * http://www.merriampark.com/ld.htm
46 : * =========================================================================
47 : */
48 :
49 : #define MYMIN(x,y) ( (x<y) ? x : y )
50 : #define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) )
51 : #define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) )
52 :
53 : /***************************************************
54 : * Get a pointer to the specified cell of the matrix
55 : **************************************************/
56 :
57 : static inline int *
58 : levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols)
59 : {
60 845 : return pOrigin + col + (row * (nCols + 1));
61 : }
62 :
63 : /******************************************************
64 : * Get the contents of the specified cell in the matrix
65 : *****************************************************/
66 :
67 : static inline int
68 : levenshtein_GetAt(int *pOrigin, int col, int row, int nCols)
69 : {
70 : int *pCell;
71 :
72 : pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
73 493 : return *pCell;
74 :
75 : }
76 :
77 : /********************************************************
78 : * Fill the specified cell in the matrix with the value x
79 : *******************************************************/
80 :
81 : static inline void
82 : levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x)
83 : {
84 : int *pCell;
85 :
86 : pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
87 352 : *pCell = x;
88 : }
89 :
90 :
91 : /******************************
92 : * Compute Levenshtein distance
93 : *****************************/
94 : static str
95 33 : levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int *replace_cost, int *transpose_cost)
96 : {
97 33 : char *s = *S;
98 33 : char *t = *T;
99 : int *d; /* pointer to matrix */
100 : int n; /* length of s */
101 : int m; /* length of t */
102 : int i; /* iterates through s */
103 : int j; /* iterates through t */
104 : char s_i; /* ith character of s */
105 : char t_j; /* jth character of t */
106 : int cost; /* cost */
107 : int cell; /* contents of target cell */
108 : int above; /* contents of cell immediately above */
109 : int left; /* contents of cell immediately to left */
110 : int diag; /* contents of cell immediately above and to left */
111 : int sz; /* number of cells in matrix */
112 : int diag2 = 0, cost2 = 0;
113 :
114 58 : if (strNil(*S) || strNil(*T)) {
115 12 : *result = int_nil;
116 12 : return MAL_SUCCEED;
117 : }
118 :
119 : /* Step 1 */
120 21 : n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */
121 21 : m = (int) strlen(t);
122 21 : if (n == 0) {
123 8 : *result = m;
124 8 : return MAL_SUCCEED;
125 : }
126 13 : if (m == 0) {
127 8 : *result = n;
128 8 : return MAL_SUCCEED;
129 : }
130 5 : sz = (n + 1) * (m + 1) * sizeof(int);
131 5 : d = (int *) GDKmalloc(sz);
132 5 : if ( d == NULL)
133 0 : throw(MAL,"levenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
134 :
135 : /* Step 2 */
136 45 : for (i = 0; i <= n; i++) {
137 : levenshtein_PutAt(d, i, 0, n, i);
138 : }
139 :
140 42 : for (j = 0; j <= m; j++) {
141 : levenshtein_PutAt(d, 0, j, n, j);
142 : }
143 :
144 : /* Step 3 */
145 40 : for (i = 1; i <= n; i++) {
146 :
147 35 : s_i = s[i - 1];
148 :
149 : /* Step 4 */
150 310 : for (j = 1; j <= m; j++) {
151 :
152 275 : t_j = t[j - 1];
153 :
154 : /* Step 5 */
155 275 : if (s_i == t_j) {
156 : cost = 0;
157 : } else {
158 232 : cost = *replace_cost;
159 : }
160 :
161 : /* Step 6 */
162 275 : above = levenshtein_GetAt(d, i - 1, j, n);
163 275 : left = levenshtein_GetAt(d, i, j - 1, n);
164 : diag = levenshtein_GetAt(d, i - 1, j - 1, n);
165 :
166 275 : if (j >= 2 && i >= 2) {
167 : /* NEW: detect transpositions */
168 :
169 213 : diag2 = levenshtein_GetAt(d, i - 2, j - 2, n);
170 213 : if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
171 4 : cost2 = *transpose_cost;
172 : } else {
173 : cost2 = 2;
174 : }
175 213 : cell = SMALLEST_OF4(above + *insdel_cost, left + *insdel_cost, diag + cost, diag2 + cost2);
176 : } else {
177 62 : cell = SMALLEST_OF(above + *insdel_cost, left + *insdel_cost, diag + cost);
178 : }
179 : levenshtein_PutAt(d, i, j, n, cell);
180 : }
181 : }
182 :
183 : /* Step 7 */
184 5 : *result = levenshtein_GetAt(d, n, m, n);
185 5 : GDKfree(d);
186 5 : return MAL_SUCCEED;
187 : }
188 :
189 : static str
190 20 : levenshteinbasic_impl(int *result, str *s, str *t)
191 : {
192 20 : int insdel = 1, replace = 1, transpose = 2;
193 :
194 20 : return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
195 : }
196 :
197 : static str
198 8 : levenshteinbasic2_impl(int *result, str *s, str *t)
199 : {
200 8 : int insdel = 1, replace = 1, transpose = 1;
201 :
202 8 : return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
203 : }
204 :
205 :
206 : /* =========================================================================
207 : * SOUNDEX FUNCTION
208 : * Source:
209 : * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c
210 : * =========================================================================
211 : */
212 :
213 : #define SoundexLen 4 /* length of a soundex code */
214 : #define SoundexKey "Z000" /* default key for soundex code */
215 :
216 : /* set letter values */
217 : static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
218 : 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
219 : };
220 :
221 : static inline char
222 34 : SCode(unsigned char c)
223 : {
224 34 : if (c == 95)
225 : return (2); /* german sz */
226 34 : return (Code[toupper(c) - 'A']);
227 : }
228 :
229 : static str
230 20 : soundex_code(const char *Name, char *Key)
231 : {
232 : char LastLetter;
233 : int Index;
234 :
235 70 : for (const char *p = Name; *p; p++)
236 50 : if ((*p & 0x80) != 0)
237 0 : throw(MAL,"soundex", SQLSTATE(42000) "Soundex function not available for non ASCII strings");
238 :
239 : /* set default key */
240 20 : strcpy(Key, SoundexKey);
241 :
242 : /* keep first letter */
243 20 : Key[0] = *Name;
244 20 : if (!isupper((unsigned char) (Key[0])))
245 20 : Key[0] = toupper(Key[0]);
246 :
247 20 : LastLetter = *Name;
248 20 : if (!*Name)
249 : return MAL_SUCCEED;
250 10 : Name++;
251 :
252 : /* scan rest of string */
253 48 : for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
254 : /* use only letters */
255 38 : if (isalpha((unsigned char) (*Name))) {
256 : /* ignore duplicate successive chars */
257 38 : if (LastLetter != *Name) {
258 : /* new LastLetter */
259 : LastLetter = *Name;
260 :
261 : /* ignore letters with code 0 */
262 34 : if (SCode(*Name) != 0) {
263 22 : Key[Index] = '0' + SCode(*Name);
264 22 : Index ++;
265 : }
266 : }
267 : }
268 : }
269 : return MAL_SUCCEED;
270 : }
271 :
272 : static str
273 24 : soundex_impl(str *res, str *Name)
274 : {
275 : str msg = MAL_SUCCEED;
276 :
277 24 : GDKfree(*res);
278 48 : RETURN_NIL_IF(strNil(*Name), TYPE_str);
279 :
280 20 : *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
281 20 : if( *res == NULL)
282 0 : throw(MAL,"soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
283 :
284 : /* calculate Key for Name */
285 20 : if ((msg = soundex_code(*Name, *res))) {
286 0 : GDKfree(*res);
287 0 : *res = NULL;
288 0 : return msg;
289 : }
290 : return msg;
291 : }
292 :
293 : static str
294 8 : stringdiff_impl(int *res, str *s1, str *s2)
295 : {
296 : str r = MAL_SUCCEED;
297 8 : char *S1 = NULL, *S2 = NULL;
298 :
299 8 : r = soundex_impl(&S1, s1);
300 8 : if( r != MAL_SUCCEED)
301 : return r;
302 8 : r = soundex_impl(&S2, s2);
303 8 : if( r != MAL_SUCCEED){
304 0 : GDKfree(S1);
305 0 : return r;
306 : }
307 8 : r = levenshteinbasic_impl(res, &S1, &S2);
308 8 : GDKfree(S1);
309 8 : GDKfree(S2);
310 8 : return r;
311 : }
312 :
313 : /******************************
314 : * QGRAMNORMALIZE
315 : *
316 : * This function 'normalizes' a string so valid q-grams can be made of it:
317 : * All characters are transformed to uppercase, and all characters
318 : * which are not letters or digits are stripped to a single space.
319 : *
320 : * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
321 : * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
322 : *
323 : *****************************/
324 : static str
325 12 : CMDqgramnormalize(str *res, str *Input)
326 : {
327 12 : char *input = *Input;
328 : int i, j = 0;
329 : char c, last = ' ';
330 :
331 12 : GDKfree(*res);
332 12 : RETURN_NIL_IF(strNil(input), TYPE_str);
333 12 : *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1)); /* normalized strings are never longer than original */
334 12 : if (*res == NULL)
335 0 : throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
336 :
337 87 : for (i = 0; input[i]; i++) {
338 75 : c = toupper(input[i]);
339 75 : if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
340 : c = ' ';
341 75 : if (c != ' ' || last != ' ') {
342 69 : (*res)[j++] = c;
343 : }
344 : last = c;
345 : }
346 12 : (*res)[j] = 0;
347 : /* strip final whitespace */
348 14 : while (j > 0 && (*res)[--j] == ' ')
349 2 : (*res)[j] = 0;
350 :
351 : return MAL_SUCCEED;
352 : }
353 :
354 : /* =========================================================================
355 : * FSTRCMP FUNCTION
356 : * Source:
357 : * http://search.cpan.org/src/MLEHMANN/String-Similarity-1/fstrcmp.c
358 : * =========================================================================
359 : */
360 :
361 : /*
362 : * Data on one input string being compared.
363 : */
364 : struct string_data {
365 : /* The string to be compared. */
366 : const char *data;
367 :
368 : /* The length of the string to be compared. */
369 : int data_length;
370 :
371 : /* The number of characters inserted or deleted. */
372 : int edit_count;
373 : };
374 :
375 : struct partition {
376 : /* Midpoints of this partition. */
377 : int xmid, ymid;
378 :
379 : /* Nonzero if low half will be analyzed minimally. */
380 : int lo_minimal;
381 :
382 : /* Likewise for high half. */
383 : int hi_minimal;
384 : };
385 :
386 : /* NAME
387 : diag - find diagonal path
388 :
389 : SYNOPSIS
390 : int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
391 : struct partition *part);
392 :
393 : DESCRIPTION
394 : Find the midpoint of the shortest edit script for a specified
395 : portion of the two strings.
396 :
397 : Scan from the beginnings of the strings, and simultaneously from
398 : the ends, doing a breadth-first search through the space of
399 : edit-sequence. When the two searches meet, we have found the
400 : midpoint of the shortest edit sequence.
401 :
402 : If MINIMAL is nonzero, find the minimal edit script regardless
403 : of expense. Otherwise, if the search is too expensive, use
404 : heuristics to stop the search and report a suboptimal answer.
405 :
406 : RETURNS
407 : Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
408 : number XMID - YMID equals the number of inserted characters
409 : minus the number of deleted characters (counting only characters
410 : before the midpoint). Return the approximate edit cost; this is
411 : the total number of characters inserted or deleted (counting
412 : only characters before the midpoint), unless a heuristic is used
413 : to terminate the search prematurely.
414 :
415 : Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
416 : for the left half of the partition is known; similarly for
417 : PART->RIGHT_MINIMAL.
418 :
419 : CAVEAT
420 : This function assumes that the first characters of the specified
421 : portions of the two strings do not match, and likewise that the
422 : last characters do not match. The caller must trim matching
423 : characters from the beginning and end of the portions it is
424 : going to specify.
425 :
426 : If we return the "wrong" partitions, the worst this can do is
427 : cause suboptimal diff output. It cannot cause incorrect diff
428 : output. */
429 :
430 : static inline int
431 0 : diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part, int too_expensive, struct string_data *string, int *fdiag, int *bdiag)
432 : {
433 : int *const fd = fdiag; /* Give the compiler a chance. */
434 : int *const bd = bdiag; /* Additional help for the compiler. */
435 0 : const char *const xv = string[0].data; /* Still more help for the compiler. */
436 0 : const char *const yv = string[1].data; /* And more and more . . . */
437 0 : const int dmin = xoff - ylim; /* Minimum valid diagonal. */
438 0 : const int dmax = xlim - yoff; /* Maximum valid diagonal. */
439 0 : const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
440 0 : const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
441 : int fmin = fmid;
442 : int fmax = fmid; /* Limits of top-down search. */
443 : int bmin = bmid;
444 : int bmax = bmid; /* Limits of bottom-up search. */
445 : int c; /* Cost. */
446 0 : int odd = (fmid - bmid) & 1;
447 :
448 : /*
449 : * True if southeast corner is on an odd diagonal with respect
450 : * to the northwest.
451 : */
452 0 : fd[fmid] = xoff;
453 0 : bd[bmid] = xlim;
454 0 : for (c = 1;; ++c) {
455 : int d; /* Active diagonal. */
456 :
457 : /* Extend the top-down search by an edit step in each diagonal. */
458 0 : if (fmin > dmin)
459 0 : fd[--fmin - 1] = -1;
460 : else
461 0 : ++fmin;
462 0 : if (fmax < dmax)
463 0 : fd[++fmax + 1] = -1;
464 : else
465 0 : --fmax;
466 0 : for (d = fmax; d >= fmin; d -= 2) {
467 : int x;
468 : int y;
469 : int tlo;
470 : int thi;
471 :
472 0 : tlo = fd[d - 1], thi = fd[d + 1];
473 :
474 0 : if (tlo >= thi)
475 0 : x = tlo + 1;
476 : else
477 : x = thi;
478 0 : y = x - d;
479 0 : while (x < xlim && y < ylim && xv[x] == yv[y]) {
480 0 : ++x;
481 0 : ++y;
482 : }
483 0 : fd[d] = x;
484 0 : if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
485 0 : part->xmid = x;
486 0 : part->ymid = y;
487 0 : part->lo_minimal = part->hi_minimal = 1;
488 0 : return 2 * c - 1;
489 : }
490 : }
491 : /* Similarly extend the bottom-up search. */
492 0 : if (bmin > dmin)
493 0 : bd[--bmin - 1] = INT_MAX;
494 : else
495 0 : ++bmin;
496 0 : if (bmax < dmax)
497 0 : bd[++bmax + 1] = INT_MAX;
498 : else
499 0 : --bmax;
500 0 : for (d = bmax; d >= bmin; d -= 2) {
501 : int x;
502 : int y;
503 : int tlo;
504 : int thi;
505 :
506 0 : tlo = bd[d - 1], thi = bd[d + 1];
507 0 : if (tlo < thi)
508 : x = tlo;
509 : else
510 0 : x = thi - 1;
511 0 : y = x - d;
512 0 : while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
513 0 : --x;
514 0 : --y;
515 : }
516 0 : bd[d] = x;
517 0 : if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
518 0 : part->xmid = x;
519 0 : part->ymid = y;
520 0 : part->lo_minimal = part->hi_minimal = 1;
521 0 : return 2 * c;
522 : }
523 : }
524 :
525 0 : if (minimal)
526 0 : continue;
527 :
528 : /* Heuristic: if we've gone well beyond the call of duty, give up
529 : and report halfway between our best results so far. */
530 0 : if (c >= too_expensive) {
531 : int fxybest;
532 : int fxbest;
533 : int bxybest;
534 : int bxbest;
535 :
536 : /* Pacify `gcc -Wall'. */
537 : fxbest = 0;
538 : bxbest = 0;
539 :
540 : /* Find forward diagonal that maximizes X + Y. */
541 : fxybest = -1;
542 0 : for (d = fmax; d >= fmin; d -= 2) {
543 : int x;
544 : int y;
545 :
546 0 : x = fd[d] < xlim ? fd[d] : xlim;
547 0 : y = x - d;
548 :
549 0 : if (ylim < y) {
550 0 : x = ylim + d;
551 : y = ylim;
552 : }
553 0 : if (fxybest < x + y) {
554 : fxybest = x + y;
555 : fxbest = x;
556 : }
557 : }
558 : /* Find backward diagonal that minimizes X + Y. */
559 : bxybest = INT_MAX;
560 0 : for (d = bmax; d >= bmin; d -= 2) {
561 : int x;
562 : int y;
563 :
564 0 : x = xoff > bd[d] ? xoff : bd[d];
565 0 : y = x - d;
566 :
567 0 : if (y < yoff) {
568 0 : x = yoff + d;
569 : y = yoff;
570 : }
571 0 : if (x + y < bxybest) {
572 : bxybest = x + y;
573 : bxbest = x;
574 : }
575 : }
576 : /* Use the better of the two diagonals. */
577 0 : if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) {
578 0 : part->xmid = fxbest;
579 0 : part->ymid = fxybest - fxbest;
580 0 : part->lo_minimal = 1;
581 0 : part->hi_minimal = 0;
582 : } else {
583 0 : part->xmid = bxbest;
584 0 : part->ymid = bxybest - bxbest;
585 0 : part->lo_minimal = 0;
586 0 : part->hi_minimal = 1;
587 : }
588 0 : return 2 * c - 1;
589 : }
590 : }
591 : }
592 :
593 :
594 : /* NAME
595 : compareseq - find edit sequence
596 :
597 : SYNOPSIS
598 : void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
599 :
600 : DESCRIPTION
601 : Compare in detail contiguous subsequences of the two strings
602 : which are known, as a whole, to match each other.
603 :
604 : The subsequence of string 0 is [XOFF, XLIM) and likewise for
605 : string 1.
606 :
607 : Note that XLIM, YLIM are exclusive bounds. All character
608 : numbers are origin-0.
609 :
610 : If MINIMAL is nonzero, find a minimal difference no matter how
611 : expensive it is. */
612 :
613 : static inline void
614 399841 : compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, int max_edits, int too_expensive, struct string_data *string, int *fdiag, int *bdiag) /* compareseq stops when edits > max_edits */
615 : {
616 399841 : const char *const xv = string[0].data; /* Help the compiler. */
617 399841 : const char *const yv = string[1].data;
618 :
619 399841 : if (string[1].edit_count + string[0].edit_count > max_edits)
620 : return;
621 :
622 : /* Slide down the bottom initial diagonal. */
623 2576608 : while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) {
624 2176767 : ++xoff;
625 2176767 : ++yoff;
626 : }
627 :
628 : /* Slide up the top initial diagonal. */
629 399846 : while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) {
630 5 : --xlim;
631 5 : --ylim;
632 : }
633 :
634 : /* Handle simple cases. */
635 399841 : if (xoff == xlim) {
636 399840 : while (yoff < ylim) {
637 0 : ++string[1].edit_count;
638 0 : ++yoff;
639 : }
640 1 : } else if (yoff == ylim) {
641 2 : while (xoff < xlim) {
642 1 : ++string[0].edit_count;
643 1 : ++xoff;
644 : }
645 : } else {
646 : int c;
647 : struct partition part;
648 :
649 : /* Find a point of correspondence in the middle of the strings. */
650 0 : c = diag(xoff, xlim, yoff, ylim, minimal, &part, too_expensive, string, fdiag, bdiag);
651 0 : if (c == 1) {
652 : /* The two subsequences differ by a single insert or delete;
653 : record it and we are done. */
654 0 : if (part.xmid - part.ymid < xoff - yoff)
655 0 : ++string[1].edit_count;
656 : else
657 0 : ++string[0].edit_count;
658 : } else {
659 : /* Use the partitions to split this problem into subproblems. */
660 0 : compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal, max_edits, too_expensive, string, fdiag, bdiag);
661 0 : compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal, max_edits, too_expensive, string, fdiag, bdiag);
662 : }
663 : }
664 : }
665 :
666 : /* NAME
667 : fstrcmp - fuzzy string compare
668 :
669 : SYNOPSIS
670 : double fstrcmp(const char *s1, int l1, const char *s2, int l2, double);
671 :
672 : DESCRIPTION
673 : The fstrcmp function may be used to compare two string for
674 : similarity. It is very useful in reducing "cascade" or
675 : "secondary" errors in compilers or other situations where
676 : symbol tables occur.
677 :
678 : RETURNS
679 : double; 0 if the strings are entirly dissimilar, 1 if the
680 : strings are identical, and a number in between if they are
681 : similar. */
682 :
683 : #define INITIAL_INT_BUFFER_LENGTH 2048
684 :
685 : #define CHECK_INT_BUFFER_LENGTH(BUFFER, BUFFER_LEN, NEXT_LEN, OP) \
686 : do { \
687 : if ((NEXT_LEN) > *BUFFER_LEN) { \
688 : size_t newlen = (((NEXT_LEN) + 1023) & ~1023); /* align to a multiple of 1024 bytes */ \
689 : int *newbuf = GDKmalloc(newlen); \
690 : if (!newbuf) \
691 : throw(MAL, OP, SQLSTATE(HY013) MAL_MALLOC_FAIL); \
692 : GDKfree(*BUFFER); \
693 : *BUFFER = newbuf; \
694 : *BUFFER_LEN = newlen; \
695 : } \
696 : } while (0)
697 :
698 : static str
699 399763 : fstrcmp_impl_internal(dbl *ret, int **fdiag_buf, size_t *fdiag_buflen, str string1, str string2, dbl minimum)
700 : {
701 : int i, max_edits, *fdiag, *bdiag, too_expensive = 1;
702 : size_t fdiag_len;
703 : struct string_data string[2];
704 :
705 : /* set the info for each string. */
706 399763 : string[0].data = string1;
707 399763 : string[0].data_length = (int) strlen(string1); /* 64bit: assume string not too long */
708 399763 : string[1].data = string2;
709 399763 : string[1].data_length = (int) strlen(string2); /* 64bit: assume string not too long */
710 :
711 : /* short-circuit obvious comparisons */
712 399763 : if (string[0].data_length == 0 && string[1].data_length == 0) {
713 0 : *ret = 1.0;
714 0 : return MAL_SUCCEED;
715 : }
716 399763 : if (string[0].data_length == 0 || string[1].data_length == 0) {
717 0 : *ret = 0.0;
718 0 : return MAL_SUCCEED;
719 : }
720 :
721 : /* Set TOO_EXPENSIVE to be approximate square root of input size,
722 : bounded below by 256. */
723 1199219 : for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
724 799456 : too_expensive <<= 1;
725 : if (too_expensive < 256)
726 : too_expensive = 256;
727 :
728 : /* Because fstrcmp is typically called multiple times, while scanning
729 : symbol tables, etc, attempt to minimize the number of memory
730 : allocations performed. Thus, we use a static buffer for the
731 : diagonal vectors, and never free them. */
732 399763 : fdiag_len = string[0].data_length + string[1].data_length + 3;
733 399763 : CHECK_INT_BUFFER_LENGTH(fdiag_buf, fdiag_buflen, fdiag_len * 2 * sizeof(int), "txtsim.similarity");
734 399763 : fdiag = *fdiag_buf + string[1].data_length + 1;
735 399763 : bdiag = fdiag + fdiag_len;
736 :
737 399763 : max_edits = 1 + (int) ((string[0].data_length + string[1].data_length) * (1. - minimum));
738 :
739 : /* Now do the main comparison algorithm */
740 399763 : string[0].edit_count = 0;
741 399763 : string[1].edit_count = 0;
742 399763 : compareseq(0, string[0].data_length, 0, string[1].data_length, 0, max_edits, too_expensive, string, fdiag, bdiag);
743 :
744 : /* The result is
745 : ((number of chars in common) / (average length of the strings)).
746 : This is admittedly biased towards finding that the strings are
747 : similar, however it does produce meaningful results. */
748 399822 : *ret = ((double)
749 399822 : (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
750 399822 : / (string[0].data_length + string[1].data_length));
751 399822 : return MAL_SUCCEED;
752 : }
753 :
754 : static str
755 0 : fstrcmp_impl(dbl *ret, str *string1, str *string2, dbl *minimum)
756 : {
757 0 : str s1 = *string1, s2 = *string2;
758 0 : dbl min = *minimum;
759 :
760 0 : if (strNil(s1) || strNil(s2) || is_dbl_nil(min)) {
761 0 : *ret = dbl_nil;
762 0 : return MAL_SUCCEED;
763 : } else {
764 : str msg = MAL_SUCCEED;
765 0 : int *fdiag_buf = NULL;
766 0 : size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
767 :
768 0 : if (!(fdiag_buf = GDKmalloc(fdiag_buflen)))
769 0 : throw(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
770 0 : msg = fstrcmp_impl_internal(ret, &fdiag_buf, &fdiag_buflen, s1, s1, min);
771 0 : GDKfree(fdiag_buf);
772 0 : return msg;
773 : }
774 : }
775 :
776 : static str
777 4 : fstrcmp0_impl(dbl *ret, str *string1, str *string2)
778 : {
779 4 : str s1 = *string1, s2 = *string2;
780 :
781 6 : if (strNil(s1) || strNil(s2)) {
782 3 : *ret = dbl_nil;
783 3 : return MAL_SUCCEED;
784 : } else {
785 : str msg = MAL_SUCCEED;
786 1 : int *fdiag_buf = NULL;
787 1 : size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
788 :
789 1 : if (!(fdiag_buf = GDKmalloc(fdiag_buflen)))
790 0 : throw(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
791 1 : msg = fstrcmp_impl_internal(ret, &fdiag_buf, &fdiag_buflen, s1, s2, 0.0);
792 1 : GDKfree(fdiag_buf);
793 1 : return msg;
794 : }
795 : }
796 :
797 : static str
798 16 : fstrcmp0_impl_bulk(bat *res, bat *strings1, bat *strings2)
799 : {
800 : BATiter lefti, righti;
801 : BAT *bn = NULL, *left = NULL, *right = NULL;
802 : BUN q = 0;
803 16 : size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
804 : str x, y, msg = MAL_SUCCEED;
805 : bool nils = false;
806 : dbl *restrict vals;
807 16 : int *fdiag_buf = GDKmalloc(fdiag_buflen);
808 :
809 16 : if (!fdiag_buf) {
810 0 : msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
811 0 : goto bailout;
812 : }
813 16 : if (!(left = BATdescriptor(*strings1)) || !(right = BATdescriptor(*strings2))) {
814 0 : msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
815 0 : goto bailout;
816 : }
817 16 : q = BATcount(left);
818 16 : if (!(bn = COLnew(left->hseqbase, TYPE_dbl, q, TRANSIENT))) {
819 0 : msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
820 0 : goto bailout;
821 : }
822 :
823 16 : lefti = bat_iterator(left);
824 16 : righti = bat_iterator(right);
825 16 : vals = Tloc(bn, 0);
826 399814 : for (BUN i = 0; i < q && !msg; i++) {
827 399798 : x = (str) BUNtvar(lefti, i);
828 399798 : y = (str) BUNtvar(righti, i);
829 :
830 799593 : if (strNil(x) || strNil(y)) {
831 11 : vals[i] = dbl_nil;
832 11 : nils = true;
833 : } else {
834 399787 : msg = fstrcmp_impl_internal(&vals[i], &fdiag_buf, &fdiag_buflen, x, y, 0.0);
835 : }
836 : }
837 16 : bat_iterator_end(&lefti);
838 16 : bat_iterator_end(&righti);
839 :
840 16 : bailout:
841 16 : GDKfree(fdiag_buf);
842 16 : if (bn && !msg) {
843 16 : BATsetcount(bn, q);
844 16 : bn->tnil = nils;
845 16 : bn->tnonil = !nils;
846 16 : bn->tkey = BATcount(bn) <= 1;
847 16 : bn->tsorted = BATcount(bn) <= 1;
848 16 : bn->trevsorted = BATcount(bn) <= 1;
849 16 : BBPkeepref(*res = bn->batCacheid);
850 0 : } else if (bn)
851 0 : BBPreclaim(bn);
852 16 : if (left)
853 16 : BBPunfix(left->batCacheid);
854 16 : if (right)
855 16 : BBPunfix(right->batCacheid);
856 16 : return msg;
857 : }
858 :
859 :
860 : /* ============ Q-GRAM SELF JOIN ============== */
861 :
862 : static str
863 0 : CMDqgramselfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid, flt *c, int *k)
864 : {
865 : BAT *qgram, *id, *pos, *len;
866 : BUN n;
867 : BUN i, j;
868 : BAT *bn, *bn2;
869 : oid *qbuf;
870 : int *ibuf;
871 : int *pbuf;
872 : int *lbuf;
873 : str msg = MAL_SUCCEED;
874 :
875 0 : qgram = BATdescriptor(*qid);
876 0 : id = BATdescriptor(*bid);
877 0 : pos = BATdescriptor(*pid);
878 0 : len = BATdescriptor(*lid);
879 0 : if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
880 0 : if (qgram)
881 0 : BBPunfix(qgram->batCacheid);
882 0 : if (id)
883 0 : BBPunfix(id->batCacheid);
884 0 : if (pos)
885 0 : BBPunfix(pos->batCacheid);
886 0 : if (len)
887 0 : BBPunfix(len->batCacheid);
888 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
889 : }
890 :
891 0 : if (qgram->ttype != TYPE_oid)
892 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
893 : SEMANTIC_TYPE_MISMATCH ": tail of BAT qgram must be oid");
894 0 : else if (id->ttype != TYPE_int)
895 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
896 : SEMANTIC_TYPE_MISMATCH ": tail of BAT id must be int");
897 0 : else if (pos->ttype != TYPE_int)
898 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
899 : SEMANTIC_TYPE_MISMATCH ": tail of BAT pos must be int");
900 0 : else if (len->ttype != TYPE_int)
901 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
902 : SEMANTIC_TYPE_MISMATCH ": tail of BAT len must be int");
903 : if (msg) {
904 0 : BBPunfix(qgram->batCacheid);
905 0 : BBPunfix(id->batCacheid);
906 0 : BBPunfix(pos->batCacheid);
907 0 : BBPunfix(len->batCacheid);
908 0 : return msg;
909 : }
910 :
911 0 : n = BATcount(qgram);
912 :
913 : /* if (BATcount(qgram)>1 && !BATtordered(qgram)) throw(MAL, "tstsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
914 :
915 0 : if (!ALIGNsynced(qgram, id))
916 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
917 : SEMANTIC_TYPE_MISMATCH ": qgram and id are not synced");
918 :
919 0 : else if (!ALIGNsynced(qgram, pos))
920 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
921 : SEMANTIC_TYPE_MISMATCH ": qgram and pos are not synced");
922 0 : else if (!ALIGNsynced(qgram, len))
923 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
924 : SEMANTIC_TYPE_MISMATCH ": qgram and len are not synced");
925 :
926 0 : else if (Tsize(qgram) != ATOMsize(qgram->ttype))
927 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
928 : SEMANTIC_TYPE_MISMATCH ": qgram is not a true void bat");
929 0 : else if (Tsize(id) != ATOMsize(id->ttype))
930 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
931 : SEMANTIC_TYPE_MISMATCH ": id is not a true void bat");
932 :
933 0 : else if (Tsize(pos) != ATOMsize(pos->ttype))
934 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
935 : SEMANTIC_TYPE_MISMATCH ": pos is not a true void bat");
936 0 : else if (Tsize(len) != ATOMsize(len->ttype))
937 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
938 : SEMANTIC_TYPE_MISMATCH ": len is not a true void bat");
939 0 : if (msg) {
940 0 : BBPunfix(qgram->batCacheid);
941 0 : BBPunfix(id->batCacheid);
942 0 : BBPunfix(pos->batCacheid);
943 0 : BBPunfix(len->batCacheid);
944 0 : return msg;
945 : }
946 :
947 0 : bn = COLnew(0, TYPE_int, n, TRANSIENT);
948 0 : bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
949 0 : if (bn == NULL || bn2 == NULL){
950 0 : BBPreclaim(bn);
951 0 : BBPreclaim(bn2);
952 0 : BBPunfix(qgram->batCacheid);
953 0 : BBPunfix(id->batCacheid);
954 0 : BBPunfix(pos->batCacheid);
955 0 : BBPunfix(len->batCacheid);
956 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
957 : }
958 :
959 0 : BATiter qgrami = bat_iterator(qgram);
960 0 : BATiter idi = bat_iterator(id);
961 0 : BATiter posi = bat_iterator(pos);
962 0 : BATiter leni = bat_iterator(len);
963 0 : qbuf = (oid *) qgrami.base;
964 0 : ibuf = (int *) idi.base;
965 0 : pbuf = (int *) posi.base;
966 0 : lbuf = (int *) leni.base;
967 0 : for (i = 0; i < n - 1; i++) {
968 0 : for (j = i + 1; (j < n && qbuf[j] == qbuf[i] && pbuf[j] <= (pbuf[i] + (*k + *c * MYMIN(lbuf[i], lbuf[j])))); j++) {
969 0 : if (ibuf[i] != ibuf[j] && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MYMIN(lbuf[i], lbuf[j]))) {
970 0 : if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED ||
971 0 : BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
972 0 : bat_iterator_end(&qgrami);
973 0 : bat_iterator_end(&idi);
974 0 : bat_iterator_end(&posi);
975 0 : bat_iterator_end(&leni);
976 0 : BBPunfix(qgram->batCacheid);
977 0 : BBPunfix(id->batCacheid);
978 0 : BBPunfix(pos->batCacheid);
979 0 : BBPunfix(len->batCacheid);
980 0 : BBPreclaim(bn);
981 0 : BBPreclaim(bn2);
982 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
983 : }
984 : }
985 : }
986 : }
987 0 : bat_iterator_end(&qgrami);
988 0 : bat_iterator_end(&idi);
989 0 : bat_iterator_end(&posi);
990 0 : bat_iterator_end(&leni);
991 :
992 0 : BBPunfix(qgram->batCacheid);
993 0 : BBPunfix(id->batCacheid);
994 0 : BBPunfix(pos->batCacheid);
995 0 : BBPunfix(len->batCacheid);
996 :
997 0 : BBPkeepref(*res1 = bn->batCacheid);
998 0 : BBPkeepref(*res2 = bn2->batCacheid);
999 :
1000 0 : return MAL_SUCCEED;
1001 : }
1002 :
1003 : /* copy up to utf8len UTF-8 encoded characters from src to buf
1004 : * stop early if buf (size given by bufsize) is too small, or if src runs out
1005 : * return number of UTF-8 characters copied (excluding NUL)
1006 : * close with NUL if enough space */
1007 : static size_t
1008 26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
1009 : {
1010 : size_t cnt = 0;
1011 :
1012 128 : while (utf8len != 0 && *src != 0 && bufsize != 0) {
1013 102 : bufsize--;
1014 102 : utf8len--;
1015 102 : cnt++;
1016 102 : if (((*buf++ = *src++) & 0x80) != 0) {
1017 40 : while ((*src & 0xC0) == 0x80 && bufsize != 0) {
1018 20 : *buf++ = *src++;
1019 20 : bufsize--;
1020 : }
1021 : }
1022 : }
1023 26 : if (bufsize != 0)
1024 26 : *buf = 0;
1025 26 : return cnt;
1026 : }
1027 :
1028 : static str
1029 2 : CMDstr2qgrams(bat *ret, str *val)
1030 : {
1031 : BAT *bn;
1032 2 : size_t i, len = strlen(*val) + 5;
1033 2 : str s = GDKmalloc(len);
1034 : char qgram[4 * 6 + 1]; /* 4 UTF-8 code points plus NULL byte */
1035 :
1036 2 : if (s == NULL)
1037 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1038 2 : strcpy(s, "##");
1039 2 : strcpy(s + 2, *val);
1040 2 : strcpy(s + len - 3, "$$");
1041 2 : bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
1042 2 : if (bn == NULL) {
1043 0 : GDKfree(s);
1044 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1045 : }
1046 :
1047 : i = 0;
1048 26 : while (s[i]) {
1049 26 : if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
1050 : break;
1051 24 : if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
1052 0 : BBPreclaim(bn);
1053 0 : GDKfree(s);
1054 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1055 : }
1056 24 : if ((s[i++] & 0xC0) == 0xC0) {
1057 8 : while ((s[i] & 0xC0) == 0x80)
1058 4 : i++;
1059 : }
1060 : }
1061 2 : BBPkeepref(*ret = bn->batCacheid);
1062 2 : GDKfree(s);
1063 2 : return MAL_SUCCEED;
1064 : }
1065 :
1066 : #include "mel.h"
1067 : mel_func txtsim_init_funcs[] = {
1068 : command("txtsim", "levenshtein", levenshtein_impl, false, "Calculates Levenshtein distance (edit distance) between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6, arg("",int),arg("s",str),arg("t",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
1069 : command("txtsim", "levenshtein", levenshteinbasic_impl, false, "Calculates Levenshtein distance (edit distance) between two strings", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1070 : command("txtsim", "editdistance", levenshteinbasic_impl, false, "Alias for Levenshtein(str,str)", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1071 : command("txtsim", "editdistance2", levenshteinbasic2_impl, false, "Calculates Levenshtein distance (edit distance) between two strings. Cost of transposition is 1 instead of 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1072 : command("txtsim", "similarity", fstrcmp_impl, false, "Normalized edit distance between two strings", args(1,4, arg("",dbl),arg("string1",str),arg("string2",str),arg("minimum",dbl))),
1073 : command("txtsim", "similarity", fstrcmp0_impl, false, "Normalized edit distance between two strings", args(1,3, arg("",dbl),arg("string1",str),arg("string2",str))),
1074 : command("battxtsim", "similarity", fstrcmp0_impl_bulk, false, "Normalized edit distance between two strings", args(1,3, batarg("",dbl),batarg("string1",str),batarg("string2",str))),
1075 : command("txtsim", "soundex", soundex_impl, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
1076 : command("txtsim", "stringdiff", stringdiff_impl, false, "calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
1077 : command("txtsim", "qgramnormalize", CMDqgramnormalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
1078 : command("txtsim", "qgramselfjoin", CMDqgramselfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
1079 : command("txtsim", "str2qgrams", CMDstr2qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
1080 : { .imp=NULL }
1081 : };
1082 : #include "mal_import.h"
1083 : #ifdef _MSC_VER
1084 : #undef read
1085 : #pragma section(".CRT$XCU",read)
1086 : #endif
1087 257 : LIB_STARTUP_FUNC(init_txtsim_mal)
1088 257 : { mal_module("txtsim", NULL, txtsim_init_funcs); }
|