Line data Source code
1 : /*
2 : * This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 : *
6 : * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
7 : */
8 :
9 : /*
10 : * @f txtsim
11 : * @t Module providing similarity metrics for strings
12 : * @a Romulo Goncalves (from M4 to M5)
13 : * @d 01/12/2007
14 : * @v 0.1
15 : *
16 : * @+ String metrics
17 : *
18 : * Provides basic similarity metrics for strings.
19 : *
20 : */
21 : #include "monetdb_config.h"
22 : #include "mal.h"
23 : #include <string.h>
24 : #include "gdk.h"
25 : #include "str.h"
26 : #include <limits.h>
27 : #include "mal_exception.h"
28 :
29 :
30 : #define RETURN_NIL_IF(b,t) \
31 : if (b) {\
32 : if (ATOMextern(t)) {\
33 : *(ptr*) res = (ptr) ATOMnil(t);\
34 : if ( *(ptr *) res == NULL)\
35 : throw(MAL,"txtsim", SQLSTATE(HY013) MAL_MALLOC_FAIL);\
36 : } else {\
37 : memcpy(res, ATOMnilptr(t), ATOMsize(t));\
38 : }\
39 : return MAL_SUCCEED; \
40 : }
41 :
42 : /* =========================================================================
43 : * LEVENSH?TEIN FUNCTION
44 : * Source:
45 : * http://www.merriampark.com/ld.htm
46 : * =========================================================================
47 : */
48 :
49 : #define MYMIN(x,y) ( (x<y) ? x : y )
50 : #define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) )
51 : #define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) )
52 :
53 : /***************************************************
54 : * Get a pointer to the specified cell of the matrix
55 : **************************************************/
56 :
57 : static inline int *
58 : levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols)
59 : {
60 845 : return pOrigin + col + (row * (nCols + 1));
61 : }
62 :
63 : /******************************************************
64 : * Get the contents of the specified cell in the matrix
65 : *****************************************************/
66 :
67 : static inline int
68 : levenshtein_GetAt(int *pOrigin, int col, int row, int nCols)
69 : {
70 : int *pCell;
71 :
72 : pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
73 493 : return *pCell;
74 :
75 : }
76 :
77 : /********************************************************
78 : * Fill the specified cell in the matrix with the value x
79 : *******************************************************/
80 :
81 : static inline void
82 : levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x)
83 : {
84 : int *pCell;
85 :
86 : pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
87 352 : *pCell = x;
88 : }
89 :
90 :
91 : /******************************
92 : * Compute Levenshtein distance
93 : *****************************/
94 : static str
95 32 : levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int *replace_cost, int *transpose_cost)
96 : {
97 32 : char *s = *S;
98 32 : char *t = *T;
99 : int *d; /* pointer to matrix */
100 : int n; /* length of s */
101 : int m; /* length of t */
102 : int i; /* iterates through s */
103 : int j; /* iterates through t */
104 : char s_i; /* ith character of s */
105 : char t_j; /* jth character of t */
106 : int cost; /* cost */
107 : int cell; /* contents of target cell */
108 : int above; /* contents of cell immediately above */
109 : int left; /* contents of cell immediately to left */
110 : int diag; /* contents of cell immediately above and to left */
111 : int sz; /* number of cells in matrix */
112 : int diag2 = 0, cost2 = 0;
113 :
114 56 : if (strNil(*S) || strNil(*T)) {
115 12 : *result = int_nil;
116 12 : return MAL_SUCCEED;
117 : }
118 :
119 : /* Step 1 */
120 20 : n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */
121 20 : m = (int) strlen(t);
122 20 : if (n == 0) {
123 7 : *result = m;
124 7 : return MAL_SUCCEED;
125 : }
126 13 : if (m == 0) {
127 8 : *result = n;
128 8 : return MAL_SUCCEED;
129 : }
130 5 : sz = (n + 1) * (m + 1) * sizeof(int);
131 5 : d = (int *) GDKmalloc(sz);
132 5 : if ( d == NULL)
133 0 : throw(MAL,"levenshtein", SQLSTATE(HY013) MAL_MALLOC_FAIL);
134 :
135 : /* Step 2 */
136 45 : for (i = 0; i <= n; i++) {
137 : levenshtein_PutAt(d, i, 0, n, i);
138 : }
139 :
140 42 : for (j = 0; j <= m; j++) {
141 : levenshtein_PutAt(d, 0, j, n, j);
142 : }
143 :
144 : /* Step 3 */
145 40 : for (i = 1; i <= n; i++) {
146 :
147 35 : s_i = s[i - 1];
148 :
149 : /* Step 4 */
150 310 : for (j = 1; j <= m; j++) {
151 :
152 275 : t_j = t[j - 1];
153 :
154 : /* Step 5 */
155 275 : if (s_i == t_j) {
156 : cost = 0;
157 : } else {
158 232 : cost = *replace_cost;
159 : }
160 :
161 : /* Step 6 */
162 275 : above = levenshtein_GetAt(d, i - 1, j, n);
163 275 : left = levenshtein_GetAt(d, i, j - 1, n);
164 : diag = levenshtein_GetAt(d, i - 1, j - 1, n);
165 :
166 275 : if (j >= 2 && i >= 2) {
167 : /* NEW: detect transpositions */
168 :
169 213 : diag2 = levenshtein_GetAt(d, i - 2, j - 2, n);
170 213 : if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
171 4 : cost2 = *transpose_cost;
172 : } else {
173 : cost2 = 2;
174 : }
175 213 : cell = SMALLEST_OF4(above + *insdel_cost, left + *insdel_cost, diag + cost, diag2 + cost2);
176 : } else {
177 62 : cell = SMALLEST_OF(above + *insdel_cost, left + *insdel_cost, diag + cost);
178 : }
179 : levenshtein_PutAt(d, i, j, n, cell);
180 : }
181 : }
182 :
183 : /* Step 7 */
184 5 : *result = levenshtein_GetAt(d, n, m, n);
185 5 : GDKfree(d);
186 5 : return MAL_SUCCEED;
187 : }
188 :
189 : static str
190 12 : levenshteinbasic_impl(int *result, str *s, str *t)
191 : {
192 19 : int insdel = 1, replace = 1, transpose = 2;
193 :
194 19 : return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
195 : }
196 :
197 : static str
198 8 : levenshteinbasic2_impl(int *result, str *s, str *t)
199 : {
200 8 : int insdel = 1, replace = 1, transpose = 1;
201 :
202 8 : return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
203 : }
204 :
205 :
206 : /* =========================================================================
207 : * SOUNDEX FUNCTION
208 : * Source:
209 : * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c
210 : * =========================================================================
211 : */
212 :
213 : #define SoundexLen 4 /* length of a soundex code */
214 : #define SoundexKey "Z000" /* default key for soundex code */
215 :
216 : /* set letter values */
217 : static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
218 : 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
219 : };
220 :
221 : static inline char
222 : SCode(unsigned char c)
223 : {
224 56 : if (c == 95)
225 : return (2); /* german sz */
226 22 : return (Code[toupper(c) - 'A']);
227 : }
228 :
229 : static void
230 20 : soundex_code(char *Name, char *Key)
231 : {
232 : char LastLetter;
233 : int Index;
234 :
235 : /* set default key */
236 20 : strcpy(Key, SoundexKey);
237 :
238 : /* keep first letter */
239 20 : Key[0] = *Name;
240 20 : if (!isupper((unsigned char) (Key[0])))
241 20 : Key[0] = toupper(Key[0]);
242 :
243 20 : LastLetter = *Name;
244 20 : if (!*Name)
245 : return;
246 10 : Name++;
247 :
248 : /* scan rest of string */
249 48 : for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
250 : /* use only letters */
251 38 : if (isalpha((unsigned char) (*Name))) {
252 : /* ignore duplicate successive chars */
253 38 : if (LastLetter != *Name) {
254 : /* new LastLetter */
255 : LastLetter = *Name;
256 :
257 : /* ignore letters with code 0 */
258 34 : if (SCode(*Name) != 0) {
259 22 : Key[Index] = '0' + SCode(*Name);
260 22 : Index ++;
261 : }
262 : }
263 : }
264 : }
265 : }
266 :
267 : static str
268 24 : soundex_impl(str *res, str *Name)
269 : {
270 48 : RETURN_NIL_IF(strNil(*Name), TYPE_str);
271 :
272 20 : *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
273 20 : if( *res == NULL)
274 0 : throw(MAL,"soundex", SQLSTATE(HY013) MAL_MALLOC_FAIL);
275 :
276 : /* calculate Key for Name */
277 20 : soundex_code(*Name, *res);
278 :
279 19 : return MAL_SUCCEED;
280 : }
281 :
282 : static str
283 8 : stringdiff_impl(int *res, str *s1, str *s2)
284 : {
285 : str r = MAL_SUCCEED;
286 8 : char *S1 = NULL, *S2 = NULL;
287 :
288 8 : r = soundex_impl(&S1, s1);
289 8 : if( r != MAL_SUCCEED)
290 : return r;
291 8 : r = soundex_impl(&S2, s2);
292 7 : if( r != MAL_SUCCEED){
293 0 : GDKfree(S1);
294 0 : return r;
295 : }
296 : r = levenshteinbasic_impl(res, &S1, &S2);
297 7 : GDKfree(S1);
298 8 : GDKfree(S2);
299 8 : return r;
300 : }
301 :
302 : /******************************
303 : * QGRAMNORMALIZE
304 : *
305 : * This function 'normalizes' a string so valid q-grams can be made of it:
306 : * All characters are transformed to uppercase, and all characters
307 : * which are not letters or digits are stripped to a single space.
308 : *
309 : * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
310 : * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
311 : *
312 : *****************************/
313 : static str
314 12 : CMDqgramnormalize(str *res, str *Input)
315 : {
316 12 : char *input = *Input;
317 : int i, j = 0;
318 : char c, last = ' ';
319 :
320 12 : RETURN_NIL_IF(strNil(input), TYPE_str);
321 12 : *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1)); /* normalized strings are never longer than original */
322 12 : if (*res == NULL)
323 0 : throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY013) MAL_MALLOC_FAIL);
324 :
325 70 : for (i = 0; input[i]; i++) {
326 58 : c = toupper(input[i]);
327 58 : if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
328 : c = ' ';
329 58 : if (c != ' ' || last != ' ') {
330 53 : (*res)[j++] = c;
331 : }
332 : last = c;
333 : }
334 12 : (*res)[j] = 0;
335 : /* strip final whitespace */
336 14 : while (j > 0 && (*res)[--j] == ' ')
337 2 : (*res)[j] = 0;
338 :
339 : return MAL_SUCCEED;
340 : }
341 :
342 : /* =========================================================================
343 : * FSTRCMP FUNCTION
344 : * Source:
345 : * http://search.cpan.org/src/MLEHMANN/String-Similarity-1/fstrcmp.c
346 : * =========================================================================
347 : */
348 :
349 : /*
350 : * Data on one input string being compared.
351 : */
352 : struct string_data {
353 : /* The string to be compared. */
354 : const char *data;
355 :
356 : /* The length of the string to be compared. */
357 : int data_length;
358 :
359 : /* The number of characters inserted or deleted. */
360 : int edit_count;
361 : };
362 :
363 : struct partition {
364 : /* Midpoints of this partition. */
365 : int xmid, ymid;
366 :
367 : /* Nonzero if low half will be analyzed minimally. */
368 : int lo_minimal;
369 :
370 : /* Likewise for high half. */
371 : int hi_minimal;
372 : };
373 :
374 : /* NAME
375 : diag - find diagonal path
376 :
377 : SYNOPSIS
378 : int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
379 : struct partition *part);
380 :
381 : DESCRIPTION
382 : Find the midpoint of the shortest edit script for a specified
383 : portion of the two strings.
384 :
385 : Scan from the beginnings of the strings, and simultaneously from
386 : the ends, doing a breadth-first search through the space of
387 : edit-sequence. When the two searches meet, we have found the
388 : midpoint of the shortest edit sequence.
389 :
390 : If MINIMAL is nonzero, find the minimal edit script regardless
391 : of expense. Otherwise, if the search is too expensive, use
392 : heuristics to stop the search and report a suboptimal answer.
393 :
394 : RETURNS
395 : Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
396 : number XMID - YMID equals the number of inserted characters
397 : minus the number of deleted characters (counting only characters
398 : before the midpoint). Return the approximate edit cost; this is
399 : the total number of characters inserted or deleted (counting
400 : only characters before the midpoint), unless a heuristic is used
401 : to terminate the search prematurely.
402 :
403 : Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
404 : for the left half of the partition is known; similarly for
405 : PART->RIGHT_MINIMAL.
406 :
407 : CAVEAT
408 : This function assumes that the first characters of the specified
409 : portions of the two strings do not match, and likewise that the
410 : last characters do not match. The caller must trim matching
411 : characters from the beginning and end of the portions it is
412 : going to specify.
413 :
414 : If we return the "wrong" partitions, the worst this can do is
415 : cause suboptimal diff output. It cannot cause incorrect diff
416 : output. */
417 :
418 : static inline int
419 0 : diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part, int too_expensive, struct string_data *string, int *fdiag, int *bdiag)
420 : {
421 : int *const fd = fdiag; /* Give the compiler a chance. */
422 : int *const bd = bdiag; /* Additional help for the compiler. */
423 0 : const char *const xv = string[0].data; /* Still more help for the compiler. */
424 0 : const char *const yv = string[1].data; /* And more and more . . . */
425 0 : const int dmin = xoff - ylim; /* Minimum valid diagonal. */
426 0 : const int dmax = xlim - yoff; /* Maximum valid diagonal. */
427 0 : const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
428 0 : const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
429 : int fmin = fmid;
430 : int fmax = fmid; /* Limits of top-down search. */
431 : int bmin = bmid;
432 : int bmax = bmid; /* Limits of bottom-up search. */
433 : int c; /* Cost. */
434 0 : int odd = (fmid - bmid) & 1;
435 :
436 : /*
437 : * True if southeast corner is on an odd diagonal with respect
438 : * to the northwest.
439 : */
440 0 : fd[fmid] = xoff;
441 0 : bd[bmid] = xlim;
442 0 : for (c = 1;; ++c) {
443 : int d; /* Active diagonal. */
444 :
445 : /* Extend the top-down search by an edit step in each diagonal. */
446 0 : if (fmin > dmin)
447 0 : fd[--fmin - 1] = -1;
448 : else
449 0 : ++fmin;
450 0 : if (fmax < dmax)
451 0 : fd[++fmax + 1] = -1;
452 : else
453 0 : --fmax;
454 0 : for (d = fmax; d >= fmin; d -= 2) {
455 : int x;
456 : int y;
457 : int tlo;
458 : int thi;
459 :
460 0 : tlo = fd[d - 1], thi = fd[d + 1];
461 :
462 0 : if (tlo >= thi)
463 0 : x = tlo + 1;
464 : else
465 : x = thi;
466 0 : y = x - d;
467 0 : while (x < xlim && y < ylim && xv[x] == yv[y]) {
468 0 : ++x;
469 0 : ++y;
470 : }
471 0 : fd[d] = x;
472 0 : if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
473 0 : part->xmid = x;
474 0 : part->ymid = y;
475 0 : part->lo_minimal = part->hi_minimal = 1;
476 0 : return 2 * c - 1;
477 : }
478 : }
479 : /* Similarly extend the bottom-up search. */
480 0 : if (bmin > dmin)
481 0 : bd[--bmin - 1] = INT_MAX;
482 : else
483 0 : ++bmin;
484 0 : if (bmax < dmax)
485 0 : bd[++bmax + 1] = INT_MAX;
486 : else
487 0 : --bmax;
488 0 : for (d = bmax; d >= bmin; d -= 2) {
489 : int x;
490 : int y;
491 : int tlo;
492 : int thi;
493 :
494 0 : tlo = bd[d - 1], thi = bd[d + 1];
495 0 : if (tlo < thi)
496 : x = tlo;
497 : else
498 0 : x = thi - 1;
499 0 : y = x - d;
500 0 : while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
501 0 : --x;
502 0 : --y;
503 : }
504 0 : bd[d] = x;
505 0 : if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
506 0 : part->xmid = x;
507 0 : part->ymid = y;
508 0 : part->lo_minimal = part->hi_minimal = 1;
509 0 : return 2 * c;
510 : }
511 : }
512 :
513 0 : if (minimal)
514 0 : continue;
515 :
516 : /* Heuristic: if we've gone well beyond the call of duty, give up
517 : and report halfway between our best results so far. */
518 0 : if (c >= too_expensive) {
519 : int fxybest;
520 : int fxbest;
521 : int bxybest;
522 : int bxbest;
523 :
524 : /* Pacify `gcc -Wall'. */
525 : fxbest = 0;
526 : bxbest = 0;
527 :
528 : /* Find forward diagonal that maximizes X + Y. */
529 : fxybest = -1;
530 0 : for (d = fmax; d >= fmin; d -= 2) {
531 : int x;
532 : int y;
533 :
534 0 : x = fd[d] < xlim ? fd[d] : xlim;
535 0 : y = x - d;
536 :
537 0 : if (ylim < y) {
538 0 : x = ylim + d;
539 : y = ylim;
540 : }
541 0 : if (fxybest < x + y) {
542 : fxybest = x + y;
543 : fxbest = x;
544 : }
545 : }
546 : /* Find backward diagonal that minimizes X + Y. */
547 : bxybest = INT_MAX;
548 0 : for (d = bmax; d >= bmin; d -= 2) {
549 : int x;
550 : int y;
551 :
552 0 : x = xoff > bd[d] ? xoff : bd[d];
553 0 : y = x - d;
554 :
555 0 : if (y < yoff) {
556 0 : x = yoff + d;
557 : y = yoff;
558 : }
559 0 : if (x + y < bxybest) {
560 : bxybest = x + y;
561 : bxbest = x;
562 : }
563 : }
564 : /* Use the better of the two diagonals. */
565 0 : if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) {
566 0 : part->xmid = fxbest;
567 0 : part->ymid = fxybest - fxbest;
568 0 : part->lo_minimal = 1;
569 0 : part->hi_minimal = 0;
570 : } else {
571 0 : part->xmid = bxbest;
572 0 : part->ymid = bxybest - bxbest;
573 0 : part->lo_minimal = 0;
574 0 : part->hi_minimal = 1;
575 : }
576 0 : return 2 * c - 1;
577 : }
578 : }
579 : }
580 :
581 :
582 : /* NAME
583 : compareseq - find edit sequence
584 :
585 : SYNOPSIS
586 : void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
587 :
588 : DESCRIPTION
589 : Compare in detail contiguous subsequences of the two strings
590 : which are known, as a whole, to match each other.
591 :
592 : The subsequence of string 0 is [XOFF, XLIM) and likewise for
593 : string 1.
594 :
595 : Note that XLIM, YLIM are exclusive bounds. All character
596 : numbers are origin-0.
597 :
598 : If MINIMAL is nonzero, find a minimal difference no matter how
599 : expensive it is. */
600 :
601 : static inline void
602 373680 : compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, int max_edits, int too_expensive, struct string_data *string, int *fdiag, int *bdiag) /* compareseq stops when edits > max_edits */
603 : {
604 373680 : const char *const xv = string[0].data; /* Help the compiler. */
605 373680 : const char *const yv = string[1].data;
606 :
607 373680 : if (string[1].edit_count + string[0].edit_count > max_edits)
608 : return;
609 :
610 : /* Slide down the bottom initial diagonal. */
611 2399780 : while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) {
612 2026100 : ++xoff;
613 2026100 : ++yoff;
614 : }
615 :
616 : /* Slide up the top initial diagonal. */
617 373685 : while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) {
618 5 : --xlim;
619 5 : --ylim;
620 : }
621 :
622 : /* Handle simple cases. */
623 373680 : if (xoff == xlim) {
624 373679 : while (yoff < ylim) {
625 0 : ++string[1].edit_count;
626 0 : ++yoff;
627 : }
628 1 : } else if (yoff == ylim) {
629 2 : while (xoff < xlim) {
630 1 : ++string[0].edit_count;
631 1 : ++xoff;
632 : }
633 : } else {
634 : int c;
635 : struct partition part;
636 :
637 : /* Find a point of correspondence in the middle of the strings. */
638 0 : c = diag(xoff, xlim, yoff, ylim, minimal, &part, too_expensive, string, fdiag, bdiag);
639 0 : if (c == 1) {
640 : /* The two subsequences differ by a single insert or delete;
641 : record it and we are done. */
642 0 : if (part.xmid - part.ymid < xoff - yoff)
643 0 : ++string[1].edit_count;
644 : else
645 0 : ++string[0].edit_count;
646 : } else {
647 : /* Use the partitions to split this problem into subproblems. */
648 0 : compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal, max_edits, too_expensive, string, fdiag, bdiag);
649 0 : compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal, max_edits, too_expensive, string, fdiag, bdiag);
650 : }
651 : }
652 : }
653 :
654 : /* NAME
655 : fstrcmp - fuzzy string compare
656 :
657 : SYNOPSIS
658 : double fstrcmp(const char *s1, int l1, const char *s2, int l2, double);
659 :
660 : DESCRIPTION
661 : The fstrcmp function may be used to compare two string for
662 : similarity. It is very useful in reducing "cascade" or
663 : "secondary" errors in compilers or other situations where
664 : symbol tables occur.
665 :
666 : RETURNS
667 : double; 0 if the strings are entirly dissimilar, 1 if the
668 : strings are identical, and a number in between if they are
669 : similar. */
670 :
671 : #define INITIAL_INT_BUFFER_LENGTH 2048
672 :
673 : #define CHECK_INT_BUFFER_LENGTH(BUFFER, BUFFER_LEN, NEXT_LEN, OP) \
674 : do { \
675 : if ((NEXT_LEN) > *BUFFER_LEN) { \
676 : size_t newlen = (((NEXT_LEN) + 1023) & ~1023); /* align to a multiple of 1024 bytes */ \
677 : int *newbuf = GDKmalloc(newlen); \
678 : if (!newbuf) \
679 : throw(MAL, OP, SQLSTATE(HY013) MAL_MALLOC_FAIL); \
680 : GDKfree(*BUFFER); \
681 : *BUFFER = newbuf; \
682 : *BUFFER_LEN = newlen; \
683 : } \
684 : } while (0)
685 :
686 : static str
687 372987 : fstrcmp_impl_internal(dbl *ret, int **fdiag_buf, size_t *fdiag_buflen, str string1, str string2, dbl minimum)
688 : {
689 : int i, max_edits, *fdiag, *bdiag, too_expensive = 1;
690 : size_t fdiag_len;
691 : struct string_data string[2];
692 :
693 : /* set the info for each string. */
694 372987 : string[0].data = string1;
695 372987 : string[0].data_length = (int) strlen(string1); /* 64bit: assume string not too long */
696 372987 : string[1].data = string2;
697 372987 : string[1].data_length = (int) strlen(string2); /* 64bit: assume string not too long */
698 :
699 : /* short-circuit obvious comparisons */
700 372987 : if (string[0].data_length == 0 && string[1].data_length == 0) {
701 0 : *ret = 1.0;
702 0 : return MAL_SUCCEED;
703 : }
704 372987 : if (string[0].data_length == 0 || string[1].data_length == 0) {
705 0 : *ret = 0.0;
706 0 : return MAL_SUCCEED;
707 : }
708 :
709 : /* Set TOO_EXPENSIVE to be approximate square root of input size,
710 : bounded below by 256. */
711 1118890 : for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
712 745900 : too_expensive <<= 1;
713 : if (too_expensive < 256)
714 : too_expensive = 256;
715 :
716 : /* Because fstrcmp is typically called multiple times, while scanning
717 : symbol tables, etc, attempt to minimize the number of memory
718 : allocations performed. Thus, we use a static buffer for the
719 : diagonal vectors, and never free them. */
720 372987 : fdiag_len = string[0].data_length + string[1].data_length + 3;
721 372987 : CHECK_INT_BUFFER_LENGTH(fdiag_buf, fdiag_buflen, fdiag_len, "txtsim.similarity");
722 372987 : fdiag = *fdiag_buf + string[1].data_length + 1;
723 372987 : bdiag = fdiag + fdiag_len;
724 :
725 372987 : max_edits = 1 + (int) ((string[0].data_length + string[1].data_length) * (1. - minimum));
726 :
727 : /* Now do the main comparison algorithm */
728 372987 : string[0].edit_count = 0;
729 372987 : string[1].edit_count = 0;
730 372987 : compareseq(0, string[0].data_length, 0, string[1].data_length, 0, max_edits, too_expensive, string, fdiag, bdiag);
731 :
732 : /* The result is
733 : ((number of chars in common) / (average length of the strings)).
734 : This is admittedly biased towards finding that the strings are
735 : similar, however it does produce meaningful results. */
736 373751 : *ret = ((double)
737 373751 : (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
738 373751 : / (string[0].data_length + string[1].data_length));
739 373751 : return MAL_SUCCEED;
740 : }
741 :
742 : static str
743 0 : fstrcmp_impl(dbl *ret, str *string1, str *string2, dbl *minimum)
744 : {
745 0 : str s1 = *string1, s2 = *string2;
746 0 : dbl min = *minimum;
747 :
748 0 : if (strNil(s1) || strNil(s2) || is_dbl_nil(min)) {
749 0 : *ret = dbl_nil;
750 0 : return MAL_SUCCEED;
751 : } else {
752 : str msg = MAL_SUCCEED;
753 0 : int *fdiag_buf = NULL;
754 0 : size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
755 :
756 0 : if (!(fdiag_buf = GDKmalloc(fdiag_buflen)))
757 0 : throw(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
758 0 : msg = fstrcmp_impl_internal(ret, &fdiag_buf, &fdiag_buflen, s1, s1, min);
759 0 : GDKfree(fdiag_buf);
760 0 : return msg;
761 : }
762 : }
763 :
764 : static str
765 4 : fstrcmp0_impl(dbl *ret, str *string1, str *string2)
766 : {
767 4 : str s1 = *string1, s2 = *string2;
768 :
769 6 : if (strNil(s1) || strNil(s2)) {
770 3 : *ret = dbl_nil;
771 3 : return MAL_SUCCEED;
772 : } else {
773 : str msg = MAL_SUCCEED;
774 1 : int *fdiag_buf = NULL;
775 1 : size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
776 :
777 1 : if (!(fdiag_buf = GDKmalloc(fdiag_buflen)))
778 0 : throw(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
779 1 : msg = fstrcmp_impl_internal(ret, &fdiag_buf, &fdiag_buflen, s1, s2, 0.0);
780 1 : GDKfree(fdiag_buf);
781 1 : return msg;
782 : }
783 : }
784 :
785 : static str
786 8 : fstrcmp0_impl_bulk(bat *res, bat *strings1, bat *strings2)
787 : {
788 : BATiter lefti, righti;
789 : BAT *bn = NULL, *left = NULL, *right = NULL;
790 : BUN q = 0;
791 8 : size_t fdiag_buflen = INITIAL_INT_BUFFER_LENGTH;
792 : str x, y, msg = MAL_SUCCEED;
793 : bool nils = false;
794 : dbl *restrict vals;
795 8 : int *fdiag_buf = GDKmalloc(fdiag_buflen);
796 :
797 8 : if (!fdiag_buf) {
798 0 : msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
799 0 : goto bailout;
800 : }
801 8 : if (!(left = BATdescriptor(*strings1)) || !(right = BATdescriptor(*strings2))) {
802 0 : msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY005) RUNTIME_OBJECT_MISSING);
803 0 : goto bailout;
804 : }
805 8 : q = BATcount(left);
806 8 : if (!(bn = COLnew(left->hseqbase, TYPE_dbl, q, TRANSIENT))) {
807 0 : msg = createException(MAL, "txtsim.similarity", SQLSTATE(HY013) MAL_MALLOC_FAIL);
808 0 : goto bailout;
809 : }
810 :
811 : lefti = bat_iterator(left);
812 : righti = bat_iterator(right);
813 8 : vals = Tloc(bn, 0);
814 373526 : for (BUN i = 0; i < q && !msg; i++) {
815 373518 : x = (str) BUNtvar(lefti, i);
816 373518 : y = (str) BUNtvar(righti, i);
817 :
818 746922 : if (strNil(x) || strNil(y)) {
819 231 : vals[i] = dbl_nil;
820 231 : nils = true;
821 : } else {
822 373287 : msg = fstrcmp_impl_internal(&vals[i], &fdiag_buf, &fdiag_buflen, x, y, 0.0);
823 : }
824 : }
825 :
826 8 : bailout:
827 8 : GDKfree(fdiag_buf);
828 8 : if (bn && !msg) {
829 8 : BATsetcount(bn, q);
830 8 : bn->tnil = nils;
831 8 : bn->tnonil = !nils;
832 8 : bn->tkey = BATcount(bn) <= 1;
833 8 : bn->tsorted = BATcount(bn) <= 1;
834 8 : bn->trevsorted = BATcount(bn) <= 1;
835 8 : bn->theap.dirty = true;
836 8 : BBPkeepref(*res = bn->batCacheid);
837 0 : } else if (bn)
838 0 : BBPreclaim(bn);
839 8 : if (left)
840 8 : BBPunfix(left->batCacheid);
841 8 : if (right)
842 8 : BBPunfix(right->batCacheid);
843 8 : return msg;
844 : }
845 :
846 :
847 : /* ============ Q-GRAM SELF JOIN ============== */
848 :
849 : static str
850 0 : CMDqgramselfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid, flt *c, int *k)
851 : {
852 : BAT *qgram, *id, *pos, *len;
853 : BUN n;
854 : BUN i, j;
855 : BAT *bn, *bn2;
856 : oid *qbuf;
857 : int *ibuf;
858 : int *pbuf;
859 : int *lbuf;
860 : str msg = MAL_SUCCEED;
861 :
862 0 : qgram = BATdescriptor(*qid);
863 0 : id = BATdescriptor(*bid);
864 0 : pos = BATdescriptor(*pid);
865 0 : len = BATdescriptor(*lid);
866 0 : if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
867 0 : if (qgram)
868 0 : BBPunfix(qgram->batCacheid);
869 0 : if (id)
870 0 : BBPunfix(id->batCacheid);
871 0 : if (pos)
872 0 : BBPunfix(pos->batCacheid);
873 0 : if (len)
874 0 : BBPunfix(len->batCacheid);
875 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
876 : }
877 :
878 0 : if (qgram->ttype != TYPE_oid)
879 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
880 : SEMANTIC_TYPE_MISMATCH ": tail of BAT qgram must be oid");
881 0 : else if (id->ttype != TYPE_int)
882 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
883 : SEMANTIC_TYPE_MISMATCH ": tail of BAT id must be int");
884 0 : else if (pos->ttype != TYPE_int)
885 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
886 : SEMANTIC_TYPE_MISMATCH ": tail of BAT pos must be int");
887 0 : else if (len->ttype != TYPE_int)
888 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
889 : SEMANTIC_TYPE_MISMATCH ": tail of BAT len must be int");
890 : if (msg) {
891 0 : BBPunfix(qgram->batCacheid);
892 0 : BBPunfix(id->batCacheid);
893 0 : BBPunfix(pos->batCacheid);
894 0 : BBPunfix(len->batCacheid);
895 0 : return msg;
896 : }
897 :
898 0 : n = BATcount(qgram);
899 0 : qbuf = (oid *) Tloc(qgram, 0);
900 0 : ibuf = (int *) Tloc(id, 0);
901 0 : pbuf = (int *) Tloc(pos, 0);
902 0 : lbuf = (int *) Tloc(len, 0);
903 :
904 : /* if (BATcount(qgram)>1 && !BATtordered(qgram)) throw(MAL, "tstsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
905 :
906 0 : if (!ALIGNsynced(qgram, id))
907 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
908 : SEMANTIC_TYPE_MISMATCH ": qgram and id are not synced");
909 :
910 0 : else if (!ALIGNsynced(qgram, pos))
911 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
912 : SEMANTIC_TYPE_MISMATCH ": qgram and pos are not synced");
913 0 : else if (!ALIGNsynced(qgram, len))
914 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
915 : SEMANTIC_TYPE_MISMATCH ": qgram and len are not synced");
916 :
917 0 : else if (Tsize(qgram) != ATOMsize(qgram->ttype))
918 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
919 : SEMANTIC_TYPE_MISMATCH ": qgram is not a true void bat");
920 0 : else if (Tsize(id) != ATOMsize(id->ttype))
921 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
922 : SEMANTIC_TYPE_MISMATCH ": id is not a true void bat");
923 :
924 0 : else if (Tsize(pos) != ATOMsize(pos->ttype))
925 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
926 : SEMANTIC_TYPE_MISMATCH ": pos is not a true void bat");
927 0 : else if (Tsize(len) != ATOMsize(len->ttype))
928 0 : msg = createException(MAL, "tstsim.qgramselfjoin",
929 : SEMANTIC_TYPE_MISMATCH ": len is not a true void bat");
930 0 : if (msg) {
931 0 : BBPunfix(qgram->batCacheid);
932 0 : BBPunfix(id->batCacheid);
933 0 : BBPunfix(pos->batCacheid);
934 0 : BBPunfix(len->batCacheid);
935 0 : return msg;
936 : }
937 :
938 0 : bn = COLnew(0, TYPE_int, n, TRANSIENT);
939 0 : bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
940 0 : if (bn == NULL || bn2 == NULL){
941 0 : BBPreclaim(bn);
942 0 : BBPreclaim(bn2);
943 0 : BBPunfix(qgram->batCacheid);
944 0 : BBPunfix(id->batCacheid);
945 0 : BBPunfix(pos->batCacheid);
946 0 : BBPunfix(len->batCacheid);
947 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
948 : }
949 :
950 0 : for (i = 0; i < n - 1; i++) {
951 0 : for (j = i + 1; (j < n && qbuf[j] == qbuf[i] && pbuf[j] <= (pbuf[i] + (*k + *c * MYMIN(lbuf[i], lbuf[j])))); j++) {
952 0 : if (ibuf[i] != ibuf[j] && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MYMIN(lbuf[i], lbuf[j]))) {
953 0 : if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED ||
954 0 : BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
955 0 : BBPunfix(qgram->batCacheid);
956 0 : BBPunfix(id->batCacheid);
957 0 : BBPunfix(pos->batCacheid);
958 0 : BBPunfix(len->batCacheid);
959 0 : BBPreclaim(bn);
960 0 : BBPreclaim(bn2);
961 0 : throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY013) MAL_MALLOC_FAIL);
962 : }
963 : }
964 : }
965 : }
966 :
967 0 : BBPunfix(qgram->batCacheid);
968 0 : BBPunfix(id->batCacheid);
969 0 : BBPunfix(pos->batCacheid);
970 0 : BBPunfix(len->batCacheid);
971 :
972 0 : BBPkeepref(*res1 = bn->batCacheid);
973 0 : BBPkeepref(*res2 = bn2->batCacheid);
974 :
975 0 : return MAL_SUCCEED;
976 : }
977 :
978 : /* copy up to utf8len UTF-8 encoded characters from src to buf
979 : * stop early if buf (size given by bufsize) is too small, or if src runs out
980 : * return number of UTF-8 characters copied (excluding NUL)
981 : * close with NUL if enough space */
982 : static size_t
983 26 : utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
984 : {
985 : size_t cnt = 0;
986 :
987 128 : while (utf8len != 0 && *src != 0 && bufsize != 0) {
988 102 : bufsize--;
989 102 : utf8len--;
990 102 : cnt++;
991 102 : if (((*buf++ = *src++) & 0x80) != 0) {
992 40 : while ((*src & 0xC0) == 0x80 && bufsize != 0) {
993 20 : *buf++ = *src++;
994 20 : bufsize--;
995 : }
996 : }
997 : }
998 26 : if (bufsize != 0)
999 26 : *buf = 0;
1000 26 : return cnt;
1001 : }
1002 :
1003 : static str
1004 2 : CMDstr2qgrams(bat *ret, str *val)
1005 : {
1006 : BAT *bn;
1007 2 : size_t i, len = strlen(*val) + 5;
1008 2 : str s = GDKmalloc(len);
1009 : char qgram[4 * 6 + 1]; /* 4 UTF-8 code points plus NULL byte */
1010 :
1011 2 : if (s == NULL)
1012 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1013 2 : strcpy(s, "##");
1014 2 : strcpy(s + 2, *val);
1015 2 : strcpy(s + len - 3, "$$");
1016 2 : bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
1017 2 : if (bn == NULL) {
1018 0 : GDKfree(s);
1019 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1020 : }
1021 :
1022 : i = 0;
1023 26 : while (s[i]) {
1024 26 : if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
1025 : break;
1026 24 : if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
1027 0 : BBPreclaim(bn);
1028 0 : GDKfree(s);
1029 0 : throw(MAL, "txtsim.str2qgram", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1030 : }
1031 24 : if ((s[i++] & 0xC0) == 0xC0) {
1032 8 : while ((s[i] & 0xC0) == 0x80)
1033 4 : i++;
1034 : }
1035 : }
1036 2 : BBPkeepref(*ret = bn->batCacheid);
1037 2 : GDKfree(s);
1038 2 : return MAL_SUCCEED;
1039 : }
1040 :
1041 : #include "mel.h"
1042 : mel_func txtsim_init_funcs[] = {
1043 : command("txtsim", "levenshtein", levenshtein_impl, false, "Calculates Levenshtein distance (edit distance) between two strings, variable operation costs (ins/del, replacement, transposition)", args(1,6, arg("",int),arg("s",str),arg("t",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
1044 : command("txtsim", "levenshtein", levenshteinbasic_impl, false, "Calculates Levenshtein distance (edit distance) between two strings", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1045 : command("txtsim", "editdistance", levenshteinbasic_impl, false, "Alias for Levenshtein(str,str)", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1046 : command("txtsim", "editdistance2", levenshteinbasic2_impl, false, "Calculates Levenshtein distance (edit distance) between two strings. Cost of transposition is 1 instead of 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
1047 : command("txtsim", "similarity", fstrcmp_impl, false, "Normalized edit distance between two strings", args(1,4, arg("",dbl),arg("string1",str),arg("string2",str),arg("minimum",dbl))),
1048 : command("txtsim", "similarity", fstrcmp0_impl, false, "Normalized edit distance between two strings", args(1,3, arg("",dbl),arg("string1",str),arg("string2",str))),
1049 : command("battxtsim", "similarity", fstrcmp0_impl_bulk, false, "Normalized edit distance between two strings", args(1,3, batarg("",dbl),batarg("string1",str),batarg("string2",str))),
1050 : command("txtsim", "soundex", soundex_impl, false, "Soundex function for phonetic matching", args(1,2, arg("",str),arg("name",str))),
1051 : command("txtsim", "stringdiff", stringdiff_impl, false, "calculate the soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
1052 : command("txtsim", "qgramnormalize", CMDqgramnormalize, false, "'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space", args(1,2, arg("",str),arg("input",str))),
1053 : command("txtsim", "qgramselfjoin", CMDqgramselfjoin, false, "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions", args(2,8, batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
1054 : command("txtsim", "str2qgrams", CMDstr2qgrams, false, "Break the string into 4-grams", args(1,2, batarg("",str),arg("s",str))),
1055 : { .imp=NULL }
1056 : };
1057 : #include "mal_import.h"
1058 : #ifdef _MSC_VER
1059 : #undef read
1060 : #pragma section(".CRT$XCU",read)
1061 : #endif
1062 255 : LIB_STARTUP_FUNC(init_txtsim_mal)
1063 255 : { mal_module("txtsim", NULL, txtsim_init_funcs); }
|