1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * @f txtsim
11 * @t Module providing similarity metrics for strings
12 * @a Romulo Goncalves (from M4 to M5)
13 * @d 01/12/2007
14 * @v 0.1
15 *
16 * @+ String metrics
17 *
18 * Provides basic similarity metrics for strings.
19 *
20 */
21#include "monetdb_config.h"
22#include "txtsim.h"
23#include "mal_exception.h"
24
25
26#define RETURN_NIL_IF(b,t) \
27 if (b) {\
28 if (ATOMextern(t)) {\
29 *(ptr*) res = (ptr) ATOMnil(t);\
30 if ( *(ptr *) res == NULL)\
31 throw(MAL,"txtsim", SQLSTATE(HY001) MAL_MALLOC_FAIL);\
32 } else {\
33 memcpy(res, ATOMnilptr(t), ATOMsize(t));\
34 }\
35 return MAL_SUCCEED; \
36 }
37
38/* =========================================================================
39 * LEVENSH?TEIN FUNCTION
40 * Source:
41 * http://www.merriampark.com/ld.htm
42 * =========================================================================
43 */
44
45#define MYMIN(x,y) ( (x<y) ? x : y )
46#define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) )
47#define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) )
48
49/***************************************************
50 * Get a pointer to the specified cell of the matrix
51 **************************************************/
52
53static inline int *
54levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols)
55{
56 return pOrigin + col + (row * (nCols + 1));
57}
58
59/******************************************************
60 * Get the contents of the specified cell in the matrix
61 *****************************************************/
62
63static inline int
64levenshtein_GetAt(int *pOrigin, int col, int row, int nCols)
65{
66 int *pCell;
67
68 pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
69 return *pCell;
70
71}
72
73/********************************************************
74 * Fill the specified cell in the matrix with the value x
75 *******************************************************/
76
77static inline void
78levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x)
79{
80 int *pCell;
81
82 pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
83 *pCell = x;
84}
85
86
87/******************************
88 * Compute Levenshtein distance
89 *****************************/
90str
91levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int *replace_cost, int *transpose_cost)
92{
93 char *s = *S;
94 char *t = *T;
95 int *d; /* pointer to matrix */
96 int n; /* length of s */
97 int m; /* length of t */
98 int i; /* iterates through s */
99 int j; /* iterates through t */
100 char s_i; /* ith character of s */
101 char t_j; /* jth character of t */
102 int cost; /* cost */
103 int cell; /* contents of target cell */
104 int above; /* contents of cell immediately above */
105 int left; /* contents of cell immediately to left */
106 int diag; /* contents of cell immediately above and to left */
107 int sz; /* number of cells in matrix */
108 int diag2 = 0, cost2 = 0;
109
110 /* Step 1 */
111 n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */
112 m = (int) strlen(t);
113 if (n == 0) {
114 *result = m;
115 return MAL_SUCCEED;
116 }
117 if (m == 0) {
118 *result = n;
119 return MAL_SUCCEED;
120 }
121 sz = (n + 1) * (m + 1) * sizeof(int);
122 d = (int *) GDKmalloc(sz);
123 if ( d == NULL)
124 throw(MAL,"levenshtein", SQLSTATE(HY001) MAL_MALLOC_FAIL);
125
126 /* Step 2 */
127 for (i = 0; i <= n; i++) {
128 levenshtein_PutAt(d, i, 0, n, i);
129 }
130
131 for (j = 0; j <= m; j++) {
132 levenshtein_PutAt(d, 0, j, n, j);
133 }
134
135 /* Step 3 */
136 for (i = 1; i <= n; i++) {
137
138 s_i = s[i - 1];
139
140 /* Step 4 */
141 for (j = 1; j <= m; j++) {
142
143 t_j = t[j - 1];
144
145 /* Step 5 */
146 if (s_i == t_j) {
147 cost = 0;
148 } else {
149 cost = *replace_cost;
150 }
151
152 /* Step 6 */
153 above = levenshtein_GetAt(d, i - 1, j, n);
154 left = levenshtein_GetAt(d, i, j - 1, n);
155 diag = levenshtein_GetAt(d, i - 1, j - 1, n);
156
157 if (j >= 2 && i >= 2) {
158 /* NEW: detect transpositions */
159
160 diag2 = levenshtein_GetAt(d, i - 2, j - 2, n);
161 if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) {
162 cost2 = *transpose_cost;
163 } else {
164 cost2 = 2;
165 }
166 cell = SMALLEST_OF4(above + *insdel_cost, left + *insdel_cost, diag + cost, diag2 + cost2);
167 } else {
168 cell = SMALLEST_OF(above + *insdel_cost, left + *insdel_cost, diag + cost);
169 }
170 levenshtein_PutAt(d, i, j, n, cell);
171 }
172 }
173
174 /* Step 7 */
175 *result = levenshtein_GetAt(d, n, m, n);
176 GDKfree(d);
177 return MAL_SUCCEED;
178}
179
180str
181levenshteinbasic_impl(int *result, str *s, str *t)
182{
183 int insdel = 1, replace = 1, transpose = 2;
184
185 return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
186}
187
188str
189levenshteinbasic2_impl(int *result, str *s, str *t)
190{
191 int insdel = 1, replace = 1, transpose = 1;
192
193 return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
194}
195
196
197/* =========================================================================
198 * SOUNDEX FUNCTION
199 * Source:
200 * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c
201 * =========================================================================
202 */
203
204#define SoundexLen 4 /* length of a soundex code */
205#define SoundexKey "Z000" /* default key for soundex code */
206
207/* set letter values */
208static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
209 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
210};
211
212static inline char
213SCode(unsigned char c)
214{
215 if (c == 95)
216 return (2); /* german sz */
217 return (Code[toupper(c) - 'A']);
218}
219
220static void
221soundex_code(char *Name, char *Key)
222{
223 char LastLetter;
224 int Index;
225
226 /* set default key */
227 strcpy(Key, SoundexKey);
228
229 /* keep first letter */
230 Key[0] = *Name;
231 if (!isupper((unsigned char) (Key[0])))
232 Key[0] = toupper(Key[0]);
233
234 LastLetter = *Name;
235 if (!*Name)
236 return;
237 Name++;
238
239 /* scan rest of string */
240 for (Index = 1; (Index <SoundexLen) &&*Name; Name++) {
241 /* use only letters */
242 if (isalpha((unsigned char) (*Name))) {
243 /* ignore duplicate successive chars */
244 if (LastLetter != *Name) {
245 /* new LastLetter */
246 LastLetter = *Name;
247
248 /* ignore letters with code 0 */
249 if (SCode(*Name) != 0) {
250 Key[Index] = '0' + SCode(*Name);
251 Index ++;
252 }
253 }
254 }
255 }
256}
257
258
259str
260soundex_impl(str *res, str *Name)
261{
262 RETURN_NIL_IF(strNil(*Name), TYPE_str);
263
264 *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1));
265 if( *res == NULL)
266 throw(MAL,"soundex", SQLSTATE(HY001) MAL_MALLOC_FAIL);
267
268 /* calculate Key for Name */
269 soundex_code(*Name, *res);
270
271 return MAL_SUCCEED;
272}
273
274str
275stringdiff_impl(int *res, str *s1, str *s2)
276{
277 str r = MAL_SUCCEED;
278 char *S1 = NULL, *S2 = NULL;
279
280 r = soundex_impl(&S1, s1);
281 if( r != MAL_SUCCEED)
282 return r;
283 r = soundex_impl(&S2, s2);
284 if( r != MAL_SUCCEED){
285 GDKfree(S1);
286 return r;
287 }
288 r = levenshteinbasic_impl(res, &S1, &S2);
289 GDKfree(S1);
290 GDKfree(S2);
291 return r;
292}
293
294/******************************
295 * QGRAMNORMALIZE
296 *
297 * This function 'normalizes' a string so valid q-grams can be made of it:
298 * All characters are transformed to uppercase, and all characters
299 * which are not letters or digits are stripped to a single space.
300 *
301 * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
302 * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
303 *
304 *****************************/
305str
306CMDqgramnormalize(str *res, str *Input)
307{
308 char *input = *Input;
309 int i, j = 0;
310 char c, last = ' ';
311
312 RETURN_NIL_IF(strNil(input), TYPE_str);
313 *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1)); /* normalized strings are never longer than original */
314 if (*res == NULL)
315 throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY001) MAL_MALLOC_FAIL);
316
317 for (i = 0; input[i]; i++) {
318 c = toupper(input[i]);
319 if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c)))
320 c = ' ';
321 if (c != ' ' || last != ' ') {
322 (*res)[j++] = c;
323 }
324 last = c;
325 }
326 (*res)[j] = 0;
327 /* strip final whitespace */
328 while (j > 0 && (*res)[--j] == ' ')
329 (*res)[j] = 0;
330
331 return MAL_SUCCEED;
332}
333
334/* =========================================================================
335 * FSTRCMP FUNCTION
336 * Source:
337 * http://search.cpan.org/src/MLEHMANN/String-Similarity-1/fstrcmp.c
338 * =========================================================================
339 */
340
341#define PARAMS(proto) proto
342
343/*
344 * Data on one input string being compared.
345 */
346struct string_data {
347 /* The string to be compared. */
348 const char *data;
349
350 /* The length of the string to be compared. */
351 int data_length;
352
353 /* The number of characters inserted or deleted. */
354 int edit_count;
355};
356
357static struct string_data string[2];
358
359static int max_edits; /* compareseq stops when edits > max_edits */
360
361#ifdef MINUS_H_FLAG
362
363/* This corresponds to the diff -H flag. With this heuristic, for
364 strings with a constant small density of changes, the algorithm is
365 linear in the strings size. This is unlikely in typical uses of
366 fstrcmp, and so is usually compiled out. Besides, there is no
367 interface to set it true. */
368static int heuristic;
369
370#endif
371
372
373/* Vector, indexed by diagonal, containing 1 + the X coordinate of the
374 point furthest along the given diagonal in the forward search of the
375 edit matrix. */
376static int *fdiag;
377
378/* Vector, indexed by diagonal, containing the X coordinate of the point
379 furthest along the given diagonal in the backward search of the edit
380 matrix. */
381static int *bdiag;
382
383/* Edit scripts longer than this are too expensive to compute. */
384static int too_expensive;
385
386/* Snakes bigger than this are considered `big'. */
387#define SNAKE_LIMIT 20
388
389struct partition {
390 /* Midpoints of this partition. */
391 int xmid, ymid;
392
393 /* Nonzero if low half will be analyzed minimally. */
394 int lo_minimal;
395
396 /* Likewise for high half. */
397 int hi_minimal;
398};
399
400
401/* NAME
402 diag - find diagonal path
403
404 SYNOPSIS
405 int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
406 struct partition *part);
407
408 DESCRIPTION
409 Find the midpoint of the shortest edit script for a specified
410 portion of the two strings.
411
412 Scan from the beginnings of the strings, and simultaneously from
413 the ends, doing a breadth-first search through the space of
414 edit-sequence. When the two searches meet, we have found the
415 midpoint of the shortest edit sequence.
416
417 If MINIMAL is nonzero, find the minimal edit script regardless
418 of expense. Otherwise, if the search is too expensive, use
419 heuristics to stop the search and report a suboptimal answer.
420
421 RETURNS
422 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
423 number XMID - YMID equals the number of inserted characters
424 minus the number of deleted characters (counting only characters
425 before the midpoint). Return the approximate edit cost; this is
426 the total number of characters inserted or deleted (counting
427 only characters before the midpoint), unless a heuristic is used
428 to terminate the search prematurely.
429
430 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
431 for the left half of the partition is known; similarly for
432 PART->RIGHT_MINIMAL.
433
434 CAVEAT
435 This function assumes that the first characters of the specified
436 portions of the two strings do not match, and likewise that the
437 last characters do not match. The caller must trim matching
438 characters from the beginning and end of the portions it is
439 going to specify.
440
441 If we return the "wrong" partitions, the worst this can do is
442 cause suboptimal diff output. It cannot cause incorrect diff
443 output. */
444
445static int diag PARAMS((int, int, int, int, int, struct partition *));
446
447static int
448diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part)
449{
450 int *const fd = fdiag; /* Give the compiler a chance. */
451 int *const bd = bdiag; /* Additional help for the compiler. */
452 const char *const xv = string[0].data; /* Still more help for the compiler. */
453 const char *const yv = string[1].data; /* And more and more . . . */
454 const int dmin = xoff - ylim; /* Minimum valid diagonal. */
455 const int dmax = xlim - yoff; /* Maximum valid diagonal. */
456 const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
457 const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
458 int fmin = fmid;
459 int fmax = fmid; /* Limits of top-down search. */
460 int bmin = bmid;
461 int bmax = bmid; /* Limits of bottom-up search. */
462 int c; /* Cost. */
463 int odd = (fmid - bmid) & 1;
464
465 /*
466 * True if southeast corner is on an odd diagonal with respect
467 * to the northwest.
468 */
469 fd[fmid] = xoff;
470 bd[bmid] = xlim;
471 for (c = 1;; ++c) {
472 int d; /* Active diagonal. */
473 int big_snake;
474
475 big_snake = 0;
476 /* Extend the top-down search by an edit step in each diagonal. */
477 if (fmin > dmin)
478 fd[--fmin - 1] = -1;
479 else
480 ++fmin;
481 if (fmax < dmax)
482 fd[++fmax + 1] = -1;
483 else
484 --fmax;
485 for (d = fmax; d >= fmin; d -= 2) {
486 int x;
487 int y;
488 int oldx;
489 int tlo;
490 int thi;
491
492 tlo = fd[d - 1], thi = fd[d + 1];
493
494 if (tlo >= thi)
495 x = tlo + 1;
496 else
497 x = thi;
498 oldx = x;
499 y = x - d;
500 while (x < xlim && y < ylim && xv[x] == yv[y]) {
501 ++x;
502 ++y;
503 }
504 if (x - oldx > SNAKE_LIMIT)
505 big_snake = 1;
506 fd[d] = x;
507 if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
508 part->xmid = x;
509 part->ymid = y;
510 part->lo_minimal = part->hi_minimal = 1;
511 return 2 * c - 1;
512 }
513 }
514 /* Similarly extend the bottom-up search. */
515 if (bmin > dmin)
516 bd[--bmin - 1] = INT_MAX;
517 else
518 ++bmin;
519 if (bmax < dmax)
520 bd[++bmax + 1] = INT_MAX;
521 else
522 --bmax;
523 for (d = bmax; d >= bmin; d -= 2) {
524 int x;
525 int y;
526 int oldx;
527 int tlo;
528 int thi;
529
530 tlo = bd[d - 1], thi = bd[d + 1];
531 if (tlo < thi)
532 x = tlo;
533 else
534 x = thi - 1;
535 oldx = x;
536 y = x - d;
537 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
538 --x;
539 --y;
540 }
541 if (oldx - x > SNAKE_LIMIT)
542 big_snake = 1;
543 bd[d] = x;
544 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
545 part->xmid = x;
546 part->ymid = y;
547 part->lo_minimal = part->hi_minimal = 1;
548 return 2 * c;
549 }
550 }
551
552 if (minimal)
553 continue;
554
555#ifdef MINUS_H_FLAG
556 /* Heuristic: check occasionally for a diagonal that has made lots
557 of progress compared with the edit distance. If we have any
558 such, find the one that has made the most progress and return
559 it as if it had succeeded.
560
561 With this heuristic, for strings with a constant small density
562 of changes, the algorithm is linear in the strings size. */
563 if (c > 200 && big_snake && heuristic) {
564 int best;
565
566 best = 0;
567 for (d = fmax; d >= fmin; d -= 2) {
568 int dd;
569 int x;
570 int y;
571 int v;
572
573 dd = d - fmid;
574 x = fd[d];
575 y = x - d;
576 v = (x - xoff) * 2 - dd;
577
578 if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
579 if (v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim) {
580 /* We have a good enough best diagonal; now insist
581 that it end with a significant snake. */
582 int k;
583
584 for (k = 1; xv[x - k] == yv[y - k]; k++) {
585 if (k == SNAKE_LIMIT) {
586 best = v;
587 part->xmid = x;
588 part->ymid = y;
589 break;
590 }
591 }
592 }
593 }
594 }
595 if (best > 0) {
596 part->lo_minimal = 1;
597 part->hi_minimal = 0;
598 return 2 * c - 1;
599 }
600 best = 0;
601 for (d = bmax; d >= bmin; d -= 2) {
602 int dd;
603 int x;
604 int y;
605 int v;
606
607 dd = d - bmid;
608 x = bd[d];
609 y = x - d;
610 v = (xlim - x) * 2 + dd;
611
612 if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
613 if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT) {
614 /* We have a good enough best diagonal; now insist
615 that it end with a significant snake. */
616 int k;
617
618 for (k = 0; xv[x + k] == yv[y + k]; k++) {
619 if (k == SNAKE_LIMIT - 1) {
620 best = v;
621 part->xmid = x;
622 part->ymid = y;
623 break;
624 }
625 }
626 }
627 }
628 }
629 if (best > 0) {
630 part->lo_minimal = 0;
631 part->hi_minimal = 1;
632 return 2 * c - 1;
633 }
634 }
635#else
636 (void) big_snake;
637#endif /* MINUS_H_FLAG */
638
639 /* Heuristic: if we've gone well beyond the call of duty, give up
640 and report halfway between our best results so far. */
641 if (c >= too_expensive) {
642 int fxybest;
643 int fxbest;
644 int bxybest;
645 int bxbest;
646
647 /* Pacify `gcc -Wall'. */
648 fxbest = 0;
649 bxbest = 0;
650
651 /* Find forward diagonal that maximizes X + Y. */
652 fxybest = -1;
653 for (d = fmax; d >= fmin; d -= 2) {
654 int x;
655 int y;
656
657 x = fd[d] < xlim ? fd[d] : xlim;
658 y = x - d;
659
660 if (ylim < y) {
661 x = ylim + d;
662 y = ylim;
663 }
664 if (fxybest < x + y) {
665 fxybest = x + y;
666 fxbest = x;
667 }
668 }
669 /* Find backward diagonal that minimizes X + Y. */
670 bxybest = INT_MAX;
671 for (d = bmax; d >= bmin; d -= 2) {
672 int x;
673 int y;
674
675 x = xoff > bd[d] ? xoff : bd[d];
676 y = x - d;
677
678 if (y < yoff) {
679 x = yoff + d;
680 y = yoff;
681 }
682 if (x + y < bxybest) {
683 bxybest = x + y;
684 bxbest = x;
685 }
686 }
687 /* Use the better of the two diagonals. */
688 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) {
689 part->xmid = fxbest;
690 part->ymid = fxybest - fxbest;
691 part->lo_minimal = 1;
692 part->hi_minimal = 0;
693 } else {
694 part->xmid = bxbest;
695 part->ymid = bxybest - bxbest;
696 part->lo_minimal = 0;
697 part->hi_minimal = 1;
698 }
699 return 2 * c - 1;
700 }
701 }
702}
703
704
705/* NAME
706 compareseq - find edit sequence
707
708 SYNOPSIS
709 void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
710
711 DESCRIPTION
712 Compare in detail contiguous subsequences of the two strings
713 which are known, as a whole, to match each other.
714
715 The subsequence of string 0 is [XOFF, XLIM) and likewise for
716 string 1.
717
718 Note that XLIM, YLIM are exclusive bounds. All character
719 numbers are origin-0.
720
721 If MINIMAL is nonzero, find a minimal difference no matter how
722 expensive it is. */
723
724static void compareseq PARAMS((int, int, int, int, int));
725
726static void
727compareseq(int xoff, int xlim, int yoff, int ylim, int minimal)
728{
729 const char *const xv = string[0].data; /* Help the compiler. */
730 const char *const yv = string[1].data;
731
732 if (string[1].edit_count + string[0].edit_count > max_edits)
733 return;
734
735 /* Slide down the bottom initial diagonal. */
736 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) {
737 ++xoff;
738 ++yoff;
739 }
740
741 /* Slide up the top initial diagonal. */
742 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) {
743 --xlim;
744 --ylim;
745 }
746
747 /* Handle simple cases. */
748 if (xoff == xlim) {
749 while (yoff < ylim) {
750 ++string[1].edit_count;
751 ++yoff;
752 }
753 } else if (yoff == ylim) {
754 while (xoff < xlim) {
755 ++string[0].edit_count;
756 ++xoff;
757 }
758 } else {
759 int c;
760 struct partition part;
761
762 /* Find a point of correspondence in the middle of the strings. */
763 c = diag(xoff, xlim, yoff, ylim, minimal, &part);
764 if (c == 1) {
765#if 0
766 /* This should be impossible, because it implies that one of
767 the two subsequences is empty, and that case was handled
768 above without calling `diag'. Let's verify that this is
769 true. */
770 abort();
771#else
772 /* The two subsequences differ by a single insert or delete;
773 record it and we are done. */
774 if (part.xmid - part.ymid < xoff - yoff)
775 ++string[1].edit_count;
776 else
777 ++string[0].edit_count;
778#endif
779 } else {
780 /* Use the partitions to split this problem into subproblems. */
781 compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
782 compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
783 }
784 }
785}
786
787/* NAME
788 fstrcmp - fuzzy string compare
789
790 SYNOPSIS
791 double fstrcmp(const char *s1, int l1, const char *s2, int l2, double);
792
793 DESCRIPTION
794 The fstrcmp function may be used to compare two string for
795 similarity. It is very useful in reducing "cascade" or
796 "secondary" errors in compilers or other situations where
797 symbol tables occur.
798
799 RETURNS
800 double; 0 if the strings are entirly dissimilar, 1 if the
801 strings are identical, and a number in between if they are
802 similar. */
803
804str
805fstrcmp_impl(dbl *ret, str *S1, str *S2, dbl *minimum)
806{
807 char *string1 = *S1;
808 char *string2 = *S2;
809 int i;
810
811 size_t fdiag_len;
812 static int *fdiag_buf;
813 static size_t fdiag_max;
814
815 /* set the info for each string. */
816 string[0].data = string1;
817 string[0].data_length = (int) strlen(string1); /* 64bit: assume string not too long */
818 string[1].data = string2;
819 string[1].data_length = (int) strlen(string2); /* 64bit: assume string not too long */
820
821 /* short-circuit obvious comparisons */
822 if (string[0].data_length == 0 && string[1].data_length == 0) {
823 *ret = 1.0;
824 return MAL_SUCCEED;
825 }
826 if (string[0].data_length == 0 || string[1].data_length == 0) {
827 *ret = 0.0;
828 return MAL_SUCCEED;
829 }
830
831 /* Set TOO_EXPENSIVE to be approximate square root of input size,
832 bounded below by 256. */
833 too_expensive = 1;
834 for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
835 too_expensive <<= 1;
836 if (too_expensive < 256)
837 too_expensive = 256;
838
839 /* Because fstrcmp is typically called multiple times, while scanning
840 symbol tables, etc, attempt to minimize the number of memory
841 allocations performed. Thus, we use a static buffer for the
842 diagonal vectors, and never free them. */
843 fdiag_len = string[0].data_length + string[1].data_length + 3;
844 if (fdiag_len > fdiag_max) {
845 fdiag_max = fdiag_len;
846 fdiag_buf = realloc(fdiag_buf, fdiag_max * (2 * sizeof(int)));
847 }
848 fdiag = fdiag_buf + string[1].data_length + 1;
849 bdiag = fdiag + fdiag_len;
850
851 max_edits = 1 + (int) ((string[0].data_length + string[1].data_length) * (1. - *minimum));
852
853 /* Now do the main comparison algorithm */
854 string[0].edit_count = 0;
855 string[1].edit_count = 0;
856 compareseq(0, string[0].data_length, 0, string[1].data_length, 0);
857
858 /* The result is
859 ((number of chars in common) / (average length of the strings)).
860 This is admittedly biased towards finding that the strings are
861 similar, however it does produce meaningful results. */
862 *ret = ((double)
863 (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count)
864 / (string[0].data_length + string[1].data_length));
865 return MAL_SUCCEED;
866}
867
868str
869fstrcmp0_impl(dbl *ret, str *string1, str *string2)
870{
871 double min = 0.0;
872
873 return fstrcmp_impl(ret, string1, string2, &min);
874}
875
876
877/* ============ Q-GRAM SELF JOIN ============== */
878
879str
880CMDqgramselfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid, flt *c, int *k)
881{
882 BAT *qgram, *id, *pos, *len;
883 BUN n;
884 BUN i, j;
885 BAT *bn, *bn2;
886 oid *qbuf;
887 int *ibuf;
888 int *pbuf;
889 int *lbuf;
890 str msg = MAL_SUCCEED;
891
892 qgram = BATdescriptor(*qid);
893 id = BATdescriptor(*bid);
894 pos = BATdescriptor(*pid);
895 len = BATdescriptor(*lid);
896 if (qgram == NULL || id == NULL || pos == NULL || len == NULL) {
897 if (qgram)
898 BBPunfix(qgram->batCacheid);
899 if (id)
900 BBPunfix(id->batCacheid);
901 if (pos)
902 BBPunfix(pos->batCacheid);
903 if (len)
904 BBPunfix(len->batCacheid);
905 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
906 }
907
908 if (qgram->ttype != TYPE_oid)
909 msg = createException(MAL, "tstsim.qgramselfjoin",
910 SEMANTIC_TYPE_MISMATCH ": tail of BAT qgram must be oid");
911 else if (id->ttype != TYPE_int)
912 msg = createException(MAL, "tstsim.qgramselfjoin",
913 SEMANTIC_TYPE_MISMATCH ": tail of BAT id must be int");
914 else if (pos->ttype != TYPE_int)
915 msg = createException(MAL, "tstsim.qgramselfjoin",
916 SEMANTIC_TYPE_MISMATCH ": tail of BAT pos must be int");
917 else if (len->ttype != TYPE_int)
918 msg = createException(MAL, "tstsim.qgramselfjoin",
919 SEMANTIC_TYPE_MISMATCH ": tail of BAT len must be int");
920 if (msg) {
921 BBPunfix(qgram->batCacheid);
922 BBPunfix(id->batCacheid);
923 BBPunfix(pos->batCacheid);
924 BBPunfix(len->batCacheid);
925 return msg;
926 }
927
928 n = BATcount(qgram);
929 qbuf = (oid *) Tloc(qgram, 0);
930 ibuf = (int *) Tloc(id, 0);
931 pbuf = (int *) Tloc(pos, 0);
932 lbuf = (int *) Tloc(len, 0);
933
934 /* if (BATcount(qgram)>1 && !BATtordered(qgram)) throw(MAL, "tstsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */
935
936 if (!ALIGNsynced(qgram, id))
937 msg = createException(MAL, "tstsim.qgramselfjoin",
938 SEMANTIC_TYPE_MISMATCH ": qgram and id are not synced");
939
940 else if (!ALIGNsynced(qgram, pos))
941 msg = createException(MAL, "tstsim.qgramselfjoin",
942 SEMANTIC_TYPE_MISMATCH ": qgram and pos are not synced");
943 else if (!ALIGNsynced(qgram, len))
944 msg = createException(MAL, "tstsim.qgramselfjoin",
945 SEMANTIC_TYPE_MISMATCH ": qgram and len are not synced");
946
947 else if (Tsize(qgram) != ATOMsize(qgram->ttype))
948 msg = createException(MAL, "tstsim.qgramselfjoin",
949 SEMANTIC_TYPE_MISMATCH ": qgram is not a true void bat");
950 else if (Tsize(id) != ATOMsize(id->ttype))
951 msg = createException(MAL, "tstsim.qgramselfjoin",
952 SEMANTIC_TYPE_MISMATCH ": id is not a true void bat");
953
954 else if (Tsize(pos) != ATOMsize(pos->ttype))
955 msg = createException(MAL, "tstsim.qgramselfjoin",
956 SEMANTIC_TYPE_MISMATCH ": pos is not a true void bat");
957 else if (Tsize(len) != ATOMsize(len->ttype))
958 msg = createException(MAL, "tstsim.qgramselfjoin",
959 SEMANTIC_TYPE_MISMATCH ": len is not a true void bat");
960 if (msg) {
961 BBPunfix(qgram->batCacheid);
962 BBPunfix(id->batCacheid);
963 BBPunfix(pos->batCacheid);
964 BBPunfix(len->batCacheid);
965 return msg;
966 }
967
968 bn = COLnew(0, TYPE_int, n, TRANSIENT);
969 bn2 = COLnew(0, TYPE_int, n, TRANSIENT);
970 if (bn == NULL || bn2 == NULL){
971 BBPreclaim(bn);
972 BBPreclaim(bn2);
973 BBPunfix(qgram->batCacheid);
974 BBPunfix(id->batCacheid);
975 BBPunfix(pos->batCacheid);
976 BBPunfix(len->batCacheid);
977 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY001) MAL_MALLOC_FAIL);
978 }
979
980 for (i = 0; i < n - 1; i++) {
981 for (j = i + 1; (j < n && qbuf[j] == qbuf[i] && pbuf[j] <= (pbuf[i] + (*k + *c * MYMIN(lbuf[i], lbuf[j])))); j++) {
982 if (ibuf[i] != ibuf[j] && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MYMIN(lbuf[i], lbuf[j]))) {
983 if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED ||
984 BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
985 BBPunfix(qgram->batCacheid);
986 BBPunfix(id->batCacheid);
987 BBPunfix(pos->batCacheid);
988 BBPunfix(len->batCacheid);
989 BBPreclaim(bn);
990 BBPreclaim(bn2);
991 throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY001) MAL_MALLOC_FAIL);
992 }
993 }
994 }
995 }
996
997 BBPunfix(qgram->batCacheid);
998 BBPunfix(id->batCacheid);
999 BBPunfix(pos->batCacheid);
1000 BBPunfix(len->batCacheid);
1001
1002 BBPkeepref(*res1 = bn->batCacheid);
1003 BBPkeepref(*res2 = bn2->batCacheid);
1004
1005 return MAL_SUCCEED;
1006}
1007
1008/* copy up to utf8len UTF-8 encoded characters from src to buf
1009 * stop early if buf (size given by bufsize) is too small, or if src runs out
1010 * return number of UTF-8 characters copied (excluding NUL)
1011 * close with NUL if enough space */
1012static size_t
1013utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len)
1014{
1015 size_t cnt = 0;
1016
1017 while (utf8len != 0 && *src != 0 && bufsize != 0) {
1018 bufsize--;
1019 utf8len--;
1020 cnt++;
1021 if (((*buf++ = *src++) & 0x80) != 0) {
1022 while ((*src & 0xC0) == 0x80 && bufsize != 0) {
1023 *buf++ = *src++;
1024 bufsize--;
1025 }
1026 }
1027 }
1028 if (bufsize != 0)
1029 *buf = 0;
1030 return cnt;
1031}
1032
1033str
1034CMDstr2qgrams(bat *ret, str *val)
1035{
1036 BAT *bn;
1037 size_t i, len = strlen(*val) + 5;
1038 str s = GDKmalloc(len);
1039 char qgram[4 * 6 + 1]; /* 4 UTF-8 code points plus NULL byte */
1040
1041 if (s == NULL)
1042 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1043 strcpy(s, "##");
1044 strcpy(s + 2, *val);
1045 strcpy(s + len - 3, "$$");
1046 bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT);
1047 if (bn == NULL) {
1048 GDKfree(s);
1049 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1050 }
1051
1052 i = 0;
1053 while (s[i]) {
1054 if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4)
1055 break;
1056 if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
1057 BBPreclaim(bn);
1058 GDKfree(s);
1059 throw(MAL, "txtsim.str2qgram", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1060 }
1061 if ((s[i++] & 0xC0) == 0xC0) {
1062 while ((s[i] & 0xC0) == 0x80)
1063 i++;
1064 }
1065 }
1066 BBPkeepref(*ret = bn->batCacheid);
1067 GDKfree(s);
1068 return MAL_SUCCEED;
1069}
1070