1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * @f txtsim |
11 | * @t Module providing similarity metrics for strings |
12 | * @a Romulo Goncalves (from M4 to M5) |
13 | * @d 01/12/2007 |
14 | * @v 0.1 |
15 | * |
16 | * @+ String metrics |
17 | * |
18 | * Provides basic similarity metrics for strings. |
19 | * |
20 | */ |
21 | #include "monetdb_config.h" |
22 | #include "txtsim.h" |
23 | #include "mal_exception.h" |
24 | |
25 | |
26 | #define RETURN_NIL_IF(b,t) \ |
27 | if (b) {\ |
28 | if (ATOMextern(t)) {\ |
29 | *(ptr*) res = (ptr) ATOMnil(t);\ |
30 | if ( *(ptr *) res == NULL)\ |
31 | throw(MAL,"txtsim", SQLSTATE(HY001) MAL_MALLOC_FAIL);\ |
32 | } else {\ |
33 | memcpy(res, ATOMnilptr(t), ATOMsize(t));\ |
34 | }\ |
35 | return MAL_SUCCEED; \ |
36 | } |
37 | |
38 | /* ========================================================================= |
39 | * LEVENSH?TEIN FUNCTION |
40 | * Source: |
41 | * http://www.merriampark.com/ld.htm |
42 | * ========================================================================= |
43 | */ |
44 | |
45 | #define MYMIN(x,y) ( (x<y) ? x : y ) |
46 | #define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) ) |
47 | #define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) ) |
48 | |
49 | /*************************************************** |
50 | * Get a pointer to the specified cell of the matrix |
51 | **************************************************/ |
52 | |
53 | static inline int * |
54 | levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols) |
55 | { |
56 | return pOrigin + col + (row * (nCols + 1)); |
57 | } |
58 | |
59 | /****************************************************** |
60 | * Get the contents of the specified cell in the matrix |
61 | *****************************************************/ |
62 | |
63 | static inline int |
64 | levenshtein_GetAt(int *pOrigin, int col, int row, int nCols) |
65 | { |
66 | int *pCell; |
67 | |
68 | pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols); |
69 | return *pCell; |
70 | |
71 | } |
72 | |
73 | /******************************************************** |
74 | * Fill the specified cell in the matrix with the value x |
75 | *******************************************************/ |
76 | |
77 | static inline void |
78 | levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x) |
79 | { |
80 | int *pCell; |
81 | |
82 | pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols); |
83 | *pCell = x; |
84 | } |
85 | |
86 | |
87 | /****************************** |
88 | * Compute Levenshtein distance |
89 | *****************************/ |
90 | str |
91 | levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int *replace_cost, int *transpose_cost) |
92 | { |
93 | char *s = *S; |
94 | char *t = *T; |
95 | int *d; /* pointer to matrix */ |
96 | int n; /* length of s */ |
97 | int m; /* length of t */ |
98 | int i; /* iterates through s */ |
99 | int j; /* iterates through t */ |
100 | char s_i; /* ith character of s */ |
101 | char t_j; /* jth character of t */ |
102 | int cost; /* cost */ |
103 | int cell; /* contents of target cell */ |
104 | int above; /* contents of cell immediately above */ |
105 | int left; /* contents of cell immediately to left */ |
106 | int diag; /* contents of cell immediately above and to left */ |
107 | int sz; /* number of cells in matrix */ |
108 | int diag2 = 0, cost2 = 0; |
109 | |
110 | /* Step 1 */ |
111 | n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */ |
112 | m = (int) strlen(t); |
113 | if (n == 0) { |
114 | *result = m; |
115 | return MAL_SUCCEED; |
116 | } |
117 | if (m == 0) { |
118 | *result = n; |
119 | return MAL_SUCCEED; |
120 | } |
121 | sz = (n + 1) * (m + 1) * sizeof(int); |
122 | d = (int *) GDKmalloc(sz); |
123 | if ( d == NULL) |
124 | throw(MAL,"levenshtein" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
125 | |
126 | /* Step 2 */ |
127 | for (i = 0; i <= n; i++) { |
128 | levenshtein_PutAt(d, i, 0, n, i); |
129 | } |
130 | |
131 | for (j = 0; j <= m; j++) { |
132 | levenshtein_PutAt(d, 0, j, n, j); |
133 | } |
134 | |
135 | /* Step 3 */ |
136 | for (i = 1; i <= n; i++) { |
137 | |
138 | s_i = s[i - 1]; |
139 | |
140 | /* Step 4 */ |
141 | for (j = 1; j <= m; j++) { |
142 | |
143 | t_j = t[j - 1]; |
144 | |
145 | /* Step 5 */ |
146 | if (s_i == t_j) { |
147 | cost = 0; |
148 | } else { |
149 | cost = *replace_cost; |
150 | } |
151 | |
152 | /* Step 6 */ |
153 | above = levenshtein_GetAt(d, i - 1, j, n); |
154 | left = levenshtein_GetAt(d, i, j - 1, n); |
155 | diag = levenshtein_GetAt(d, i - 1, j - 1, n); |
156 | |
157 | if (j >= 2 && i >= 2) { |
158 | /* NEW: detect transpositions */ |
159 | |
160 | diag2 = levenshtein_GetAt(d, i - 2, j - 2, n); |
161 | if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) { |
162 | cost2 = *transpose_cost; |
163 | } else { |
164 | cost2 = 2; |
165 | } |
166 | cell = SMALLEST_OF4(above + *insdel_cost, left + *insdel_cost, diag + cost, diag2 + cost2); |
167 | } else { |
168 | cell = SMALLEST_OF(above + *insdel_cost, left + *insdel_cost, diag + cost); |
169 | } |
170 | levenshtein_PutAt(d, i, j, n, cell); |
171 | } |
172 | } |
173 | |
174 | /* Step 7 */ |
175 | *result = levenshtein_GetAt(d, n, m, n); |
176 | GDKfree(d); |
177 | return MAL_SUCCEED; |
178 | } |
179 | |
180 | str |
181 | levenshteinbasic_impl(int *result, str *s, str *t) |
182 | { |
183 | int insdel = 1, replace = 1, transpose = 2; |
184 | |
185 | return levenshtein_impl(result, s, t, &insdel, &replace, &transpose); |
186 | } |
187 | |
188 | str |
189 | levenshteinbasic2_impl(int *result, str *s, str *t) |
190 | { |
191 | int insdel = 1, replace = 1, transpose = 1; |
192 | |
193 | return levenshtein_impl(result, s, t, &insdel, &replace, &transpose); |
194 | } |
195 | |
196 | |
197 | /* ========================================================================= |
198 | * SOUNDEX FUNCTION |
199 | * Source: |
200 | * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c |
201 | * ========================================================================= |
202 | */ |
203 | |
204 | #define SoundexLen 4 /* length of a soundex code */ |
205 | #define SoundexKey "Z000" /* default key for soundex code */ |
206 | |
207 | /* set letter values */ |
208 | static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, |
209 | 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2 |
210 | }; |
211 | |
212 | static inline char |
213 | SCode(unsigned char c) |
214 | { |
215 | if (c == 95) |
216 | return (2); /* german sz */ |
217 | return (Code[toupper(c) - 'A']); |
218 | } |
219 | |
220 | static void |
221 | soundex_code(char *Name, char *Key) |
222 | { |
223 | char LastLetter; |
224 | int Index; |
225 | |
226 | /* set default key */ |
227 | strcpy(Key, SoundexKey); |
228 | |
229 | /* keep first letter */ |
230 | Key[0] = *Name; |
231 | if (!isupper((unsigned char) (Key[0]))) |
232 | Key[0] = toupper(Key[0]); |
233 | |
234 | LastLetter = *Name; |
235 | if (!*Name) |
236 | return; |
237 | Name++; |
238 | |
239 | /* scan rest of string */ |
240 | for (Index = 1; (Index <SoundexLen) &&*Name; Name++) { |
241 | /* use only letters */ |
242 | if (isalpha((unsigned char) (*Name))) { |
243 | /* ignore duplicate successive chars */ |
244 | if (LastLetter != *Name) { |
245 | /* new LastLetter */ |
246 | LastLetter = *Name; |
247 | |
248 | /* ignore letters with code 0 */ |
249 | if (SCode(*Name) != 0) { |
250 | Key[Index] = '0' + SCode(*Name); |
251 | Index ++; |
252 | } |
253 | } |
254 | } |
255 | } |
256 | } |
257 | |
258 | |
259 | str |
260 | soundex_impl(str *res, str *Name) |
261 | { |
262 | RETURN_NIL_IF(strNil(*Name), TYPE_str); |
263 | |
264 | *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1)); |
265 | if( *res == NULL) |
266 | throw(MAL,"soundex" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
267 | |
268 | /* calculate Key for Name */ |
269 | soundex_code(*Name, *res); |
270 | |
271 | return MAL_SUCCEED; |
272 | } |
273 | |
274 | str |
275 | stringdiff_impl(int *res, str *s1, str *s2) |
276 | { |
277 | str r = MAL_SUCCEED; |
278 | char *S1 = NULL, *S2 = NULL; |
279 | |
280 | r = soundex_impl(&S1, s1); |
281 | if( r != MAL_SUCCEED) |
282 | return r; |
283 | r = soundex_impl(&S2, s2); |
284 | if( r != MAL_SUCCEED){ |
285 | GDKfree(S1); |
286 | return r; |
287 | } |
288 | r = levenshteinbasic_impl(res, &S1, &S2); |
289 | GDKfree(S1); |
290 | GDKfree(S2); |
291 | return r; |
292 | } |
293 | |
294 | /****************************** |
295 | * QGRAMNORMALIZE |
296 | * |
297 | * This function 'normalizes' a string so valid q-grams can be made of it: |
298 | * All characters are transformed to uppercase, and all characters |
299 | * which are not letters or digits are stripped to a single space. |
300 | * |
301 | * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL" |
302 | * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ] |
303 | * |
304 | *****************************/ |
305 | str |
306 | CMDqgramnormalize(str *res, str *Input) |
307 | { |
308 | char *input = *Input; |
309 | int i, j = 0; |
310 | char c, last = ' '; |
311 | |
312 | RETURN_NIL_IF(strNil(input), TYPE_str); |
313 | *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1)); /* normalized strings are never longer than original */ |
314 | if (*res == NULL) |
315 | throw(MAL, "txtsim.qgramnormalize" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
316 | |
317 | for (i = 0; input[i]; i++) { |
318 | c = toupper(input[i]); |
319 | if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c))) |
320 | c = ' '; |
321 | if (c != ' ' || last != ' ') { |
322 | (*res)[j++] = c; |
323 | } |
324 | last = c; |
325 | } |
326 | (*res)[j] = 0; |
327 | /* strip final whitespace */ |
328 | while (j > 0 && (*res)[--j] == ' ') |
329 | (*res)[j] = 0; |
330 | |
331 | return MAL_SUCCEED; |
332 | } |
333 | |
334 | /* ========================================================================= |
335 | * FSTRCMP FUNCTION |
336 | * Source: |
337 | * http://search.cpan.org/src/MLEHMANN/String-Similarity-1/fstrcmp.c |
338 | * ========================================================================= |
339 | */ |
340 | |
341 | #define PARAMS(proto) proto |
342 | |
343 | /* |
344 | * Data on one input string being compared. |
345 | */ |
346 | struct string_data { |
347 | /* The string to be compared. */ |
348 | const char *data; |
349 | |
350 | /* The length of the string to be compared. */ |
351 | int data_length; |
352 | |
353 | /* The number of characters inserted or deleted. */ |
354 | int edit_count; |
355 | }; |
356 | |
357 | static struct string_data string[2]; |
358 | |
359 | static int max_edits; /* compareseq stops when edits > max_edits */ |
360 | |
361 | #ifdef MINUS_H_FLAG |
362 | |
363 | /* This corresponds to the diff -H flag. With this heuristic, for |
364 | strings with a constant small density of changes, the algorithm is |
365 | linear in the strings size. This is unlikely in typical uses of |
366 | fstrcmp, and so is usually compiled out. Besides, there is no |
367 | interface to set it true. */ |
368 | static int heuristic; |
369 | |
370 | #endif |
371 | |
372 | |
373 | /* Vector, indexed by diagonal, containing 1 + the X coordinate of the |
374 | point furthest along the given diagonal in the forward search of the |
375 | edit matrix. */ |
376 | static int *fdiag; |
377 | |
378 | /* Vector, indexed by diagonal, containing the X coordinate of the point |
379 | furthest along the given diagonal in the backward search of the edit |
380 | matrix. */ |
381 | static int *bdiag; |
382 | |
383 | /* Edit scripts longer than this are too expensive to compute. */ |
384 | static int too_expensive; |
385 | |
386 | /* Snakes bigger than this are considered `big'. */ |
387 | #define SNAKE_LIMIT 20 |
388 | |
389 | struct partition { |
390 | /* Midpoints of this partition. */ |
391 | int xmid, ymid; |
392 | |
393 | /* Nonzero if low half will be analyzed minimally. */ |
394 | int lo_minimal; |
395 | |
396 | /* Likewise for high half. */ |
397 | int hi_minimal; |
398 | }; |
399 | |
400 | |
401 | /* NAME |
402 | diag - find diagonal path |
403 | |
404 | SYNOPSIS |
405 | int diag(int xoff, int xlim, int yoff, int ylim, int minimal, |
406 | struct partition *part); |
407 | |
408 | DESCRIPTION |
409 | Find the midpoint of the shortest edit script for a specified |
410 | portion of the two strings. |
411 | |
412 | Scan from the beginnings of the strings, and simultaneously from |
413 | the ends, doing a breadth-first search through the space of |
414 | edit-sequence. When the two searches meet, we have found the |
415 | midpoint of the shortest edit sequence. |
416 | |
417 | If MINIMAL is nonzero, find the minimal edit script regardless |
418 | of expense. Otherwise, if the search is too expensive, use |
419 | heuristics to stop the search and report a suboptimal answer. |
420 | |
421 | RETURNS |
422 | Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal |
423 | number XMID - YMID equals the number of inserted characters |
424 | minus the number of deleted characters (counting only characters |
425 | before the midpoint). Return the approximate edit cost; this is |
426 | the total number of characters inserted or deleted (counting |
427 | only characters before the midpoint), unless a heuristic is used |
428 | to terminate the search prematurely. |
429 | |
430 | Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script |
431 | for the left half of the partition is known; similarly for |
432 | PART->RIGHT_MINIMAL. |
433 | |
434 | CAVEAT |
435 | This function assumes that the first characters of the specified |
436 | portions of the two strings do not match, and likewise that the |
437 | last characters do not match. The caller must trim matching |
438 | characters from the beginning and end of the portions it is |
439 | going to specify. |
440 | |
441 | If we return the "wrong" partitions, the worst this can do is |
442 | cause suboptimal diff output. It cannot cause incorrect diff |
443 | output. */ |
444 | |
445 | static int diag PARAMS((int, int, int, int, int, struct partition *)); |
446 | |
447 | static int |
448 | diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part) |
449 | { |
450 | int *const fd = fdiag; /* Give the compiler a chance. */ |
451 | int *const bd = bdiag; /* Additional help for the compiler. */ |
452 | const char *const xv = string[0].data; /* Still more help for the compiler. */ |
453 | const char *const yv = string[1].data; /* And more and more . . . */ |
454 | const int dmin = xoff - ylim; /* Minimum valid diagonal. */ |
455 | const int dmax = xlim - yoff; /* Maximum valid diagonal. */ |
456 | const int fmid = xoff - yoff; /* Center diagonal of top-down search. */ |
457 | const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ |
458 | int fmin = fmid; |
459 | int fmax = fmid; /* Limits of top-down search. */ |
460 | int bmin = bmid; |
461 | int bmax = bmid; /* Limits of bottom-up search. */ |
462 | int c; /* Cost. */ |
463 | int odd = (fmid - bmid) & 1; |
464 | |
465 | /* |
466 | * True if southeast corner is on an odd diagonal with respect |
467 | * to the northwest. |
468 | */ |
469 | fd[fmid] = xoff; |
470 | bd[bmid] = xlim; |
471 | for (c = 1;; ++c) { |
472 | int d; /* Active diagonal. */ |
473 | int big_snake; |
474 | |
475 | big_snake = 0; |
476 | /* Extend the top-down search by an edit step in each diagonal. */ |
477 | if (fmin > dmin) |
478 | fd[--fmin - 1] = -1; |
479 | else |
480 | ++fmin; |
481 | if (fmax < dmax) |
482 | fd[++fmax + 1] = -1; |
483 | else |
484 | --fmax; |
485 | for (d = fmax; d >= fmin; d -= 2) { |
486 | int x; |
487 | int y; |
488 | int oldx; |
489 | int tlo; |
490 | int thi; |
491 | |
492 | tlo = fd[d - 1], thi = fd[d + 1]; |
493 | |
494 | if (tlo >= thi) |
495 | x = tlo + 1; |
496 | else |
497 | x = thi; |
498 | oldx = x; |
499 | y = x - d; |
500 | while (x < xlim && y < ylim && xv[x] == yv[y]) { |
501 | ++x; |
502 | ++y; |
503 | } |
504 | if (x - oldx > SNAKE_LIMIT) |
505 | big_snake = 1; |
506 | fd[d] = x; |
507 | if (odd && bmin <= d && d <= bmax && bd[d] <= x) { |
508 | part->xmid = x; |
509 | part->ymid = y; |
510 | part->lo_minimal = part->hi_minimal = 1; |
511 | return 2 * c - 1; |
512 | } |
513 | } |
514 | /* Similarly extend the bottom-up search. */ |
515 | if (bmin > dmin) |
516 | bd[--bmin - 1] = INT_MAX; |
517 | else |
518 | ++bmin; |
519 | if (bmax < dmax) |
520 | bd[++bmax + 1] = INT_MAX; |
521 | else |
522 | --bmax; |
523 | for (d = bmax; d >= bmin; d -= 2) { |
524 | int x; |
525 | int y; |
526 | int oldx; |
527 | int tlo; |
528 | int thi; |
529 | |
530 | tlo = bd[d - 1], thi = bd[d + 1]; |
531 | if (tlo < thi) |
532 | x = tlo; |
533 | else |
534 | x = thi - 1; |
535 | oldx = x; |
536 | y = x - d; |
537 | while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { |
538 | --x; |
539 | --y; |
540 | } |
541 | if (oldx - x > SNAKE_LIMIT) |
542 | big_snake = 1; |
543 | bd[d] = x; |
544 | if (!odd && fmin <= d && d <= fmax && x <= fd[d]) { |
545 | part->xmid = x; |
546 | part->ymid = y; |
547 | part->lo_minimal = part->hi_minimal = 1; |
548 | return 2 * c; |
549 | } |
550 | } |
551 | |
552 | if (minimal) |
553 | continue; |
554 | |
555 | #ifdef MINUS_H_FLAG |
556 | /* Heuristic: check occasionally for a diagonal that has made lots |
557 | of progress compared with the edit distance. If we have any |
558 | such, find the one that has made the most progress and return |
559 | it as if it had succeeded. |
560 | |
561 | With this heuristic, for strings with a constant small density |
562 | of changes, the algorithm is linear in the strings size. */ |
563 | if (c > 200 && big_snake && heuristic) { |
564 | int best; |
565 | |
566 | best = 0; |
567 | for (d = fmax; d >= fmin; d -= 2) { |
568 | int dd; |
569 | int x; |
570 | int y; |
571 | int v; |
572 | |
573 | dd = d - fmid; |
574 | x = fd[d]; |
575 | y = x - d; |
576 | v = (x - xoff) * 2 - dd; |
577 | |
578 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) { |
579 | if (v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim) { |
580 | /* We have a good enough best diagonal; now insist |
581 | that it end with a significant snake. */ |
582 | int k; |
583 | |
584 | for (k = 1; xv[x - k] == yv[y - k]; k++) { |
585 | if (k == SNAKE_LIMIT) { |
586 | best = v; |
587 | part->xmid = x; |
588 | part->ymid = y; |
589 | break; |
590 | } |
591 | } |
592 | } |
593 | } |
594 | } |
595 | if (best > 0) { |
596 | part->lo_minimal = 1; |
597 | part->hi_minimal = 0; |
598 | return 2 * c - 1; |
599 | } |
600 | best = 0; |
601 | for (d = bmax; d >= bmin; d -= 2) { |
602 | int dd; |
603 | int x; |
604 | int y; |
605 | int v; |
606 | |
607 | dd = d - bmid; |
608 | x = bd[d]; |
609 | y = x - d; |
610 | v = (xlim - x) * 2 + dd; |
611 | |
612 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) { |
613 | if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT) { |
614 | /* We have a good enough best diagonal; now insist |
615 | that it end with a significant snake. */ |
616 | int k; |
617 | |
618 | for (k = 0; xv[x + k] == yv[y + k]; k++) { |
619 | if (k == SNAKE_LIMIT - 1) { |
620 | best = v; |
621 | part->xmid = x; |
622 | part->ymid = y; |
623 | break; |
624 | } |
625 | } |
626 | } |
627 | } |
628 | } |
629 | if (best > 0) { |
630 | part->lo_minimal = 0; |
631 | part->hi_minimal = 1; |
632 | return 2 * c - 1; |
633 | } |
634 | } |
635 | #else |
636 | (void) big_snake; |
637 | #endif /* MINUS_H_FLAG */ |
638 | |
639 | /* Heuristic: if we've gone well beyond the call of duty, give up |
640 | and report halfway between our best results so far. */ |
641 | if (c >= too_expensive) { |
642 | int fxybest; |
643 | int fxbest; |
644 | int bxybest; |
645 | int bxbest; |
646 | |
647 | /* Pacify `gcc -Wall'. */ |
648 | fxbest = 0; |
649 | bxbest = 0; |
650 | |
651 | /* Find forward diagonal that maximizes X + Y. */ |
652 | fxybest = -1; |
653 | for (d = fmax; d >= fmin; d -= 2) { |
654 | int x; |
655 | int y; |
656 | |
657 | x = fd[d] < xlim ? fd[d] : xlim; |
658 | y = x - d; |
659 | |
660 | if (ylim < y) { |
661 | x = ylim + d; |
662 | y = ylim; |
663 | } |
664 | if (fxybest < x + y) { |
665 | fxybest = x + y; |
666 | fxbest = x; |
667 | } |
668 | } |
669 | /* Find backward diagonal that minimizes X + Y. */ |
670 | bxybest = INT_MAX; |
671 | for (d = bmax; d >= bmin; d -= 2) { |
672 | int x; |
673 | int y; |
674 | |
675 | x = xoff > bd[d] ? xoff : bd[d]; |
676 | y = x - d; |
677 | |
678 | if (y < yoff) { |
679 | x = yoff + d; |
680 | y = yoff; |
681 | } |
682 | if (x + y < bxybest) { |
683 | bxybest = x + y; |
684 | bxbest = x; |
685 | } |
686 | } |
687 | /* Use the better of the two diagonals. */ |
688 | if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) { |
689 | part->xmid = fxbest; |
690 | part->ymid = fxybest - fxbest; |
691 | part->lo_minimal = 1; |
692 | part->hi_minimal = 0; |
693 | } else { |
694 | part->xmid = bxbest; |
695 | part->ymid = bxybest - bxbest; |
696 | part->lo_minimal = 0; |
697 | part->hi_minimal = 1; |
698 | } |
699 | return 2 * c - 1; |
700 | } |
701 | } |
702 | } |
703 | |
704 | |
705 | /* NAME |
706 | compareseq - find edit sequence |
707 | |
708 | SYNOPSIS |
709 | void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal); |
710 | |
711 | DESCRIPTION |
712 | Compare in detail contiguous subsequences of the two strings |
713 | which are known, as a whole, to match each other. |
714 | |
715 | The subsequence of string 0 is [XOFF, XLIM) and likewise for |
716 | string 1. |
717 | |
718 | Note that XLIM, YLIM are exclusive bounds. All character |
719 | numbers are origin-0. |
720 | |
721 | If MINIMAL is nonzero, find a minimal difference no matter how |
722 | expensive it is. */ |
723 | |
724 | static void compareseq PARAMS((int, int, int, int, int)); |
725 | |
726 | static void |
727 | compareseq(int xoff, int xlim, int yoff, int ylim, int minimal) |
728 | { |
729 | const char *const xv = string[0].data; /* Help the compiler. */ |
730 | const char *const yv = string[1].data; |
731 | |
732 | if (string[1].edit_count + string[0].edit_count > max_edits) |
733 | return; |
734 | |
735 | /* Slide down the bottom initial diagonal. */ |
736 | while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) { |
737 | ++xoff; |
738 | ++yoff; |
739 | } |
740 | |
741 | /* Slide up the top initial diagonal. */ |
742 | while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) { |
743 | --xlim; |
744 | --ylim; |
745 | } |
746 | |
747 | /* Handle simple cases. */ |
748 | if (xoff == xlim) { |
749 | while (yoff < ylim) { |
750 | ++string[1].edit_count; |
751 | ++yoff; |
752 | } |
753 | } else if (yoff == ylim) { |
754 | while (xoff < xlim) { |
755 | ++string[0].edit_count; |
756 | ++xoff; |
757 | } |
758 | } else { |
759 | int c; |
760 | struct partition part; |
761 | |
762 | /* Find a point of correspondence in the middle of the strings. */ |
763 | c = diag(xoff, xlim, yoff, ylim, minimal, &part); |
764 | if (c == 1) { |
765 | #if 0 |
766 | /* This should be impossible, because it implies that one of |
767 | the two subsequences is empty, and that case was handled |
768 | above without calling `diag'. Let's verify that this is |
769 | true. */ |
770 | abort(); |
771 | #else |
772 | /* The two subsequences differ by a single insert or delete; |
773 | record it and we are done. */ |
774 | if (part.xmid - part.ymid < xoff - yoff) |
775 | ++string[1].edit_count; |
776 | else |
777 | ++string[0].edit_count; |
778 | #endif |
779 | } else { |
780 | /* Use the partitions to split this problem into subproblems. */ |
781 | compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal); |
782 | compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal); |
783 | } |
784 | } |
785 | } |
786 | |
787 | /* NAME |
788 | fstrcmp - fuzzy string compare |
789 | |
790 | SYNOPSIS |
791 | double fstrcmp(const char *s1, int l1, const char *s2, int l2, double); |
792 | |
793 | DESCRIPTION |
794 | The fstrcmp function may be used to compare two string for |
795 | similarity. It is very useful in reducing "cascade" or |
796 | "secondary" errors in compilers or other situations where |
797 | symbol tables occur. |
798 | |
799 | RETURNS |
800 | double; 0 if the strings are entirly dissimilar, 1 if the |
801 | strings are identical, and a number in between if they are |
802 | similar. */ |
803 | |
804 | str |
805 | fstrcmp_impl(dbl *ret, str *S1, str *S2, dbl *minimum) |
806 | { |
807 | char *string1 = *S1; |
808 | char *string2 = *S2; |
809 | int i; |
810 | |
811 | size_t fdiag_len; |
812 | static int *fdiag_buf; |
813 | static size_t fdiag_max; |
814 | |
815 | /* set the info for each string. */ |
816 | string[0].data = string1; |
817 | string[0].data_length = (int) strlen(string1); /* 64bit: assume string not too long */ |
818 | string[1].data = string2; |
819 | string[1].data_length = (int) strlen(string2); /* 64bit: assume string not too long */ |
820 | |
821 | /* short-circuit obvious comparisons */ |
822 | if (string[0].data_length == 0 && string[1].data_length == 0) { |
823 | *ret = 1.0; |
824 | return MAL_SUCCEED; |
825 | } |
826 | if (string[0].data_length == 0 || string[1].data_length == 0) { |
827 | *ret = 0.0; |
828 | return MAL_SUCCEED; |
829 | } |
830 | |
831 | /* Set TOO_EXPENSIVE to be approximate square root of input size, |
832 | bounded below by 256. */ |
833 | too_expensive = 1; |
834 | for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2) |
835 | too_expensive <<= 1; |
836 | if (too_expensive < 256) |
837 | too_expensive = 256; |
838 | |
839 | /* Because fstrcmp is typically called multiple times, while scanning |
840 | symbol tables, etc, attempt to minimize the number of memory |
841 | allocations performed. Thus, we use a static buffer for the |
842 | diagonal vectors, and never free them. */ |
843 | fdiag_len = string[0].data_length + string[1].data_length + 3; |
844 | if (fdiag_len > fdiag_max) { |
845 | fdiag_max = fdiag_len; |
846 | fdiag_buf = realloc(fdiag_buf, fdiag_max * (2 * sizeof(int))); |
847 | } |
848 | fdiag = fdiag_buf + string[1].data_length + 1; |
849 | bdiag = fdiag + fdiag_len; |
850 | |
851 | max_edits = 1 + (int) ((string[0].data_length + string[1].data_length) * (1. - *minimum)); |
852 | |
853 | /* Now do the main comparison algorithm */ |
854 | string[0].edit_count = 0; |
855 | string[1].edit_count = 0; |
856 | compareseq(0, string[0].data_length, 0, string[1].data_length, 0); |
857 | |
858 | /* The result is |
859 | ((number of chars in common) / (average length of the strings)). |
860 | This is admittedly biased towards finding that the strings are |
861 | similar, however it does produce meaningful results. */ |
862 | *ret = ((double) |
863 | (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count) |
864 | / (string[0].data_length + string[1].data_length)); |
865 | return MAL_SUCCEED; |
866 | } |
867 | |
868 | str |
869 | fstrcmp0_impl(dbl *ret, str *string1, str *string2) |
870 | { |
871 | double min = 0.0; |
872 | |
873 | return fstrcmp_impl(ret, string1, string2, &min); |
874 | } |
875 | |
876 | |
877 | /* ============ Q-GRAM SELF JOIN ============== */ |
878 | |
879 | str |
880 | CMDqgramselfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid, flt *c, int *k) |
881 | { |
882 | BAT *qgram, *id, *pos, *len; |
883 | BUN n; |
884 | BUN i, j; |
885 | BAT *bn, *bn2; |
886 | oid *qbuf; |
887 | int *ibuf; |
888 | int *pbuf; |
889 | int *lbuf; |
890 | str msg = MAL_SUCCEED; |
891 | |
892 | qgram = BATdescriptor(*qid); |
893 | id = BATdescriptor(*bid); |
894 | pos = BATdescriptor(*pid); |
895 | len = BATdescriptor(*lid); |
896 | if (qgram == NULL || id == NULL || pos == NULL || len == NULL) { |
897 | if (qgram) |
898 | BBPunfix(qgram->batCacheid); |
899 | if (id) |
900 | BBPunfix(id->batCacheid); |
901 | if (pos) |
902 | BBPunfix(pos->batCacheid); |
903 | if (len) |
904 | BBPunfix(len->batCacheid); |
905 | throw(MAL, "txtsim.qgramselfjoin" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
906 | } |
907 | |
908 | if (qgram->ttype != TYPE_oid) |
909 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
910 | SEMANTIC_TYPE_MISMATCH ": tail of BAT qgram must be oid" ); |
911 | else if (id->ttype != TYPE_int) |
912 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
913 | SEMANTIC_TYPE_MISMATCH ": tail of BAT id must be int" ); |
914 | else if (pos->ttype != TYPE_int) |
915 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
916 | SEMANTIC_TYPE_MISMATCH ": tail of BAT pos must be int" ); |
917 | else if (len->ttype != TYPE_int) |
918 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
919 | SEMANTIC_TYPE_MISMATCH ": tail of BAT len must be int" ); |
920 | if (msg) { |
921 | BBPunfix(qgram->batCacheid); |
922 | BBPunfix(id->batCacheid); |
923 | BBPunfix(pos->batCacheid); |
924 | BBPunfix(len->batCacheid); |
925 | return msg; |
926 | } |
927 | |
928 | n = BATcount(qgram); |
929 | qbuf = (oid *) Tloc(qgram, 0); |
930 | ibuf = (int *) Tloc(id, 0); |
931 | pbuf = (int *) Tloc(pos, 0); |
932 | lbuf = (int *) Tloc(len, 0); |
933 | |
934 | /* if (BATcount(qgram)>1 && !BATtordered(qgram)) throw(MAL, "tstsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */ |
935 | |
936 | if (!ALIGNsynced(qgram, id)) |
937 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
938 | SEMANTIC_TYPE_MISMATCH ": qgram and id are not synced" ); |
939 | |
940 | else if (!ALIGNsynced(qgram, pos)) |
941 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
942 | SEMANTIC_TYPE_MISMATCH ": qgram and pos are not synced" ); |
943 | else if (!ALIGNsynced(qgram, len)) |
944 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
945 | SEMANTIC_TYPE_MISMATCH ": qgram and len are not synced" ); |
946 | |
947 | else if (Tsize(qgram) != ATOMsize(qgram->ttype)) |
948 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
949 | SEMANTIC_TYPE_MISMATCH ": qgram is not a true void bat" ); |
950 | else if (Tsize(id) != ATOMsize(id->ttype)) |
951 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
952 | SEMANTIC_TYPE_MISMATCH ": id is not a true void bat" ); |
953 | |
954 | else if (Tsize(pos) != ATOMsize(pos->ttype)) |
955 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
956 | SEMANTIC_TYPE_MISMATCH ": pos is not a true void bat" ); |
957 | else if (Tsize(len) != ATOMsize(len->ttype)) |
958 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
959 | SEMANTIC_TYPE_MISMATCH ": len is not a true void bat" ); |
960 | if (msg) { |
961 | BBPunfix(qgram->batCacheid); |
962 | BBPunfix(id->batCacheid); |
963 | BBPunfix(pos->batCacheid); |
964 | BBPunfix(len->batCacheid); |
965 | return msg; |
966 | } |
967 | |
968 | bn = COLnew(0, TYPE_int, n, TRANSIENT); |
969 | bn2 = COLnew(0, TYPE_int, n, TRANSIENT); |
970 | if (bn == NULL || bn2 == NULL){ |
971 | BBPreclaim(bn); |
972 | BBPreclaim(bn2); |
973 | BBPunfix(qgram->batCacheid); |
974 | BBPunfix(id->batCacheid); |
975 | BBPunfix(pos->batCacheid); |
976 | BBPunfix(len->batCacheid); |
977 | throw(MAL, "txtsim.qgramselfjoin" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
978 | } |
979 | |
980 | for (i = 0; i < n - 1; i++) { |
981 | for (j = i + 1; (j < n && qbuf[j] == qbuf[i] && pbuf[j] <= (pbuf[i] + (*k + *c * MYMIN(lbuf[i], lbuf[j])))); j++) { |
982 | if (ibuf[i] != ibuf[j] && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MYMIN(lbuf[i], lbuf[j]))) { |
983 | if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED || |
984 | BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) { |
985 | BBPunfix(qgram->batCacheid); |
986 | BBPunfix(id->batCacheid); |
987 | BBPunfix(pos->batCacheid); |
988 | BBPunfix(len->batCacheid); |
989 | BBPreclaim(bn); |
990 | BBPreclaim(bn2); |
991 | throw(MAL, "txtsim.qgramselfjoin" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
992 | } |
993 | } |
994 | } |
995 | } |
996 | |
997 | BBPunfix(qgram->batCacheid); |
998 | BBPunfix(id->batCacheid); |
999 | BBPunfix(pos->batCacheid); |
1000 | BBPunfix(len->batCacheid); |
1001 | |
1002 | BBPkeepref(*res1 = bn->batCacheid); |
1003 | BBPkeepref(*res2 = bn2->batCacheid); |
1004 | |
1005 | return MAL_SUCCEED; |
1006 | } |
1007 | |
1008 | /* copy up to utf8len UTF-8 encoded characters from src to buf |
1009 | * stop early if buf (size given by bufsize) is too small, or if src runs out |
1010 | * return number of UTF-8 characters copied (excluding NUL) |
1011 | * close with NUL if enough space */ |
1012 | static size_t |
1013 | utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len) |
1014 | { |
1015 | size_t cnt = 0; |
1016 | |
1017 | while (utf8len != 0 && *src != 0 && bufsize != 0) { |
1018 | bufsize--; |
1019 | utf8len--; |
1020 | cnt++; |
1021 | if (((*buf++ = *src++) & 0x80) != 0) { |
1022 | while ((*src & 0xC0) == 0x80 && bufsize != 0) { |
1023 | *buf++ = *src++; |
1024 | bufsize--; |
1025 | } |
1026 | } |
1027 | } |
1028 | if (bufsize != 0) |
1029 | *buf = 0; |
1030 | return cnt; |
1031 | } |
1032 | |
1033 | str |
1034 | CMDstr2qgrams(bat *ret, str *val) |
1035 | { |
1036 | BAT *bn; |
1037 | size_t i, len = strlen(*val) + 5; |
1038 | str s = GDKmalloc(len); |
1039 | char qgram[4 * 6 + 1]; /* 4 UTF-8 code points plus NULL byte */ |
1040 | |
1041 | if (s == NULL) |
1042 | throw(MAL, "txtsim.str2qgram" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1043 | strcpy(s, "##" ); |
1044 | strcpy(s + 2, *val); |
1045 | strcpy(s + len - 3, "$$" ); |
1046 | bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT); |
1047 | if (bn == NULL) { |
1048 | GDKfree(s); |
1049 | throw(MAL, "txtsim.str2qgram" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1050 | } |
1051 | |
1052 | i = 0; |
1053 | while (s[i]) { |
1054 | if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4) |
1055 | break; |
1056 | if (BUNappend(bn, qgram, false) != GDK_SUCCEED) { |
1057 | BBPreclaim(bn); |
1058 | GDKfree(s); |
1059 | throw(MAL, "txtsim.str2qgram" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
1060 | } |
1061 | if ((s[i++] & 0xC0) == 0xC0) { |
1062 | while ((s[i] & 0xC0) == 0x80) |
1063 | i++; |
1064 | } |
1065 | } |
1066 | BBPkeepref(*ret = bn->batCacheid); |
1067 | GDKfree(s); |
1068 | return MAL_SUCCEED; |
1069 | } |
1070 | |