| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * @f txtsim |
| 11 | * @t Module providing similarity metrics for strings |
| 12 | * @a Romulo Goncalves (from M4 to M5) |
| 13 | * @d 01/12/2007 |
| 14 | * @v 0.1 |
| 15 | * |
| 16 | * @+ String metrics |
| 17 | * |
| 18 | * Provides basic similarity metrics for strings. |
| 19 | * |
| 20 | */ |
| 21 | #include "monetdb_config.h" |
| 22 | #include "txtsim.h" |
| 23 | #include "mal_exception.h" |
| 24 | |
| 25 | |
| 26 | #define RETURN_NIL_IF(b,t) \ |
| 27 | if (b) {\ |
| 28 | if (ATOMextern(t)) {\ |
| 29 | *(ptr*) res = (ptr) ATOMnil(t);\ |
| 30 | if ( *(ptr *) res == NULL)\ |
| 31 | throw(MAL,"txtsim", SQLSTATE(HY001) MAL_MALLOC_FAIL);\ |
| 32 | } else {\ |
| 33 | memcpy(res, ATOMnilptr(t), ATOMsize(t));\ |
| 34 | }\ |
| 35 | return MAL_SUCCEED; \ |
| 36 | } |
| 37 | |
| 38 | /* ========================================================================= |
| 39 | * LEVENSH?TEIN FUNCTION |
| 40 | * Source: |
| 41 | * http://www.merriampark.com/ld.htm |
| 42 | * ========================================================================= |
| 43 | */ |
| 44 | |
| 45 | #define MYMIN(x,y) ( (x<y) ? x : y ) |
| 46 | #define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) ) |
| 47 | #define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) ) |
| 48 | |
| 49 | /*************************************************** |
| 50 | * Get a pointer to the specified cell of the matrix |
| 51 | **************************************************/ |
| 52 | |
| 53 | static inline int * |
| 54 | levenshtein_GetCellPointer(int *pOrigin, int col, int row, int nCols) |
| 55 | { |
| 56 | return pOrigin + col + (row * (nCols + 1)); |
| 57 | } |
| 58 | |
| 59 | /****************************************************** |
| 60 | * Get the contents of the specified cell in the matrix |
| 61 | *****************************************************/ |
| 62 | |
| 63 | static inline int |
| 64 | levenshtein_GetAt(int *pOrigin, int col, int row, int nCols) |
| 65 | { |
| 66 | int *pCell; |
| 67 | |
| 68 | pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols); |
| 69 | return *pCell; |
| 70 | |
| 71 | } |
| 72 | |
| 73 | /******************************************************** |
| 74 | * Fill the specified cell in the matrix with the value x |
| 75 | *******************************************************/ |
| 76 | |
| 77 | static inline void |
| 78 | levenshtein_PutAt(int *pOrigin, int col, int row, int nCols, int x) |
| 79 | { |
| 80 | int *pCell; |
| 81 | |
| 82 | pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols); |
| 83 | *pCell = x; |
| 84 | } |
| 85 | |
| 86 | |
| 87 | /****************************** |
| 88 | * Compute Levenshtein distance |
| 89 | *****************************/ |
| 90 | str |
| 91 | levenshtein_impl(int *result, str *S, str *T, int *insdel_cost, int *replace_cost, int *transpose_cost) |
| 92 | { |
| 93 | char *s = *S; |
| 94 | char *t = *T; |
| 95 | int *d; /* pointer to matrix */ |
| 96 | int n; /* length of s */ |
| 97 | int m; /* length of t */ |
| 98 | int i; /* iterates through s */ |
| 99 | int j; /* iterates through t */ |
| 100 | char s_i; /* ith character of s */ |
| 101 | char t_j; /* jth character of t */ |
| 102 | int cost; /* cost */ |
| 103 | int cell; /* contents of target cell */ |
| 104 | int above; /* contents of cell immediately above */ |
| 105 | int left; /* contents of cell immediately to left */ |
| 106 | int diag; /* contents of cell immediately above and to left */ |
| 107 | int sz; /* number of cells in matrix */ |
| 108 | int diag2 = 0, cost2 = 0; |
| 109 | |
| 110 | /* Step 1 */ |
| 111 | n = (int) strlen(s); /* 64bit: assume strings are less than 2 GB */ |
| 112 | m = (int) strlen(t); |
| 113 | if (n == 0) { |
| 114 | *result = m; |
| 115 | return MAL_SUCCEED; |
| 116 | } |
| 117 | if (m == 0) { |
| 118 | *result = n; |
| 119 | return MAL_SUCCEED; |
| 120 | } |
| 121 | sz = (n + 1) * (m + 1) * sizeof(int); |
| 122 | d = (int *) GDKmalloc(sz); |
| 123 | if ( d == NULL) |
| 124 | throw(MAL,"levenshtein" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 125 | |
| 126 | /* Step 2 */ |
| 127 | for (i = 0; i <= n; i++) { |
| 128 | levenshtein_PutAt(d, i, 0, n, i); |
| 129 | } |
| 130 | |
| 131 | for (j = 0; j <= m; j++) { |
| 132 | levenshtein_PutAt(d, 0, j, n, j); |
| 133 | } |
| 134 | |
| 135 | /* Step 3 */ |
| 136 | for (i = 1; i <= n; i++) { |
| 137 | |
| 138 | s_i = s[i - 1]; |
| 139 | |
| 140 | /* Step 4 */ |
| 141 | for (j = 1; j <= m; j++) { |
| 142 | |
| 143 | t_j = t[j - 1]; |
| 144 | |
| 145 | /* Step 5 */ |
| 146 | if (s_i == t_j) { |
| 147 | cost = 0; |
| 148 | } else { |
| 149 | cost = *replace_cost; |
| 150 | } |
| 151 | |
| 152 | /* Step 6 */ |
| 153 | above = levenshtein_GetAt(d, i - 1, j, n); |
| 154 | left = levenshtein_GetAt(d, i, j - 1, n); |
| 155 | diag = levenshtein_GetAt(d, i - 1, j - 1, n); |
| 156 | |
| 157 | if (j >= 2 && i >= 2) { |
| 158 | /* NEW: detect transpositions */ |
| 159 | |
| 160 | diag2 = levenshtein_GetAt(d, i - 2, j - 2, n); |
| 161 | if (s[i - 2] == t[j - 1] && s[i - 1] == t[j - 2]) { |
| 162 | cost2 = *transpose_cost; |
| 163 | } else { |
| 164 | cost2 = 2; |
| 165 | } |
| 166 | cell = SMALLEST_OF4(above + *insdel_cost, left + *insdel_cost, diag + cost, diag2 + cost2); |
| 167 | } else { |
| 168 | cell = SMALLEST_OF(above + *insdel_cost, left + *insdel_cost, diag + cost); |
| 169 | } |
| 170 | levenshtein_PutAt(d, i, j, n, cell); |
| 171 | } |
| 172 | } |
| 173 | |
| 174 | /* Step 7 */ |
| 175 | *result = levenshtein_GetAt(d, n, m, n); |
| 176 | GDKfree(d); |
| 177 | return MAL_SUCCEED; |
| 178 | } |
| 179 | |
| 180 | str |
| 181 | levenshteinbasic_impl(int *result, str *s, str *t) |
| 182 | { |
| 183 | int insdel = 1, replace = 1, transpose = 2; |
| 184 | |
| 185 | return levenshtein_impl(result, s, t, &insdel, &replace, &transpose); |
| 186 | } |
| 187 | |
| 188 | str |
| 189 | levenshteinbasic2_impl(int *result, str *s, str *t) |
| 190 | { |
| 191 | int insdel = 1, replace = 1, transpose = 1; |
| 192 | |
| 193 | return levenshtein_impl(result, s, t, &insdel, &replace, &transpose); |
| 194 | } |
| 195 | |
| 196 | |
| 197 | /* ========================================================================= |
| 198 | * SOUNDEX FUNCTION |
| 199 | * Source: |
| 200 | * http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c |
| 201 | * ========================================================================= |
| 202 | */ |
| 203 | |
| 204 | #define SoundexLen 4 /* length of a soundex code */ |
| 205 | #define SoundexKey "Z000" /* default key for soundex code */ |
| 206 | |
| 207 | /* set letter values */ |
| 208 | static int Code[] = { 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, |
| 209 | 1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2 |
| 210 | }; |
| 211 | |
| 212 | static inline char |
| 213 | SCode(unsigned char c) |
| 214 | { |
| 215 | if (c == 95) |
| 216 | return (2); /* german sz */ |
| 217 | return (Code[toupper(c) - 'A']); |
| 218 | } |
| 219 | |
| 220 | static void |
| 221 | soundex_code(char *Name, char *Key) |
| 222 | { |
| 223 | char LastLetter; |
| 224 | int Index; |
| 225 | |
| 226 | /* set default key */ |
| 227 | strcpy(Key, SoundexKey); |
| 228 | |
| 229 | /* keep first letter */ |
| 230 | Key[0] = *Name; |
| 231 | if (!isupper((unsigned char) (Key[0]))) |
| 232 | Key[0] = toupper(Key[0]); |
| 233 | |
| 234 | LastLetter = *Name; |
| 235 | if (!*Name) |
| 236 | return; |
| 237 | Name++; |
| 238 | |
| 239 | /* scan rest of string */ |
| 240 | for (Index = 1; (Index <SoundexLen) &&*Name; Name++) { |
| 241 | /* use only letters */ |
| 242 | if (isalpha((unsigned char) (*Name))) { |
| 243 | /* ignore duplicate successive chars */ |
| 244 | if (LastLetter != *Name) { |
| 245 | /* new LastLetter */ |
| 246 | LastLetter = *Name; |
| 247 | |
| 248 | /* ignore letters with code 0 */ |
| 249 | if (SCode(*Name) != 0) { |
| 250 | Key[Index] = '0' + SCode(*Name); |
| 251 | Index ++; |
| 252 | } |
| 253 | } |
| 254 | } |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | |
| 259 | str |
| 260 | soundex_impl(str *res, str *Name) |
| 261 | { |
| 262 | RETURN_NIL_IF(strNil(*Name), TYPE_str); |
| 263 | |
| 264 | *res = (str) GDKmalloc(sizeof(char) * (SoundexLen + 1)); |
| 265 | if( *res == NULL) |
| 266 | throw(MAL,"soundex" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 267 | |
| 268 | /* calculate Key for Name */ |
| 269 | soundex_code(*Name, *res); |
| 270 | |
| 271 | return MAL_SUCCEED; |
| 272 | } |
| 273 | |
| 274 | str |
| 275 | stringdiff_impl(int *res, str *s1, str *s2) |
| 276 | { |
| 277 | str r = MAL_SUCCEED; |
| 278 | char *S1 = NULL, *S2 = NULL; |
| 279 | |
| 280 | r = soundex_impl(&S1, s1); |
| 281 | if( r != MAL_SUCCEED) |
| 282 | return r; |
| 283 | r = soundex_impl(&S2, s2); |
| 284 | if( r != MAL_SUCCEED){ |
| 285 | GDKfree(S1); |
| 286 | return r; |
| 287 | } |
| 288 | r = levenshteinbasic_impl(res, &S1, &S2); |
| 289 | GDKfree(S1); |
| 290 | GDKfree(S2); |
| 291 | return r; |
| 292 | } |
| 293 | |
| 294 | /****************************** |
| 295 | * QGRAMNORMALIZE |
| 296 | * |
| 297 | * This function 'normalizes' a string so valid q-grams can be made of it: |
| 298 | * All characters are transformed to uppercase, and all characters |
| 299 | * which are not letters or digits are stripped to a single space. |
| 300 | * |
| 301 | * qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL" |
| 302 | * qgramnormalize(" '' t ' est").print(); --> [ "T EST" ] |
| 303 | * |
| 304 | *****************************/ |
| 305 | str |
| 306 | CMDqgramnormalize(str *res, str *Input) |
| 307 | { |
| 308 | char *input = *Input; |
| 309 | int i, j = 0; |
| 310 | char c, last = ' '; |
| 311 | |
| 312 | RETURN_NIL_IF(strNil(input), TYPE_str); |
| 313 | *res = (str) GDKmalloc(sizeof(char) * (strlen(input) + 1)); /* normalized strings are never longer than original */ |
| 314 | if (*res == NULL) |
| 315 | throw(MAL, "txtsim.qgramnormalize" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 316 | |
| 317 | for (i = 0; input[i]; i++) { |
| 318 | c = toupper(input[i]); |
| 319 | if (!(('A' <= c && c <= 'Z') || isdigit((unsigned char) c))) |
| 320 | c = ' '; |
| 321 | if (c != ' ' || last != ' ') { |
| 322 | (*res)[j++] = c; |
| 323 | } |
| 324 | last = c; |
| 325 | } |
| 326 | (*res)[j] = 0; |
| 327 | /* strip final whitespace */ |
| 328 | while (j > 0 && (*res)[--j] == ' ') |
| 329 | (*res)[j] = 0; |
| 330 | |
| 331 | return MAL_SUCCEED; |
| 332 | } |
| 333 | |
| 334 | /* ========================================================================= |
| 335 | * FSTRCMP FUNCTION |
| 336 | * Source: |
| 337 | * http://search.cpan.org/src/MLEHMANN/String-Similarity-1/fstrcmp.c |
| 338 | * ========================================================================= |
| 339 | */ |
| 340 | |
| 341 | #define PARAMS(proto) proto |
| 342 | |
| 343 | /* |
| 344 | * Data on one input string being compared. |
| 345 | */ |
| 346 | struct string_data { |
| 347 | /* The string to be compared. */ |
| 348 | const char *data; |
| 349 | |
| 350 | /* The length of the string to be compared. */ |
| 351 | int data_length; |
| 352 | |
| 353 | /* The number of characters inserted or deleted. */ |
| 354 | int edit_count; |
| 355 | }; |
| 356 | |
| 357 | static struct string_data string[2]; |
| 358 | |
| 359 | static int max_edits; /* compareseq stops when edits > max_edits */ |
| 360 | |
| 361 | #ifdef MINUS_H_FLAG |
| 362 | |
| 363 | /* This corresponds to the diff -H flag. With this heuristic, for |
| 364 | strings with a constant small density of changes, the algorithm is |
| 365 | linear in the strings size. This is unlikely in typical uses of |
| 366 | fstrcmp, and so is usually compiled out. Besides, there is no |
| 367 | interface to set it true. */ |
| 368 | static int heuristic; |
| 369 | |
| 370 | #endif |
| 371 | |
| 372 | |
| 373 | /* Vector, indexed by diagonal, containing 1 + the X coordinate of the |
| 374 | point furthest along the given diagonal in the forward search of the |
| 375 | edit matrix. */ |
| 376 | static int *fdiag; |
| 377 | |
| 378 | /* Vector, indexed by diagonal, containing the X coordinate of the point |
| 379 | furthest along the given diagonal in the backward search of the edit |
| 380 | matrix. */ |
| 381 | static int *bdiag; |
| 382 | |
| 383 | /* Edit scripts longer than this are too expensive to compute. */ |
| 384 | static int too_expensive; |
| 385 | |
| 386 | /* Snakes bigger than this are considered `big'. */ |
| 387 | #define SNAKE_LIMIT 20 |
| 388 | |
| 389 | struct partition { |
| 390 | /* Midpoints of this partition. */ |
| 391 | int xmid, ymid; |
| 392 | |
| 393 | /* Nonzero if low half will be analyzed minimally. */ |
| 394 | int lo_minimal; |
| 395 | |
| 396 | /* Likewise for high half. */ |
| 397 | int hi_minimal; |
| 398 | }; |
| 399 | |
| 400 | |
| 401 | /* NAME |
| 402 | diag - find diagonal path |
| 403 | |
| 404 | SYNOPSIS |
| 405 | int diag(int xoff, int xlim, int yoff, int ylim, int minimal, |
| 406 | struct partition *part); |
| 407 | |
| 408 | DESCRIPTION |
| 409 | Find the midpoint of the shortest edit script for a specified |
| 410 | portion of the two strings. |
| 411 | |
| 412 | Scan from the beginnings of the strings, and simultaneously from |
| 413 | the ends, doing a breadth-first search through the space of |
| 414 | edit-sequence. When the two searches meet, we have found the |
| 415 | midpoint of the shortest edit sequence. |
| 416 | |
| 417 | If MINIMAL is nonzero, find the minimal edit script regardless |
| 418 | of expense. Otherwise, if the search is too expensive, use |
| 419 | heuristics to stop the search and report a suboptimal answer. |
| 420 | |
| 421 | RETURNS |
| 422 | Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal |
| 423 | number XMID - YMID equals the number of inserted characters |
| 424 | minus the number of deleted characters (counting only characters |
| 425 | before the midpoint). Return the approximate edit cost; this is |
| 426 | the total number of characters inserted or deleted (counting |
| 427 | only characters before the midpoint), unless a heuristic is used |
| 428 | to terminate the search prematurely. |
| 429 | |
| 430 | Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script |
| 431 | for the left half of the partition is known; similarly for |
| 432 | PART->RIGHT_MINIMAL. |
| 433 | |
| 434 | CAVEAT |
| 435 | This function assumes that the first characters of the specified |
| 436 | portions of the two strings do not match, and likewise that the |
| 437 | last characters do not match. The caller must trim matching |
| 438 | characters from the beginning and end of the portions it is |
| 439 | going to specify. |
| 440 | |
| 441 | If we return the "wrong" partitions, the worst this can do is |
| 442 | cause suboptimal diff output. It cannot cause incorrect diff |
| 443 | output. */ |
| 444 | |
| 445 | static int diag PARAMS((int, int, int, int, int, struct partition *)); |
| 446 | |
| 447 | static int |
| 448 | diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part) |
| 449 | { |
| 450 | int *const fd = fdiag; /* Give the compiler a chance. */ |
| 451 | int *const bd = bdiag; /* Additional help for the compiler. */ |
| 452 | const char *const xv = string[0].data; /* Still more help for the compiler. */ |
| 453 | const char *const yv = string[1].data; /* And more and more . . . */ |
| 454 | const int dmin = xoff - ylim; /* Minimum valid diagonal. */ |
| 455 | const int dmax = xlim - yoff; /* Maximum valid diagonal. */ |
| 456 | const int fmid = xoff - yoff; /* Center diagonal of top-down search. */ |
| 457 | const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ |
| 458 | int fmin = fmid; |
| 459 | int fmax = fmid; /* Limits of top-down search. */ |
| 460 | int bmin = bmid; |
| 461 | int bmax = bmid; /* Limits of bottom-up search. */ |
| 462 | int c; /* Cost. */ |
| 463 | int odd = (fmid - bmid) & 1; |
| 464 | |
| 465 | /* |
| 466 | * True if southeast corner is on an odd diagonal with respect |
| 467 | * to the northwest. |
| 468 | */ |
| 469 | fd[fmid] = xoff; |
| 470 | bd[bmid] = xlim; |
| 471 | for (c = 1;; ++c) { |
| 472 | int d; /* Active diagonal. */ |
| 473 | int big_snake; |
| 474 | |
| 475 | big_snake = 0; |
| 476 | /* Extend the top-down search by an edit step in each diagonal. */ |
| 477 | if (fmin > dmin) |
| 478 | fd[--fmin - 1] = -1; |
| 479 | else |
| 480 | ++fmin; |
| 481 | if (fmax < dmax) |
| 482 | fd[++fmax + 1] = -1; |
| 483 | else |
| 484 | --fmax; |
| 485 | for (d = fmax; d >= fmin; d -= 2) { |
| 486 | int x; |
| 487 | int y; |
| 488 | int oldx; |
| 489 | int tlo; |
| 490 | int thi; |
| 491 | |
| 492 | tlo = fd[d - 1], thi = fd[d + 1]; |
| 493 | |
| 494 | if (tlo >= thi) |
| 495 | x = tlo + 1; |
| 496 | else |
| 497 | x = thi; |
| 498 | oldx = x; |
| 499 | y = x - d; |
| 500 | while (x < xlim && y < ylim && xv[x] == yv[y]) { |
| 501 | ++x; |
| 502 | ++y; |
| 503 | } |
| 504 | if (x - oldx > SNAKE_LIMIT) |
| 505 | big_snake = 1; |
| 506 | fd[d] = x; |
| 507 | if (odd && bmin <= d && d <= bmax && bd[d] <= x) { |
| 508 | part->xmid = x; |
| 509 | part->ymid = y; |
| 510 | part->lo_minimal = part->hi_minimal = 1; |
| 511 | return 2 * c - 1; |
| 512 | } |
| 513 | } |
| 514 | /* Similarly extend the bottom-up search. */ |
| 515 | if (bmin > dmin) |
| 516 | bd[--bmin - 1] = INT_MAX; |
| 517 | else |
| 518 | ++bmin; |
| 519 | if (bmax < dmax) |
| 520 | bd[++bmax + 1] = INT_MAX; |
| 521 | else |
| 522 | --bmax; |
| 523 | for (d = bmax; d >= bmin; d -= 2) { |
| 524 | int x; |
| 525 | int y; |
| 526 | int oldx; |
| 527 | int tlo; |
| 528 | int thi; |
| 529 | |
| 530 | tlo = bd[d - 1], thi = bd[d + 1]; |
| 531 | if (tlo < thi) |
| 532 | x = tlo; |
| 533 | else |
| 534 | x = thi - 1; |
| 535 | oldx = x; |
| 536 | y = x - d; |
| 537 | while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { |
| 538 | --x; |
| 539 | --y; |
| 540 | } |
| 541 | if (oldx - x > SNAKE_LIMIT) |
| 542 | big_snake = 1; |
| 543 | bd[d] = x; |
| 544 | if (!odd && fmin <= d && d <= fmax && x <= fd[d]) { |
| 545 | part->xmid = x; |
| 546 | part->ymid = y; |
| 547 | part->lo_minimal = part->hi_minimal = 1; |
| 548 | return 2 * c; |
| 549 | } |
| 550 | } |
| 551 | |
| 552 | if (minimal) |
| 553 | continue; |
| 554 | |
| 555 | #ifdef MINUS_H_FLAG |
| 556 | /* Heuristic: check occasionally for a diagonal that has made lots |
| 557 | of progress compared with the edit distance. If we have any |
| 558 | such, find the one that has made the most progress and return |
| 559 | it as if it had succeeded. |
| 560 | |
| 561 | With this heuristic, for strings with a constant small density |
| 562 | of changes, the algorithm is linear in the strings size. */ |
| 563 | if (c > 200 && big_snake && heuristic) { |
| 564 | int best; |
| 565 | |
| 566 | best = 0; |
| 567 | for (d = fmax; d >= fmin; d -= 2) { |
| 568 | int dd; |
| 569 | int x; |
| 570 | int y; |
| 571 | int v; |
| 572 | |
| 573 | dd = d - fmid; |
| 574 | x = fd[d]; |
| 575 | y = x - d; |
| 576 | v = (x - xoff) * 2 - dd; |
| 577 | |
| 578 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) { |
| 579 | if (v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim) { |
| 580 | /* We have a good enough best diagonal; now insist |
| 581 | that it end with a significant snake. */ |
| 582 | int k; |
| 583 | |
| 584 | for (k = 1; xv[x - k] == yv[y - k]; k++) { |
| 585 | if (k == SNAKE_LIMIT) { |
| 586 | best = v; |
| 587 | part->xmid = x; |
| 588 | part->ymid = y; |
| 589 | break; |
| 590 | } |
| 591 | } |
| 592 | } |
| 593 | } |
| 594 | } |
| 595 | if (best > 0) { |
| 596 | part->lo_minimal = 1; |
| 597 | part->hi_minimal = 0; |
| 598 | return 2 * c - 1; |
| 599 | } |
| 600 | best = 0; |
| 601 | for (d = bmax; d >= bmin; d -= 2) { |
| 602 | int dd; |
| 603 | int x; |
| 604 | int y; |
| 605 | int v; |
| 606 | |
| 607 | dd = d - bmid; |
| 608 | x = bd[d]; |
| 609 | y = x - d; |
| 610 | v = (xlim - x) * 2 + dd; |
| 611 | |
| 612 | if (v > 12 * (c + (dd < 0 ? -dd : dd))) { |
| 613 | if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT) { |
| 614 | /* We have a good enough best diagonal; now insist |
| 615 | that it end with a significant snake. */ |
| 616 | int k; |
| 617 | |
| 618 | for (k = 0; xv[x + k] == yv[y + k]; k++) { |
| 619 | if (k == SNAKE_LIMIT - 1) { |
| 620 | best = v; |
| 621 | part->xmid = x; |
| 622 | part->ymid = y; |
| 623 | break; |
| 624 | } |
| 625 | } |
| 626 | } |
| 627 | } |
| 628 | } |
| 629 | if (best > 0) { |
| 630 | part->lo_minimal = 0; |
| 631 | part->hi_minimal = 1; |
| 632 | return 2 * c - 1; |
| 633 | } |
| 634 | } |
| 635 | #else |
| 636 | (void) big_snake; |
| 637 | #endif /* MINUS_H_FLAG */ |
| 638 | |
| 639 | /* Heuristic: if we've gone well beyond the call of duty, give up |
| 640 | and report halfway between our best results so far. */ |
| 641 | if (c >= too_expensive) { |
| 642 | int fxybest; |
| 643 | int fxbest; |
| 644 | int bxybest; |
| 645 | int bxbest; |
| 646 | |
| 647 | /* Pacify `gcc -Wall'. */ |
| 648 | fxbest = 0; |
| 649 | bxbest = 0; |
| 650 | |
| 651 | /* Find forward diagonal that maximizes X + Y. */ |
| 652 | fxybest = -1; |
| 653 | for (d = fmax; d >= fmin; d -= 2) { |
| 654 | int x; |
| 655 | int y; |
| 656 | |
| 657 | x = fd[d] < xlim ? fd[d] : xlim; |
| 658 | y = x - d; |
| 659 | |
| 660 | if (ylim < y) { |
| 661 | x = ylim + d; |
| 662 | y = ylim; |
| 663 | } |
| 664 | if (fxybest < x + y) { |
| 665 | fxybest = x + y; |
| 666 | fxbest = x; |
| 667 | } |
| 668 | } |
| 669 | /* Find backward diagonal that minimizes X + Y. */ |
| 670 | bxybest = INT_MAX; |
| 671 | for (d = bmax; d >= bmin; d -= 2) { |
| 672 | int x; |
| 673 | int y; |
| 674 | |
| 675 | x = xoff > bd[d] ? xoff : bd[d]; |
| 676 | y = x - d; |
| 677 | |
| 678 | if (y < yoff) { |
| 679 | x = yoff + d; |
| 680 | y = yoff; |
| 681 | } |
| 682 | if (x + y < bxybest) { |
| 683 | bxybest = x + y; |
| 684 | bxbest = x; |
| 685 | } |
| 686 | } |
| 687 | /* Use the better of the two diagonals. */ |
| 688 | if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) { |
| 689 | part->xmid = fxbest; |
| 690 | part->ymid = fxybest - fxbest; |
| 691 | part->lo_minimal = 1; |
| 692 | part->hi_minimal = 0; |
| 693 | } else { |
| 694 | part->xmid = bxbest; |
| 695 | part->ymid = bxybest - bxbest; |
| 696 | part->lo_minimal = 0; |
| 697 | part->hi_minimal = 1; |
| 698 | } |
| 699 | return 2 * c - 1; |
| 700 | } |
| 701 | } |
| 702 | } |
| 703 | |
| 704 | |
| 705 | /* NAME |
| 706 | compareseq - find edit sequence |
| 707 | |
| 708 | SYNOPSIS |
| 709 | void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal); |
| 710 | |
| 711 | DESCRIPTION |
| 712 | Compare in detail contiguous subsequences of the two strings |
| 713 | which are known, as a whole, to match each other. |
| 714 | |
| 715 | The subsequence of string 0 is [XOFF, XLIM) and likewise for |
| 716 | string 1. |
| 717 | |
| 718 | Note that XLIM, YLIM are exclusive bounds. All character |
| 719 | numbers are origin-0. |
| 720 | |
| 721 | If MINIMAL is nonzero, find a minimal difference no matter how |
| 722 | expensive it is. */ |
| 723 | |
| 724 | static void compareseq PARAMS((int, int, int, int, int)); |
| 725 | |
| 726 | static void |
| 727 | compareseq(int xoff, int xlim, int yoff, int ylim, int minimal) |
| 728 | { |
| 729 | const char *const xv = string[0].data; /* Help the compiler. */ |
| 730 | const char *const yv = string[1].data; |
| 731 | |
| 732 | if (string[1].edit_count + string[0].edit_count > max_edits) |
| 733 | return; |
| 734 | |
| 735 | /* Slide down the bottom initial diagonal. */ |
| 736 | while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) { |
| 737 | ++xoff; |
| 738 | ++yoff; |
| 739 | } |
| 740 | |
| 741 | /* Slide up the top initial diagonal. */ |
| 742 | while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) { |
| 743 | --xlim; |
| 744 | --ylim; |
| 745 | } |
| 746 | |
| 747 | /* Handle simple cases. */ |
| 748 | if (xoff == xlim) { |
| 749 | while (yoff < ylim) { |
| 750 | ++string[1].edit_count; |
| 751 | ++yoff; |
| 752 | } |
| 753 | } else if (yoff == ylim) { |
| 754 | while (xoff < xlim) { |
| 755 | ++string[0].edit_count; |
| 756 | ++xoff; |
| 757 | } |
| 758 | } else { |
| 759 | int c; |
| 760 | struct partition part; |
| 761 | |
| 762 | /* Find a point of correspondence in the middle of the strings. */ |
| 763 | c = diag(xoff, xlim, yoff, ylim, minimal, &part); |
| 764 | if (c == 1) { |
| 765 | #if 0 |
| 766 | /* This should be impossible, because it implies that one of |
| 767 | the two subsequences is empty, and that case was handled |
| 768 | above without calling `diag'. Let's verify that this is |
| 769 | true. */ |
| 770 | abort(); |
| 771 | #else |
| 772 | /* The two subsequences differ by a single insert or delete; |
| 773 | record it and we are done. */ |
| 774 | if (part.xmid - part.ymid < xoff - yoff) |
| 775 | ++string[1].edit_count; |
| 776 | else |
| 777 | ++string[0].edit_count; |
| 778 | #endif |
| 779 | } else { |
| 780 | /* Use the partitions to split this problem into subproblems. */ |
| 781 | compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal); |
| 782 | compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal); |
| 783 | } |
| 784 | } |
| 785 | } |
| 786 | |
| 787 | /* NAME |
| 788 | fstrcmp - fuzzy string compare |
| 789 | |
| 790 | SYNOPSIS |
| 791 | double fstrcmp(const char *s1, int l1, const char *s2, int l2, double); |
| 792 | |
| 793 | DESCRIPTION |
| 794 | The fstrcmp function may be used to compare two string for |
| 795 | similarity. It is very useful in reducing "cascade" or |
| 796 | "secondary" errors in compilers or other situations where |
| 797 | symbol tables occur. |
| 798 | |
| 799 | RETURNS |
| 800 | double; 0 if the strings are entirly dissimilar, 1 if the |
| 801 | strings are identical, and a number in between if they are |
| 802 | similar. */ |
| 803 | |
| 804 | str |
| 805 | fstrcmp_impl(dbl *ret, str *S1, str *S2, dbl *minimum) |
| 806 | { |
| 807 | char *string1 = *S1; |
| 808 | char *string2 = *S2; |
| 809 | int i; |
| 810 | |
| 811 | size_t fdiag_len; |
| 812 | static int *fdiag_buf; |
| 813 | static size_t fdiag_max; |
| 814 | |
| 815 | /* set the info for each string. */ |
| 816 | string[0].data = string1; |
| 817 | string[0].data_length = (int) strlen(string1); /* 64bit: assume string not too long */ |
| 818 | string[1].data = string2; |
| 819 | string[1].data_length = (int) strlen(string2); /* 64bit: assume string not too long */ |
| 820 | |
| 821 | /* short-circuit obvious comparisons */ |
| 822 | if (string[0].data_length == 0 && string[1].data_length == 0) { |
| 823 | *ret = 1.0; |
| 824 | return MAL_SUCCEED; |
| 825 | } |
| 826 | if (string[0].data_length == 0 || string[1].data_length == 0) { |
| 827 | *ret = 0.0; |
| 828 | return MAL_SUCCEED; |
| 829 | } |
| 830 | |
| 831 | /* Set TOO_EXPENSIVE to be approximate square root of input size, |
| 832 | bounded below by 256. */ |
| 833 | too_expensive = 1; |
| 834 | for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2) |
| 835 | too_expensive <<= 1; |
| 836 | if (too_expensive < 256) |
| 837 | too_expensive = 256; |
| 838 | |
| 839 | /* Because fstrcmp is typically called multiple times, while scanning |
| 840 | symbol tables, etc, attempt to minimize the number of memory |
| 841 | allocations performed. Thus, we use a static buffer for the |
| 842 | diagonal vectors, and never free them. */ |
| 843 | fdiag_len = string[0].data_length + string[1].data_length + 3; |
| 844 | if (fdiag_len > fdiag_max) { |
| 845 | fdiag_max = fdiag_len; |
| 846 | fdiag_buf = realloc(fdiag_buf, fdiag_max * (2 * sizeof(int))); |
| 847 | } |
| 848 | fdiag = fdiag_buf + string[1].data_length + 1; |
| 849 | bdiag = fdiag + fdiag_len; |
| 850 | |
| 851 | max_edits = 1 + (int) ((string[0].data_length + string[1].data_length) * (1. - *minimum)); |
| 852 | |
| 853 | /* Now do the main comparison algorithm */ |
| 854 | string[0].edit_count = 0; |
| 855 | string[1].edit_count = 0; |
| 856 | compareseq(0, string[0].data_length, 0, string[1].data_length, 0); |
| 857 | |
| 858 | /* The result is |
| 859 | ((number of chars in common) / (average length of the strings)). |
| 860 | This is admittedly biased towards finding that the strings are |
| 861 | similar, however it does produce meaningful results. */ |
| 862 | *ret = ((double) |
| 863 | (string[0].data_length + string[1].data_length - string[1].edit_count - string[0].edit_count) |
| 864 | / (string[0].data_length + string[1].data_length)); |
| 865 | return MAL_SUCCEED; |
| 866 | } |
| 867 | |
| 868 | str |
| 869 | fstrcmp0_impl(dbl *ret, str *string1, str *string2) |
| 870 | { |
| 871 | double min = 0.0; |
| 872 | |
| 873 | return fstrcmp_impl(ret, string1, string2, &min); |
| 874 | } |
| 875 | |
| 876 | |
| 877 | /* ============ Q-GRAM SELF JOIN ============== */ |
| 878 | |
| 879 | str |
| 880 | CMDqgramselfjoin(bat *res1, bat *res2, bat *qid, bat *bid, bat *pid, bat *lid, flt *c, int *k) |
| 881 | { |
| 882 | BAT *qgram, *id, *pos, *len; |
| 883 | BUN n; |
| 884 | BUN i, j; |
| 885 | BAT *bn, *bn2; |
| 886 | oid *qbuf; |
| 887 | int *ibuf; |
| 888 | int *pbuf; |
| 889 | int *lbuf; |
| 890 | str msg = MAL_SUCCEED; |
| 891 | |
| 892 | qgram = BATdescriptor(*qid); |
| 893 | id = BATdescriptor(*bid); |
| 894 | pos = BATdescriptor(*pid); |
| 895 | len = BATdescriptor(*lid); |
| 896 | if (qgram == NULL || id == NULL || pos == NULL || len == NULL) { |
| 897 | if (qgram) |
| 898 | BBPunfix(qgram->batCacheid); |
| 899 | if (id) |
| 900 | BBPunfix(id->batCacheid); |
| 901 | if (pos) |
| 902 | BBPunfix(pos->batCacheid); |
| 903 | if (len) |
| 904 | BBPunfix(len->batCacheid); |
| 905 | throw(MAL, "txtsim.qgramselfjoin" , SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); |
| 906 | } |
| 907 | |
| 908 | if (qgram->ttype != TYPE_oid) |
| 909 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 910 | SEMANTIC_TYPE_MISMATCH ": tail of BAT qgram must be oid" ); |
| 911 | else if (id->ttype != TYPE_int) |
| 912 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 913 | SEMANTIC_TYPE_MISMATCH ": tail of BAT id must be int" ); |
| 914 | else if (pos->ttype != TYPE_int) |
| 915 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 916 | SEMANTIC_TYPE_MISMATCH ": tail of BAT pos must be int" ); |
| 917 | else if (len->ttype != TYPE_int) |
| 918 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 919 | SEMANTIC_TYPE_MISMATCH ": tail of BAT len must be int" ); |
| 920 | if (msg) { |
| 921 | BBPunfix(qgram->batCacheid); |
| 922 | BBPunfix(id->batCacheid); |
| 923 | BBPunfix(pos->batCacheid); |
| 924 | BBPunfix(len->batCacheid); |
| 925 | return msg; |
| 926 | } |
| 927 | |
| 928 | n = BATcount(qgram); |
| 929 | qbuf = (oid *) Tloc(qgram, 0); |
| 930 | ibuf = (int *) Tloc(id, 0); |
| 931 | pbuf = (int *) Tloc(pos, 0); |
| 932 | lbuf = (int *) Tloc(len, 0); |
| 933 | |
| 934 | /* if (BATcount(qgram)>1 && !BATtordered(qgram)) throw(MAL, "tstsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); */ |
| 935 | |
| 936 | if (!ALIGNsynced(qgram, id)) |
| 937 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 938 | SEMANTIC_TYPE_MISMATCH ": qgram and id are not synced" ); |
| 939 | |
| 940 | else if (!ALIGNsynced(qgram, pos)) |
| 941 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 942 | SEMANTIC_TYPE_MISMATCH ": qgram and pos are not synced" ); |
| 943 | else if (!ALIGNsynced(qgram, len)) |
| 944 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 945 | SEMANTIC_TYPE_MISMATCH ": qgram and len are not synced" ); |
| 946 | |
| 947 | else if (Tsize(qgram) != ATOMsize(qgram->ttype)) |
| 948 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 949 | SEMANTIC_TYPE_MISMATCH ": qgram is not a true void bat" ); |
| 950 | else if (Tsize(id) != ATOMsize(id->ttype)) |
| 951 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 952 | SEMANTIC_TYPE_MISMATCH ": id is not a true void bat" ); |
| 953 | |
| 954 | else if (Tsize(pos) != ATOMsize(pos->ttype)) |
| 955 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 956 | SEMANTIC_TYPE_MISMATCH ": pos is not a true void bat" ); |
| 957 | else if (Tsize(len) != ATOMsize(len->ttype)) |
| 958 | msg = createException(MAL, "tstsim.qgramselfjoin" , |
| 959 | SEMANTIC_TYPE_MISMATCH ": len is not a true void bat" ); |
| 960 | if (msg) { |
| 961 | BBPunfix(qgram->batCacheid); |
| 962 | BBPunfix(id->batCacheid); |
| 963 | BBPunfix(pos->batCacheid); |
| 964 | BBPunfix(len->batCacheid); |
| 965 | return msg; |
| 966 | } |
| 967 | |
| 968 | bn = COLnew(0, TYPE_int, n, TRANSIENT); |
| 969 | bn2 = COLnew(0, TYPE_int, n, TRANSIENT); |
| 970 | if (bn == NULL || bn2 == NULL){ |
| 971 | BBPreclaim(bn); |
| 972 | BBPreclaim(bn2); |
| 973 | BBPunfix(qgram->batCacheid); |
| 974 | BBPunfix(id->batCacheid); |
| 975 | BBPunfix(pos->batCacheid); |
| 976 | BBPunfix(len->batCacheid); |
| 977 | throw(MAL, "txtsim.qgramselfjoin" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 978 | } |
| 979 | |
| 980 | for (i = 0; i < n - 1; i++) { |
| 981 | for (j = i + 1; (j < n && qbuf[j] == qbuf[i] && pbuf[j] <= (pbuf[i] + (*k + *c * MYMIN(lbuf[i], lbuf[j])))); j++) { |
| 982 | if (ibuf[i] != ibuf[j] && abs(lbuf[i] - lbuf[j]) <= (*k + *c * MYMIN(lbuf[i], lbuf[j]))) { |
| 983 | if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED || |
| 984 | BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) { |
| 985 | BBPunfix(qgram->batCacheid); |
| 986 | BBPunfix(id->batCacheid); |
| 987 | BBPunfix(pos->batCacheid); |
| 988 | BBPunfix(len->batCacheid); |
| 989 | BBPreclaim(bn); |
| 990 | BBPreclaim(bn2); |
| 991 | throw(MAL, "txtsim.qgramselfjoin" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 992 | } |
| 993 | } |
| 994 | } |
| 995 | } |
| 996 | |
| 997 | BBPunfix(qgram->batCacheid); |
| 998 | BBPunfix(id->batCacheid); |
| 999 | BBPunfix(pos->batCacheid); |
| 1000 | BBPunfix(len->batCacheid); |
| 1001 | |
| 1002 | BBPkeepref(*res1 = bn->batCacheid); |
| 1003 | BBPkeepref(*res2 = bn2->batCacheid); |
| 1004 | |
| 1005 | return MAL_SUCCEED; |
| 1006 | } |
| 1007 | |
| 1008 | /* copy up to utf8len UTF-8 encoded characters from src to buf |
| 1009 | * stop early if buf (size given by bufsize) is too small, or if src runs out |
| 1010 | * return number of UTF-8 characters copied (excluding NUL) |
| 1011 | * close with NUL if enough space */ |
| 1012 | static size_t |
| 1013 | utf8strncpy(char *buf, size_t bufsize, const char *src, size_t utf8len) |
| 1014 | { |
| 1015 | size_t cnt = 0; |
| 1016 | |
| 1017 | while (utf8len != 0 && *src != 0 && bufsize != 0) { |
| 1018 | bufsize--; |
| 1019 | utf8len--; |
| 1020 | cnt++; |
| 1021 | if (((*buf++ = *src++) & 0x80) != 0) { |
| 1022 | while ((*src & 0xC0) == 0x80 && bufsize != 0) { |
| 1023 | *buf++ = *src++; |
| 1024 | bufsize--; |
| 1025 | } |
| 1026 | } |
| 1027 | } |
| 1028 | if (bufsize != 0) |
| 1029 | *buf = 0; |
| 1030 | return cnt; |
| 1031 | } |
| 1032 | |
| 1033 | str |
| 1034 | CMDstr2qgrams(bat *ret, str *val) |
| 1035 | { |
| 1036 | BAT *bn; |
| 1037 | size_t i, len = strlen(*val) + 5; |
| 1038 | str s = GDKmalloc(len); |
| 1039 | char qgram[4 * 6 + 1]; /* 4 UTF-8 code points plus NULL byte */ |
| 1040 | |
| 1041 | if (s == NULL) |
| 1042 | throw(MAL, "txtsim.str2qgram" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1043 | strcpy(s, "##" ); |
| 1044 | strcpy(s + 2, *val); |
| 1045 | strcpy(s + len - 3, "$$" ); |
| 1046 | bn = COLnew(0, TYPE_str, (BUN) strlen(*val), TRANSIENT); |
| 1047 | if (bn == NULL) { |
| 1048 | GDKfree(s); |
| 1049 | throw(MAL, "txtsim.str2qgram" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1050 | } |
| 1051 | |
| 1052 | i = 0; |
| 1053 | while (s[i]) { |
| 1054 | if (utf8strncpy(qgram, sizeof(qgram), s + i, 4) < 4) |
| 1055 | break; |
| 1056 | if (BUNappend(bn, qgram, false) != GDK_SUCCEED) { |
| 1057 | BBPreclaim(bn); |
| 1058 | GDKfree(s); |
| 1059 | throw(MAL, "txtsim.str2qgram" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 1060 | } |
| 1061 | if ((s[i++] & 0xC0) == 0xC0) { |
| 1062 | while ((s[i] & 0xC0) == 0x80) |
| 1063 | i++; |
| 1064 | } |
| 1065 | } |
| 1066 | BBPkeepref(*ret = bn->batCacheid); |
| 1067 | GDKfree(s); |
| 1068 | return MAL_SUCCEED; |
| 1069 | } |
| 1070 | |