txtsim.c source code [MonetDB/monetdb5/modules/mal/txtsim.c]

1	/*
2	* This Source Code Form is subject to the terms of the Mozilla Public
3	* License, v. 2.0. If a copy of the MPL was not distributed with this
4	* file, You can obtain one at http://mozilla.org/MPL/2.0/.
5	*
6	* Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7	*/
8
9	/*
10	* @f txtsim
11	* @t Module providing similarity metrics for strings
12	* @a Romulo Goncalves (from M4 to M5)
13	* @d 01/12/2007
14	* @v 0.1
15	*
16	* @+ String metrics
17	*
18	* Provides basic similarity metrics for strings.
19	*
20	*/
21	#include "monetdb_config.h"
22	#include "txtsim.h"
23	#include "mal_exception.h"
24
25
26	#define RETURN_NIL_IF(b,t) \
27	if (b) {\
28	if (ATOMextern(t)) {\
29	(ptr) res = (ptr) ATOMnil(t);\
30	if ( (ptr ) res == NULL)\
31	throw(MAL,"txtsim", SQLSTATE(HY001) MAL_MALLOC_FAIL);\
32	} else {\
33	memcpy(res, ATOMnilptr(t), ATOMsize(t));\
34	}\
35	return MAL_SUCCEED; \
36	}
37
38	/ =========================================================================*
39	* LEVENSH?TEIN FUNCTION
40	* Source:
41	* http://www.merriampark.com/ld.htm
42	* =========================================================================
43	*/
44
45	#define MYMIN(x,y) ( (x<y) ? x : y )
46	#define SMALLEST_OF(x,y,z) ( MYMIN(MYMIN(x,y),z) )
47	#define SMALLEST_OF4(x,y,z,z2) ( MYMIN(MYMIN(MYMIN(x,y),z),z2) )
48
49	/***************************************************
50	* Get a pointer to the specified cell of the matrix
51	**************************************************/
52
53	static inline int *
54	levenshtein_GetCellPointer(int pOrigin, int* col, int row, int nCols)
55	{
56	return pOrigin + col + (row * (nCols + `1`));
57	}
58
59	/******************************************************
60	* Get the contents of the specified cell in the matrix
61	*****************************************************/
62
63	static inline int
64	levenshtein_GetAt(int pOrigin, int* col, int row, int nCols)
65	{
66	int *pCell;
67
68	pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
69	return *pCell;
70
71	}
72
73	/********************************************************
74	* Fill the specified cell in the matrix with the value x
75	*******************************************************/
76
77	static inline void
78	levenshtein_PutAt(int pOrigin, int* col, int row, int nCols, int x)
79	{
80	int *pCell;
81
82	pCell = levenshtein_GetCellPointer(pOrigin, col, row, nCols);
83	*pCell = x;
84	}
85
86
87	/******************************
88	* Compute Levenshtein distance
89	*****************************/
90	str
91	levenshtein_impl(int result, str S, str T, int* insdel_cost, int* replace_cost, int* *transpose_cost)
92	{
93	char s = S;
94	char t = T;
95	int d; /* pointer to matrix /
96	int n; / length of s /
97	int m; / length of t /
98	int i; / iterates through s /
99	int j; / iterates through t /
100	char s_i; / ith character of s /
101	char t_j; / jth character of t /
102	int cost; / cost /
103	int cell; / contents of target cell /
104	int above; / contents of cell immediately above /
105	int left; / contents of cell immediately to left /
106	int diag; / contents of cell immediately above and to left /
107	int sz; / number of cells in matrix /
108	int diag2 = `0`, cost2 = `0`;
109
110	/ Step 1 /
111	n = (int) strlen(s); / 64bit: assume strings are less than 2 GB /
112	m = (int) strlen(t);
113	if (n == `0`) {
114	*result = m;
115	return MAL_SUCCEED;
116	}
117	if (m == `0`) {
118	*result = n;
119	return MAL_SUCCEED;
120	}
121	sz = (n + `1`) * (m + `1`) * sizeof(int);
122	d = (int *) GDKmalloc(sz);
123	if ( d == NULL)
124	throw(MAL,"levenshtein", SQLSTATE(HY001) MAL_MALLOC_FAIL);
125
126	/ Step 2 /
127	for (i = `0`; i <= n; i++) {
128	levenshtein_PutAt(d, i, `0`, n, i);
129	}
130
131	for (j = `0`; j <= m; j++) {
132	levenshtein_PutAt(d, `0`, j, n, j);
133	}
134
135	/ Step 3 /
136	for (i = `1`; i <= n; i++) {
137
138	s_i = s[i - `1`];
139
140	/ Step 4 /
141	for (j = `1`; j <= m; j++) {
142
143	t_j = t[j - `1`];
144
145	/ Step 5 /
146	if (s_i == t_j) {
147	cost = `0`;
148	} else {
149	cost = *replace_cost;
150	}
151
152	/ Step 6 /
153	above = levenshtein_GetAt(d, i - `1`, j, n);
154	left = levenshtein_GetAt(d, i, j - `1`, n);
155	diag = levenshtein_GetAt(d, i - `1`, j - `1`, n);
156
157	if (j >= `2` && i >= `2`) {
158	/ NEW: detect transpositions /
159
160	diag2 = levenshtein_GetAt(d, i - `2`, j - `2`, n);
161	if (s[i - `2`] == t[j - `1`] && s[i - `1`] == t[j - `2`]) {
162	cost2 = *transpose_cost;
163	} else {
164	cost2 = `2`;
165	}
166	cell = SMALLEST_OF4(above + insdel_cost, left + insdel_cost, diag + cost, diag2 + cost2);
167	} else {
168	cell = SMALLEST_OF(above + insdel_cost, left + insdel_cost, diag + cost);
169	}
170	levenshtein_PutAt(d, i, j, n, cell);
171	}
172	}
173
174	/ Step 7 /
175	*result = levenshtein_GetAt(d, n, m, n);
176	GDKfree(d);
177	return MAL_SUCCEED;
178	}
179
180	str
181	levenshteinbasic_impl(int result, str s, str *t)
182	{
183	int insdel = `1`, replace = `1`, transpose = `2`;
184
185	return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
186	}
187
188	str
189	levenshteinbasic2_impl(int result, str s, str *t)
190	{
191	int insdel = `1`, replace = `1`, transpose = `1`;
192
193	return levenshtein_impl(result, s, t, &insdel, &replace, &transpose);
194	}
195
196
197	/ =========================================================================*
198	* SOUNDEX FUNCTION
199	* Source:
200	* http://www.mit.edu/afs/sipb/service/rtfm/src/freeWAIS-sf-1.0/ir/soundex.c
201	* =========================================================================
202	*/
203
204	#define SoundexLen 4 /* length of a soundex code */
205	#define SoundexKey "Z000" /* default key for soundex code */
206
207	/ set letter values /
208	static int Code[] = { `0`, `1`, `2`, `3`, `0`, `1`, `2`, `0`, `0`, `2`, `2`, `4`, `5`, `5`, `0`,
209	`1`, `2`, `6`, `2`, `3`, `0`, `1`, `0`, `2`, `0`, `2`
210	};
211
212	static inline char
213	SCode(unsigned char c)
214	{
215	if (c == `95`)
216	return (`2`); / german sz /
217	return (Code[toupper(c) - `'A'`]);
218	}
219
220	static void
221	soundex_code(char Name, char* *Key)
222	{
223	char LastLetter;
224	int Index;
225
226	/ set default key /
227	strcpy(Key, SoundexKey);
228
229	/ keep first letter /
230	Key[`0`] = *Name;
231	if (!isupper((unsigned char) (Key[`0`])))
232	Key[`0`] = toupper(Key[`0`]);
233
234	LastLetter = *Name;
235	if (!*Name)
236	return;
237	Name++;
238
239	/ scan rest of string /
240	for (Index = `1`; (Index <SoundexLen) &&*Name; Name++) {
241	/ use only letters /
242	if (isalpha((unsigned char) (*Name))) {
243	/ ignore duplicate successive chars /
244	if (LastLetter != *Name) {
245	/ new LastLetter /
246	LastLetter = *Name;
247
248	/ ignore letters with code 0 /
249	if (SCode(*Name) != `0`) {
250	Key[Index] = `'0'` + SCode(*Name);
251	Index ++;
252	}
253	}
254	}
255	}
256	}
257
258
259	str
260	soundex_impl(str res, str Name)
261	{
262	RETURN_NIL_IF(strNil(*Name), TYPE_str);
263
264	res = (str) GDKmalloc(sizeof(char) (SoundexLen + `1`));
265	if( *res == NULL)
266	throw(MAL,"soundex", SQLSTATE(HY001) MAL_MALLOC_FAIL);
267
268	/ calculate Key for Name /
269	soundex_code(Name, res);
270
271	return MAL_SUCCEED;
272	}
273
274	str
275	stringdiff_impl(int res, str s1, str *s2)
276	{
277	str r = MAL_SUCCEED;
278	char S1 = NULL, S2 = NULL;
279
280	r = soundex_impl(&S1, s1);
281	if( r != MAL_SUCCEED)
282	return r;
283	r = soundex_impl(&S2, s2);
284	if( r != MAL_SUCCEED){
285	GDKfree(S1);
286	return r;
287	}
288	r = levenshteinbasic_impl(res, &S1, &S2);
289	GDKfree(S1);
290	GDKfree(S2);
291	return r;
292	}
293
294	/******************************
295	* QGRAMNORMALIZE
296	*
297	* This function 'normalizes' a string so valid q-grams can be made of it:
298	* All characters are transformed to uppercase, and all characters
299	* which are not letters or digits are stripped to a single space.
300	*
301	* qgramnormalize("Hallo, allemaal!").print(); --> "HALLO ALLEMAAL"
302	* qgramnormalize(" '' t ' est").print(); --> [ "T EST" ]
303	*
304	*****************************/
305	str
306	CMDqgramnormalize(str res, str Input)
307	{
308	char input = Input;
309	int i, j = `0`;
310	char c, last = `' '`;
311
312	RETURN_NIL_IF(strNil(input), TYPE_str);
313	res = (str) GDKmalloc(sizeof(char) (strlen(input) + `1`)); / normalized strings are never longer than original /
314	if (*res == NULL)
315	throw(MAL, "txtsim.qgramnormalize", SQLSTATE(HY001) MAL_MALLOC_FAIL);
316
317	for (i = `0`; input[i]; i++) {
318	c = toupper(input[i]);
319	if (!((`'A'` <= c && c <= `'Z'`) \|\| isdigit((unsigned char) c)))
320	c = `' '`;
321	if (c != `' '` \|\| last != `' '`) {
322	(*res)[j++] = c;
323	}
324	last = c;
325	}
326	(*res)[j] = `0`;
327	/ strip final whitespace /
328	while (j > `0` && (*res)[--j] == `' '`)
329	(*res)[j] = `0`;
330
331	return MAL_SUCCEED;
332	}
333
334	/ =========================================================================*
335	* FSTRCMP FUNCTION
336	* Source:
337	* http://search.cpan.org/src/MLEHMANN/String-Similarity-1/fstrcmp.c
338	* =========================================================================
339	*/
340
341	#define PARAMS(proto) proto
342
343	/*
344	* Data on one input string being compared.
345	*/
346	struct string_data {
347	/ The string to be compared. /
348	const char *data;
349
350	/ The length of the string to be compared. /
351	int data_length;
352
353	/ The number of characters inserted or deleted. /
354	int edit_count;
355	};
356
357	static struct string_data string[`2`];
358
359	static int max_edits; / compareseq stops when edits > max_edits /
360
361	#ifdef MINUS_H_FLAG
362
363	/ This corresponds to the diff -H flag. With this heuristic, for*
364	strings with a constant small density of changes, the algorithm is
365	linear in the strings size. This is unlikely in typical uses of
366	fstrcmp, and so is usually compiled out. Besides, there is no
367	interface to set it true. /*
368	static int heuristic;
369
370	#endif
371
372
373	/ Vector, indexed by diagonal, containing 1 + the X coordinate of the*
374	point furthest along the given diagonal in the forward search of the
375	edit matrix. /*
376	static int *fdiag;
377
378	/ Vector, indexed by diagonal, containing the X coordinate of the point*
379	furthest along the given diagonal in the backward search of the edit
380	matrix. /*
381	static int *bdiag;
382
383	/ Edit scripts longer than this are too expensive to compute. /
384	static int too_expensive;
385
386	/ Snakes bigger than this are considered `big'. /
387	#define SNAKE_LIMIT 20
388
389	struct partition {
390	/ Midpoints of this partition. /
391	int xmid, ymid;
392
393	/ Nonzero if low half will be analyzed minimally. /
394	int lo_minimal;
395
396	/ Likewise for high half. /
397	int hi_minimal;
398	};
399
400
401	/ NAME*
402	diag - find diagonal path
403
404	SYNOPSIS
405	int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
406	struct partition part);*
407
408	DESCRIPTION
409	Find the midpoint of the shortest edit script for a specified
410	portion of the two strings.
411
412	Scan from the beginnings of the strings, and simultaneously from
413	the ends, doing a breadth-first search through the space of
414	edit-sequence. When the two searches meet, we have found the
415	midpoint of the shortest edit sequence.
416
417	If MINIMAL is nonzero, find the minimal edit script regardless
418	of expense. Otherwise, if the search is too expensive, use
419	heuristics to stop the search and report a suboptimal answer.
420
421	RETURNS
422	Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
423	number XMID - YMID equals the number of inserted characters
424	minus the number of deleted characters (counting only characters
425	before the midpoint). Return the approximate edit cost; this is
426	the total number of characters inserted or deleted (counting
427	only characters before the midpoint), unless a heuristic is used
428	to terminate the search prematurely.
429
430	Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
431	for the left half of the partition is known; similarly for
432	PART->RIGHT_MINIMAL.
433
434	CAVEAT
435	This function assumes that the first characters of the specified
436	portions of the two strings do not match, and likewise that the
437	last characters do not match. The caller must trim matching
438	characters from the beginning and end of the portions it is
439	going to specify.
440
441	If we return the "wrong" partitions, the worst this can do is
442	cause suboptimal diff output. It cannot cause incorrect diff
443	output. /*
444
445	static int diag PARAMS((int, int, int, int, int, struct partition *));
446
447	static int
448	diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition *part)
449	{
450	int *const fd = fdiag; / Give the compiler a chance. /
451	int *const bd = bdiag; / Additional help for the compiler. /
452	const char *const xv = string[`0`].data; / Still more help for the compiler. /
453	const char *const yv = string[`1`].data; / And more and more . . . /
454	const int dmin = xoff - ylim; / Minimum valid diagonal. /
455	const int dmax = xlim - yoff; / Maximum valid diagonal. /
456	const int fmid = xoff - yoff; / Center diagonal of top-down search. /
457	const int bmid = xlim - ylim; / Center diagonal of bottom-up search. /
458	int fmin = fmid;
459	int fmax = fmid; / Limits of top-down search. /
460	int bmin = bmid;
461	int bmax = bmid; / Limits of bottom-up search. /
462	int c; / Cost. /
463	int odd = (fmid - bmid) & `1`;
464
465	/*
466	* True if southeast corner is on an odd diagonal with respect
467	* to the northwest.
468	*/
469	fd[fmid] = xoff;
470	bd[bmid] = xlim;
471	for (c = `1`;; ++c) {
472	int d; / Active diagonal. /
473	int big_snake;
474
475	big_snake = `0`;
476	/ Extend the top-down search by an edit step in each diagonal. /
477	if (fmin > dmin)
478	fd[--fmin - `1`] = -`1`;
479	else
480	++fmin;
481	if (fmax < dmax)
482	fd[++fmax + `1`] = -`1`;
483	else
484	--fmax;
485	for (d = fmax; d >= fmin; d -= `2`) {
486	int x;
487	int y;
488	int oldx;
489	int tlo;
490	int thi;
491
492	tlo = fd[d - `1`], thi = fd[d + `1`];
493
494	if (tlo >= thi)
495	x = tlo + `1`;
496	else
497	x = thi;
498	oldx = x;
499	y = x - d;
500	while (x < xlim && y < ylim && xv[x] == yv[y]) {
501	++x;
502	++y;
503	}
504	if (x - oldx > SNAKE_LIMIT)
505	big_snake = `1`;
506	fd[d] = x;
507	if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
508	part->xmid = x;
509	part->ymid = y;
510	part->lo_minimal = part->hi_minimal = `1`;
511	return `2` * c - `1`;
512	}
513	}
514	/ Similarly extend the bottom-up search. /
515	if (bmin > dmin)
516	bd[--bmin - `1`] = INT_MAX;
517	else
518	++bmin;
519	if (bmax < dmax)
520	bd[++bmax + `1`] = INT_MAX;
521	else
522	--bmax;
523	for (d = bmax; d >= bmin; d -= `2`) {
524	int x;
525	int y;
526	int oldx;
527	int tlo;
528	int thi;
529
530	tlo = bd[d - `1`], thi = bd[d + `1`];
531	if (tlo < thi)
532	x = tlo;
533	else
534	x = thi - `1`;
535	oldx = x;
536	y = x - d;
537	while (x > xoff && y > yoff && xv[x - `1`] == yv[y - `1`]) {
538	--x;
539	--y;
540	}
541	if (oldx - x > SNAKE_LIMIT)
542	big_snake = `1`;
543	bd[d] = x;
544	if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
545	part->xmid = x;
546	part->ymid = y;
547	part->lo_minimal = part->hi_minimal = `1`;
548	return `2` * c;
549	}
550	}
551
552	if (minimal)
553	continue;
554
555	#ifdef MINUS_H_FLAG
556	/ Heuristic: check occasionally for a diagonal that has made lots*
557	of progress compared with the edit distance. If we have any
558	such, find the one that has made the most progress and return
559	it as if it had succeeded.
560
561	With this heuristic, for strings with a constant small density
562	of changes, the algorithm is linear in the strings size. /*
563	if (c > `200` && big_snake && heuristic) {
564	int best;
565
566	best = `0`;
567	for (d = fmax; d >= fmin; d -= `2`) {
568	int dd;
569	int x;
570	int y;
571	int v;
572
573	dd = d - fmid;
574	x = fd[d];
575	y = x - d;
576	v = (x - xoff) * `2` - dd;
577
578	if (v > `12` * (c + (dd < `0` ? -dd : dd))) {
579	if (v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim) {
580	/ We have a good enough best diagonal; now insist*
581	that it end with a significant snake. /*
582	int k;
583
584	for (k = `1`; xv[x - k] == yv[y - k]; k++) {
585	if (k == SNAKE_LIMIT) {
586	best = v;
587	part->xmid = x;
588	part->ymid = y;
589	break;
590	}
591	}
592	}
593	}
594	}
595	if (best > `0`) {
596	part->lo_minimal = `1`;
597	part->hi_minimal = `0`;
598	return `2` * c - `1`;
599	}
600	best = `0`;
601	for (d = bmax; d >= bmin; d -= `2`) {
602	int dd;
603	int x;
604	int y;
605	int v;
606
607	dd = d - bmid;
608	x = bd[d];
609	y = x - d;
610	v = (xlim - x) * `2` + dd;
611
612	if (v > `12` * (c + (dd < `0` ? -dd : dd))) {
613	if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT) {
614	/ We have a good enough best diagonal; now insist*
615	that it end with a significant snake. /*
616	int k;
617
618	for (k = `0`; xv[x + k] == yv[y + k]; k++) {
619	if (k == SNAKE_LIMIT - `1`) {
620	best = v;
621	part->xmid = x;
622	part->ymid = y;
623	break;
624	}
625	}
626	}
627	}
628	}
629	if (best > `0`) {
630	part->lo_minimal = `0`;
631	part->hi_minimal = `1`;
632	return `2` * c - `1`;
633	}
634	}
635	#else
636	(void) big_snake;
637	#endif /* MINUS_H_FLAG */
638
639	/ Heuristic: if we've gone well beyond the call of duty, give up*
640	and report halfway between our best results so far. /*
641	if (c >= too_expensive) {
642	int fxybest;
643	int fxbest;
644	int bxybest;
645	int bxbest;
646
647	/ Pacify `gcc -Wall'. /
648	fxbest = `0`;
649	bxbest = `0`;
650
651	/ Find forward diagonal that maximizes X + Y. /
652	fxybest = -`1`;
653	for (d = fmax; d >= fmin; d -= `2`) {
654	int x;
655	int y;
656
657	x = fd[d] < xlim ? fd[d] : xlim;
658	y = x - d;
659
660	if (ylim < y) {
661	x = ylim + d;
662	y = ylim;
663	}
664	if (fxybest < x + y) {
665	fxybest = x + y;
666	fxbest = x;
667	}
668	}
669	/ Find backward diagonal that minimizes X + Y. /
670	bxybest = INT_MAX;
671	for (d = bmax; d >= bmin; d -= `2`) {
672	int x;
673	int y;
674
675	x = xoff > bd[d] ? xoff : bd[d];
676	y = x - d;
677
678	if (y < yoff) {
679	x = yoff + d;
680	y = yoff;
681	}
682	if (x + y < bxybest) {
683	bxybest = x + y;
684	bxbest = x;
685	}
686	}
687	/ Use the better of the two diagonals. /
688	if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) {
689	part->xmid = fxbest;
690	part->ymid = fxybest - fxbest;
691	part->lo_minimal = `1`;
692	part->hi_minimal = `0`;
693	} else {
694	part->xmid = bxbest;
695	part->ymid = bxybest - bxbest;
696	part->lo_minimal = `0`;
697	part->hi_minimal = `1`;
698	}
699	return `2` * c - `1`;
700	}
701	}
702	}
703
704
705	/ NAME*
706	compareseq - find edit sequence
707
708	SYNOPSIS
709	void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
710
711	DESCRIPTION
712	Compare in detail contiguous subsequences of the two strings
713	which are known, as a whole, to match each other.
714
715	The subsequence of string 0 is [XOFF, XLIM) and likewise for
716	string 1.
717
718	Note that XLIM, YLIM are exclusive bounds. All character
719	numbers are origin-0.
720
721	If MINIMAL is nonzero, find a minimal difference no matter how
722	expensive it is. /*
723
724	static void compareseq PARAMS((int, int, int, int, int));
725
726	static void
727	compareseq(int xoff, int xlim, int yoff, int ylim, int minimal)
728	{
729	const char *const xv = string[`0`].data; / Help the compiler. /
730	const char *const yv = string[`1`].data;
731
732	if (string[`1`].edit_count + string[`0`].edit_count > max_edits)
733	return;
734
735	/ Slide down the bottom initial diagonal. /
736	while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) {
737	++xoff;
738	++yoff;
739	}
740
741	/ Slide up the top initial diagonal. /
742	while (xlim > xoff && ylim > yoff && xv[xlim - `1`] == yv[ylim - `1`]) {
743	--xlim;
744	--ylim;
745	}
746
747	/ Handle simple cases. /
748	if (xoff == xlim) {
749	while (yoff < ylim) {
750	++string[`1`].edit_count;
751	++yoff;
752	}
753	} else if (yoff == ylim) {
754	while (xoff < xlim) {
755	++string[`0`].edit_count;
756	++xoff;
757	}
758	} else {
759	int c;
760	struct partition part;
761
762	/ Find a point of correspondence in the middle of the strings. /
763	c = diag(xoff, xlim, yoff, ylim, minimal, &part);
764	if (c == `1`) {
765	#if 0
766	/ This should be impossible, because it implies that one of*
767	the two subsequences is empty, and that case was handled
768	above without calling `diag'. Let's verify that this is
769	true. /*
770	abort();
771	#else
772	/ The two subsequences differ by a single insert or delete;*
773	record it and we are done. /*
774	if (part.xmid - part.ymid < xoff - yoff)
775	++string[`1`].edit_count;
776	else
777	++string[`0`].edit_count;
778	#endif
779	} else {
780	/ Use the partitions to split this problem into subproblems. /
781	compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
782	compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
783	}
784	}
785	}
786
787	/ NAME*
788	fstrcmp - fuzzy string compare
789
790	SYNOPSIS
791	double fstrcmp(const char s1, int l1, const char s2, int l2, double);
792
793	DESCRIPTION
794	The fstrcmp function may be used to compare two string for
795	similarity. It is very useful in reducing "cascade" or
796	"secondary" errors in compilers or other situations where
797	symbol tables occur.
798
799	RETURNS
800	double; 0 if the strings are entirly dissimilar, 1 if the
801	strings are identical, and a number in between if they are
802	similar. /*
803
804	str
805	fstrcmp_impl(dbl ret, str S1, str S2, dbl minimum)
806	{
807	char string1 = S1;
808	char string2 = S2;
809	int i;
810
811	size_t fdiag_len;
812	static int *fdiag_buf;
813	static size_t fdiag_max;
814
815	/ set the info for each string. /
816	string[`0`].data = string1;
817	string[`0`].data_length = (int) strlen(string1); / 64bit: assume string not too long /
818	string[`1`].data = string2;
819	string[`1`].data_length = (int) strlen(string2); / 64bit: assume string not too long /
820
821	/ short-circuit obvious comparisons /
822	if (string[`0`].data_length == `0` && string[`1`].data_length == `0`) {
823	*ret = `1.0`;
824	return MAL_SUCCEED;
825	}
826	if (string[`0`].data_length == `0` \|\| string[`1`].data_length == `0`) {
827	*ret = `0.0`;
828	return MAL_SUCCEED;
829	}
830
831	/ Set TOO_EXPENSIVE to be approximate square root of input size,*
832	bounded below by 256. /*
833	too_expensive = `1`;
834	for (i = string[`0`].data_length + string[`1`].data_length; i != `0`; i >>= `2`)
835	too_expensive <<= `1`;
836	if (too_expensive < `256`)
837	too_expensive = `256`;
838
839	/ Because fstrcmp is typically called multiple times, while scanning*
840	symbol tables, etc, attempt to minimize the number of memory
841	allocations performed. Thus, we use a static buffer for the
842	diagonal vectors, and never free them. /*
843	fdiag_len = string[`0`].data_length + string[`1`].data_length + `3`;
844	if (fdiag_len > fdiag_max) {
845	fdiag_max = fdiag_len;
846	fdiag_buf = realloc(fdiag_buf, fdiag_max * (`2` * sizeof(int)));
847	}
848	fdiag = fdiag_buf + string[`1`].data_length + `1`;
849	bdiag = fdiag + fdiag_len;
850
851	max_edits = `1` + (int) ((string[`0`].data_length + string[`1`].data_length) * (`1.` - *minimum));
852
853	/ Now do the main comparison algorithm /
854	string[`0`].edit_count = `0`;
855	string[`1`].edit_count = `0`;
856	compareseq(`0`, string[`0`].data_length, `0`, string[`1`].data_length, `0`);
857
858	/ The result is*
859	((number of chars in common) / (average length of the strings)).
860	This is admittedly biased towards finding that the strings are
861	similar, however it does produce meaningful results. /*
862	ret = ((double*)
863	(string[`0`].data_length + string[`1`].data_length - string[`1`].edit_count - string[`0`].edit_count)
864	/ (string[`0`].data_length + string[`1`].data_length));
865	return MAL_SUCCEED;
866	}
867
868	str
869	fstrcmp0_impl(dbl ret, str string1, str *string2)
870	{
871	double min = `0.0`;
872
873	return fstrcmp_impl(ret, string1, string2, &min);
874	}
875
876
877	/ ============ Q-GRAM SELF JOIN ============== /
878
879	str
880	CMDqgramselfjoin(bat res1, bat res2, bat qid, bat bid, bat pid, bat lid, flt c, int* *k)
881	{
882	BAT qgram, id, pos, len;
883	BUN n;
884	BUN i, j;
885	BAT bn, bn2;
886	oid *qbuf;
887	int *ibuf;
888	int *pbuf;
889	int *lbuf;
890	str msg = MAL_SUCCEED;
891
892	qgram = BATdescriptor(*qid);
893	id = BATdescriptor(*bid);
894	pos = BATdescriptor(*pid);
895	len = BATdescriptor(*lid);
896	if (qgram == NULL \|\| id == NULL \|\| pos == NULL \|\| len == NULL) {
897	if (qgram)
898	BBPunfix(qgram->batCacheid);
899	if (id)
900	BBPunfix(id->batCacheid);
901	if (pos)
902	BBPunfix(pos->batCacheid);
903	if (len)
904	BBPunfix(len->batCacheid);
905	throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
906	}
907
908	if (qgram->ttype != TYPE_oid)
909	msg = createException(MAL, "tstsim.qgramselfjoin",
910	SEMANTIC_TYPE_MISMATCH ": tail of BAT qgram must be oid");
911	else if (id->ttype != TYPE_int)
912	msg = createException(MAL, "tstsim.qgramselfjoin",
913	SEMANTIC_TYPE_MISMATCH ": tail of BAT id must be int");
914	else if (pos->ttype != TYPE_int)
915	msg = createException(MAL, "tstsim.qgramselfjoin",
916	SEMANTIC_TYPE_MISMATCH ": tail of BAT pos must be int");
917	else if (len->ttype != TYPE_int)
918	msg = createException(MAL, "tstsim.qgramselfjoin",
919	SEMANTIC_TYPE_MISMATCH ": tail of BAT len must be int");
920	if (msg) {
921	BBPunfix(qgram->batCacheid);
922	BBPunfix(id->batCacheid);
923	BBPunfix(pos->batCacheid);
924	BBPunfix(len->batCacheid);
925	return msg;
926	}
927
928	n = BATcount(qgram);
929	qbuf = (oid *) Tloc(qgram, `0`);
930	ibuf = (int *) Tloc(id, `0`);
931	pbuf = (int *) Tloc(pos, `0`);
932	lbuf = (int *) Tloc(len, `0`);
933
934	/ if (BATcount(qgram)>1 && !BATtordered(qgram)) throw(MAL, "tstsim.qgramselfjoin", SEMANTIC_TYPE_MISMATCH); /
935
936	if (!ALIGNsynced(qgram, id))
937	msg = createException(MAL, "tstsim.qgramselfjoin",
938	SEMANTIC_TYPE_MISMATCH ": qgram and id are not synced");
939
940	else if (!ALIGNsynced(qgram, pos))
941	msg = createException(MAL, "tstsim.qgramselfjoin",
942	SEMANTIC_TYPE_MISMATCH ": qgram and pos are not synced");
943	else if (!ALIGNsynced(qgram, len))
944	msg = createException(MAL, "tstsim.qgramselfjoin",
945	SEMANTIC_TYPE_MISMATCH ": qgram and len are not synced");
946
947	else if (Tsize(qgram) != ATOMsize(qgram->ttype))
948	msg = createException(MAL, "tstsim.qgramselfjoin",
949	SEMANTIC_TYPE_MISMATCH ": qgram is not a true void bat");
950	else if (Tsize(id) != ATOMsize(id->ttype))
951	msg = createException(MAL, "tstsim.qgramselfjoin",
952	SEMANTIC_TYPE_MISMATCH ": id is not a true void bat");
953
954	else if (Tsize(pos) != ATOMsize(pos->ttype))
955	msg = createException(MAL, "tstsim.qgramselfjoin",
956	SEMANTIC_TYPE_MISMATCH ": pos is not a true void bat");
957	else if (Tsize(len) != ATOMsize(len->ttype))
958	msg = createException(MAL, "tstsim.qgramselfjoin",
959	SEMANTIC_TYPE_MISMATCH ": len is not a true void bat");
960	if (msg) {
961	BBPunfix(qgram->batCacheid);
962	BBPunfix(id->batCacheid);
963	BBPunfix(pos->batCacheid);
964	BBPunfix(len->batCacheid);
965	return msg;
966	}
967
968	bn = COLnew(`0`, TYPE_int, n, TRANSIENT);
969	bn2 = COLnew(`0`, TYPE_int, n, TRANSIENT);
970	if (bn == NULL \|\| bn2 == NULL){
971	BBPreclaim(bn);
972	BBPreclaim(bn2);
973	BBPunfix(qgram->batCacheid);
974	BBPunfix(id->batCacheid);
975	BBPunfix(pos->batCacheid);
976	BBPunfix(len->batCacheid);
977	throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY001) MAL_MALLOC_FAIL);
978	}
979
980	for (i = `0`; i < n - `1`; i++) {
981	for (j = i + `1`; (j < n && qbuf[j] == qbuf[i] && pbuf[j] <= (pbuf[i] + (k + c * MYMIN(lbuf[i], lbuf[j])))); j++) {
982	if (ibuf[i] != ibuf[j] && abs(lbuf[i] - lbuf[j]) <= (k + c * MYMIN(lbuf[i], lbuf[j]))) {
983	if (BUNappend(bn, ibuf + i, false) != GDK_SUCCEED \|\|
984	BUNappend(bn2, ibuf + j, false) != GDK_SUCCEED) {
985	BBPunfix(qgram->batCacheid);
986	BBPunfix(id->batCacheid);
987	BBPunfix(pos->batCacheid);
988	BBPunfix(len->batCacheid);
989	BBPreclaim(bn);
990	BBPreclaim(bn2);
991	throw(MAL, "txtsim.qgramselfjoin", SQLSTATE(HY001) MAL_MALLOC_FAIL);
992	}
993	}
994	}
995	}
996
997	BBPunfix(qgram->batCacheid);
998	BBPunfix(id->batCacheid);
999	BBPunfix(pos->batCacheid);
1000	BBPunfix(len->batCacheid);
1001
1002	BBPkeepref(*res1 = bn->batCacheid);
1003	BBPkeepref(*res2 = bn2->batCacheid);
1004
1005	return MAL_SUCCEED;
1006	}
1007
1008	/ copy up to utf8len UTF-8 encoded characters from src to buf*
1009	* stop early if buf (size given by bufsize) is too small, or if src runs out
1010	* return number of UTF-8 characters copied (excluding NUL)
1011	* close with NUL if enough space */
1012	static size_t
1013	utf8strncpy(char buf, size_t bufsize, const* char *src, size_t utf8len)
1014	{
1015	size_t cnt = `0`;
1016
1017	while (utf8len != `0` && *src != `0` && bufsize != `0`) {
1018	bufsize--;
1019	utf8len--;
1020	cnt++;
1021	if (((buf++ = src++) & `0x80`) != `0`) {
1022	while ((*src & `0xC0`) == `0x80` && bufsize != `0`) {
1023	buf++ = src++;
1024	bufsize--;
1025	}
1026	}
1027	}
1028	if (bufsize != `0`)
1029	*buf = `0`;
1030	return cnt;
1031	}
1032
1033	str
1034	CMDstr2qgrams(bat ret, str val)
1035	{
1036	BAT *bn;
1037	size_t i, len = strlen(*val) + `5`;
1038	str s = GDKmalloc(len);
1039	char qgram[`4` * `6` + `1`]; / 4 UTF-8 code points plus NULL byte /
1040
1041	if (s == NULL)
1042	throw(MAL, "txtsim.str2qgram", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1043	strcpy(s, "##");
1044	strcpy(s + `2`, *val);
1045	strcpy(s + len - `3`, "$$");
1046	bn = COLnew(`0`, TYPE_str, (BUN) strlen(*val), TRANSIENT);
1047	if (bn == NULL) {
1048	GDKfree(s);
1049	throw(MAL, "txtsim.str2qgram", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1050	}
1051
1052	i = `0`;
1053	while (s[i]) {
1054	if (utf8strncpy(qgram, sizeof(qgram), s + i, `4`) < `4`)
1055	break;
1056	if (BUNappend(bn, qgram, false) != GDK_SUCCEED) {
1057	BBPreclaim(bn);
1058	GDKfree(s);
1059	throw(MAL, "txtsim.str2qgram", SQLSTATE(HY001) MAL_MALLOC_FAIL);
1060	}
1061	if ((s[i++] & `0xC0`) == `0xC0`) {
1062	while ((s[i] & `0xC0`) == `0x80`)
1063	i++;
1064	}
1065	}
1066	BBPkeepref(*ret = bn->batCacheid);
1067	GDKfree(s);
1068	return MAL_SUCCEED;
1069	}
1070

Browse the source code of MonetDB/monetdb5/modules/mal/txtsim.c