1/*-------------------------------------------------------------------------
2 *
3 * tsrank.c
4 * rank tsvector by tsquery
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsrank.c
11 *
12 *-------------------------------------------------------------------------
13 */
14#include "postgres.h"
15
16#include <limits.h>
17#include <math.h>
18
19#include "tsearch/ts_utils.h"
20#include "utils/array.h"
21#include "utils/builtins.h"
22#include "miscadmin.h"
23
24
25static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
26
27#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
28
29#define RANK_NO_NORM 0x00
30#define RANK_NORM_LOGLENGTH 0x01
31#define RANK_NORM_LENGTH 0x02
32#define RANK_NORM_EXTDIST 0x04
33#define RANK_NORM_UNIQ 0x08
34#define RANK_NORM_LOGUNIQ 0x10
35#define RANK_NORM_RDIVRPLUS1 0x20
36#define DEF_NORM_METHOD RANK_NO_NORM
37
38static float calc_rank_or(const float *w, TSVector t, TSQuery q);
39static float calc_rank_and(const float *w, TSVector t, TSQuery q);
40
41/*
42 * Returns a weight of a word collocation
43 */
44static float4
45word_distance(int32 w)
46{
47 if (w > 100)
48 return 1e-30f;
49
50 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
51}
52
53static int
54cnt_length(TSVector t)
55{
56 WordEntry *ptr = ARRPTR(t),
57 *end = (WordEntry *) STRPTR(t);
58 int len = 0;
59
60 while (ptr < end)
61 {
62 int clen = POSDATALEN(t, ptr);
63
64 if (clen == 0)
65 len += 1;
66 else
67 len += clen;
68
69 ptr++;
70 }
71
72 return len;
73}
74
75
76#define WordECompareQueryItem(e,q,p,i,m) \
77 tsCompareString((q) + (i)->distance, (i)->length, \
78 (e) + (p)->pos, (p)->len, (m))
79
80
81/*
82 * Returns a pointer to a WordEntry's array corresponding to 'item' from
83 * tsvector 't'. 'q' is the TSQuery containing 'item'.
84 * Returns NULL if not found.
85 */
86static WordEntry *
87find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
88{
89 WordEntry *StopLow = ARRPTR(t);
90 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
91 WordEntry *StopMiddle = StopHigh;
92 int difference;
93
94 *nitem = 0;
95
96 /* Loop invariant: StopLow <= item < StopHigh */
97 while (StopLow < StopHigh)
98 {
99 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
100 difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
101 if (difference == 0)
102 {
103 StopHigh = StopMiddle;
104 *nitem = 1;
105 break;
106 }
107 else if (difference > 0)
108 StopLow = StopMiddle + 1;
109 else
110 StopHigh = StopMiddle;
111 }
112
113 if (item->prefix)
114 {
115 if (StopLow >= StopHigh)
116 StopMiddle = StopHigh;
117
118 *nitem = 0;
119
120 while (StopMiddle < (WordEntry *) STRPTR(t) &&
121 WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
122 {
123 (*nitem)++;
124 StopMiddle++;
125 }
126 }
127
128 return (*nitem > 0) ? StopHigh : NULL;
129}
130
131
132/*
133 * sort QueryOperands by (length, word)
134 */
135static int
136compareQueryOperand(const void *a, const void *b, void *arg)
137{
138 char *operand = (char *) arg;
139 QueryOperand *qa = (*(QueryOperand *const *) a);
140 QueryOperand *qb = (*(QueryOperand *const *) b);
141
142 return tsCompareString(operand + qa->distance, qa->length,
143 operand + qb->distance, qb->length,
144 false);
145}
146
147/*
148 * Returns a sorted, de-duplicated array of QueryOperands in a query.
149 * The returned QueryOperands are pointers to the original QueryOperands
150 * in the query.
151 *
152 * Length of the returned array is stored in *size
153 */
154static QueryOperand **
155SortAndUniqItems(TSQuery q, int *size)
156{
157 char *operand = GETOPERAND(q);
158 QueryItem *item = GETQUERY(q);
159 QueryOperand **res,
160 **ptr,
161 **prevptr;
162
163 ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
164
165 /* Collect all operands from the tree to res */
166 while ((*size)--)
167 {
168 if (item->type == QI_VAL)
169 {
170 *ptr = (QueryOperand *) item;
171 ptr++;
172 }
173 item++;
174 }
175
176 *size = ptr - res;
177 if (*size < 2)
178 return res;
179
180 qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
181
182 ptr = res + 1;
183 prevptr = res;
184
185 /* remove duplicates */
186 while (ptr - res < *size)
187 {
188 if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
189 {
190 prevptr++;
191 *prevptr = *ptr;
192 }
193 ptr++;
194 }
195
196 *size = prevptr + 1 - res;
197 return res;
198}
199
200static float
201calc_rank_and(const float *w, TSVector t, TSQuery q)
202{
203 WordEntryPosVector **pos;
204 WordEntryPosVector1 posnull;
205 WordEntryPosVector *POSNULL;
206 int i,
207 k,
208 l,
209 p;
210 WordEntry *entry,
211 *firstentry;
212 WordEntryPos *post,
213 *ct;
214 int32 dimt,
215 lenct,
216 dist,
217 nitem;
218 float res = -1.0;
219 QueryOperand **item;
220 int size = q->size;
221
222 item = SortAndUniqItems(q, &size);
223 if (size < 2)
224 {
225 pfree(item);
226 return calc_rank_or(w, t, q);
227 }
228 pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
229
230 /* A dummy WordEntryPos array to use when haspos is false */
231 posnull.npos = 1;
232 posnull.pos[0] = 0;
233 WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
234 POSNULL = (WordEntryPosVector *) &posnull;
235
236 for (i = 0; i < size; i++)
237 {
238 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
239 if (!entry)
240 continue;
241
242 while (entry - firstentry < nitem)
243 {
244 if (entry->haspos)
245 pos[i] = _POSVECPTR(t, entry);
246 else
247 pos[i] = POSNULL;
248
249 dimt = pos[i]->npos;
250 post = pos[i]->pos;
251 for (k = 0; k < i; k++)
252 {
253 if (!pos[k])
254 continue;
255 lenct = pos[k]->npos;
256 ct = pos[k]->pos;
257 for (l = 0; l < dimt; l++)
258 {
259 for (p = 0; p < lenct; p++)
260 {
261 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
262 if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
263 {
264 float curw;
265
266 if (!dist)
267 dist = MAXENTRYPOS;
268 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
269 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
270 }
271 }
272 }
273 }
274
275 entry++;
276 }
277 }
278 pfree(pos);
279 pfree(item);
280 return res;
281}
282
283static float
284calc_rank_or(const float *w, TSVector t, TSQuery q)
285{
286 WordEntry *entry,
287 *firstentry;
288 WordEntryPosVector1 posnull;
289 WordEntryPos *post;
290 int32 dimt,
291 j,
292 i,
293 nitem;
294 float res = 0.0;
295 QueryOperand **item;
296 int size = q->size;
297
298 /* A dummy WordEntryPos array to use when haspos is false */
299 posnull.npos = 1;
300 posnull.pos[0] = 0;
301
302 item = SortAndUniqItems(q, &size);
303
304 for (i = 0; i < size; i++)
305 {
306 float resj,
307 wjm;
308 int32 jm;
309
310 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
311 if (!entry)
312 continue;
313
314 while (entry - firstentry < nitem)
315 {
316 if (entry->haspos)
317 {
318 dimt = POSDATALEN(t, entry);
319 post = POSDATAPTR(t, entry);
320 }
321 else
322 {
323 dimt = posnull.npos;
324 post = posnull.pos;
325 }
326
327 resj = 0.0;
328 wjm = -1.0;
329 jm = 0;
330 for (j = 0; j < dimt; j++)
331 {
332 resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
333 if (wpos(post[j]) > wjm)
334 {
335 wjm = wpos(post[j]);
336 jm = j;
337 }
338 }
339/*
340 limit (sum(i/i^2),i->inf) = pi^2/6
341 resj = sum(wi/i^2),i=1,noccurence,
342 wi - should be sorted desc,
343 don't sort for now, just choose maximum weight. This should be corrected
344 Oleg Bartunov
345*/
346 res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
347
348 entry++;
349 }
350 }
351 if (size > 0)
352 res = res / size;
353 pfree(item);
354 return res;
355}
356
357static float
358calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
359{
360 QueryItem *item = GETQUERY(q);
361 float res = 0.0;
362 int len;
363
364 if (!t->size || !q->size)
365 return 0.0;
366
367 /* XXX: What about NOT? */
368 res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND ||
369 item->qoperator.oper == OP_PHRASE)) ?
370 calc_rank_and(w, t, q) :
371 calc_rank_or(w, t, q);
372
373 if (res < 0)
374 res = 1e-20f;
375
376 if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
377 res /= log((double) (cnt_length(t) + 1)) / log(2.0);
378
379 if (method & RANK_NORM_LENGTH)
380 {
381 len = cnt_length(t);
382 if (len > 0)
383 res /= (float) len;
384 }
385
386 /* RANK_NORM_EXTDIST not applicable */
387
388 if ((method & RANK_NORM_UNIQ) && t->size > 0)
389 res /= (float) (t->size);
390
391 if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
392 res /= log((double) (t->size + 1)) / log(2.0);
393
394 if (method & RANK_NORM_RDIVRPLUS1)
395 res /= (res + 1);
396
397 return res;
398}
399
400static const float *
401getWeights(ArrayType *win)
402{
403 static float ws[lengthof(weights)];
404 int i;
405 float4 *arrdata;
406
407 if (win == NULL)
408 return weights;
409
410 if (ARR_NDIM(win) != 1)
411 ereport(ERROR,
412 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
413 errmsg("array of weight must be one-dimensional")));
414
415 if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
416 ereport(ERROR,
417 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
418 errmsg("array of weight is too short")));
419
420 if (array_contains_nulls(win))
421 ereport(ERROR,
422 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
423 errmsg("array of weight must not contain nulls")));
424
425 arrdata = (float4 *) ARR_DATA_PTR(win);
426 for (i = 0; i < lengthof(weights); i++)
427 {
428 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
429 if (ws[i] > 1.0)
430 ereport(ERROR,
431 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
432 errmsg("weight out of range")));
433 }
434
435 return ws;
436}
437
438Datum
439ts_rank_wttf(PG_FUNCTION_ARGS)
440{
441 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
442 TSVector txt = PG_GETARG_TSVECTOR(1);
443 TSQuery query = PG_GETARG_TSQUERY(2);
444 int method = PG_GETARG_INT32(3);
445 float res;
446
447 res = calc_rank(getWeights(win), txt, query, method);
448
449 PG_FREE_IF_COPY(win, 0);
450 PG_FREE_IF_COPY(txt, 1);
451 PG_FREE_IF_COPY(query, 2);
452 PG_RETURN_FLOAT4(res);
453}
454
455Datum
456ts_rank_wtt(PG_FUNCTION_ARGS)
457{
458 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
459 TSVector txt = PG_GETARG_TSVECTOR(1);
460 TSQuery query = PG_GETARG_TSQUERY(2);
461 float res;
462
463 res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
464
465 PG_FREE_IF_COPY(win, 0);
466 PG_FREE_IF_COPY(txt, 1);
467 PG_FREE_IF_COPY(query, 2);
468 PG_RETURN_FLOAT4(res);
469}
470
471Datum
472ts_rank_ttf(PG_FUNCTION_ARGS)
473{
474 TSVector txt = PG_GETARG_TSVECTOR(0);
475 TSQuery query = PG_GETARG_TSQUERY(1);
476 int method = PG_GETARG_INT32(2);
477 float res;
478
479 res = calc_rank(getWeights(NULL), txt, query, method);
480
481 PG_FREE_IF_COPY(txt, 0);
482 PG_FREE_IF_COPY(query, 1);
483 PG_RETURN_FLOAT4(res);
484}
485
486Datum
487ts_rank_tt(PG_FUNCTION_ARGS)
488{
489 TSVector txt = PG_GETARG_TSVECTOR(0);
490 TSQuery query = PG_GETARG_TSQUERY(1);
491 float res;
492
493 res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
494
495 PG_FREE_IF_COPY(txt, 0);
496 PG_FREE_IF_COPY(query, 1);
497 PG_RETURN_FLOAT4(res);
498}
499
500typedef struct
501{
502 union
503 {
504 struct
505 { /* compiled doc representation */
506 QueryItem **items;
507 int16 nitem;
508 } query;
509 struct
510 { /* struct is used for preparing doc
511 * representation */
512 QueryItem *item;
513 WordEntry *entry;
514 } map;
515 } data;
516 WordEntryPos pos;
517} DocRepresentation;
518
519static int
520compareDocR(const void *va, const void *vb)
521{
522 const DocRepresentation *a = (const DocRepresentation *) va;
523 const DocRepresentation *b = (const DocRepresentation *) vb;
524
525 if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos))
526 {
527 if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
528 {
529 if (a->data.map.entry == b->data.map.entry)
530 return 0;
531
532 return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
533 }
534
535 return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
536 }
537
538 return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1;
539}
540
541#define MAXQROPOS MAXENTRYPOS
542typedef struct
543{
544 bool operandexists;
545 bool reverseinsert; /* indicates insert order, true means
546 * descending order */
547 uint32 npos;
548 WordEntryPos pos[MAXQROPOS];
549} QueryRepresentationOperand;
550
551typedef struct
552{
553 TSQuery query;
554 QueryRepresentationOperand *operandData;
555} QueryRepresentation;
556
557#define QR_GET_OPERAND_DATA(q, v) \
558 ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
559
560static bool
561checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
562{
563 QueryRepresentation *qr = (QueryRepresentation *) checkval;
564 QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
565
566 if (!opData->operandexists)
567 return false;
568
569 if (data)
570 {
571 data->npos = opData->npos;
572 data->pos = opData->pos;
573 if (opData->reverseinsert)
574 data->pos += MAXQROPOS - opData->npos;
575 }
576
577 return true;
578}
579
580typedef struct
581{
582 int pos;
583 int p;
584 int q;
585 DocRepresentation *begin;
586 DocRepresentation *end;
587} CoverExt;
588
589static void
590resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert)
591{
592 int i;
593
594 for (i = 0; i < qr->query->size; i++)
595 {
596 qr->operandData[i].operandexists = false;
597 qr->operandData[i].reverseinsert = reverseinsert;
598 qr->operandData[i].npos = 0;
599 }
600}
601
602static void
603fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry)
604{
605 int i;
606 int lastPos;
607 QueryRepresentationOperand *opData;
608
609 for (i = 0; i < entry->data.query.nitem; i++)
610 {
611 if (entry->data.query.items[i]->type != QI_VAL)
612 continue;
613
614 opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]);
615
616 opData->operandexists = true;
617
618 if (opData->npos == 0)
619 {
620 lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0;
621 opData->pos[lastPos] = entry->pos;
622 opData->npos++;
623 continue;
624 }
625
626 lastPos = opData->reverseinsert ?
627 (MAXQROPOS - opData->npos) :
628 (opData->npos - 1);
629
630 if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos))
631 {
632 lastPos = opData->reverseinsert ?
633 (MAXQROPOS - 1 - opData->npos) :
634 (opData->npos);
635
636 opData->pos[lastPos] = entry->pos;
637 opData->npos++;
638 }
639 }
640}
641
642static bool
643Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
644{
645 DocRepresentation *ptr;
646 int lastpos = ext->pos;
647 bool found = false;
648
649 /*
650 * since this function recurses, it could be driven to stack overflow.
651 * (though any decent compiler will optimize away the tail-recursion.
652 */
653 check_stack_depth();
654
655 resetQueryRepresentation(qr, false);
656
657 ext->p = INT_MAX;
658 ext->q = 0;
659 ptr = doc + ext->pos;
660
661 /* find upper bound of cover from current position, move up */
662 while (ptr - doc < len)
663 {
664 fillQueryRepresentationData(qr, ptr);
665
666 if (TS_execute(GETQUERY(qr->query), (void *) qr,
667 TS_EXEC_EMPTY, checkcondition_QueryOperand))
668 {
669 if (WEP_GETPOS(ptr->pos) > ext->q)
670 {
671 ext->q = WEP_GETPOS(ptr->pos);
672 ext->end = ptr;
673 lastpos = ptr - doc;
674 found = true;
675 }
676 break;
677 }
678 ptr++;
679 }
680
681 if (!found)
682 return false;
683
684 resetQueryRepresentation(qr, true);
685
686 ptr = doc + lastpos;
687
688 /* find lower bound of cover from found upper bound, move down */
689 while (ptr >= doc + ext->pos)
690 {
691 /*
692 * we scan doc from right to left, so pos info in reverse order!
693 */
694 fillQueryRepresentationData(qr, ptr);
695
696 if (TS_execute(GETQUERY(qr->query), (void *) qr,
697 TS_EXEC_CALC_NOT, checkcondition_QueryOperand))
698 {
699 if (WEP_GETPOS(ptr->pos) < ext->p)
700 {
701 ext->begin = ptr;
702 ext->p = WEP_GETPOS(ptr->pos);
703 }
704 break;
705 }
706 ptr--;
707 }
708
709 if (ext->p <= ext->q)
710 {
711 /*
712 * set position for next try to next lexeme after beginning of found
713 * cover
714 */
715 ext->pos = (ptr - doc) + 1;
716 return true;
717 }
718
719 ext->pos++;
720 return Cover(doc, len, qr, ext);
721}
722
723static DocRepresentation *
724get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
725{
726 QueryItem *item = GETQUERY(qr->query);
727 WordEntry *entry,
728 *firstentry;
729 WordEntryPos *post;
730 int32 dimt, /* number of 'post' items */
731 j,
732 i,
733 nitem;
734 int len = qr->query->size * 4,
735 cur = 0;
736 DocRepresentation *doc;
737
738 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
739
740 /*
741 * Iterate through query to make DocRepresentaion for words and it's
742 * entries satisfied by query
743 */
744 for (i = 0; i < qr->query->size; i++)
745 {
746 QueryOperand *curoperand;
747
748 if (item[i].type != QI_VAL)
749 continue;
750
751 curoperand = &item[i].qoperand;
752
753 firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
754 if (!entry)
755 continue;
756
757 /* iterations over entries in tsvector */
758 while (entry - firstentry < nitem)
759 {
760 if (entry->haspos)
761 {
762 dimt = POSDATALEN(txt, entry);
763 post = POSDATAPTR(txt, entry);
764 }
765 else
766 {
767 /* ignore words without positions */
768 entry++;
769 continue;
770 }
771
772 while (cur + dimt >= len)
773 {
774 len *= 2;
775 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
776 }
777
778 /* iterations over entry's positions */
779 for (j = 0; j < dimt; j++)
780 {
781 if (curoperand->weight == 0 ||
782 curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
783 {
784 doc[cur].pos = post[j];
785 doc[cur].data.map.entry = entry;
786 doc[cur].data.map.item = (QueryItem *) curoperand;
787 cur++;
788 }
789 }
790
791 entry++;
792 }
793 }
794
795 if (cur > 0)
796 {
797 DocRepresentation *rptr = doc + 1,
798 *wptr = doc,
799 storage;
800
801 /*
802 * Sort representation in ascending order by pos and entry
803 */
804 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
805
806 /*
807 * Join QueryItem per WordEntry and it's position
808 */
809 storage.pos = doc->pos;
810 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
811 storage.data.query.items[0] = doc->data.map.item;
812 storage.data.query.nitem = 1;
813
814 while (rptr - doc < cur)
815 {
816 if (rptr->pos == (rptr - 1)->pos &&
817 rptr->data.map.entry == (rptr - 1)->data.map.entry)
818 {
819 storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
820 storage.data.query.nitem++;
821 }
822 else
823 {
824 *wptr = storage;
825 wptr++;
826 storage.pos = rptr->pos;
827 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
828 storage.data.query.items[0] = rptr->data.map.item;
829 storage.data.query.nitem = 1;
830 }
831
832 rptr++;
833 }
834
835 *wptr = storage;
836 wptr++;
837
838 *doclen = wptr - doc;
839 return doc;
840 }
841
842 pfree(doc);
843 return NULL;
844}
845
846static float4
847calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
848{
849 DocRepresentation *doc;
850 int len,
851 i,
852 doclen = 0;
853 CoverExt ext;
854 double Wdoc = 0.0;
855 double invws[lengthof(weights)];
856 double SumDist = 0.0,
857 PrevExtPos = 0.0,
858 CurExtPos = 0.0;
859 int NExtent = 0;
860 QueryRepresentation qr;
861
862
863 for (i = 0; i < lengthof(weights); i++)
864 {
865 invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
866 if (invws[i] > 1.0)
867 ereport(ERROR,
868 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
869 errmsg("weight out of range")));
870 invws[i] = 1.0 / invws[i];
871 }
872
873 qr.query = query;
874 qr.operandData = (QueryRepresentationOperand *)
875 palloc0(sizeof(QueryRepresentationOperand) * query->size);
876
877 doc = get_docrep(txt, &qr, &doclen);
878 if (!doc)
879 {
880 pfree(qr.operandData);
881 return 0.0;
882 }
883
884 MemSet(&ext, 0, sizeof(CoverExt));
885 while (Cover(doc, doclen, &qr, &ext))
886 {
887 double Cpos = 0.0;
888 double InvSum = 0.0;
889 int nNoise;
890 DocRepresentation *ptr = ext.begin;
891
892 while (ptr <= ext.end)
893 {
894 InvSum += invws[WEP_GETWEIGHT(ptr->pos)];
895 ptr++;
896 }
897
898 Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
899
900 /*
901 * if doc are big enough then ext.q may be equal to ext.p due to limit
902 * of positional information. In this case we approximate number of
903 * noise word as half cover's length
904 */
905 nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
906 if (nNoise < 0)
907 nNoise = (ext.end - ext.begin) / 2;
908 Wdoc += Cpos / ((double) (1 + nNoise));
909
910 CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
911 if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent division by
912 * zero in a case of
913 * multiple lexize */ )
914 SumDist += 1.0 / (CurExtPos - PrevExtPos);
915
916 PrevExtPos = CurExtPos;
917 NExtent++;
918 }
919
920 if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
921 Wdoc /= log((double) (cnt_length(txt) + 1));
922
923 if (method & RANK_NORM_LENGTH)
924 {
925 len = cnt_length(txt);
926 if (len > 0)
927 Wdoc /= (double) len;
928 }
929
930 if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
931 Wdoc /= ((double) NExtent) / SumDist;
932
933 if ((method & RANK_NORM_UNIQ) && txt->size > 0)
934 Wdoc /= (double) (txt->size);
935
936 if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
937 Wdoc /= log((double) (txt->size + 1)) / log(2.0);
938
939 if (method & RANK_NORM_RDIVRPLUS1)
940 Wdoc /= (Wdoc + 1);
941
942 pfree(doc);
943
944 pfree(qr.operandData);
945
946 return (float4) Wdoc;
947}
948
949Datum
950ts_rankcd_wttf(PG_FUNCTION_ARGS)
951{
952 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
953 TSVector txt = PG_GETARG_TSVECTOR(1);
954 TSQuery query = PG_GETARG_TSQUERY(2);
955 int method = PG_GETARG_INT32(3);
956 float res;
957
958 res = calc_rank_cd(getWeights(win), txt, query, method);
959
960 PG_FREE_IF_COPY(win, 0);
961 PG_FREE_IF_COPY(txt, 1);
962 PG_FREE_IF_COPY(query, 2);
963 PG_RETURN_FLOAT4(res);
964}
965
966Datum
967ts_rankcd_wtt(PG_FUNCTION_ARGS)
968{
969 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
970 TSVector txt = PG_GETARG_TSVECTOR(1);
971 TSQuery query = PG_GETARG_TSQUERY(2);
972 float res;
973
974 res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
975
976 PG_FREE_IF_COPY(win, 0);
977 PG_FREE_IF_COPY(txt, 1);
978 PG_FREE_IF_COPY(query, 2);
979 PG_RETURN_FLOAT4(res);
980}
981
982Datum
983ts_rankcd_ttf(PG_FUNCTION_ARGS)
984{
985 TSVector txt = PG_GETARG_TSVECTOR(0);
986 TSQuery query = PG_GETARG_TSQUERY(1);
987 int method = PG_GETARG_INT32(2);
988 float res;
989
990 res = calc_rank_cd(getWeights(NULL), txt, query, method);
991
992 PG_FREE_IF_COPY(txt, 0);
993 PG_FREE_IF_COPY(query, 1);
994 PG_RETURN_FLOAT4(res);
995}
996
997Datum
998ts_rankcd_tt(PG_FUNCTION_ARGS)
999{
1000 TSVector txt = PG_GETARG_TSVECTOR(0);
1001 TSQuery query = PG_GETARG_TSQUERY(1);
1002 float res;
1003
1004 res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
1005
1006 PG_FREE_IF_COPY(txt, 0);
1007 PG_FREE_IF_COPY(query, 1);
1008 PG_RETURN_FLOAT4(res);
1009}
1010