1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * tsrank.c |
4 | * rank tsvector by tsquery |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * |
8 | * |
9 | * IDENTIFICATION |
10 | * src/backend/utils/adt/tsrank.c |
11 | * |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | #include "postgres.h" |
15 | |
16 | #include <limits.h> |
17 | #include <math.h> |
18 | |
19 | #include "tsearch/ts_utils.h" |
20 | #include "utils/array.h" |
21 | #include "utils/builtins.h" |
22 | #include "miscadmin.h" |
23 | |
24 | |
25 | static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f}; |
26 | |
27 | #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] ) |
28 | |
29 | #define RANK_NO_NORM 0x00 |
30 | #define RANK_NORM_LOGLENGTH 0x01 |
31 | #define RANK_NORM_LENGTH 0x02 |
32 | #define RANK_NORM_EXTDIST 0x04 |
33 | #define RANK_NORM_UNIQ 0x08 |
34 | #define RANK_NORM_LOGUNIQ 0x10 |
35 | #define RANK_NORM_RDIVRPLUS1 0x20 |
36 | #define DEF_NORM_METHOD RANK_NO_NORM |
37 | |
38 | static float calc_rank_or(const float *w, TSVector t, TSQuery q); |
39 | static float calc_rank_and(const float *w, TSVector t, TSQuery q); |
40 | |
41 | /* |
42 | * Returns a weight of a word collocation |
43 | */ |
44 | static float4 |
45 | word_distance(int32 w) |
46 | { |
47 | if (w > 100) |
48 | return 1e-30f; |
49 | |
50 | return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2)); |
51 | } |
52 | |
53 | static int |
54 | cnt_length(TSVector t) |
55 | { |
56 | WordEntry *ptr = ARRPTR(t), |
57 | *end = (WordEntry *) STRPTR(t); |
58 | int len = 0; |
59 | |
60 | while (ptr < end) |
61 | { |
62 | int clen = POSDATALEN(t, ptr); |
63 | |
64 | if (clen == 0) |
65 | len += 1; |
66 | else |
67 | len += clen; |
68 | |
69 | ptr++; |
70 | } |
71 | |
72 | return len; |
73 | } |
74 | |
75 | |
76 | #define WordECompareQueryItem(e,q,p,i,m) \ |
77 | tsCompareString((q) + (i)->distance, (i)->length, \ |
78 | (e) + (p)->pos, (p)->len, (m)) |
79 | |
80 | |
81 | /* |
82 | * Returns a pointer to a WordEntry's array corresponding to 'item' from |
83 | * tsvector 't'. 'q' is the TSQuery containing 'item'. |
84 | * Returns NULL if not found. |
85 | */ |
86 | static WordEntry * |
87 | find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) |
88 | { |
89 | WordEntry *StopLow = ARRPTR(t); |
90 | WordEntry *StopHigh = (WordEntry *) STRPTR(t); |
91 | WordEntry *StopMiddle = StopHigh; |
92 | int difference; |
93 | |
94 | *nitem = 0; |
95 | |
96 | /* Loop invariant: StopLow <= item < StopHigh */ |
97 | while (StopLow < StopHigh) |
98 | { |
99 | StopMiddle = StopLow + (StopHigh - StopLow) / 2; |
100 | difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false); |
101 | if (difference == 0) |
102 | { |
103 | StopHigh = StopMiddle; |
104 | *nitem = 1; |
105 | break; |
106 | } |
107 | else if (difference > 0) |
108 | StopLow = StopMiddle + 1; |
109 | else |
110 | StopHigh = StopMiddle; |
111 | } |
112 | |
113 | if (item->prefix) |
114 | { |
115 | if (StopLow >= StopHigh) |
116 | StopMiddle = StopHigh; |
117 | |
118 | *nitem = 0; |
119 | |
120 | while (StopMiddle < (WordEntry *) STRPTR(t) && |
121 | WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0) |
122 | { |
123 | (*nitem)++; |
124 | StopMiddle++; |
125 | } |
126 | } |
127 | |
128 | return (*nitem > 0) ? StopHigh : NULL; |
129 | } |
130 | |
131 | |
132 | /* |
133 | * sort QueryOperands by (length, word) |
134 | */ |
135 | static int |
136 | compareQueryOperand(const void *a, const void *b, void *arg) |
137 | { |
138 | char *operand = (char *) arg; |
139 | QueryOperand *qa = (*(QueryOperand *const *) a); |
140 | QueryOperand *qb = (*(QueryOperand *const *) b); |
141 | |
142 | return tsCompareString(operand + qa->distance, qa->length, |
143 | operand + qb->distance, qb->length, |
144 | false); |
145 | } |
146 | |
147 | /* |
148 | * Returns a sorted, de-duplicated array of QueryOperands in a query. |
149 | * The returned QueryOperands are pointers to the original QueryOperands |
150 | * in the query. |
151 | * |
152 | * Length of the returned array is stored in *size |
153 | */ |
154 | static QueryOperand ** |
155 | SortAndUniqItems(TSQuery q, int *size) |
156 | { |
157 | char *operand = GETOPERAND(q); |
158 | QueryItem *item = GETQUERY(q); |
159 | QueryOperand **res, |
160 | **ptr, |
161 | **prevptr; |
162 | |
163 | ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size); |
164 | |
165 | /* Collect all operands from the tree to res */ |
166 | while ((*size)--) |
167 | { |
168 | if (item->type == QI_VAL) |
169 | { |
170 | *ptr = (QueryOperand *) item; |
171 | ptr++; |
172 | } |
173 | item++; |
174 | } |
175 | |
176 | *size = ptr - res; |
177 | if (*size < 2) |
178 | return res; |
179 | |
180 | qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand); |
181 | |
182 | ptr = res + 1; |
183 | prevptr = res; |
184 | |
185 | /* remove duplicates */ |
186 | while (ptr - res < *size) |
187 | { |
188 | if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0) |
189 | { |
190 | prevptr++; |
191 | *prevptr = *ptr; |
192 | } |
193 | ptr++; |
194 | } |
195 | |
196 | *size = prevptr + 1 - res; |
197 | return res; |
198 | } |
199 | |
200 | static float |
201 | calc_rank_and(const float *w, TSVector t, TSQuery q) |
202 | { |
203 | WordEntryPosVector **pos; |
204 | WordEntryPosVector1 posnull; |
205 | WordEntryPosVector *POSNULL; |
206 | int i, |
207 | k, |
208 | l, |
209 | p; |
210 | WordEntry *entry, |
211 | *firstentry; |
212 | WordEntryPos *post, |
213 | *ct; |
214 | int32 dimt, |
215 | lenct, |
216 | dist, |
217 | nitem; |
218 | float res = -1.0; |
219 | QueryOperand **item; |
220 | int size = q->size; |
221 | |
222 | item = SortAndUniqItems(q, &size); |
223 | if (size < 2) |
224 | { |
225 | pfree(item); |
226 | return calc_rank_or(w, t, q); |
227 | } |
228 | pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size); |
229 | |
230 | /* A dummy WordEntryPos array to use when haspos is false */ |
231 | posnull.npos = 1; |
232 | posnull.pos[0] = 0; |
233 | WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1); |
234 | POSNULL = (WordEntryPosVector *) &posnull; |
235 | |
236 | for (i = 0; i < size; i++) |
237 | { |
238 | firstentry = entry = find_wordentry(t, q, item[i], &nitem); |
239 | if (!entry) |
240 | continue; |
241 | |
242 | while (entry - firstentry < nitem) |
243 | { |
244 | if (entry->haspos) |
245 | pos[i] = _POSVECPTR(t, entry); |
246 | else |
247 | pos[i] = POSNULL; |
248 | |
249 | dimt = pos[i]->npos; |
250 | post = pos[i]->pos; |
251 | for (k = 0; k < i; k++) |
252 | { |
253 | if (!pos[k]) |
254 | continue; |
255 | lenct = pos[k]->npos; |
256 | ct = pos[k]->pos; |
257 | for (l = 0; l < dimt; l++) |
258 | { |
259 | for (p = 0; p < lenct; p++) |
260 | { |
261 | dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); |
262 | if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL))) |
263 | { |
264 | float curw; |
265 | |
266 | if (!dist) |
267 | dist = MAXENTRYPOS; |
268 | curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist)); |
269 | res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw); |
270 | } |
271 | } |
272 | } |
273 | } |
274 | |
275 | entry++; |
276 | } |
277 | } |
278 | pfree(pos); |
279 | pfree(item); |
280 | return res; |
281 | } |
282 | |
283 | static float |
284 | calc_rank_or(const float *w, TSVector t, TSQuery q) |
285 | { |
286 | WordEntry *entry, |
287 | *firstentry; |
288 | WordEntryPosVector1 posnull; |
289 | WordEntryPos *post; |
290 | int32 dimt, |
291 | j, |
292 | i, |
293 | nitem; |
294 | float res = 0.0; |
295 | QueryOperand **item; |
296 | int size = q->size; |
297 | |
298 | /* A dummy WordEntryPos array to use when haspos is false */ |
299 | posnull.npos = 1; |
300 | posnull.pos[0] = 0; |
301 | |
302 | item = SortAndUniqItems(q, &size); |
303 | |
304 | for (i = 0; i < size; i++) |
305 | { |
306 | float resj, |
307 | wjm; |
308 | int32 jm; |
309 | |
310 | firstentry = entry = find_wordentry(t, q, item[i], &nitem); |
311 | if (!entry) |
312 | continue; |
313 | |
314 | while (entry - firstentry < nitem) |
315 | { |
316 | if (entry->haspos) |
317 | { |
318 | dimt = POSDATALEN(t, entry); |
319 | post = POSDATAPTR(t, entry); |
320 | } |
321 | else |
322 | { |
323 | dimt = posnull.npos; |
324 | post = posnull.pos; |
325 | } |
326 | |
327 | resj = 0.0; |
328 | wjm = -1.0; |
329 | jm = 0; |
330 | for (j = 0; j < dimt; j++) |
331 | { |
332 | resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); |
333 | if (wpos(post[j]) > wjm) |
334 | { |
335 | wjm = wpos(post[j]); |
336 | jm = j; |
337 | } |
338 | } |
339 | /* |
340 | limit (sum(i/i^2),i->inf) = pi^2/6 |
341 | resj = sum(wi/i^2),i=1,noccurence, |
342 | wi - should be sorted desc, |
343 | don't sort for now, just choose maximum weight. This should be corrected |
344 | Oleg Bartunov |
345 | */ |
346 | res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; |
347 | |
348 | entry++; |
349 | } |
350 | } |
351 | if (size > 0) |
352 | res = res / size; |
353 | pfree(item); |
354 | return res; |
355 | } |
356 | |
357 | static float |
358 | calc_rank(const float *w, TSVector t, TSQuery q, int32 method) |
359 | { |
360 | QueryItem *item = GETQUERY(q); |
361 | float res = 0.0; |
362 | int len; |
363 | |
364 | if (!t->size || !q->size) |
365 | return 0.0; |
366 | |
367 | /* XXX: What about NOT? */ |
368 | res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND || |
369 | item->qoperator.oper == OP_PHRASE)) ? |
370 | calc_rank_and(w, t, q) : |
371 | calc_rank_or(w, t, q); |
372 | |
373 | if (res < 0) |
374 | res = 1e-20f; |
375 | |
376 | if ((method & RANK_NORM_LOGLENGTH) && t->size > 0) |
377 | res /= log((double) (cnt_length(t) + 1)) / log(2.0); |
378 | |
379 | if (method & RANK_NORM_LENGTH) |
380 | { |
381 | len = cnt_length(t); |
382 | if (len > 0) |
383 | res /= (float) len; |
384 | } |
385 | |
386 | /* RANK_NORM_EXTDIST not applicable */ |
387 | |
388 | if ((method & RANK_NORM_UNIQ) && t->size > 0) |
389 | res /= (float) (t->size); |
390 | |
391 | if ((method & RANK_NORM_LOGUNIQ) && t->size > 0) |
392 | res /= log((double) (t->size + 1)) / log(2.0); |
393 | |
394 | if (method & RANK_NORM_RDIVRPLUS1) |
395 | res /= (res + 1); |
396 | |
397 | return res; |
398 | } |
399 | |
400 | static const float * |
401 | getWeights(ArrayType *win) |
402 | { |
403 | static float ws[lengthof(weights)]; |
404 | int i; |
405 | float4 *arrdata; |
406 | |
407 | if (win == NULL) |
408 | return weights; |
409 | |
410 | if (ARR_NDIM(win) != 1) |
411 | ereport(ERROR, |
412 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
413 | errmsg("array of weight must be one-dimensional" ))); |
414 | |
415 | if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights)) |
416 | ereport(ERROR, |
417 | (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), |
418 | errmsg("array of weight is too short" ))); |
419 | |
420 | if (array_contains_nulls(win)) |
421 | ereport(ERROR, |
422 | (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), |
423 | errmsg("array of weight must not contain nulls" ))); |
424 | |
425 | arrdata = (float4 *) ARR_DATA_PTR(win); |
426 | for (i = 0; i < lengthof(weights); i++) |
427 | { |
428 | ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i]; |
429 | if (ws[i] > 1.0) |
430 | ereport(ERROR, |
431 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
432 | errmsg("weight out of range" ))); |
433 | } |
434 | |
435 | return ws; |
436 | } |
437 | |
438 | Datum |
439 | ts_rank_wttf(PG_FUNCTION_ARGS) |
440 | { |
441 | ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
442 | TSVector txt = PG_GETARG_TSVECTOR(1); |
443 | TSQuery query = PG_GETARG_TSQUERY(2); |
444 | int method = PG_GETARG_INT32(3); |
445 | float res; |
446 | |
447 | res = calc_rank(getWeights(win), txt, query, method); |
448 | |
449 | PG_FREE_IF_COPY(win, 0); |
450 | PG_FREE_IF_COPY(txt, 1); |
451 | PG_FREE_IF_COPY(query, 2); |
452 | PG_RETURN_FLOAT4(res); |
453 | } |
454 | |
455 | Datum |
456 | ts_rank_wtt(PG_FUNCTION_ARGS) |
457 | { |
458 | ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
459 | TSVector txt = PG_GETARG_TSVECTOR(1); |
460 | TSQuery query = PG_GETARG_TSQUERY(2); |
461 | float res; |
462 | |
463 | res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD); |
464 | |
465 | PG_FREE_IF_COPY(win, 0); |
466 | PG_FREE_IF_COPY(txt, 1); |
467 | PG_FREE_IF_COPY(query, 2); |
468 | PG_RETURN_FLOAT4(res); |
469 | } |
470 | |
471 | Datum |
472 | ts_rank_ttf(PG_FUNCTION_ARGS) |
473 | { |
474 | TSVector txt = PG_GETARG_TSVECTOR(0); |
475 | TSQuery query = PG_GETARG_TSQUERY(1); |
476 | int method = PG_GETARG_INT32(2); |
477 | float res; |
478 | |
479 | res = calc_rank(getWeights(NULL), txt, query, method); |
480 | |
481 | PG_FREE_IF_COPY(txt, 0); |
482 | PG_FREE_IF_COPY(query, 1); |
483 | PG_RETURN_FLOAT4(res); |
484 | } |
485 | |
486 | Datum |
487 | ts_rank_tt(PG_FUNCTION_ARGS) |
488 | { |
489 | TSVector txt = PG_GETARG_TSVECTOR(0); |
490 | TSQuery query = PG_GETARG_TSQUERY(1); |
491 | float res; |
492 | |
493 | res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD); |
494 | |
495 | PG_FREE_IF_COPY(txt, 0); |
496 | PG_FREE_IF_COPY(query, 1); |
497 | PG_RETURN_FLOAT4(res); |
498 | } |
499 | |
500 | typedef struct |
501 | { |
502 | union |
503 | { |
504 | struct |
505 | { /* compiled doc representation */ |
506 | QueryItem **items; |
507 | int16 nitem; |
508 | } query; |
509 | struct |
510 | { /* struct is used for preparing doc |
511 | * representation */ |
512 | QueryItem *item; |
513 | WordEntry *entry; |
514 | } map; |
515 | } data; |
516 | WordEntryPos pos; |
517 | } DocRepresentation; |
518 | |
519 | static int |
520 | compareDocR(const void *va, const void *vb) |
521 | { |
522 | const DocRepresentation *a = (const DocRepresentation *) va; |
523 | const DocRepresentation *b = (const DocRepresentation *) vb; |
524 | |
525 | if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos)) |
526 | { |
527 | if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos)) |
528 | { |
529 | if (a->data.map.entry == b->data.map.entry) |
530 | return 0; |
531 | |
532 | return (a->data.map.entry > b->data.map.entry) ? 1 : -1; |
533 | } |
534 | |
535 | return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1; |
536 | } |
537 | |
538 | return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1; |
539 | } |
540 | |
541 | #define MAXQROPOS MAXENTRYPOS |
542 | typedef struct |
543 | { |
544 | bool operandexists; |
545 | bool reverseinsert; /* indicates insert order, true means |
546 | * descending order */ |
547 | uint32 npos; |
548 | WordEntryPos pos[MAXQROPOS]; |
549 | } QueryRepresentationOperand; |
550 | |
551 | typedef struct |
552 | { |
553 | TSQuery query; |
554 | QueryRepresentationOperand *operandData; |
555 | } QueryRepresentation; |
556 | |
557 | #define QR_GET_OPERAND_DATA(q, v) \ |
558 | ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) ) |
559 | |
560 | static bool |
561 | checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data) |
562 | { |
563 | QueryRepresentation *qr = (QueryRepresentation *) checkval; |
564 | QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val); |
565 | |
566 | if (!opData->operandexists) |
567 | return false; |
568 | |
569 | if (data) |
570 | { |
571 | data->npos = opData->npos; |
572 | data->pos = opData->pos; |
573 | if (opData->reverseinsert) |
574 | data->pos += MAXQROPOS - opData->npos; |
575 | } |
576 | |
577 | return true; |
578 | } |
579 | |
580 | typedef struct |
581 | { |
582 | int pos; |
583 | int p; |
584 | int q; |
585 | DocRepresentation *begin; |
586 | DocRepresentation *end; |
587 | } CoverExt; |
588 | |
589 | static void |
590 | resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert) |
591 | { |
592 | int i; |
593 | |
594 | for (i = 0; i < qr->query->size; i++) |
595 | { |
596 | qr->operandData[i].operandexists = false; |
597 | qr->operandData[i].reverseinsert = reverseinsert; |
598 | qr->operandData[i].npos = 0; |
599 | } |
600 | } |
601 | |
602 | static void |
603 | fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry) |
604 | { |
605 | int i; |
606 | int lastPos; |
607 | QueryRepresentationOperand *opData; |
608 | |
609 | for (i = 0; i < entry->data.query.nitem; i++) |
610 | { |
611 | if (entry->data.query.items[i]->type != QI_VAL) |
612 | continue; |
613 | |
614 | opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]); |
615 | |
616 | opData->operandexists = true; |
617 | |
618 | if (opData->npos == 0) |
619 | { |
620 | lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0; |
621 | opData->pos[lastPos] = entry->pos; |
622 | opData->npos++; |
623 | continue; |
624 | } |
625 | |
626 | lastPos = opData->reverseinsert ? |
627 | (MAXQROPOS - opData->npos) : |
628 | (opData->npos - 1); |
629 | |
630 | if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos)) |
631 | { |
632 | lastPos = opData->reverseinsert ? |
633 | (MAXQROPOS - 1 - opData->npos) : |
634 | (opData->npos); |
635 | |
636 | opData->pos[lastPos] = entry->pos; |
637 | opData->npos++; |
638 | } |
639 | } |
640 | } |
641 | |
642 | static bool |
643 | Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) |
644 | { |
645 | DocRepresentation *ptr; |
646 | int lastpos = ext->pos; |
647 | bool found = false; |
648 | |
649 | /* |
650 | * since this function recurses, it could be driven to stack overflow. |
651 | * (though any decent compiler will optimize away the tail-recursion. |
652 | */ |
653 | check_stack_depth(); |
654 | |
655 | resetQueryRepresentation(qr, false); |
656 | |
657 | ext->p = INT_MAX; |
658 | ext->q = 0; |
659 | ptr = doc + ext->pos; |
660 | |
661 | /* find upper bound of cover from current position, move up */ |
662 | while (ptr - doc < len) |
663 | { |
664 | fillQueryRepresentationData(qr, ptr); |
665 | |
666 | if (TS_execute(GETQUERY(qr->query), (void *) qr, |
667 | TS_EXEC_EMPTY, checkcondition_QueryOperand)) |
668 | { |
669 | if (WEP_GETPOS(ptr->pos) > ext->q) |
670 | { |
671 | ext->q = WEP_GETPOS(ptr->pos); |
672 | ext->end = ptr; |
673 | lastpos = ptr - doc; |
674 | found = true; |
675 | } |
676 | break; |
677 | } |
678 | ptr++; |
679 | } |
680 | |
681 | if (!found) |
682 | return false; |
683 | |
684 | resetQueryRepresentation(qr, true); |
685 | |
686 | ptr = doc + lastpos; |
687 | |
688 | /* find lower bound of cover from found upper bound, move down */ |
689 | while (ptr >= doc + ext->pos) |
690 | { |
691 | /* |
692 | * we scan doc from right to left, so pos info in reverse order! |
693 | */ |
694 | fillQueryRepresentationData(qr, ptr); |
695 | |
696 | if (TS_execute(GETQUERY(qr->query), (void *) qr, |
697 | TS_EXEC_CALC_NOT, checkcondition_QueryOperand)) |
698 | { |
699 | if (WEP_GETPOS(ptr->pos) < ext->p) |
700 | { |
701 | ext->begin = ptr; |
702 | ext->p = WEP_GETPOS(ptr->pos); |
703 | } |
704 | break; |
705 | } |
706 | ptr--; |
707 | } |
708 | |
709 | if (ext->p <= ext->q) |
710 | { |
711 | /* |
712 | * set position for next try to next lexeme after beginning of found |
713 | * cover |
714 | */ |
715 | ext->pos = (ptr - doc) + 1; |
716 | return true; |
717 | } |
718 | |
719 | ext->pos++; |
720 | return Cover(doc, len, qr, ext); |
721 | } |
722 | |
723 | static DocRepresentation * |
724 | get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) |
725 | { |
726 | QueryItem *item = GETQUERY(qr->query); |
727 | WordEntry *entry, |
728 | *firstentry; |
729 | WordEntryPos *post; |
730 | int32 dimt, /* number of 'post' items */ |
731 | j, |
732 | i, |
733 | nitem; |
734 | int len = qr->query->size * 4, |
735 | cur = 0; |
736 | DocRepresentation *doc; |
737 | |
738 | doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); |
739 | |
740 | /* |
741 | * Iterate through query to make DocRepresentaion for words and it's |
742 | * entries satisfied by query |
743 | */ |
744 | for (i = 0; i < qr->query->size; i++) |
745 | { |
746 | QueryOperand *curoperand; |
747 | |
748 | if (item[i].type != QI_VAL) |
749 | continue; |
750 | |
751 | curoperand = &item[i].qoperand; |
752 | |
753 | firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); |
754 | if (!entry) |
755 | continue; |
756 | |
757 | /* iterations over entries in tsvector */ |
758 | while (entry - firstentry < nitem) |
759 | { |
760 | if (entry->haspos) |
761 | { |
762 | dimt = POSDATALEN(txt, entry); |
763 | post = POSDATAPTR(txt, entry); |
764 | } |
765 | else |
766 | { |
767 | /* ignore words without positions */ |
768 | entry++; |
769 | continue; |
770 | } |
771 | |
772 | while (cur + dimt >= len) |
773 | { |
774 | len *= 2; |
775 | doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); |
776 | } |
777 | |
778 | /* iterations over entry's positions */ |
779 | for (j = 0; j < dimt; j++) |
780 | { |
781 | if (curoperand->weight == 0 || |
782 | curoperand->weight & (1 << WEP_GETWEIGHT(post[j]))) |
783 | { |
784 | doc[cur].pos = post[j]; |
785 | doc[cur].data.map.entry = entry; |
786 | doc[cur].data.map.item = (QueryItem *) curoperand; |
787 | cur++; |
788 | } |
789 | } |
790 | |
791 | entry++; |
792 | } |
793 | } |
794 | |
795 | if (cur > 0) |
796 | { |
797 | DocRepresentation *rptr = doc + 1, |
798 | *wptr = doc, |
799 | storage; |
800 | |
801 | /* |
802 | * Sort representation in ascending order by pos and entry |
803 | */ |
804 | qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); |
805 | |
806 | /* |
807 | * Join QueryItem per WordEntry and it's position |
808 | */ |
809 | storage.pos = doc->pos; |
810 | storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); |
811 | storage.data.query.items[0] = doc->data.map.item; |
812 | storage.data.query.nitem = 1; |
813 | |
814 | while (rptr - doc < cur) |
815 | { |
816 | if (rptr->pos == (rptr - 1)->pos && |
817 | rptr->data.map.entry == (rptr - 1)->data.map.entry) |
818 | { |
819 | storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item; |
820 | storage.data.query.nitem++; |
821 | } |
822 | else |
823 | { |
824 | *wptr = storage; |
825 | wptr++; |
826 | storage.pos = rptr->pos; |
827 | storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); |
828 | storage.data.query.items[0] = rptr->data.map.item; |
829 | storage.data.query.nitem = 1; |
830 | } |
831 | |
832 | rptr++; |
833 | } |
834 | |
835 | *wptr = storage; |
836 | wptr++; |
837 | |
838 | *doclen = wptr - doc; |
839 | return doc; |
840 | } |
841 | |
842 | pfree(doc); |
843 | return NULL; |
844 | } |
845 | |
846 | static float4 |
847 | calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) |
848 | { |
849 | DocRepresentation *doc; |
850 | int len, |
851 | i, |
852 | doclen = 0; |
853 | CoverExt ext; |
854 | double Wdoc = 0.0; |
855 | double invws[lengthof(weights)]; |
856 | double SumDist = 0.0, |
857 | PrevExtPos = 0.0, |
858 | CurExtPos = 0.0; |
859 | int NExtent = 0; |
860 | QueryRepresentation qr; |
861 | |
862 | |
863 | for (i = 0; i < lengthof(weights); i++) |
864 | { |
865 | invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i])); |
866 | if (invws[i] > 1.0) |
867 | ereport(ERROR, |
868 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
869 | errmsg("weight out of range" ))); |
870 | invws[i] = 1.0 / invws[i]; |
871 | } |
872 | |
873 | qr.query = query; |
874 | qr.operandData = (QueryRepresentationOperand *) |
875 | palloc0(sizeof(QueryRepresentationOperand) * query->size); |
876 | |
877 | doc = get_docrep(txt, &qr, &doclen); |
878 | if (!doc) |
879 | { |
880 | pfree(qr.operandData); |
881 | return 0.0; |
882 | } |
883 | |
884 | MemSet(&ext, 0, sizeof(CoverExt)); |
885 | while (Cover(doc, doclen, &qr, &ext)) |
886 | { |
887 | double Cpos = 0.0; |
888 | double InvSum = 0.0; |
889 | int nNoise; |
890 | DocRepresentation *ptr = ext.begin; |
891 | |
892 | while (ptr <= ext.end) |
893 | { |
894 | InvSum += invws[WEP_GETWEIGHT(ptr->pos)]; |
895 | ptr++; |
896 | } |
897 | |
898 | Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; |
899 | |
900 | /* |
901 | * if doc are big enough then ext.q may be equal to ext.p due to limit |
902 | * of positional information. In this case we approximate number of |
903 | * noise word as half cover's length |
904 | */ |
905 | nNoise = (ext.q - ext.p) - (ext.end - ext.begin); |
906 | if (nNoise < 0) |
907 | nNoise = (ext.end - ext.begin) / 2; |
908 | Wdoc += Cpos / ((double) (1 + nNoise)); |
909 | |
910 | CurExtPos = ((double) (ext.q + ext.p)) / 2.0; |
911 | if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent division by |
912 | * zero in a case of |
913 | * multiple lexize */ ) |
914 | SumDist += 1.0 / (CurExtPos - PrevExtPos); |
915 | |
916 | PrevExtPos = CurExtPos; |
917 | NExtent++; |
918 | } |
919 | |
920 | if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0) |
921 | Wdoc /= log((double) (cnt_length(txt) + 1)); |
922 | |
923 | if (method & RANK_NORM_LENGTH) |
924 | { |
925 | len = cnt_length(txt); |
926 | if (len > 0) |
927 | Wdoc /= (double) len; |
928 | } |
929 | |
930 | if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0) |
931 | Wdoc /= ((double) NExtent) / SumDist; |
932 | |
933 | if ((method & RANK_NORM_UNIQ) && txt->size > 0) |
934 | Wdoc /= (double) (txt->size); |
935 | |
936 | if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0) |
937 | Wdoc /= log((double) (txt->size + 1)) / log(2.0); |
938 | |
939 | if (method & RANK_NORM_RDIVRPLUS1) |
940 | Wdoc /= (Wdoc + 1); |
941 | |
942 | pfree(doc); |
943 | |
944 | pfree(qr.operandData); |
945 | |
946 | return (float4) Wdoc; |
947 | } |
948 | |
949 | Datum |
950 | ts_rankcd_wttf(PG_FUNCTION_ARGS) |
951 | { |
952 | ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
953 | TSVector txt = PG_GETARG_TSVECTOR(1); |
954 | TSQuery query = PG_GETARG_TSQUERY(2); |
955 | int method = PG_GETARG_INT32(3); |
956 | float res; |
957 | |
958 | res = calc_rank_cd(getWeights(win), txt, query, method); |
959 | |
960 | PG_FREE_IF_COPY(win, 0); |
961 | PG_FREE_IF_COPY(txt, 1); |
962 | PG_FREE_IF_COPY(query, 2); |
963 | PG_RETURN_FLOAT4(res); |
964 | } |
965 | |
966 | Datum |
967 | ts_rankcd_wtt(PG_FUNCTION_ARGS) |
968 | { |
969 | ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); |
970 | TSVector txt = PG_GETARG_TSVECTOR(1); |
971 | TSQuery query = PG_GETARG_TSQUERY(2); |
972 | float res; |
973 | |
974 | res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD); |
975 | |
976 | PG_FREE_IF_COPY(win, 0); |
977 | PG_FREE_IF_COPY(txt, 1); |
978 | PG_FREE_IF_COPY(query, 2); |
979 | PG_RETURN_FLOAT4(res); |
980 | } |
981 | |
982 | Datum |
983 | ts_rankcd_ttf(PG_FUNCTION_ARGS) |
984 | { |
985 | TSVector txt = PG_GETARG_TSVECTOR(0); |
986 | TSQuery query = PG_GETARG_TSQUERY(1); |
987 | int method = PG_GETARG_INT32(2); |
988 | float res; |
989 | |
990 | res = calc_rank_cd(getWeights(NULL), txt, query, method); |
991 | |
992 | PG_FREE_IF_COPY(txt, 0); |
993 | PG_FREE_IF_COPY(query, 1); |
994 | PG_RETURN_FLOAT4(res); |
995 | } |
996 | |
997 | Datum |
998 | ts_rankcd_tt(PG_FUNCTION_ARGS) |
999 | { |
1000 | TSVector txt = PG_GETARG_TSVECTOR(0); |
1001 | TSQuery query = PG_GETARG_TSQUERY(1); |
1002 | float res; |
1003 | |
1004 | res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD); |
1005 | |
1006 | PG_FREE_IF_COPY(txt, 0); |
1007 | PG_FREE_IF_COPY(query, 1); |
1008 | PG_RETURN_FLOAT4(res); |
1009 | } |
1010 | |