1/*-------------------------------------------------------------------------
2 *
3 * tsvector_op.c
4 * operations over tsvector
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector_op.c
11 *
12 *-------------------------------------------------------------------------
13 */
14#include "postgres.h"
15
16#include <limits.h>
17
18#include "access/htup_details.h"
19#include "catalog/namespace.h"
20#include "catalog/pg_type.h"
21#include "commands/trigger.h"
22#include "executor/spi.h"
23#include "funcapi.h"
24#include "mb/pg_wchar.h"
25#include "miscadmin.h"
26#include "parser/parse_coerce.h"
27#include "tsearch/ts_utils.h"
28#include "utils/builtins.h"
29#include "utils/lsyscache.h"
30#include "utils/regproc.h"
31#include "utils/rel.h"
32
33
34typedef struct
35{
36 WordEntry *arrb;
37 WordEntry *arre;
38 char *values;
39 char *operand;
40} CHKVAL;
41
42
43typedef struct StatEntry
44{
45 uint32 ndoc; /* zero indicates that we were already here
46 * while walking through the tree */
47 uint32 nentry;
48 struct StatEntry *left;
49 struct StatEntry *right;
50 uint32 lenlexeme;
51 char lexeme[FLEXIBLE_ARRAY_MEMBER];
52} StatEntry;
53
54#define STATENTRYHDRSZ (offsetof(StatEntry, lexeme))
55
56typedef struct
57{
58 int32 weight;
59
60 uint32 maxdepth;
61
62 StatEntry **stack;
63 uint32 stackpos;
64
65 StatEntry *root;
66} TSVectorStat;
67
68static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
69static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
70
71/*
72 * Order: haspos, len, word, for all positions (pos, weight)
73 */
74static int
75silly_cmp_tsvector(const TSVector a, const TSVector b)
76{
77 if (VARSIZE(a) < VARSIZE(b))
78 return -1;
79 else if (VARSIZE(a) > VARSIZE(b))
80 return 1;
81 else if (a->size < b->size)
82 return -1;
83 else if (a->size > b->size)
84 return 1;
85 else
86 {
87 WordEntry *aptr = ARRPTR(a);
88 WordEntry *bptr = ARRPTR(b);
89 int i = 0;
90 int res;
91
92
93 for (i = 0; i < a->size; i++)
94 {
95 if (aptr->haspos != bptr->haspos)
96 {
97 return (aptr->haspos > bptr->haspos) ? -1 : 1;
98 }
99 else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
100 {
101 return res;
102 }
103 else if (aptr->haspos)
104 {
105 WordEntryPos *ap = POSDATAPTR(a, aptr);
106 WordEntryPos *bp = POSDATAPTR(b, bptr);
107 int j;
108
109 if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
110 return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
111
112 for (j = 0; j < POSDATALEN(a, aptr); j++)
113 {
114 if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
115 {
116 return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1;
117 }
118 else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp))
119 {
120 return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1;
121 }
122 ap++, bp++;
123 }
124 }
125
126 aptr++;
127 bptr++;
128 }
129 }
130
131 return 0;
132}
133
134#define TSVECTORCMPFUNC( type, action, ret ) \
135Datum \
136tsvector_##type(PG_FUNCTION_ARGS) \
137{ \
138 TSVector a = PG_GETARG_TSVECTOR(0); \
139 TSVector b = PG_GETARG_TSVECTOR(1); \
140 int res = silly_cmp_tsvector(a, b); \
141 PG_FREE_IF_COPY(a,0); \
142 PG_FREE_IF_COPY(b,1); \
143 PG_RETURN_##ret( res action 0 ); \
144} \
145/* keep compiler quiet - no extra ; */ \
146extern int no_such_variable
147
148TSVECTORCMPFUNC(lt, <, BOOL);
149TSVECTORCMPFUNC(le, <=, BOOL);
150TSVECTORCMPFUNC(eq, ==, BOOL);
151TSVECTORCMPFUNC(ge, >=, BOOL);
152TSVECTORCMPFUNC(gt, >, BOOL);
153TSVECTORCMPFUNC(ne, !=, BOOL);
154TSVECTORCMPFUNC(cmp, +, INT32);
155
156Datum
157tsvector_strip(PG_FUNCTION_ARGS)
158{
159 TSVector in = PG_GETARG_TSVECTOR(0);
160 TSVector out;
161 int i,
162 len = 0;
163 WordEntry *arrin = ARRPTR(in),
164 *arrout;
165 char *cur;
166
167 for (i = 0; i < in->size; i++)
168 len += arrin[i].len;
169
170 len = CALCDATASIZE(in->size, len);
171 out = (TSVector) palloc0(len);
172 SET_VARSIZE(out, len);
173 out->size = in->size;
174 arrout = ARRPTR(out);
175 cur = STRPTR(out);
176 for (i = 0; i < in->size; i++)
177 {
178 memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
179 arrout[i].haspos = 0;
180 arrout[i].len = arrin[i].len;
181 arrout[i].pos = cur - STRPTR(out);
182 cur += arrout[i].len;
183 }
184
185 PG_FREE_IF_COPY(in, 0);
186 PG_RETURN_POINTER(out);
187}
188
189Datum
190tsvector_length(PG_FUNCTION_ARGS)
191{
192 TSVector in = PG_GETARG_TSVECTOR(0);
193 int32 ret = in->size;
194
195 PG_FREE_IF_COPY(in, 0);
196 PG_RETURN_INT32(ret);
197}
198
199Datum
200tsvector_setweight(PG_FUNCTION_ARGS)
201{
202 TSVector in = PG_GETARG_TSVECTOR(0);
203 char cw = PG_GETARG_CHAR(1);
204 TSVector out;
205 int i,
206 j;
207 WordEntry *entry;
208 WordEntryPos *p;
209 int w = 0;
210
211 switch (cw)
212 {
213 case 'A':
214 case 'a':
215 w = 3;
216 break;
217 case 'B':
218 case 'b':
219 w = 2;
220 break;
221 case 'C':
222 case 'c':
223 w = 1;
224 break;
225 case 'D':
226 case 'd':
227 w = 0;
228 break;
229 default:
230 /* internal error */
231 elog(ERROR, "unrecognized weight: %d", cw);
232 }
233
234 out = (TSVector) palloc(VARSIZE(in));
235 memcpy(out, in, VARSIZE(in));
236 entry = ARRPTR(out);
237 i = out->size;
238 while (i--)
239 {
240 if ((j = POSDATALEN(out, entry)) != 0)
241 {
242 p = POSDATAPTR(out, entry);
243 while (j--)
244 {
245 WEP_SETWEIGHT(*p, w);
246 p++;
247 }
248 }
249 entry++;
250 }
251
252 PG_FREE_IF_COPY(in, 0);
253 PG_RETURN_POINTER(out);
254}
255
256/*
257 * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
258 *
259 * Assign weight w to elements of tsin that are listed in lexemes.
260 */
261Datum
262tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
263{
264 TSVector tsin = PG_GETARG_TSVECTOR(0);
265 char char_weight = PG_GETARG_CHAR(1);
266 ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
267
268 TSVector tsout;
269 int i,
270 j,
271 nlexemes,
272 weight;
273 WordEntry *entry;
274 Datum *dlexemes;
275 bool *nulls;
276
277 switch (char_weight)
278 {
279 case 'A':
280 case 'a':
281 weight = 3;
282 break;
283 case 'B':
284 case 'b':
285 weight = 2;
286 break;
287 case 'C':
288 case 'c':
289 weight = 1;
290 break;
291 case 'D':
292 case 'd':
293 weight = 0;
294 break;
295 default:
296 /* internal error */
297 elog(ERROR, "unrecognized weight: %c", char_weight);
298 }
299
300 tsout = (TSVector) palloc(VARSIZE(tsin));
301 memcpy(tsout, tsin, VARSIZE(tsin));
302 entry = ARRPTR(tsout);
303
304 deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
305 &dlexemes, &nulls, &nlexemes);
306
307 /*
308 * Assuming that lexemes array is significantly shorter than tsvector we
309 * can iterate through lexemes performing binary search of each lexeme
310 * from lexemes in tsvector.
311 */
312 for (i = 0; i < nlexemes; i++)
313 {
314 char *lex;
315 int lex_len,
316 lex_pos;
317
318 if (nulls[i])
319 ereport(ERROR,
320 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
321 errmsg("lexeme array may not contain nulls")));
322
323 lex = VARDATA(dlexemes[i]);
324 lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
325 lex_pos = tsvector_bsearch(tsout, lex, lex_len);
326
327 if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
328 {
329 WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
330
331 while (j--)
332 {
333 WEP_SETWEIGHT(*p, weight);
334 p++;
335 }
336 }
337 }
338
339 PG_FREE_IF_COPY(tsin, 0);
340 PG_FREE_IF_COPY(lexemes, 2);
341
342 PG_RETURN_POINTER(tsout);
343}
344
345#define compareEntry(pa, a, pb, b) \
346 tsCompareString((pa) + (a)->pos, (a)->len, \
347 (pb) + (b)->pos, (b)->len, \
348 false)
349
350/*
351 * Add positions from src to dest after offsetting them by maxpos.
352 * Return the number added (might be less than expected due to overflow)
353 */
354static int32
355add_pos(TSVector src, WordEntry *srcptr,
356 TSVector dest, WordEntry *destptr,
357 int32 maxpos)
358{
359 uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
360 int i;
361 uint16 slen = POSDATALEN(src, srcptr),
362 startlen;
363 WordEntryPos *spos = POSDATAPTR(src, srcptr),
364 *dpos = POSDATAPTR(dest, destptr);
365
366 if (!destptr->haspos)
367 *clen = 0;
368
369 startlen = *clen;
370 for (i = 0;
371 i < slen && *clen < MAXNUMPOS &&
372 (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
373 i++)
374 {
375 WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
376 WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
377 (*clen)++;
378 }
379
380 if (*clen != startlen)
381 destptr->haspos = 1;
382 return *clen - startlen;
383}
384
385/*
386 * Perform binary search of given lexeme in TSVector.
387 * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
388 * found.
389 */
390static int
391tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
392{
393 WordEntry *arrin = ARRPTR(tsv);
394 int StopLow = 0,
395 StopHigh = tsv->size,
396 StopMiddle,
397 cmp;
398
399 while (StopLow < StopHigh)
400 {
401 StopMiddle = (StopLow + StopHigh) / 2;
402
403 cmp = tsCompareString(lexeme, lexeme_len,
404 STRPTR(tsv) + arrin[StopMiddle].pos,
405 arrin[StopMiddle].len,
406 false);
407
408 if (cmp < 0)
409 StopHigh = StopMiddle;
410 else if (cmp > 0)
411 StopLow = StopMiddle + 1;
412 else /* found it */
413 return StopMiddle;
414 }
415
416 return -1;
417}
418
419/*
420 * qsort comparator functions
421 */
422
423static int
424compare_int(const void *va, const void *vb)
425{
426 int a = *((const int *) va);
427 int b = *((const int *) vb);
428
429 if (a == b)
430 return 0;
431 return (a > b) ? 1 : -1;
432}
433
434static int
435compare_text_lexemes(const void *va, const void *vb)
436{
437 Datum a = *((const Datum *) va);
438 Datum b = *((const Datum *) vb);
439 char *alex = VARDATA_ANY(a);
440 int alex_len = VARSIZE_ANY_EXHDR(a);
441 char *blex = VARDATA_ANY(b);
442 int blex_len = VARSIZE_ANY_EXHDR(b);
443
444 return tsCompareString(alex, alex_len, blex, blex_len, false);
445}
446
447/*
448 * Internal routine to delete lexemes from TSVector by array of offsets.
449 *
450 * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
451 * int indices_count -- size of that array
452 *
453 * Returns new TSVector without given lexemes along with their positions
454 * and weights.
455 */
456static TSVector
457tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
458 int indices_count)
459{
460 TSVector tsout;
461 WordEntry *arrin = ARRPTR(tsv),
462 *arrout;
463 char *data = STRPTR(tsv),
464 *dataout;
465 int i, /* index in arrin */
466 j, /* index in arrout */
467 k, /* index in indices_to_delete */
468 curoff; /* index in dataout area */
469
470 /*
471 * Sort the filter array to simplify membership checks below. Also, get
472 * rid of any duplicate entries, so that we can assume that indices_count
473 * is exactly equal to the number of lexemes that will be removed.
474 */
475 if (indices_count > 1)
476 {
477 int kp;
478
479 qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
480 kp = 0;
481 for (k = 1; k < indices_count; k++)
482 {
483 if (indices_to_delete[k] != indices_to_delete[kp])
484 indices_to_delete[++kp] = indices_to_delete[k];
485 }
486 indices_count = ++kp;
487 }
488
489 /*
490 * Here we overestimate tsout size, since we don't know how much space is
491 * used by the deleted lexeme(s). We will set exact size below.
492 */
493 tsout = (TSVector) palloc0(VARSIZE(tsv));
494
495 /* This count must be correct because STRPTR(tsout) relies on it. */
496 tsout->size = tsv->size - indices_count;
497
498 /*
499 * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
500 */
501 arrout = ARRPTR(tsout);
502 dataout = STRPTR(tsout);
503 curoff = 0;
504 for (i = j = k = 0; i < tsv->size; i++)
505 {
506 /*
507 * If current i is present in indices_to_delete, skip this lexeme.
508 * Since indices_to_delete is already sorted, we only need to check
509 * the current (k'th) entry.
510 */
511 if (k < indices_count && i == indices_to_delete[k])
512 {
513 k++;
514 continue;
515 }
516
517 /* Copy lexeme and its positions and weights */
518 memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
519 arrout[j].haspos = arrin[i].haspos;
520 arrout[j].len = arrin[i].len;
521 arrout[j].pos = curoff;
522 curoff += arrin[i].len;
523 if (arrin[i].haspos)
524 {
525 int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
526 + sizeof(uint16);
527
528 curoff = SHORTALIGN(curoff);
529 memcpy(dataout + curoff,
530 STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
531 len);
532 curoff += len;
533 }
534
535 j++;
536 }
537
538 /*
539 * k should now be exactly equal to indices_count. If it isn't then the
540 * caller provided us with indices outside of [0, tsv->size) range and
541 * estimation of tsout's size is wrong.
542 */
543 Assert(k == indices_count);
544
545 SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
546 return tsout;
547}
548
549/*
550 * Delete given lexeme from tsvector.
551 * Implementation of user-level ts_delete(tsvector, text).
552 */
553Datum
554tsvector_delete_str(PG_FUNCTION_ARGS)
555{
556 TSVector tsin = PG_GETARG_TSVECTOR(0),
557 tsout;
558 text *tlexeme = PG_GETARG_TEXT_PP(1);
559 char *lexeme = VARDATA_ANY(tlexeme);
560 int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
561 skip_index;
562
563 if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
564 PG_RETURN_POINTER(tsin);
565
566 tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
567
568 PG_FREE_IF_COPY(tsin, 0);
569 PG_FREE_IF_COPY(tlexeme, 1);
570 PG_RETURN_POINTER(tsout);
571}
572
573/*
574 * Delete given array of lexemes from tsvector.
575 * Implementation of user-level ts_delete(tsvector, text[]).
576 */
577Datum
578tsvector_delete_arr(PG_FUNCTION_ARGS)
579{
580 TSVector tsin = PG_GETARG_TSVECTOR(0),
581 tsout;
582 ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
583 int i,
584 nlex,
585 skip_count,
586 *skip_indices;
587 Datum *dlexemes;
588 bool *nulls;
589
590 deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
591 &dlexemes, &nulls, &nlex);
592
593 /*
594 * In typical use case array of lexemes to delete is relatively small. So
595 * here we optimize things for that scenario: iterate through lexarr
596 * performing binary search of each lexeme from lexarr in tsvector.
597 */
598 skip_indices = palloc0(nlex * sizeof(int));
599 for (i = skip_count = 0; i < nlex; i++)
600 {
601 char *lex;
602 int lex_len,
603 lex_pos;
604
605 if (nulls[i])
606 ereport(ERROR,
607 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
608 errmsg("lexeme array may not contain nulls")));
609
610 lex = VARDATA(dlexemes[i]);
611 lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
612 lex_pos = tsvector_bsearch(tsin, lex, lex_len);
613
614 if (lex_pos >= 0)
615 skip_indices[skip_count++] = lex_pos;
616 }
617
618 tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
619
620 pfree(skip_indices);
621 PG_FREE_IF_COPY(tsin, 0);
622 PG_FREE_IF_COPY(lexemes, 1);
623
624 PG_RETURN_POINTER(tsout);
625}
626
627/*
628 * Expand tsvector as table with following columns:
629 * lexeme: lexeme text
630 * positions: integer array of lexeme positions
631 * weights: char array of weights corresponding to positions
632 */
633Datum
634tsvector_unnest(PG_FUNCTION_ARGS)
635{
636 FuncCallContext *funcctx;
637 TSVector tsin;
638
639 if (SRF_IS_FIRSTCALL())
640 {
641 MemoryContext oldcontext;
642 TupleDesc tupdesc;
643
644 funcctx = SRF_FIRSTCALL_INIT();
645 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
646
647 tupdesc = CreateTemplateTupleDesc(3);
648 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
649 TEXTOID, -1, 0);
650 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
651 INT2ARRAYOID, -1, 0);
652 TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
653 TEXTARRAYOID, -1, 0);
654 funcctx->tuple_desc = BlessTupleDesc(tupdesc);
655
656 funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
657
658 MemoryContextSwitchTo(oldcontext);
659 }
660
661 funcctx = SRF_PERCALL_SETUP();
662 tsin = (TSVector) funcctx->user_fctx;
663
664 if (funcctx->call_cntr < tsin->size)
665 {
666 WordEntry *arrin = ARRPTR(tsin);
667 char *data = STRPTR(tsin);
668 HeapTuple tuple;
669 int j,
670 i = funcctx->call_cntr;
671 bool nulls[] = {false, false, false};
672 Datum values[3];
673
674 values[0] = PointerGetDatum(
675 cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
676 );
677
678 if (arrin[i].haspos)
679 {
680 WordEntryPosVector *posv;
681 Datum *positions;
682 Datum *weights;
683 char weight;
684
685 /*
686 * Internally tsvector stores position and weight in the same
687 * uint16 (2 bits for weight, 14 for position). Here we extract
688 * that in two separate arrays.
689 */
690 posv = _POSVECPTR(tsin, arrin + i);
691 positions = palloc(posv->npos * sizeof(Datum));
692 weights = palloc(posv->npos * sizeof(Datum));
693 for (j = 0; j < posv->npos; j++)
694 {
695 positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
696 weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
697 weights[j] = PointerGetDatum(
698 cstring_to_text_with_len(&weight, 1)
699 );
700 }
701
702 values[1] = PointerGetDatum(
703 construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
704 values[2] = PointerGetDatum(
705 construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
706 }
707 else
708 {
709 nulls[1] = nulls[2] = true;
710 }
711
712 tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
713 SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
714 }
715 else
716 {
717 pfree(tsin);
718 SRF_RETURN_DONE(funcctx);
719 }
720}
721
722/*
723 * Convert tsvector to array of lexemes.
724 */
725Datum
726tsvector_to_array(PG_FUNCTION_ARGS)
727{
728 TSVector tsin = PG_GETARG_TSVECTOR(0);
729 WordEntry *arrin = ARRPTR(tsin);
730 Datum *elements;
731 int i;
732 ArrayType *array;
733
734 elements = palloc(tsin->size * sizeof(Datum));
735
736 for (i = 0; i < tsin->size; i++)
737 {
738 elements[i] = PointerGetDatum(
739 cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
740 );
741 }
742
743 array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
744
745 pfree(elements);
746 PG_FREE_IF_COPY(tsin, 0);
747 PG_RETURN_POINTER(array);
748}
749
750/*
751 * Build tsvector from array of lexemes.
752 */
753Datum
754array_to_tsvector(PG_FUNCTION_ARGS)
755{
756 ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
757 TSVector tsout;
758 Datum *dlexemes;
759 WordEntry *arrout;
760 bool *nulls;
761 int nitems,
762 i,
763 j,
764 tslen,
765 datalen = 0;
766 char *cur;
767
768 deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
769
770 /* Reject nulls (maybe we should just ignore them, instead?) */
771 for (i = 0; i < nitems; i++)
772 {
773 if (nulls[i])
774 ereport(ERROR,
775 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
776 errmsg("lexeme array may not contain nulls")));
777 }
778
779 /* Sort and de-dup, because this is required for a valid tsvector. */
780 if (nitems > 1)
781 {
782 qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
783 j = 0;
784 for (i = 1; i < nitems; i++)
785 {
786 if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0)
787 dlexemes[++j] = dlexemes[i];
788 }
789 nitems = ++j;
790 }
791
792 /* Calculate space needed for surviving lexemes. */
793 for (i = 0; i < nitems; i++)
794 datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
795 tslen = CALCDATASIZE(nitems, datalen);
796
797 /* Allocate and fill tsvector. */
798 tsout = (TSVector) palloc0(tslen);
799 SET_VARSIZE(tsout, tslen);
800 tsout->size = nitems;
801
802 arrout = ARRPTR(tsout);
803 cur = STRPTR(tsout);
804 for (i = 0; i < nitems; i++)
805 {
806 char *lex = VARDATA(dlexemes[i]);
807 int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
808
809 memcpy(cur, lex, lex_len);
810 arrout[i].haspos = 0;
811 arrout[i].len = lex_len;
812 arrout[i].pos = cur - STRPTR(tsout);
813 cur += lex_len;
814 }
815
816 PG_FREE_IF_COPY(v, 0);
817 PG_RETURN_POINTER(tsout);
818}
819
820/*
821 * ts_filter(): keep only lexemes with given weights in tsvector.
822 */
823Datum
824tsvector_filter(PG_FUNCTION_ARGS)
825{
826 TSVector tsin = PG_GETARG_TSVECTOR(0),
827 tsout;
828 ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
829 WordEntry *arrin = ARRPTR(tsin),
830 *arrout;
831 char *datain = STRPTR(tsin),
832 *dataout;
833 Datum *dweights;
834 bool *nulls;
835 int nweights;
836 int i,
837 j;
838 int cur_pos = 0;
839 char mask = 0;
840
841 deconstruct_array(weights, CHAROID, 1, true, 'c',
842 &dweights, &nulls, &nweights);
843
844 for (i = 0; i < nweights; i++)
845 {
846 char char_weight;
847
848 if (nulls[i])
849 ereport(ERROR,
850 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
851 errmsg("weight array may not contain nulls")));
852
853 char_weight = DatumGetChar(dweights[i]);
854 switch (char_weight)
855 {
856 case 'A':
857 case 'a':
858 mask = mask | 8;
859 break;
860 case 'B':
861 case 'b':
862 mask = mask | 4;
863 break;
864 case 'C':
865 case 'c':
866 mask = mask | 2;
867 break;
868 case 'D':
869 case 'd':
870 mask = mask | 1;
871 break;
872 default:
873 ereport(ERROR,
874 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
875 errmsg("unrecognized weight: \"%c\"", char_weight)));
876 }
877 }
878
879 tsout = (TSVector) palloc0(VARSIZE(tsin));
880 tsout->size = tsin->size;
881 arrout = ARRPTR(tsout);
882 dataout = STRPTR(tsout);
883
884 for (i = j = 0; i < tsin->size; i++)
885 {
886 WordEntryPosVector *posvin,
887 *posvout;
888 int npos = 0;
889 int k;
890
891 if (!arrin[i].haspos)
892 continue;
893
894 posvin = _POSVECPTR(tsin, arrin + i);
895 posvout = (WordEntryPosVector *)
896 (dataout + SHORTALIGN(cur_pos + arrin[i].len));
897
898 for (k = 0; k < posvin->npos; k++)
899 {
900 if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
901 posvout->pos[npos++] = posvin->pos[k];
902 }
903
904 /* if no satisfactory positions found, skip lexeme */
905 if (!npos)
906 continue;
907
908 arrout[j].haspos = true;
909 arrout[j].len = arrin[i].len;
910 arrout[j].pos = cur_pos;
911
912 memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
913 posvout->npos = npos;
914 cur_pos += SHORTALIGN(arrin[i].len);
915 cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
916 sizeof(uint16);
917 j++;
918 }
919
920 tsout->size = j;
921 if (dataout != STRPTR(tsout))
922 memmove(STRPTR(tsout), dataout, cur_pos);
923
924 SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
925
926 PG_FREE_IF_COPY(tsin, 0);
927 PG_RETURN_POINTER(tsout);
928}
929
930Datum
931tsvector_concat(PG_FUNCTION_ARGS)
932{
933 TSVector in1 = PG_GETARG_TSVECTOR(0);
934 TSVector in2 = PG_GETARG_TSVECTOR(1);
935 TSVector out;
936 WordEntry *ptr;
937 WordEntry *ptr1,
938 *ptr2;
939 WordEntryPos *p;
940 int maxpos = 0,
941 i,
942 j,
943 i1,
944 i2,
945 dataoff,
946 output_bytes,
947 output_size;
948 char *data,
949 *data1,
950 *data2;
951
952 /* Get max position in in1; we'll need this to offset in2's positions */
953 ptr = ARRPTR(in1);
954 i = in1->size;
955 while (i--)
956 {
957 if ((j = POSDATALEN(in1, ptr)) != 0)
958 {
959 p = POSDATAPTR(in1, ptr);
960 while (j--)
961 {
962 if (WEP_GETPOS(*p) > maxpos)
963 maxpos = WEP_GETPOS(*p);
964 p++;
965 }
966 }
967 ptr++;
968 }
969
970 ptr1 = ARRPTR(in1);
971 ptr2 = ARRPTR(in2);
972 data1 = STRPTR(in1);
973 data2 = STRPTR(in2);
974 i1 = in1->size;
975 i2 = in2->size;
976
977 /*
978 * Conservative estimate of space needed. We might need all the data in
979 * both inputs, and conceivably add a pad byte before position data for
980 * each item where there was none before.
981 */
982 output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
983
984 out = (TSVector) palloc0(output_bytes);
985 SET_VARSIZE(out, output_bytes);
986
987 /*
988 * We must make out->size valid so that STRPTR(out) is sensible. We'll
989 * collapse out any unused space at the end.
990 */
991 out->size = in1->size + in2->size;
992
993 ptr = ARRPTR(out);
994 data = STRPTR(out);
995 dataoff = 0;
996 while (i1 && i2)
997 {
998 int cmp = compareEntry(data1, ptr1, data2, ptr2);
999
1000 if (cmp < 0)
1001 { /* in1 first */
1002 ptr->haspos = ptr1->haspos;
1003 ptr->len = ptr1->len;
1004 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1005 ptr->pos = dataoff;
1006 dataoff += ptr1->len;
1007 if (ptr->haspos)
1008 {
1009 dataoff = SHORTALIGN(dataoff);
1010 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1011 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1012 }
1013
1014 ptr++;
1015 ptr1++;
1016 i1--;
1017 }
1018 else if (cmp > 0)
1019 { /* in2 first */
1020 ptr->haspos = ptr2->haspos;
1021 ptr->len = ptr2->len;
1022 memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
1023 ptr->pos = dataoff;
1024 dataoff += ptr2->len;
1025 if (ptr->haspos)
1026 {
1027 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1028
1029 if (addlen == 0)
1030 ptr->haspos = 0;
1031 else
1032 {
1033 dataoff = SHORTALIGN(dataoff);
1034 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1035 }
1036 }
1037
1038 ptr++;
1039 ptr2++;
1040 i2--;
1041 }
1042 else
1043 {
1044 ptr->haspos = ptr1->haspos | ptr2->haspos;
1045 ptr->len = ptr1->len;
1046 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1047 ptr->pos = dataoff;
1048 dataoff += ptr1->len;
1049 if (ptr->haspos)
1050 {
1051 if (ptr1->haspos)
1052 {
1053 dataoff = SHORTALIGN(dataoff);
1054 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1055 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1056 if (ptr2->haspos)
1057 dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
1058 }
1059 else /* must have ptr2->haspos */
1060 {
1061 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1062
1063 if (addlen == 0)
1064 ptr->haspos = 0;
1065 else
1066 {
1067 dataoff = SHORTALIGN(dataoff);
1068 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1069 }
1070 }
1071 }
1072
1073 ptr++;
1074 ptr1++;
1075 ptr2++;
1076 i1--;
1077 i2--;
1078 }
1079 }
1080
1081 while (i1)
1082 {
1083 ptr->haspos = ptr1->haspos;
1084 ptr->len = ptr1->len;
1085 memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1086 ptr->pos = dataoff;
1087 dataoff += ptr1->len;
1088 if (ptr->haspos)
1089 {
1090 dataoff = SHORTALIGN(dataoff);
1091 memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1092 dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
1093 }
1094
1095 ptr++;
1096 ptr1++;
1097 i1--;
1098 }
1099
1100 while (i2)
1101 {
1102 ptr->haspos = ptr2->haspos;
1103 ptr->len = ptr2->len;
1104 memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
1105 ptr->pos = dataoff;
1106 dataoff += ptr2->len;
1107 if (ptr->haspos)
1108 {
1109 int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
1110
1111 if (addlen == 0)
1112 ptr->haspos = 0;
1113 else
1114 {
1115 dataoff = SHORTALIGN(dataoff);
1116 dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
1117 }
1118 }
1119
1120 ptr++;
1121 ptr2++;
1122 i2--;
1123 }
1124
1125 /*
1126 * Instead of checking each offset individually, we check for overflow of
1127 * pos fields once at the end.
1128 */
1129 if (dataoff > MAXSTRPOS)
1130 ereport(ERROR,
1131 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1132 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS)));
1133
1134 /*
1135 * Adjust sizes (asserting that we didn't overrun the original estimates)
1136 * and collapse out any unused array entries.
1137 */
1138 output_size = ptr - ARRPTR(out);
1139 Assert(output_size <= out->size);
1140 out->size = output_size;
1141 if (data != STRPTR(out))
1142 memmove(STRPTR(out), data, dataoff);
1143 output_bytes = CALCDATASIZE(out->size, dataoff);
1144 Assert(output_bytes <= VARSIZE(out));
1145 SET_VARSIZE(out, output_bytes);
1146
1147 PG_FREE_IF_COPY(in1, 0);
1148 PG_FREE_IF_COPY(in2, 1);
1149 PG_RETURN_POINTER(out);
1150}
1151
1152/*
1153 * Compare two strings by tsvector rules.
1154 *
1155 * if isPrefix = true then it returns zero value iff b has prefix a
1156 */
1157int32
1158tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
1159{
1160 int cmp;
1161
1162 if (lena == 0)
1163 {
1164 if (prefix)
1165 cmp = 0; /* empty string is prefix of anything */
1166 else
1167 cmp = (lenb > 0) ? -1 : 0;
1168 }
1169 else if (lenb == 0)
1170 {
1171 cmp = (lena > 0) ? 1 : 0;
1172 }
1173 else
1174 {
1175 cmp = memcmp(a, b, Min(lena, lenb));
1176
1177 if (prefix)
1178 {
1179 if (cmp == 0 && lena > lenb)
1180 cmp = 1; /* a is longer, so not a prefix of b */
1181 }
1182 else if (cmp == 0 && lena != lenb)
1183 {
1184 cmp = (lena < lenb) ? -1 : 1;
1185 }
1186 }
1187
1188 return cmp;
1189}
1190
1191/*
1192 * Check weight info or/and fill 'data' with the required positions
1193 */
1194static bool
1195checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
1196 ExecPhraseData *data)
1197{
1198 bool result = false;
1199
1200 if (entry->haspos && (val->weight || data))
1201 {
1202 WordEntryPosVector *posvec;
1203
1204 /*
1205 * We can't use the _POSVECPTR macro here because the pointer to the
1206 * tsvector's lexeme storage is already contained in chkval->values.
1207 */
1208 posvec = (WordEntryPosVector *)
1209 (chkval->values + SHORTALIGN(entry->pos + entry->len));
1210
1211 if (val->weight && data)
1212 {
1213 WordEntryPos *posvec_iter = posvec->pos;
1214 WordEntryPos *dptr;
1215
1216 /*
1217 * Filter position information by weights
1218 */
1219 dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
1220 data->allocated = true;
1221
1222 /* Is there a position with a matching weight? */
1223 while (posvec_iter < posvec->pos + posvec->npos)
1224 {
1225 /* If true, append this position to the data->pos */
1226 if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
1227 {
1228 *dptr = WEP_GETPOS(*posvec_iter);
1229 dptr++;
1230 }
1231
1232 posvec_iter++;
1233 }
1234
1235 data->npos = dptr - data->pos;
1236
1237 if (data->npos > 0)
1238 result = true;
1239 }
1240 else if (val->weight)
1241 {
1242 WordEntryPos *posvec_iter = posvec->pos;
1243
1244 /* Is there a position with a matching weight? */
1245 while (posvec_iter < posvec->pos + posvec->npos)
1246 {
1247 if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
1248 {
1249 result = true;
1250 break; /* no need to go further */
1251 }
1252
1253 posvec_iter++;
1254 }
1255 }
1256 else /* data != NULL */
1257 {
1258 data->npos = posvec->npos;
1259 data->pos = posvec->pos;
1260 data->allocated = false;
1261 result = true;
1262 }
1263 }
1264 else
1265 {
1266 result = true;
1267 }
1268
1269 return result;
1270}
1271
1272/*
1273 * Removes duplicate pos entries. We can't use uniquePos() from
1274 * tsvector.c because array might be longer than MAXENTRYPOS
1275 *
1276 * Returns new length.
1277 */
1278static int
1279uniqueLongPos(WordEntryPos *pos, int npos)
1280{
1281 WordEntryPos *pos_iter,
1282 *result;
1283
1284 if (npos <= 1)
1285 return npos;
1286
1287 qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
1288
1289 result = pos;
1290 pos_iter = pos + 1;
1291 while (pos_iter < pos + npos)
1292 {
1293 if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result))
1294 {
1295 result++;
1296 *result = WEP_GETPOS(*pos_iter);
1297 }
1298
1299 pos_iter++;
1300 }
1301
1302 return result + 1 - pos;
1303}
1304
1305/*
1306 * is there value 'val' in array or not ?
1307 */
1308static bool
1309checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
1310{
1311 CHKVAL *chkval = (CHKVAL *) checkval;
1312 WordEntry *StopLow = chkval->arrb;
1313 WordEntry *StopHigh = chkval->arre;
1314 WordEntry *StopMiddle = StopHigh;
1315 int difference = -1;
1316 bool res = false;
1317
1318 /* Loop invariant: StopLow <= val < StopHigh */
1319 while (StopLow < StopHigh)
1320 {
1321 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
1322 difference = tsCompareString(chkval->operand + val->distance,
1323 val->length,
1324 chkval->values + StopMiddle->pos,
1325 StopMiddle->len,
1326 false);
1327
1328 if (difference == 0)
1329 {
1330 /* Check weight info & fill 'data' with positions */
1331 res = checkclass_str(chkval, StopMiddle, val, data);
1332 break;
1333 }
1334 else if (difference > 0)
1335 StopLow = StopMiddle + 1;
1336 else
1337 StopHigh = StopMiddle;
1338 }
1339
1340 if ((!res || data) && val->prefix)
1341 {
1342 WordEntryPos *allpos = NULL;
1343 int npos = 0,
1344 totalpos = 0;
1345
1346 /*
1347 * there was a failed exact search, so we should scan further to find
1348 * a prefix match. We also need to do so if caller needs position info
1349 */
1350 if (StopLow >= StopHigh)
1351 StopMiddle = StopHigh;
1352
1353 while ((!res || data) && StopMiddle < chkval->arre &&
1354 tsCompareString(chkval->operand + val->distance,
1355 val->length,
1356 chkval->values + StopMiddle->pos,
1357 StopMiddle->len,
1358 true) == 0)
1359 {
1360 if (data)
1361 {
1362 /*
1363 * We need to join position information
1364 */
1365 res = checkclass_str(chkval, StopMiddle, val, data);
1366
1367 if (res)
1368 {
1369 while (npos + data->npos >= totalpos)
1370 {
1371 if (totalpos == 0)
1372 {
1373 totalpos = 256;
1374 allpos = palloc(sizeof(WordEntryPos) * totalpos);
1375 }
1376 else
1377 {
1378 totalpos *= 2;
1379 allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos);
1380 }
1381 }
1382
1383 memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos);
1384 npos += data->npos;
1385 }
1386 }
1387 else
1388 {
1389 res = checkclass_str(chkval, StopMiddle, val, NULL);
1390 }
1391
1392 StopMiddle++;
1393 }
1394
1395 if (res && data)
1396 {
1397 /* Sort and make unique array of found positions */
1398 data->pos = allpos;
1399 data->npos = uniqueLongPos(allpos, npos);
1400 data->allocated = true;
1401 }
1402 }
1403
1404 return res;
1405}
1406
1407/*
1408 * Compute output position list for a tsquery operator in phrase mode.
1409 *
1410 * Merge the position lists in Ldata and Rdata as specified by "emit",
1411 * returning the result list into *data. The input position lists must be
1412 * sorted and unique, and the output will be as well.
1413 *
1414 * data: pointer to initially-all-zeroes output struct, or NULL
1415 * Ldata, Rdata: input position lists
1416 * emit: bitmask of TSPO_XXX flags
1417 * Loffset: offset to be added to Ldata positions before comparing/outputting
1418 * Roffset: offset to be added to Rdata positions before comparing/outputting
1419 * max_npos: maximum possible required size of output position array
1420 *
1421 * Loffset and Roffset should not be negative, else we risk trying to output
1422 * negative positions, which won't fit into WordEntryPos.
1423 *
1424 * Returns true if any positions were emitted to *data; or if data is NULL,
1425 * returns true if any positions would have been emitted.
1426 */
1427#define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */
1428#define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */
1429#define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */
1430
1431static bool
1432TS_phrase_output(ExecPhraseData *data,
1433 ExecPhraseData *Ldata,
1434 ExecPhraseData *Rdata,
1435 int emit,
1436 int Loffset,
1437 int Roffset,
1438 int max_npos)
1439{
1440 int Lindex,
1441 Rindex;
1442
1443 /* Loop until both inputs are exhausted */
1444 Lindex = Rindex = 0;
1445 while (Lindex < Ldata->npos || Rindex < Rdata->npos)
1446 {
1447 int Lpos,
1448 Rpos;
1449 int output_pos = 0;
1450
1451 /*
1452 * Fetch current values to compare. WEP_GETPOS() is needed because
1453 * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
1454 */
1455 if (Lindex < Ldata->npos)
1456 Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset;
1457 else
1458 {
1459 /* L array exhausted, so we're done if R_ONLY isn't set */
1460 if (!(emit & TSPO_R_ONLY))
1461 break;
1462 Lpos = INT_MAX;
1463 }
1464 if (Rindex < Rdata->npos)
1465 Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset;
1466 else
1467 {
1468 /* R array exhausted, so we're done if L_ONLY isn't set */
1469 if (!(emit & TSPO_L_ONLY))
1470 break;
1471 Rpos = INT_MAX;
1472 }
1473
1474 /* Merge-join the two input lists */
1475 if (Lpos < Rpos)
1476 {
1477 /* Lpos is not matched in Rdata, should we output it? */
1478 if (emit & TSPO_L_ONLY)
1479 output_pos = Lpos;
1480 Lindex++;
1481 }
1482 else if (Lpos == Rpos)
1483 {
1484 /* Lpos and Rpos match ... should we output it? */
1485 if (emit & TSPO_BOTH)
1486 output_pos = Rpos;
1487 Lindex++;
1488 Rindex++;
1489 }
1490 else /* Lpos > Rpos */
1491 {
1492 /* Rpos is not matched in Ldata, should we output it? */
1493 if (emit & TSPO_R_ONLY)
1494 output_pos = Rpos;
1495 Rindex++;
1496 }
1497
1498 if (output_pos > 0)
1499 {
1500 if (data)
1501 {
1502 /* Store position, first allocating output array if needed */
1503 if (data->pos == NULL)
1504 {
1505 data->pos = (WordEntryPos *)
1506 palloc(max_npos * sizeof(WordEntryPos));
1507 data->allocated = true;
1508 }
1509 data->pos[data->npos++] = output_pos;
1510 }
1511 else
1512 {
1513 /*
1514 * Exact positions not needed, so return true as soon as we
1515 * know there is at least one.
1516 */
1517 return true;
1518 }
1519 }
1520 }
1521
1522 if (data && data->npos > 0)
1523 {
1524 /* Let's assert we didn't overrun the array */
1525 Assert(data->npos <= max_npos);
1526 return true;
1527 }
1528 return false;
1529}
1530
1531/*
1532 * Execute tsquery at or below an OP_PHRASE operator.
1533 *
1534 * This handles tsquery execution at recursion levels where we need to care
1535 * about match locations.
1536 *
1537 * In addition to the same arguments used for TS_execute, the caller may pass
1538 * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme
1539 * match position info on success. data == NULL if no position data need be
1540 * returned. (In practice, outside callers pass NULL, and only the internal
1541 * recursion cases pass a data pointer.)
1542 * Note: the function assumes data != NULL for operators other than OP_PHRASE.
1543 * This is OK because an outside call always starts from an OP_PHRASE node.
1544 *
1545 * The detailed semantics of the match data, given that the function returned
1546 * "true" (successful match, or possible match), are:
1547 *
1548 * npos > 0, negate = false:
1549 * query is matched at specified position(s) (and only those positions)
1550 * npos > 0, negate = true:
1551 * query is matched at all positions *except* specified position(s)
1552 * npos = 0, negate = false:
1553 * query is possibly matched, matching position(s) are unknown
1554 * (this should only be returned when TS_EXEC_PHRASE_NO_POS flag is set)
1555 * npos = 0, negate = true:
1556 * query is matched at all positions
1557 *
1558 * Successful matches also return a "width" value which is the match width in
1559 * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches,
1560 * and is the sum of the phrase operator distances for phrase matches. Note
1561 * that when width > 0, the listed positions represent the ends of matches not
1562 * the starts. (This unintuitive rule is needed to avoid possibly generating
1563 * negative positions, which wouldn't fit into the WordEntryPos arrays.)
1564 *
1565 * When the function returns "false" (no match), it must return npos = 0,
1566 * negate = false (which is the state initialized by the caller); but the
1567 * "width" output in such cases is undefined.
1568 */
1569static bool
1570TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
1571 TSExecuteCallback chkcond,
1572 ExecPhraseData *data)
1573{
1574 ExecPhraseData Ldata,
1575 Rdata;
1576 bool lmatch,
1577 rmatch;
1578 int Loffset,
1579 Roffset,
1580 maxwidth;
1581
1582 /* since this function recurses, it could be driven to stack overflow */
1583 check_stack_depth();
1584
1585 if (curitem->type == QI_VAL)
1586 return chkcond(arg, (QueryOperand *) curitem, data);
1587
1588 switch (curitem->qoperator.oper)
1589 {
1590 case OP_NOT:
1591
1592 /*
1593 * Because a "true" result with no specific positions is taken as
1594 * uncertain, we need no special care here for !TS_EXEC_CALC_NOT.
1595 * If it's a false positive, the right things happen anyway.
1596 *
1597 * Also, we need not touch data->width, since a NOT operation does
1598 * not change the match width.
1599 */
1600 if (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data))
1601 {
1602 if (data->npos > 0)
1603 {
1604 /* we have some positions, invert negate flag */
1605 data->negate = !data->negate;
1606 return true;
1607 }
1608 else if (data->negate)
1609 {
1610 /* change "match everywhere" to "match nowhere" */
1611 data->negate = false;
1612 return false;
1613 }
1614 /* match positions are, and remain, uncertain */
1615 return true;
1616 }
1617 else
1618 {
1619 /* change "match nowhere" to "match everywhere" */
1620 Assert(data->npos == 0 && !data->negate);
1621 data->negate = true;
1622 return true;
1623 }
1624
1625 case OP_PHRASE:
1626 case OP_AND:
1627 memset(&Ldata, 0, sizeof(Ldata));
1628 memset(&Rdata, 0, sizeof(Rdata));
1629
1630 if (!TS_phrase_execute(curitem + curitem->qoperator.left,
1631 arg, flags, chkcond, &Ldata))
1632 return false;
1633
1634 if (!TS_phrase_execute(curitem + 1,
1635 arg, flags, chkcond, &Rdata))
1636 return false;
1637
1638 /*
1639 * If either operand has no position information, then we can't
1640 * return position data, only a "possible match" result. "Possible
1641 * match" answers are only wanted when TS_EXEC_PHRASE_NO_POS flag
1642 * is set, otherwise return false.
1643 */
1644 if ((Ldata.npos == 0 && !Ldata.negate) ||
1645 (Rdata.npos == 0 && !Rdata.negate))
1646 return (flags & TS_EXEC_PHRASE_NO_POS) ? true : false;
1647
1648 if (curitem->qoperator.oper == OP_PHRASE)
1649 {
1650 /*
1651 * Compute Loffset and Roffset suitable for phrase match, and
1652 * compute overall width of whole phrase match.
1653 */
1654 Loffset = curitem->qoperator.distance + Rdata.width;
1655 Roffset = 0;
1656 if (data)
1657 data->width = curitem->qoperator.distance +
1658 Ldata.width + Rdata.width;
1659 }
1660 else
1661 {
1662 /*
1663 * For OP_AND, set output width and alignment like OP_OR (see
1664 * comment below)
1665 */
1666 maxwidth = Max(Ldata.width, Rdata.width);
1667 Loffset = maxwidth - Ldata.width;
1668 Roffset = maxwidth - Rdata.width;
1669 if (data)
1670 data->width = maxwidth;
1671 }
1672
1673 if (Ldata.negate && Rdata.negate)
1674 {
1675 /* !L & !R: treat as !(L | R) */
1676 (void) TS_phrase_output(data, &Ldata, &Rdata,
1677 TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
1678 Loffset, Roffset,
1679 Ldata.npos + Rdata.npos);
1680 if (data)
1681 data->negate = true;
1682 return true;
1683 }
1684 else if (Ldata.negate)
1685 {
1686 /* !L & R */
1687 return TS_phrase_output(data, &Ldata, &Rdata,
1688 TSPO_R_ONLY,
1689 Loffset, Roffset,
1690 Rdata.npos);
1691 }
1692 else if (Rdata.negate)
1693 {
1694 /* L & !R */
1695 return TS_phrase_output(data, &Ldata, &Rdata,
1696 TSPO_L_ONLY,
1697 Loffset, Roffset,
1698 Ldata.npos);
1699 }
1700 else
1701 {
1702 /* straight AND */
1703 return TS_phrase_output(data, &Ldata, &Rdata,
1704 TSPO_BOTH,
1705 Loffset, Roffset,
1706 Min(Ldata.npos, Rdata.npos));
1707 }
1708
1709 case OP_OR:
1710 memset(&Ldata, 0, sizeof(Ldata));
1711 memset(&Rdata, 0, sizeof(Rdata));
1712
1713 lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
1714 arg, flags, chkcond, &Ldata);
1715 rmatch = TS_phrase_execute(curitem + 1,
1716 arg, flags, chkcond, &Rdata);
1717
1718 if (!lmatch && !rmatch)
1719 return false;
1720
1721 /*
1722 * If a valid operand has no position information, then we can't
1723 * return position data, only a "possible match" result. "Possible
1724 * match" answers are only wanted when TS_EXEC_PHRASE_NO_POS flag
1725 * is set, otherwise return false.
1726 */
1727 if ((lmatch && Ldata.npos == 0 && !Ldata.negate) ||
1728 (rmatch && Rdata.npos == 0 && !Rdata.negate))
1729 return (flags & TS_EXEC_PHRASE_NO_POS) ? true : false;
1730
1731 /*
1732 * Cope with undefined output width from failed submatch. (This
1733 * takes less code than trying to ensure that all failure returns
1734 * set data->width to zero.)
1735 */
1736 if (!lmatch)
1737 Ldata.width = 0;
1738 if (!rmatch)
1739 Rdata.width = 0;
1740
1741 /*
1742 * For OP_AND and OP_OR, report the width of the wider of the two
1743 * inputs, and align the narrower input's positions to the right
1744 * end of that width. This rule deals at least somewhat
1745 * reasonably with cases like "x <-> (y | z <-> q)".
1746 */
1747 maxwidth = Max(Ldata.width, Rdata.width);
1748 Loffset = maxwidth - Ldata.width;
1749 Roffset = maxwidth - Rdata.width;
1750 data->width = maxwidth;
1751
1752 if (Ldata.negate && Rdata.negate)
1753 {
1754 /* !L | !R: treat as !(L & R) */
1755 (void) TS_phrase_output(data, &Ldata, &Rdata,
1756 TSPO_BOTH,
1757 Loffset, Roffset,
1758 Min(Ldata.npos, Rdata.npos));
1759 data->negate = true;
1760 return true;
1761 }
1762 else if (Ldata.negate)
1763 {
1764 /* !L | R: treat as !(L & !R) */
1765 (void) TS_phrase_output(data, &Ldata, &Rdata,
1766 TSPO_L_ONLY,
1767 Loffset, Roffset,
1768 Ldata.npos);
1769 data->negate = true;
1770 return true;
1771 }
1772 else if (Rdata.negate)
1773 {
1774 /* L | !R: treat as !(!L & R) */
1775 (void) TS_phrase_output(data, &Ldata, &Rdata,
1776 TSPO_R_ONLY,
1777 Loffset, Roffset,
1778 Rdata.npos);
1779 data->negate = true;
1780 return true;
1781 }
1782 else
1783 {
1784 /* straight OR */
1785 return TS_phrase_output(data, &Ldata, &Rdata,
1786 TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
1787 Loffset, Roffset,
1788 Ldata.npos + Rdata.npos);
1789 }
1790
1791 default:
1792 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
1793 }
1794
1795 /* not reachable, but keep compiler quiet */
1796 return false;
1797}
1798
1799
1800/*
1801 * Evaluate tsquery boolean expression.
1802 *
1803 * curitem: current tsquery item (initially, the first one)
1804 * arg: opaque value to pass through to callback function
1805 * flags: bitmask of flag bits shown in ts_utils.h
1806 * chkcond: callback function to check whether a primitive value is present
1807 *
1808 * The logic here deals only with operators above any phrase operator, for
1809 * which we do not need to worry about lexeme positions. As soon as we hit an
1810 * OP_PHRASE operator, we pass it off to TS_phrase_execute which does worry.
1811 */
1812bool
1813TS_execute(QueryItem *curitem, void *arg, uint32 flags,
1814 TSExecuteCallback chkcond)
1815{
1816 /* since this function recurses, it could be driven to stack overflow */
1817 check_stack_depth();
1818
1819 if (curitem->type == QI_VAL)
1820 return chkcond(arg, (QueryOperand *) curitem,
1821 NULL /* we don't need position info */ );
1822
1823 switch (curitem->qoperator.oper)
1824 {
1825 case OP_NOT:
1826 if (flags & TS_EXEC_CALC_NOT)
1827 return !TS_execute(curitem + 1, arg, flags, chkcond);
1828 else
1829 return true;
1830
1831 case OP_AND:
1832 if (TS_execute(curitem + curitem->qoperator.left, arg, flags, chkcond))
1833 return TS_execute(curitem + 1, arg, flags, chkcond);
1834 else
1835 return false;
1836
1837 case OP_OR:
1838 if (TS_execute(curitem + curitem->qoperator.left, arg, flags, chkcond))
1839 return true;
1840 else
1841 return TS_execute(curitem + 1, arg, flags, chkcond);
1842
1843 case OP_PHRASE:
1844 return TS_phrase_execute(curitem, arg, flags, chkcond, NULL);
1845
1846 default:
1847 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
1848 }
1849
1850 /* not reachable, but keep compiler quiet */
1851 return false;
1852}
1853
1854/*
1855 * Detect whether a tsquery boolean expression requires any positive matches
1856 * to values shown in the tsquery.
1857 *
1858 * This is needed to know whether a GIN index search requires full index scan.
1859 * For example, 'x & !y' requires a match of x, so it's sufficient to scan
1860 * entries for x; but 'x | !y' could match rows containing neither x nor y.
1861 */
1862bool
1863tsquery_requires_match(QueryItem *curitem)
1864{
1865 /* since this function recurses, it could be driven to stack overflow */
1866 check_stack_depth();
1867
1868 if (curitem->type == QI_VAL)
1869 return true;
1870
1871 switch (curitem->qoperator.oper)
1872 {
1873 case OP_NOT:
1874
1875 /*
1876 * Assume there are no required matches underneath a NOT. For
1877 * some cases with nested NOTs, we could prove there's a required
1878 * match, but it seems unlikely to be worth the trouble.
1879 */
1880 return false;
1881
1882 case OP_PHRASE:
1883
1884 /*
1885 * Treat OP_PHRASE as OP_AND here
1886 */
1887 case OP_AND:
1888 /* If either side requires a match, we're good */
1889 if (tsquery_requires_match(curitem + curitem->qoperator.left))
1890 return true;
1891 else
1892 return tsquery_requires_match(curitem + 1);
1893
1894 case OP_OR:
1895 /* Both sides must require a match */
1896 if (tsquery_requires_match(curitem + curitem->qoperator.left))
1897 return tsquery_requires_match(curitem + 1);
1898 else
1899 return false;
1900
1901 default:
1902 elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
1903 }
1904
1905 /* not reachable, but keep compiler quiet */
1906 return false;
1907}
1908
1909/*
1910 * boolean operations
1911 */
1912Datum
1913ts_match_qv(PG_FUNCTION_ARGS)
1914{
1915 PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq,
1916 PG_GETARG_DATUM(1),
1917 PG_GETARG_DATUM(0)));
1918}
1919
1920Datum
1921ts_match_vq(PG_FUNCTION_ARGS)
1922{
1923 TSVector val = PG_GETARG_TSVECTOR(0);
1924 TSQuery query = PG_GETARG_TSQUERY(1);
1925 CHKVAL chkval;
1926 bool result;
1927
1928 /* empty query matches nothing */
1929 if (!query->size)
1930 {
1931 PG_FREE_IF_COPY(val, 0);
1932 PG_FREE_IF_COPY(query, 1);
1933 PG_RETURN_BOOL(false);
1934 }
1935
1936 chkval.arrb = ARRPTR(val);
1937 chkval.arre = chkval.arrb + val->size;
1938 chkval.values = STRPTR(val);
1939 chkval.operand = GETOPERAND(query);
1940 result = TS_execute(GETQUERY(query),
1941 &chkval,
1942 TS_EXEC_CALC_NOT,
1943 checkcondition_str);
1944
1945 PG_FREE_IF_COPY(val, 0);
1946 PG_FREE_IF_COPY(query, 1);
1947 PG_RETURN_BOOL(result);
1948}
1949
1950Datum
1951ts_match_tt(PG_FUNCTION_ARGS)
1952{
1953 TSVector vector;
1954 TSQuery query;
1955 bool res;
1956
1957 vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
1958 PG_GETARG_DATUM(0)));
1959 query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery,
1960 PG_GETARG_DATUM(1)));
1961
1962 res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
1963 TSVectorGetDatum(vector),
1964 TSQueryGetDatum(query)));
1965
1966 pfree(vector);
1967 pfree(query);
1968
1969 PG_RETURN_BOOL(res);
1970}
1971
1972Datum
1973ts_match_tq(PG_FUNCTION_ARGS)
1974{
1975 TSVector vector;
1976 TSQuery query = PG_GETARG_TSQUERY(1);
1977 bool res;
1978
1979 vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
1980 PG_GETARG_DATUM(0)));
1981
1982 res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
1983 TSVectorGetDatum(vector),
1984 TSQueryGetDatum(query)));
1985
1986 pfree(vector);
1987 PG_FREE_IF_COPY(query, 1);
1988
1989 PG_RETURN_BOOL(res);
1990}
1991
1992/*
1993 * ts_stat statistic function support
1994 */
1995
1996
1997/*
1998 * Returns the number of positions in value 'wptr' within tsvector 'txt',
1999 * that have a weight equal to one of the weights in 'weight' bitmask.
2000 */
2001static int
2002check_weight(TSVector txt, WordEntry *wptr, int8 weight)
2003{
2004 int len = POSDATALEN(txt, wptr);
2005 int num = 0;
2006 WordEntryPos *ptr = POSDATAPTR(txt, wptr);
2007
2008 while (len--)
2009 {
2010 if (weight & (1 << WEP_GETWEIGHT(*ptr)))
2011 num++;
2012 ptr++;
2013 }
2014 return num;
2015}
2016
2017#define compareStatWord(a,e,t) \
2018 tsCompareString((a)->lexeme, (a)->lenlexeme, \
2019 STRPTR(t) + (e)->pos, (e)->len, \
2020 false)
2021
2022static void
2023insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
2024{
2025 WordEntry *we = ARRPTR(txt) + off;
2026 StatEntry *node = stat->root,
2027 *pnode = NULL;
2028 int n,
2029 res = 0;
2030 uint32 depth = 1;
2031
2032 if (stat->weight == 0)
2033 n = (we->haspos) ? POSDATALEN(txt, we) : 1;
2034 else
2035 n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
2036
2037 if (n == 0)
2038 return; /* nothing to insert */
2039
2040 while (node)
2041 {
2042 res = compareStatWord(node, we, txt);
2043
2044 if (res == 0)
2045 {
2046 break;
2047 }
2048 else
2049 {
2050 pnode = node;
2051 node = (res < 0) ? node->left : node->right;
2052 }
2053 depth++;
2054 }
2055
2056 if (depth > stat->maxdepth)
2057 stat->maxdepth = depth;
2058
2059 if (node == NULL)
2060 {
2061 node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len);
2062 node->left = node->right = NULL;
2063 node->ndoc = 1;
2064 node->nentry = n;
2065 node->lenlexeme = we->len;
2066 memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
2067
2068 if (pnode == NULL)
2069 {
2070 stat->root = node;
2071 }
2072 else
2073 {
2074 if (res < 0)
2075 pnode->left = node;
2076 else
2077 pnode->right = node;
2078 }
2079
2080 }
2081 else
2082 {
2083 node->ndoc++;
2084 node->nentry += n;
2085 }
2086}
2087
2088static void
2089chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt,
2090 uint32 low, uint32 high, uint32 offset)
2091{
2092 uint32 pos;
2093 uint32 middle = (low + high) >> 1;
2094
2095 pos = (low + middle) >> 1;
2096 if (low != middle && pos >= offset && pos - offset < txt->size)
2097 insertStatEntry(persistentContext, stat, txt, pos - offset);
2098 pos = (high + middle + 1) >> 1;
2099 if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
2100 insertStatEntry(persistentContext, stat, txt, pos - offset);
2101
2102 if (low != middle)
2103 chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset);
2104 if (high != middle + 1)
2105 chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset);
2106}
2107
2108/*
2109 * This is written like a custom aggregate function, because the
2110 * original plan was to do just that. Unfortunately, an aggregate function
2111 * can't return a set, so that plan was abandoned. If that limitation is
2112 * lifted in the future, ts_stat could be a real aggregate function so that
2113 * you could use it like this:
2114 *
2115 * SELECT ts_stat(vector_column) FROM vector_table;
2116 *
2117 * where vector_column is a tsvector-type column in vector_table.
2118 */
2119
2120static TSVectorStat *
2121ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
2122{
2123 TSVector txt = DatumGetTSVector(data);
2124 uint32 i,
2125 nbit = 0,
2126 offset;
2127
2128 if (stat == NULL)
2129 { /* Init in first */
2130 stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
2131 stat->maxdepth = 1;
2132 }
2133
2134 /* simple check of correctness */
2135 if (txt == NULL || txt->size == 0)
2136 {
2137 if (txt && txt != (TSVector) DatumGetPointer(data))
2138 pfree(txt);
2139 return stat;
2140 }
2141
2142 i = txt->size - 1;
2143 for (; i > 0; i >>= 1)
2144 nbit++;
2145
2146 nbit = 1 << nbit;
2147 offset = (nbit - txt->size) / 2;
2148
2149 insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
2150 chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
2151
2152 return stat;
2153}
2154
2155static void
2156ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx,
2157 TSVectorStat *stat)
2158{
2159 TupleDesc tupdesc;
2160 MemoryContext oldcontext;
2161 StatEntry *node;
2162
2163 funcctx->user_fctx = (void *) stat;
2164
2165 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
2166
2167 stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1));
2168 stat->stackpos = 0;
2169
2170 node = stat->root;
2171 /* find leftmost value */
2172 if (node == NULL)
2173 stat->stack[stat->stackpos] = NULL;
2174 else
2175 for (;;)
2176 {
2177 stat->stack[stat->stackpos] = node;
2178 if (node->left)
2179 {
2180 stat->stackpos++;
2181 node = node->left;
2182 }
2183 else
2184 break;
2185 }
2186 Assert(stat->stackpos <= stat->maxdepth);
2187
2188 tupdesc = CreateTemplateTupleDesc(3);
2189 TupleDescInitEntry(tupdesc, (AttrNumber) 1, "word",
2190 TEXTOID, -1, 0);
2191 TupleDescInitEntry(tupdesc, (AttrNumber) 2, "ndoc",
2192 INT4OID, -1, 0);
2193 TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nentry",
2194 INT4OID, -1, 0);
2195 funcctx->tuple_desc = BlessTupleDesc(tupdesc);
2196 funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
2197
2198 MemoryContextSwitchTo(oldcontext);
2199}
2200
2201static StatEntry *
2202walkStatEntryTree(TSVectorStat *stat)
2203{
2204 StatEntry *node = stat->stack[stat->stackpos];
2205
2206 if (node == NULL)
2207 return NULL;
2208
2209 if (node->ndoc != 0)
2210 {
2211 /* return entry itself: we already was at left sublink */
2212 return node;
2213 }
2214 else if (node->right && node->right != stat->stack[stat->stackpos + 1])
2215 {
2216 /* go on right sublink */
2217 stat->stackpos++;
2218 node = node->right;
2219
2220 /* find most-left value */
2221 for (;;)
2222 {
2223 stat->stack[stat->stackpos] = node;
2224 if (node->left)
2225 {
2226 stat->stackpos++;
2227 node = node->left;
2228 }
2229 else
2230 break;
2231 }
2232 Assert(stat->stackpos <= stat->maxdepth);
2233 }
2234 else
2235 {
2236 /* we already return all left subtree, itself and right subtree */
2237 if (stat->stackpos == 0)
2238 return NULL;
2239
2240 stat->stackpos--;
2241 return walkStatEntryTree(stat);
2242 }
2243
2244 return node;
2245}
2246
2247static Datum
2248ts_process_call(FuncCallContext *funcctx)
2249{
2250 TSVectorStat *st;
2251 StatEntry *entry;
2252
2253 st = (TSVectorStat *) funcctx->user_fctx;
2254
2255 entry = walkStatEntryTree(st);
2256
2257 if (entry != NULL)
2258 {
2259 Datum result;
2260 char *values[3];
2261 char ndoc[16];
2262 char nentry[16];
2263 HeapTuple tuple;
2264
2265 values[0] = palloc(entry->lenlexeme + 1);
2266 memcpy(values[0], entry->lexeme, entry->lenlexeme);
2267 (values[0])[entry->lenlexeme] = '\0';
2268 sprintf(ndoc, "%d", entry->ndoc);
2269 values[1] = ndoc;
2270 sprintf(nentry, "%d", entry->nentry);
2271 values[2] = nentry;
2272
2273 tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
2274 result = HeapTupleGetDatum(tuple);
2275
2276 pfree(values[0]);
2277
2278 /* mark entry as already visited */
2279 entry->ndoc = 0;
2280
2281 return result;
2282 }
2283
2284 return (Datum) 0;
2285}
2286
2287static TSVectorStat *
2288ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
2289{
2290 char *query = text_to_cstring(txt);
2291 TSVectorStat *stat;
2292 bool isnull;
2293 Portal portal;
2294 SPIPlanPtr plan;
2295
2296 if ((plan = SPI_prepare(query, 0, NULL)) == NULL)
2297 /* internal error */
2298 elog(ERROR, "SPI_prepare(\"%s\") failed", query);
2299
2300 if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL)
2301 /* internal error */
2302 elog(ERROR, "SPI_cursor_open(\"%s\") failed", query);
2303
2304 SPI_cursor_fetch(portal, true, 100);
2305
2306 if (SPI_tuptable == NULL ||
2307 SPI_tuptable->tupdesc->natts != 1 ||
2308 !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1),
2309 TSVECTOROID))
2310 ereport(ERROR,
2311 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2312 errmsg("ts_stat query must return one tsvector column")));
2313
2314 stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
2315 stat->maxdepth = 1;
2316
2317 if (ws)
2318 {
2319 char *buf;
2320
2321 buf = VARDATA_ANY(ws);
2322 while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
2323 {
2324 if (pg_mblen(buf) == 1)
2325 {
2326 switch (*buf)
2327 {
2328 case 'A':
2329 case 'a':
2330 stat->weight |= 1 << 3;
2331 break;
2332 case 'B':
2333 case 'b':
2334 stat->weight |= 1 << 2;
2335 break;
2336 case 'C':
2337 case 'c':
2338 stat->weight |= 1 << 1;
2339 break;
2340 case 'D':
2341 case 'd':
2342 stat->weight |= 1;
2343 break;
2344 default:
2345 stat->weight |= 0;
2346 }
2347 }
2348 buf += pg_mblen(buf);
2349 }
2350 }
2351
2352 while (SPI_processed > 0)
2353 {
2354 uint64 i;
2355
2356 for (i = 0; i < SPI_processed; i++)
2357 {
2358 Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull);
2359
2360 if (!isnull)
2361 stat = ts_accum(persistentContext, stat, data);
2362 }
2363
2364 SPI_freetuptable(SPI_tuptable);
2365 SPI_cursor_fetch(portal, true, 100);
2366 }
2367
2368 SPI_freetuptable(SPI_tuptable);
2369 SPI_cursor_close(portal);
2370 SPI_freeplan(plan);
2371 pfree(query);
2372
2373 return stat;
2374}
2375
2376Datum
2377ts_stat1(PG_FUNCTION_ARGS)
2378{
2379 FuncCallContext *funcctx;
2380 Datum result;
2381
2382 if (SRF_IS_FIRSTCALL())
2383 {
2384 TSVectorStat *stat;
2385 text *txt = PG_GETARG_TEXT_PP(0);
2386
2387 funcctx = SRF_FIRSTCALL_INIT();
2388 SPI_connect();
2389 stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL);
2390 PG_FREE_IF_COPY(txt, 0);
2391 ts_setup_firstcall(fcinfo, funcctx, stat);
2392 SPI_finish();
2393 }
2394
2395 funcctx = SRF_PERCALL_SETUP();
2396 if ((result = ts_process_call(funcctx)) != (Datum) 0)
2397 SRF_RETURN_NEXT(funcctx, result);
2398 SRF_RETURN_DONE(funcctx);
2399}
2400
2401Datum
2402ts_stat2(PG_FUNCTION_ARGS)
2403{
2404 FuncCallContext *funcctx;
2405 Datum result;
2406
2407 if (SRF_IS_FIRSTCALL())
2408 {
2409 TSVectorStat *stat;
2410 text *txt = PG_GETARG_TEXT_PP(0);
2411 text *ws = PG_GETARG_TEXT_PP(1);
2412
2413 funcctx = SRF_FIRSTCALL_INIT();
2414 SPI_connect();
2415 stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws);
2416 PG_FREE_IF_COPY(txt, 0);
2417 PG_FREE_IF_COPY(ws, 1);
2418 ts_setup_firstcall(fcinfo, funcctx, stat);
2419 SPI_finish();
2420 }
2421
2422 funcctx = SRF_PERCALL_SETUP();
2423 if ((result = ts_process_call(funcctx)) != (Datum) 0)
2424 SRF_RETURN_NEXT(funcctx, result);
2425 SRF_RETURN_DONE(funcctx);
2426}
2427
2428
2429/*
2430 * Triggers for automatic update of a tsvector column from text column(s)
2431 *
2432 * Trigger arguments are either
2433 * name of tsvector col, name of tsconfig to use, name(s) of text col(s)
2434 * name of tsvector col, name of regconfig col, name(s) of text col(s)
2435 * ie, tsconfig can either be specified by name, or indirectly as the
2436 * contents of a regconfig field in the row. If the name is used, it must
2437 * be explicitly schema-qualified.
2438 */
2439Datum
2440tsvector_update_trigger_byid(PG_FUNCTION_ARGS)
2441{
2442 return tsvector_update_trigger(fcinfo, false);
2443}
2444
2445Datum
2446tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)
2447{
2448 return tsvector_update_trigger(fcinfo, true);
2449}
2450
2451static Datum
2452tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
2453{
2454 TriggerData *trigdata;
2455 Trigger *trigger;
2456 Relation rel;
2457 HeapTuple rettuple = NULL;
2458 int tsvector_attr_num,
2459 i;
2460 ParsedText prs;
2461 Datum datum;
2462 bool isnull;
2463 text *txt;
2464 Oid cfgId;
2465
2466 /* Check call context */
2467 if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */
2468 elog(ERROR, "tsvector_update_trigger: not fired by trigger manager");
2469
2470 trigdata = (TriggerData *) fcinfo->context;
2471 if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event))
2472 elog(ERROR, "tsvector_update_trigger: must be fired for row");
2473 if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event))
2474 elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event");
2475
2476 if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
2477 rettuple = trigdata->tg_trigtuple;
2478 else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
2479 rettuple = trigdata->tg_newtuple;
2480 else
2481 elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE");
2482
2483 trigger = trigdata->tg_trigger;
2484 rel = trigdata->tg_relation;
2485
2486 if (trigger->tgnargs < 3)
2487 elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)");
2488
2489 /* Find the target tsvector column */
2490 tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
2491 if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE)
2492 ereport(ERROR,
2493 (errcode(ERRCODE_UNDEFINED_COLUMN),
2494 errmsg("tsvector column \"%s\" does not exist",
2495 trigger->tgargs[0])));
2496 /* This will effectively reject system columns, so no separate test: */
2497 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, tsvector_attr_num),
2498 TSVECTOROID))
2499 ereport(ERROR,
2500 (errcode(ERRCODE_DATATYPE_MISMATCH),
2501 errmsg("column \"%s\" is not of tsvector type",
2502 trigger->tgargs[0])));
2503
2504 /* Find the configuration to use */
2505 if (config_column)
2506 {
2507 int config_attr_num;
2508
2509 config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]);
2510 if (config_attr_num == SPI_ERROR_NOATTRIBUTE)
2511 ereport(ERROR,
2512 (errcode(ERRCODE_UNDEFINED_COLUMN),
2513 errmsg("configuration column \"%s\" does not exist",
2514 trigger->tgargs[1])));
2515 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num),
2516 REGCONFIGOID))
2517 ereport(ERROR,
2518 (errcode(ERRCODE_DATATYPE_MISMATCH),
2519 errmsg("column \"%s\" is not of regconfig type",
2520 trigger->tgargs[1])));
2521
2522 datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull);
2523 if (isnull)
2524 ereport(ERROR,
2525 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
2526 errmsg("configuration column \"%s\" must not be null",
2527 trigger->tgargs[1])));
2528 cfgId = DatumGetObjectId(datum);
2529 }
2530 else
2531 {
2532 List *names;
2533
2534 names = stringToQualifiedNameList(trigger->tgargs[1]);
2535 /* require a schema so that results are not search path dependent */
2536 if (list_length(names) < 2)
2537 ereport(ERROR,
2538 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2539 errmsg("text search configuration name \"%s\" must be schema-qualified",
2540 trigger->tgargs[1])));
2541 cfgId = get_ts_config_oid(names, false);
2542 }
2543
2544 /* initialize parse state */
2545 prs.lenwords = 32;
2546 prs.curwords = 0;
2547 prs.pos = 0;
2548 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
2549
2550 /* find all words in indexable column(s) */
2551 for (i = 2; i < trigger->tgnargs; i++)
2552 {
2553 int numattr;
2554
2555 numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
2556 if (numattr == SPI_ERROR_NOATTRIBUTE)
2557 ereport(ERROR,
2558 (errcode(ERRCODE_UNDEFINED_COLUMN),
2559 errmsg("column \"%s\" does not exist",
2560 trigger->tgargs[i])));
2561 if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID))
2562 ereport(ERROR,
2563 (errcode(ERRCODE_DATATYPE_MISMATCH),
2564 errmsg("column \"%s\" is not of a character type",
2565 trigger->tgargs[i])));
2566
2567 datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull);
2568 if (isnull)
2569 continue;
2570
2571 txt = DatumGetTextPP(datum);
2572
2573 parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt));
2574
2575 if (txt != (text *) DatumGetPointer(datum))
2576 pfree(txt);
2577 }
2578
2579 /* make tsvector value */
2580 datum = TSVectorGetDatum(make_tsvector(&prs));
2581 isnull = false;
2582
2583 /* and insert it into tuple */
2584 rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
2585 1, &tsvector_attr_num,
2586 &datum, &isnull);
2587
2588 pfree(DatumGetPointer(datum));
2589
2590 return PointerGetDatum(rettuple);
2591}
2592