1/*-------------------------------------------------------------------------
2 *
3 * to_tsany.c
4 * to_ts* function definitions
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/to_tsany.c
11 *
12 *-------------------------------------------------------------------------
13 */
14#include "postgres.h"
15
16#include "tsearch/ts_cache.h"
17#include "tsearch/ts_utils.h"
18#include "utils/builtins.h"
19#include "utils/jsonapi.h"
20
21
22typedef struct MorphOpaque
23{
24 Oid cfg_id;
25 int qoperator; /* query operator */
26} MorphOpaque;
27
28typedef struct TSVectorBuildState
29{
30 ParsedText *prs;
31 Oid cfgId;
32} TSVectorBuildState;
33
34static void add_to_tsvector(void *_state, char *elem_value, int elem_len);
35
36
37Datum
38get_current_ts_config(PG_FUNCTION_ARGS)
39{
40 PG_RETURN_OID(getTSCurrentConfig(true));
41}
42
43/*
44 * to_tsvector
45 */
46static int
47compareWORD(const void *a, const void *b)
48{
49 int res;
50
51 res = tsCompareString(
52 ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
53 ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
54 false);
55
56 if (res == 0)
57 {
58 if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
59 return 0;
60
61 res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
62 }
63
64 return res;
65}
66
67static int
68uniqueWORD(ParsedWord *a, int32 l)
69{
70 ParsedWord *ptr,
71 *res;
72 int tmppos;
73
74 if (l == 1)
75 {
76 tmppos = LIMITPOS(a->pos.pos);
77 a->alen = 2;
78 a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
79 a->pos.apos[0] = 1;
80 a->pos.apos[1] = tmppos;
81 return l;
82 }
83
84 res = a;
85 ptr = a + 1;
86
87 /*
88 * Sort words with its positions
89 */
90 qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
91
92 /*
93 * Initialize first word and its first position
94 */
95 tmppos = LIMITPOS(a->pos.pos);
96 a->alen = 2;
97 a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
98 a->pos.apos[0] = 1;
99 a->pos.apos[1] = tmppos;
100
101 /*
102 * Summarize position information for each word
103 */
104 while (ptr - a < l)
105 {
106 if (!(ptr->len == res->len &&
107 strncmp(ptr->word, res->word, res->len) == 0))
108 {
109 /*
110 * Got a new word, so put it in result
111 */
112 res++;
113 res->len = ptr->len;
114 res->word = ptr->word;
115 tmppos = LIMITPOS(ptr->pos.pos);
116 res->alen = 2;
117 res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
118 res->pos.apos[0] = 1;
119 res->pos.apos[1] = tmppos;
120 }
121 else
122 {
123 /*
124 * The word already exists, so adjust position information. But
125 * before we should check size of position's array, max allowed
126 * value for position and uniqueness of position
127 */
128 pfree(ptr->word);
129 if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
130 res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
131 {
132 if (res->pos.apos[0] + 1 >= res->alen)
133 {
134 res->alen *= 2;
135 res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
136 }
137 if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
138 {
139 res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
140 res->pos.apos[0]++;
141 }
142 }
143 }
144 ptr++;
145 }
146
147 return res + 1 - a;
148}
149
150/*
151 * make value of tsvector, given parsed text
152 *
153 * Note: frees prs->words and subsidiary data.
154 */
155TSVector
156make_tsvector(ParsedText *prs)
157{
158 int i,
159 j,
160 lenstr = 0,
161 totallen;
162 TSVector in;
163 WordEntry *ptr;
164 char *str;
165 int stroff;
166
167 /* Merge duplicate words */
168 if (prs->curwords > 0)
169 prs->curwords = uniqueWORD(prs->words, prs->curwords);
170
171 /* Determine space needed */
172 for (i = 0; i < prs->curwords; i++)
173 {
174 lenstr += prs->words[i].len;
175 if (prs->words[i].alen)
176 {
177 lenstr = SHORTALIGN(lenstr);
178 lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
179 }
180 }
181
182 if (lenstr > MAXSTRPOS)
183 ereport(ERROR,
184 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
185 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
186
187 totallen = CALCDATASIZE(prs->curwords, lenstr);
188 in = (TSVector) palloc0(totallen);
189 SET_VARSIZE(in, totallen);
190 in->size = prs->curwords;
191
192 ptr = ARRPTR(in);
193 str = STRPTR(in);
194 stroff = 0;
195 for (i = 0; i < prs->curwords; i++)
196 {
197 ptr->len = prs->words[i].len;
198 ptr->pos = stroff;
199 memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
200 stroff += prs->words[i].len;
201 pfree(prs->words[i].word);
202 if (prs->words[i].alen)
203 {
204 int k = prs->words[i].pos.apos[0];
205 WordEntryPos *wptr;
206
207 if (k > 0xFFFF)
208 elog(ERROR, "positions array too long");
209
210 ptr->haspos = 1;
211 stroff = SHORTALIGN(stroff);
212 *(uint16 *) (str + stroff) = (uint16) k;
213 wptr = POSDATAPTR(in, ptr);
214 for (j = 0; j < k; j++)
215 {
216 WEP_SETWEIGHT(wptr[j], 0);
217 WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
218 }
219 stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
220 pfree(prs->words[i].pos.apos);
221 }
222 else
223 ptr->haspos = 0;
224 ptr++;
225 }
226
227 if (prs->words)
228 pfree(prs->words);
229
230 return in;
231}
232
233Datum
234to_tsvector_byid(PG_FUNCTION_ARGS)
235{
236 Oid cfgId = PG_GETARG_OID(0);
237 text *in = PG_GETARG_TEXT_PP(1);
238 ParsedText prs;
239 TSVector out;
240
241 prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's
242 * number */
243 if (prs.lenwords < 2)
244 prs.lenwords = 2;
245 prs.curwords = 0;
246 prs.pos = 0;
247 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
248
249 parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
250
251 PG_FREE_IF_COPY(in, 1);
252
253 out = make_tsvector(&prs);
254
255 PG_RETURN_TSVECTOR(out);
256}
257
258Datum
259to_tsvector(PG_FUNCTION_ARGS)
260{
261 text *in = PG_GETARG_TEXT_PP(0);
262 Oid cfgId;
263
264 cfgId = getTSCurrentConfig(true);
265 PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
266 ObjectIdGetDatum(cfgId),
267 PointerGetDatum(in)));
268}
269
270/*
271 * Worker function for jsonb(_string)_to_tsvector(_byid)
272 */
273static TSVector
274jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
275{
276 TSVectorBuildState state;
277 ParsedText prs;
278
279 prs.words = NULL;
280 prs.curwords = 0;
281 state.prs = &prs;
282 state.cfgId = cfgId;
283
284 iterate_jsonb_values(jb, flags, &state, add_to_tsvector);
285
286 return make_tsvector(&prs);
287}
288
289Datum
290jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
291{
292 Oid cfgId = PG_GETARG_OID(0);
293 Jsonb *jb = PG_GETARG_JSONB_P(1);
294 TSVector result;
295
296 result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
297 PG_FREE_IF_COPY(jb, 1);
298
299 PG_RETURN_TSVECTOR(result);
300}
301
302Datum
303jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
304{
305 Jsonb *jb = PG_GETARG_JSONB_P(0);
306 Oid cfgId;
307 TSVector result;
308
309 cfgId = getTSCurrentConfig(true);
310 result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
311 PG_FREE_IF_COPY(jb, 0);
312
313 PG_RETURN_TSVECTOR(result);
314}
315
316Datum
317jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
318{
319 Oid cfgId = PG_GETARG_OID(0);
320 Jsonb *jb = PG_GETARG_JSONB_P(1);
321 Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
322 TSVector result;
323 uint32 flags = parse_jsonb_index_flags(jbFlags);
324
325 result = jsonb_to_tsvector_worker(cfgId, jb, flags);
326 PG_FREE_IF_COPY(jb, 1);
327 PG_FREE_IF_COPY(jbFlags, 2);
328
329 PG_RETURN_TSVECTOR(result);
330}
331
332Datum
333jsonb_to_tsvector(PG_FUNCTION_ARGS)
334{
335 Jsonb *jb = PG_GETARG_JSONB_P(0);
336 Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
337 Oid cfgId;
338 TSVector result;
339 uint32 flags = parse_jsonb_index_flags(jbFlags);
340
341 cfgId = getTSCurrentConfig(true);
342 result = jsonb_to_tsvector_worker(cfgId, jb, flags);
343 PG_FREE_IF_COPY(jb, 0);
344 PG_FREE_IF_COPY(jbFlags, 1);
345
346 PG_RETURN_TSVECTOR(result);
347}
348
349/*
350 * Worker function for json(_string)_to_tsvector(_byid)
351 */
352static TSVector
353json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
354{
355 TSVectorBuildState state;
356 ParsedText prs;
357
358 prs.words = NULL;
359 prs.curwords = 0;
360 state.prs = &prs;
361 state.cfgId = cfgId;
362
363 iterate_json_values(json, flags, &state, add_to_tsvector);
364
365 return make_tsvector(&prs);
366}
367
368Datum
369json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
370{
371 Oid cfgId = PG_GETARG_OID(0);
372 text *json = PG_GETARG_TEXT_P(1);
373 TSVector result;
374
375 result = json_to_tsvector_worker(cfgId, json, jtiString);
376 PG_FREE_IF_COPY(json, 1);
377
378 PG_RETURN_TSVECTOR(result);
379}
380
381Datum
382json_string_to_tsvector(PG_FUNCTION_ARGS)
383{
384 text *json = PG_GETARG_TEXT_P(0);
385 Oid cfgId;
386 TSVector result;
387
388 cfgId = getTSCurrentConfig(true);
389 result = json_to_tsvector_worker(cfgId, json, jtiString);
390 PG_FREE_IF_COPY(json, 0);
391
392 PG_RETURN_TSVECTOR(result);
393}
394
395Datum
396json_to_tsvector_byid(PG_FUNCTION_ARGS)
397{
398 Oid cfgId = PG_GETARG_OID(0);
399 text *json = PG_GETARG_TEXT_P(1);
400 Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
401 TSVector result;
402 uint32 flags = parse_jsonb_index_flags(jbFlags);
403
404 result = json_to_tsvector_worker(cfgId, json, flags);
405 PG_FREE_IF_COPY(json, 1);
406 PG_FREE_IF_COPY(jbFlags, 2);
407
408 PG_RETURN_TSVECTOR(result);
409}
410
411Datum
412json_to_tsvector(PG_FUNCTION_ARGS)
413{
414 text *json = PG_GETARG_TEXT_P(0);
415 Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
416 Oid cfgId;
417 TSVector result;
418 uint32 flags = parse_jsonb_index_flags(jbFlags);
419
420 cfgId = getTSCurrentConfig(true);
421 result = json_to_tsvector_worker(cfgId, json, flags);
422 PG_FREE_IF_COPY(json, 0);
423 PG_FREE_IF_COPY(jbFlags, 1);
424
425 PG_RETURN_TSVECTOR(result);
426}
427
428/*
429 * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
430 */
431static void
432add_to_tsvector(void *_state, char *elem_value, int elem_len)
433{
434 TSVectorBuildState *state = (TSVectorBuildState *) _state;
435 ParsedText *prs = state->prs;
436 int32 prevwords;
437
438 if (prs->words == NULL)
439 {
440 /*
441 * First time through: initialize words array to a reasonable size.
442 * (parsetext() will realloc it bigger as needed.)
443 */
444 prs->lenwords = 16;
445 prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
446 prs->curwords = 0;
447 prs->pos = 0;
448 }
449
450 prevwords = prs->curwords;
451
452 parsetext(state->cfgId, prs, elem_value, elem_len);
453
454 /*
455 * If we extracted any words from this JSON element, advance pos to create
456 * an artificial break between elements. This is because we don't want
457 * phrase searches to think that the last word in this element is adjacent
458 * to the first word in the next one.
459 */
460 if (prs->curwords > prevwords)
461 prs->pos += 1;
462}
463
464
465/*
466 * to_tsquery
467 */
468
469
470/*
471 * This function is used for morph parsing.
472 *
473 * The value is passed to parsetext which will call the right dictionary to
474 * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
475 * to the stack.
476 *
477 * All words belonging to the same variant are pushed as an ANDed list,
478 * and different variants are ORed together.
479 */
480static void
481pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
482{
483 int32 count = 0;
484 ParsedText prs;
485 uint32 variant,
486 pos = 0,
487 cntvar = 0,
488 cntpos = 0,
489 cnt = 0;
490 MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
491
492 prs.lenwords = 4;
493 prs.curwords = 0;
494 prs.pos = 0;
495 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
496
497 parsetext(data->cfg_id, &prs, strval, lenval);
498
499 if (prs.curwords > 0)
500 {
501 while (count < prs.curwords)
502 {
503 /*
504 * Were any stop words removed? If so, fill empty positions with
505 * placeholders linked by an appropriate operator.
506 */
507 if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
508 {
509 while (pos + 1 < prs.words[count].pos.pos)
510 {
511 /* put placeholders for each missing stop word */
512 pushStop(state);
513 if (cntpos)
514 pushOperator(state, data->qoperator, 1);
515 cntpos++;
516 pos++;
517 }
518 }
519
520 /* save current word's position */
521 pos = prs.words[count].pos.pos;
522
523 /* Go through all variants obtained from this token */
524 cntvar = 0;
525 while (count < prs.curwords && pos == prs.words[count].pos.pos)
526 {
527 variant = prs.words[count].nvariant;
528
529 /* Push all words belonging to the same variant */
530 cnt = 0;
531 while (count < prs.curwords &&
532 pos == prs.words[count].pos.pos &&
533 variant == prs.words[count].nvariant)
534 {
535 pushValue(state,
536 prs.words[count].word,
537 prs.words[count].len,
538 weight,
539 ((prs.words[count].flags & TSL_PREFIX) || prefix));
540 pfree(prs.words[count].word);
541 if (cnt)
542 pushOperator(state, OP_AND, 0);
543 cnt++;
544 count++;
545 }
546
547 if (cntvar)
548 pushOperator(state, OP_OR, 0);
549 cntvar++;
550 }
551
552 if (cntpos)
553 {
554 /* distance may be useful */
555 pushOperator(state, data->qoperator, 1);
556 }
557
558 cntpos++;
559 }
560
561 pfree(prs.words);
562
563 }
564 else
565 pushStop(state);
566}
567
568Datum
569to_tsquery_byid(PG_FUNCTION_ARGS)
570{
571 text *in = PG_GETARG_TEXT_PP(1);
572 TSQuery query;
573 MorphOpaque data;
574
575 data.cfg_id = PG_GETARG_OID(0);
576 data.qoperator = OP_AND;
577
578 query = parse_tsquery(text_to_cstring(in),
579 pushval_morph,
580 PointerGetDatum(&data),
581 0);
582
583 PG_RETURN_TSQUERY(query);
584}
585
586Datum
587to_tsquery(PG_FUNCTION_ARGS)
588{
589 text *in = PG_GETARG_TEXT_PP(0);
590 Oid cfgId;
591
592 cfgId = getTSCurrentConfig(true);
593 PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
594 ObjectIdGetDatum(cfgId),
595 PointerGetDatum(in)));
596}
597
598Datum
599plainto_tsquery_byid(PG_FUNCTION_ARGS)
600{
601 text *in = PG_GETARG_TEXT_PP(1);
602 TSQuery query;
603 MorphOpaque data;
604
605 data.cfg_id = PG_GETARG_OID(0);
606 data.qoperator = OP_AND;
607
608 query = parse_tsquery(text_to_cstring(in),
609 pushval_morph,
610 PointerGetDatum(&data),
611 P_TSQ_PLAIN);
612
613 PG_RETURN_POINTER(query);
614}
615
616Datum
617plainto_tsquery(PG_FUNCTION_ARGS)
618{
619 text *in = PG_GETARG_TEXT_PP(0);
620 Oid cfgId;
621
622 cfgId = getTSCurrentConfig(true);
623 PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
624 ObjectIdGetDatum(cfgId),
625 PointerGetDatum(in)));
626}
627
628
629Datum
630phraseto_tsquery_byid(PG_FUNCTION_ARGS)
631{
632 text *in = PG_GETARG_TEXT_PP(1);
633 TSQuery query;
634 MorphOpaque data;
635
636 data.cfg_id = PG_GETARG_OID(0);
637 data.qoperator = OP_PHRASE;
638
639 query = parse_tsquery(text_to_cstring(in),
640 pushval_morph,
641 PointerGetDatum(&data),
642 P_TSQ_PLAIN);
643
644 PG_RETURN_TSQUERY(query);
645}
646
647Datum
648phraseto_tsquery(PG_FUNCTION_ARGS)
649{
650 text *in = PG_GETARG_TEXT_PP(0);
651 Oid cfgId;
652
653 cfgId = getTSCurrentConfig(true);
654 PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
655 ObjectIdGetDatum(cfgId),
656 PointerGetDatum(in)));
657}
658
659Datum
660websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
661{
662 text *in = PG_GETARG_TEXT_PP(1);
663 MorphOpaque data;
664 TSQuery query = NULL;
665
666 data.cfg_id = PG_GETARG_OID(0);
667
668 data.qoperator = OP_AND;
669
670 query = parse_tsquery(text_to_cstring(in),
671 pushval_morph,
672 PointerGetDatum(&data),
673 P_TSQ_WEB);
674
675 PG_RETURN_TSQUERY(query);
676}
677
678Datum
679websearch_to_tsquery(PG_FUNCTION_ARGS)
680{
681 text *in = PG_GETARG_TEXT_PP(0);
682 Oid cfgId;
683
684 cfgId = getTSCurrentConfig(true);
685 PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
686 ObjectIdGetDatum(cfgId),
687 PointerGetDatum(in)));
688
689}
690