1/*-------------------------------------------------------------------------
2 *
3 * ts_parse.c
4 * main parse functions for tsearch
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/ts_parse.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "tsearch/ts_cache.h"
18#include "tsearch/ts_utils.h"
19
20#define IGNORE_LONGLEXEME 1
21
22/*
23 * Lexize subsystem
24 */
25
26typedef struct ParsedLex
27{
28 int type;
29 char *lemm;
30 int lenlemm;
31 struct ParsedLex *next;
32} ParsedLex;
33
34typedef struct ListParsedLex
35{
36 ParsedLex *head;
37 ParsedLex *tail;
38} ListParsedLex;
39
40typedef struct
41{
42 TSConfigCacheEntry *cfg;
43 Oid curDictId;
44 int posDict;
45 DictSubState dictState;
46 ParsedLex *curSub;
47 ListParsedLex towork; /* current list to work */
48 ListParsedLex waste; /* list of lexemes that already lexized */
49
50 /*
51 * fields to store last variant to lexize (basically, thesaurus or similar
52 * to, which wants several lexemes
53 */
54
55 ParsedLex *lastRes;
56 TSLexeme *tmpRes;
57} LexizeData;
58
59static void
60LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
61{
62 ld->cfg = cfg;
63 ld->curDictId = InvalidOid;
64 ld->posDict = 0;
65 ld->towork.head = ld->towork.tail = ld->curSub = NULL;
66 ld->waste.head = ld->waste.tail = NULL;
67 ld->lastRes = NULL;
68 ld->tmpRes = NULL;
69}
70
71static void
72LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
73{
74 if (list->tail)
75 {
76 list->tail->next = newpl;
77 list->tail = newpl;
78 }
79 else
80 list->head = list->tail = newpl;
81 newpl->next = NULL;
82}
83
84static ParsedLex *
85LPLRemoveHead(ListParsedLex *list)
86{
87 ParsedLex *res = list->head;
88
89 if (list->head)
90 list->head = list->head->next;
91
92 if (list->head == NULL)
93 list->tail = NULL;
94
95 return res;
96}
97
98static void
99LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
100{
101 ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
102
103 newpl->type = type;
104 newpl->lemm = lemm;
105 newpl->lenlemm = lenlemm;
106 LPLAddTail(&ld->towork, newpl);
107 ld->curSub = ld->towork.tail;
108}
109
110static void
111RemoveHead(LexizeData *ld)
112{
113 LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
114
115 ld->posDict = 0;
116}
117
118static void
119setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
120{
121 if (correspondLexem)
122 {
123 *correspondLexem = ld->waste.head;
124 }
125 else
126 {
127 ParsedLex *tmp,
128 *ptr = ld->waste.head;
129
130 while (ptr)
131 {
132 tmp = ptr->next;
133 pfree(ptr);
134 ptr = tmp;
135 }
136 }
137 ld->waste.head = ld->waste.tail = NULL;
138}
139
140static void
141moveToWaste(LexizeData *ld, ParsedLex *stop)
142{
143 bool go = true;
144
145 while (ld->towork.head && go)
146 {
147 if (ld->towork.head == stop)
148 {
149 ld->curSub = stop->next;
150 go = false;
151 }
152 RemoveHead(ld);
153 }
154}
155
156static void
157setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
158{
159 if (ld->tmpRes)
160 {
161 TSLexeme *ptr;
162
163 for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
164 pfree(ptr->lexeme);
165 pfree(ld->tmpRes);
166 }
167 ld->tmpRes = res;
168 ld->lastRes = lex;
169}
170
171static TSLexeme *
172LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
173{
174 int i;
175 ListDictionary *map;
176 TSDictionaryCacheEntry *dict;
177 TSLexeme *res;
178
179 if (ld->curDictId == InvalidOid)
180 {
181 /*
182 * usual mode: dictionary wants only one word, but we should keep in
183 * mind that we should go through all stack
184 */
185
186 while (ld->towork.head)
187 {
188 ParsedLex *curVal = ld->towork.head;
189 char *curValLemm = curVal->lemm;
190 int curValLenLemm = curVal->lenlemm;
191
192 map = ld->cfg->map + curVal->type;
193
194 if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
195 {
196 /* skip this type of lexeme */
197 RemoveHead(ld);
198 continue;
199 }
200
201 for (i = ld->posDict; i < map->len; i++)
202 {
203 dict = lookup_ts_dictionary_cache(map->dictIds[i]);
204
205 ld->dictState.isend = ld->dictState.getnext = false;
206 ld->dictState.private_state = NULL;
207 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
208 &(dict->lexize),
209 PointerGetDatum(dict->dictData),
210 PointerGetDatum(curValLemm),
211 Int32GetDatum(curValLenLemm),
212 PointerGetDatum(&ld->dictState)
213 ));
214
215 if (ld->dictState.getnext)
216 {
217 /*
218 * dictionary wants next word, so setup and store current
219 * position and go to multiword mode
220 */
221
222 ld->curDictId = DatumGetObjectId(map->dictIds[i]);
223 ld->posDict = i + 1;
224 ld->curSub = curVal->next;
225 if (res)
226 setNewTmpRes(ld, curVal, res);
227 return LexizeExec(ld, correspondLexem);
228 }
229
230 if (!res) /* dictionary doesn't know this lexeme */
231 continue;
232
233 if (res->flags & TSL_FILTER)
234 {
235 curValLemm = res->lexeme;
236 curValLenLemm = strlen(res->lexeme);
237 continue;
238 }
239
240 RemoveHead(ld);
241 setCorrLex(ld, correspondLexem);
242 return res;
243 }
244
245 RemoveHead(ld);
246 }
247 }
248 else
249 { /* curDictId is valid */
250 dict = lookup_ts_dictionary_cache(ld->curDictId);
251
252 /*
253 * Dictionary ld->curDictId asks us about following words
254 */
255
256 while (ld->curSub)
257 {
258 ParsedLex *curVal = ld->curSub;
259
260 map = ld->cfg->map + curVal->type;
261
262 if (curVal->type != 0)
263 {
264 bool dictExists = false;
265
266 if (curVal->type >= ld->cfg->lenmap || map->len == 0)
267 {
268 /* skip this type of lexeme */
269 ld->curSub = curVal->next;
270 continue;
271 }
272
273 /*
274 * We should be sure that current type of lexeme is recognized
275 * by our dictionary: we just check is it exist in list of
276 * dictionaries ?
277 */
278 for (i = 0; i < map->len && !dictExists; i++)
279 if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
280 dictExists = true;
281
282 if (!dictExists)
283 {
284 /*
285 * Dictionary can't work with current tpe of lexeme,
286 * return to basic mode and redo all stored lexemes
287 */
288 ld->curDictId = InvalidOid;
289 return LexizeExec(ld, correspondLexem);
290 }
291 }
292
293 ld->dictState.isend = (curVal->type == 0) ? true : false;
294 ld->dictState.getnext = false;
295
296 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
297 &(dict->lexize),
298 PointerGetDatum(dict->dictData),
299 PointerGetDatum(curVal->lemm),
300 Int32GetDatum(curVal->lenlemm),
301 PointerGetDatum(&ld->dictState)
302 ));
303
304 if (ld->dictState.getnext)
305 {
306 /* Dictionary wants one more */
307 ld->curSub = curVal->next;
308 if (res)
309 setNewTmpRes(ld, curVal, res);
310 continue;
311 }
312
313 if (res || ld->tmpRes)
314 {
315 /*
316 * Dictionary normalizes lexemes, so we remove from stack all
317 * used lexemes, return to basic mode and redo end of stack
318 * (if it exists)
319 */
320 if (res)
321 {
322 moveToWaste(ld, ld->curSub);
323 }
324 else
325 {
326 res = ld->tmpRes;
327 moveToWaste(ld, ld->lastRes);
328 }
329
330 /* reset to initial state */
331 ld->curDictId = InvalidOid;
332 ld->posDict = 0;
333 ld->lastRes = NULL;
334 ld->tmpRes = NULL;
335 setCorrLex(ld, correspondLexem);
336 return res;
337 }
338
339 /*
340 * Dict don't want next lexem and didn't recognize anything, redo
341 * from ld->towork.head
342 */
343 ld->curDictId = InvalidOid;
344 return LexizeExec(ld, correspondLexem);
345 }
346 }
347
348 setCorrLex(ld, correspondLexem);
349 return NULL;
350}
351
352/*
353 * Parse string and lexize words.
354 *
355 * prs will be filled in.
356 */
357void
358parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
359{
360 int type,
361 lenlemm;
362 char *lemm = NULL;
363 LexizeData ldata;
364 TSLexeme *norms;
365 TSConfigCacheEntry *cfg;
366 TSParserCacheEntry *prsobj;
367 void *prsdata;
368
369 cfg = lookup_ts_config_cache(cfgId);
370 prsobj = lookup_ts_parser_cache(cfg->prsId);
371
372 prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
373 PointerGetDatum(buf),
374 Int32GetDatum(buflen)));
375
376 LexizeInit(&ldata, cfg);
377
378 do
379 {
380 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
381 PointerGetDatum(prsdata),
382 PointerGetDatum(&lemm),
383 PointerGetDatum(&lenlemm)));
384
385 if (type > 0 && lenlemm >= MAXSTRLEN)
386 {
387#ifdef IGNORE_LONGLEXEME
388 ereport(NOTICE,
389 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
390 errmsg("word is too long to be indexed"),
391 errdetail("Words longer than %d characters are ignored.",
392 MAXSTRLEN)));
393 continue;
394#else
395 ereport(ERROR,
396 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
397 errmsg("word is too long to be indexed"),
398 errdetail("Words longer than %d characters are ignored.",
399 MAXSTRLEN)));
400#endif
401 }
402
403 LexizeAddLemm(&ldata, type, lemm, lenlemm);
404
405 while ((norms = LexizeExec(&ldata, NULL)) != NULL)
406 {
407 TSLexeme *ptr = norms;
408
409 prs->pos++; /* set pos */
410
411 while (ptr->lexeme)
412 {
413 if (prs->curwords == prs->lenwords)
414 {
415 prs->lenwords *= 2;
416 prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
417 }
418
419 if (ptr->flags & TSL_ADDPOS)
420 prs->pos++;
421 prs->words[prs->curwords].len = strlen(ptr->lexeme);
422 prs->words[prs->curwords].word = ptr->lexeme;
423 prs->words[prs->curwords].nvariant = ptr->nvariant;
424 prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
425 prs->words[prs->curwords].alen = 0;
426 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
427 ptr++;
428 prs->curwords++;
429 }
430 pfree(norms);
431 }
432 } while (type > 0);
433
434 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
435}
436
437/*
438 * Headline framework
439 */
440static void
441hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
442{
443 while (prs->curwords >= prs->lenwords)
444 {
445 prs->lenwords *= 2;
446 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
447 }
448 memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
449 prs->words[prs->curwords].type = (uint8) type;
450 prs->words[prs->curwords].len = buflen;
451 prs->words[prs->curwords].word = palloc(buflen);
452 memcpy(prs->words[prs->curwords].word, buf, buflen);
453 prs->curwords++;
454}
455
456static void
457hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
458{
459 int i;
460 QueryItem *item = GETQUERY(query);
461 HeadlineWordEntry *word;
462
463 while (prs->curwords + query->size >= prs->lenwords)
464 {
465 prs->lenwords *= 2;
466 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
467 }
468
469 word = &(prs->words[prs->curwords - 1]);
470 word->pos = LIMITPOS(pos);
471 for (i = 0; i < query->size; i++)
472 {
473 if (item->type == QI_VAL &&
474 tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
475 buf, buflen, item->qoperand.prefix) == 0)
476 {
477 if (word->item)
478 {
479 memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
480 prs->words[prs->curwords].item = &item->qoperand;
481 prs->words[prs->curwords].repeated = 1;
482 prs->curwords++;
483 }
484 else
485 word->item = &item->qoperand;
486 }
487 item++;
488 }
489}
490
491static void
492addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
493{
494 ParsedLex *tmplexs;
495 TSLexeme *ptr;
496 int32 savedpos;
497
498 while (lexs)
499 {
500 if (lexs->type > 0)
501 hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
502
503 ptr = norms;
504 savedpos = prs->vectorpos;
505 while (ptr && ptr->lexeme)
506 {
507 if (ptr->flags & TSL_ADDPOS)
508 savedpos++;
509 hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
510 ptr++;
511 }
512
513 tmplexs = lexs->next;
514 pfree(lexs);
515 lexs = tmplexs;
516 }
517
518 if (norms)
519 {
520 ptr = norms;
521 while (ptr->lexeme)
522 {
523 if (ptr->flags & TSL_ADDPOS)
524 prs->vectorpos++;
525 pfree(ptr->lexeme);
526 ptr++;
527 }
528 pfree(norms);
529 }
530}
531
532void
533hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
534{
535 int type,
536 lenlemm;
537 char *lemm = NULL;
538 LexizeData ldata;
539 TSLexeme *norms;
540 ParsedLex *lexs;
541 TSConfigCacheEntry *cfg;
542 TSParserCacheEntry *prsobj;
543 void *prsdata;
544
545 cfg = lookup_ts_config_cache(cfgId);
546 prsobj = lookup_ts_parser_cache(cfg->prsId);
547
548 prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
549 PointerGetDatum(buf),
550 Int32GetDatum(buflen)));
551
552 LexizeInit(&ldata, cfg);
553
554 do
555 {
556 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
557 PointerGetDatum(prsdata),
558 PointerGetDatum(&lemm),
559 PointerGetDatum(&lenlemm)));
560
561 if (type > 0 && lenlemm >= MAXSTRLEN)
562 {
563#ifdef IGNORE_LONGLEXEME
564 ereport(NOTICE,
565 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
566 errmsg("word is too long to be indexed"),
567 errdetail("Words longer than %d characters are ignored.",
568 MAXSTRLEN)));
569 continue;
570#else
571 ereport(ERROR,
572 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
573 errmsg("word is too long to be indexed"),
574 errdetail("Words longer than %d characters are ignored.",
575 MAXSTRLEN)));
576#endif
577 }
578
579 LexizeAddLemm(&ldata, type, lemm, lenlemm);
580
581 do
582 {
583 if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
584 {
585 prs->vectorpos++;
586 addHLParsedLex(prs, query, lexs, norms);
587 }
588 else
589 addHLParsedLex(prs, query, lexs, NULL);
590 } while (norms);
591
592 } while (type > 0);
593
594 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
595}
596
597text *
598generateHeadline(HeadlineParsedText *prs)
599{
600 text *out;
601 char *ptr;
602 int len = 128;
603 int numfragments = 0;
604 int16 infrag = 0;
605
606 HeadlineWordEntry *wrd = prs->words;
607
608 out = (text *) palloc(len);
609 ptr = ((char *) out) + VARHDRSZ;
610
611 while (wrd - prs->words < prs->curwords)
612 {
613 while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
614 {
615 int dist = ptr - ((char *) out);
616
617 len *= 2;
618 out = (text *) repalloc(out, len);
619 ptr = ((char *) out) + dist;
620 }
621
622 if (wrd->in && !wrd->repeated)
623 {
624 if (!infrag)
625 {
626
627 /* start of a new fragment */
628 infrag = 1;
629 numfragments++;
630 /* add a fragment delimiter if this is after the first one */
631 if (numfragments > 1)
632 {
633 memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
634 ptr += prs->fragdelimlen;
635 }
636
637 }
638 if (wrd->replace)
639 {
640 *ptr = ' ';
641 ptr++;
642 }
643 else if (!wrd->skip)
644 {
645 if (wrd->selected)
646 {
647 memcpy(ptr, prs->startsel, prs->startsellen);
648 ptr += prs->startsellen;
649 }
650 memcpy(ptr, wrd->word, wrd->len);
651 ptr += wrd->len;
652 if (wrd->selected)
653 {
654 memcpy(ptr, prs->stopsel, prs->stopsellen);
655 ptr += prs->stopsellen;
656 }
657 }
658 }
659 else if (!wrd->repeated)
660 {
661 if (infrag)
662 infrag = 0;
663 pfree(wrd->word);
664 }
665
666 wrd++;
667 }
668
669 SET_VARSIZE(out, ptr - ((char *) out));
670 return out;
671}
672