ts_parse.c source code [PostgreSQL/src/backend/tsearch/ts_parse.c]

1	/-------------------------------------------------------------------------*
2	*
3	* ts_parse.c
4	* main parse functions for tsearch
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	*
8	*
9	* IDENTIFICATION
10	* src/backend/tsearch/ts_parse.c
11	*
12	*-------------------------------------------------------------------------
13	*/
14
15	#include "postgres.h"
16
17	#include "tsearch/ts_cache.h"
18	#include "tsearch/ts_utils.h"
19
20	#define IGNORE_LONGLEXEME 1
21
22	/*
23	* Lexize subsystem
24	*/
25
26	typedef struct ParsedLex
27	{
28	int type;
29	char *lemm;
30	int lenlemm;
31	struct ParsedLex *next;
32	} ParsedLex;
33
34	typedef struct ListParsedLex
35	{
36	ParsedLex *head;
37	ParsedLex *tail;
38	} ListParsedLex;
39
40	typedef struct
41	{
42	TSConfigCacheEntry *cfg;
43	Oid curDictId;
44	int posDict;
45	DictSubState dictState;
46	ParsedLex *curSub;
47	ListParsedLex towork; / current list to work /
48	ListParsedLex waste; / list of lexemes that already lexized /
49
50	/*
51	* fields to store last variant to lexize (basically, thesaurus or similar
52	* to, which wants several lexemes
53	*/
54
55	ParsedLex *lastRes;
56	TSLexeme *tmpRes;
57	} LexizeData;
58
59	static void
60	LexizeInit(LexizeData ld, TSConfigCacheEntry cfg)
61	{
62	ld->cfg = cfg;
63	ld->curDictId = InvalidOid;
64	ld->posDict = `0`;
65	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
66	ld->waste.head = ld->waste.tail = NULL;
67	ld->lastRes = NULL;
68	ld->tmpRes = NULL;
69	}
70
71	static void
72	LPLAddTail(ListParsedLex list, ParsedLex newpl)
73	{
74	if (list->tail)
75	{
76	list->tail->next = newpl;
77	list->tail = newpl;
78	}
79	else
80	list->head = list->tail = newpl;
81	newpl->next = NULL;
82	}
83
84	static ParsedLex *
85	LPLRemoveHead(ListParsedLex *list)
86	{
87	ParsedLex *res = list->head;
88
89	if (list->head)
90	list->head = list->head->next;
91
92	if (list->head == NULL)
93	list->tail = NULL;
94
95	return res;
96	}
97
98	static void
99	LexizeAddLemm(LexizeData ld, int* type, char lemm, int* lenlemm)
100	{
101	ParsedLex newpl = (ParsedLex ) palloc(sizeof(ParsedLex));
102
103	newpl->type = type;
104	newpl->lemm = lemm;
105	newpl->lenlemm = lenlemm;
106	LPLAddTail(&ld->towork, newpl);
107	ld->curSub = ld->towork.tail;
108	}
109
110	static void
111	RemoveHead(LexizeData *ld)
112	{
113	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
114
115	ld->posDict = `0`;
116	}
117
118	static void
119	setCorrLex(LexizeData ld, ParsedLex *correspondLexem)
120	{
121	if (correspondLexem)
122	{
123	*correspondLexem = ld->waste.head;
124	}
125	else
126	{
127	ParsedLex *tmp,
128	*ptr = ld->waste.head;
129
130	while (ptr)
131	{
132	tmp = ptr->next;
133	pfree(ptr);
134	ptr = tmp;
135	}
136	}
137	ld->waste.head = ld->waste.tail = NULL;
138	}
139
140	static void
141	moveToWaste(LexizeData ld, ParsedLex stop)
142	{
143	bool go = true;
144
145	while (ld->towork.head && go)
146	{
147	if (ld->towork.head == stop)
148	{
149	ld->curSub = stop->next;
150	go = false;
151	}
152	RemoveHead(ld);
153	}
154	}
155
156	static void
157	setNewTmpRes(LexizeData ld, ParsedLex lex, TSLexeme *res)
158	{
159	if (ld->tmpRes)
160	{
161	TSLexeme *ptr;
162
163	for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
164	pfree(ptr->lexeme);
165	pfree(ld->tmpRes);
166	}
167	ld->tmpRes = res;
168	ld->lastRes = lex;
169	}
170
171	static TSLexeme *
172	LexizeExec(LexizeData ld, ParsedLex *correspondLexem)
173	{
174	int i;
175	ListDictionary *map;
176	TSDictionaryCacheEntry *dict;
177	TSLexeme *res;
178
179	if (ld->curDictId == InvalidOid)
180	{
181	/*
182	* usual mode: dictionary wants only one word, but we should keep in
183	* mind that we should go through all stack
184	*/
185
186	while (ld->towork.head)
187	{
188	ParsedLex *curVal = ld->towork.head;
189	char *curValLemm = curVal->lemm;
190	int curValLenLemm = curVal->lenlemm;
191
192	map = ld->cfg->map + curVal->type;
193
194	if (curVal->type == `0` \|\| curVal->type >= ld->cfg->lenmap \|\| map->len == `0`)
195	{
196	/ skip this type of lexeme /
197	RemoveHead(ld);
198	continue;
199	}
200
201	for (i = ld->posDict; i < map->len; i++)
202	{
203	dict = lookup_ts_dictionary_cache(map->dictIds[i]);
204
205	ld->dictState.isend = ld->dictState.getnext = false;
206	ld->dictState.private_state = NULL;
207	res = (TSLexeme *) DatumGetPointer(FunctionCall4(
208	&(dict->lexize),
209	PointerGetDatum(dict->dictData),
210	PointerGetDatum(curValLemm),
211	Int32GetDatum(curValLenLemm),
212	PointerGetDatum(&ld->dictState)
213	));
214
215	if (ld->dictState.getnext)
216	{
217	/*
218	* dictionary wants next word, so setup and store current
219	* position and go to multiword mode
220	*/
221
222	ld->curDictId = DatumGetObjectId(map->dictIds[i]);
223	ld->posDict = i + `1`;
224	ld->curSub = curVal->next;
225	if (res)
226	setNewTmpRes(ld, curVal, res);
227	return LexizeExec(ld, correspondLexem);
228	}
229
230	if (!res) / dictionary doesn't know this lexeme /
231	continue;
232
233	if (res->flags & TSL_FILTER)
234	{
235	curValLemm = res->lexeme;
236	curValLenLemm = strlen(res->lexeme);
237	continue;
238	}
239
240	RemoveHead(ld);
241	setCorrLex(ld, correspondLexem);
242	return res;
243	}
244
245	RemoveHead(ld);
246	}
247	}
248	else
249	{ / curDictId is valid /
250	dict = lookup_ts_dictionary_cache(ld->curDictId);
251
252	/*
253	* Dictionary ld->curDictId asks us about following words
254	*/
255
256	while (ld->curSub)
257	{
258	ParsedLex *curVal = ld->curSub;
259
260	map = ld->cfg->map + curVal->type;
261
262	if (curVal->type != `0`)
263	{
264	bool dictExists = false;
265
266	if (curVal->type >= ld->cfg->lenmap \|\| map->len == `0`)
267	{
268	/ skip this type of lexeme /
269	ld->curSub = curVal->next;
270	continue;
271	}
272
273	/*
274	* We should be sure that current type of lexeme is recognized
275	* by our dictionary: we just check is it exist in list of
276	* dictionaries ?
277	*/
278	for (i = `0`; i < map->len && !dictExists; i++)
279	if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
280	dictExists = true;
281
282	if (!dictExists)
283	{
284	/*
285	* Dictionary can't work with current tpe of lexeme,
286	* return to basic mode and redo all stored lexemes
287	*/
288	ld->curDictId = InvalidOid;
289	return LexizeExec(ld, correspondLexem);
290	}
291	}
292
293	ld->dictState.isend = (curVal->type == `0`) ? true : false;
294	ld->dictState.getnext = false;
295
296	res = (TSLexeme *) DatumGetPointer(FunctionCall4(
297	&(dict->lexize),
298	PointerGetDatum(dict->dictData),
299	PointerGetDatum(curVal->lemm),
300	Int32GetDatum(curVal->lenlemm),
301	PointerGetDatum(&ld->dictState)
302	));
303
304	if (ld->dictState.getnext)
305	{
306	/ Dictionary wants one more /
307	ld->curSub = curVal->next;
308	if (res)
309	setNewTmpRes(ld, curVal, res);
310	continue;
311	}
312
313	if (res \|\| ld->tmpRes)
314	{
315	/*
316	* Dictionary normalizes lexemes, so we remove from stack all
317	* used lexemes, return to basic mode and redo end of stack
318	* (if it exists)
319	*/
320	if (res)
321	{
322	moveToWaste(ld, ld->curSub);
323	}
324	else
325	{
326	res = ld->tmpRes;
327	moveToWaste(ld, ld->lastRes);
328	}
329
330	/ reset to initial state /
331	ld->curDictId = InvalidOid;
332	ld->posDict = `0`;
333	ld->lastRes = NULL;
334	ld->tmpRes = NULL;
335	setCorrLex(ld, correspondLexem);
336	return res;
337	}
338
339	/*
340	* Dict don't want next lexem and didn't recognize anything, redo
341	* from ld->towork.head
342	*/
343	ld->curDictId = InvalidOid;
344	return LexizeExec(ld, correspondLexem);
345	}
346	}
347
348	setCorrLex(ld, correspondLexem);
349	return NULL;
350	}
351
352	/*
353	* Parse string and lexize words.
354	*
355	* prs will be filled in.
356	*/
357	void
358	parsetext(Oid cfgId, ParsedText prs, char* buf, int* buflen)
359	{
360	int type,
361	lenlemm;
362	char *lemm = NULL;
363	LexizeData ldata;
364	TSLexeme *norms;
365	TSConfigCacheEntry *cfg;
366	TSParserCacheEntry *prsobj;
367	void *prsdata;
368
369	cfg = lookup_ts_config_cache(cfgId);
370	prsobj = lookup_ts_parser_cache(cfg->prsId);
371
372	prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
373	PointerGetDatum(buf),
374	Int32GetDatum(buflen)));
375
376	LexizeInit(&ldata, cfg);
377
378	do
379	{
380	type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
381	PointerGetDatum(prsdata),
382	PointerGetDatum(&lemm),
383	PointerGetDatum(&lenlemm)));
384
385	if (type > `0` && lenlemm >= MAXSTRLEN)
386	{
387	#ifdef IGNORE_LONGLEXEME
388	ereport(NOTICE,
389	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
390	errmsg("word is too long to be indexed"),
391	errdetail("Words longer than %d characters are ignored.",
392	MAXSTRLEN)));
393	continue;
394	#else
395	ereport(ERROR,
396	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
397	errmsg("word is too long to be indexed"),
398	errdetail("Words longer than %d characters are ignored.",
399	MAXSTRLEN)));
400	#endif
401	}
402
403	LexizeAddLemm(&ldata, type, lemm, lenlemm);
404
405	while ((norms = LexizeExec(&ldata, NULL)) != NULL)
406	{
407	TSLexeme *ptr = norms;
408
409	prs->pos++; / set pos /
410
411	while (ptr->lexeme)
412	{
413	if (prs->curwords == prs->lenwords)
414	{
415	prs->lenwords *= `2`;
416	prs->words = (ParsedWord ) repalloc((void* ) prs->words, prs->lenwords sizeof(ParsedWord));
417	}
418
419	if (ptr->flags & TSL_ADDPOS)
420	prs->pos++;
421	prs->words[prs->curwords].len = strlen(ptr->lexeme);
422	prs->words[prs->curwords].word = ptr->lexeme;
423	prs->words[prs->curwords].nvariant = ptr->nvariant;
424	prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
425	prs->words[prs->curwords].alen = `0`;
426	prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
427	ptr++;
428	prs->curwords++;
429	}
430	pfree(norms);
431	}
432	} while (type > `0`);
433
434	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
435	}
436
437	/*
438	* Headline framework
439	*/
440	static void
441	hladdword(HeadlineParsedText prs, char* buf, int* buflen, int type)
442	{
443	while (prs->curwords >= prs->lenwords)
444	{
445	prs->lenwords *= `2`;
446	prs->words = (HeadlineWordEntry ) repalloc((void* ) prs->words, prs->lenwords sizeof(HeadlineWordEntry));
447	}
448	memset(&(prs->words[prs->curwords]), `0`, sizeof(HeadlineWordEntry));
449	prs->words[prs->curwords].type = (uint8) type;
450	prs->words[prs->curwords].len = buflen;
451	prs->words[prs->curwords].word = palloc(buflen);
452	memcpy(prs->words[prs->curwords].word, buf, buflen);
453	prs->curwords++;
454	}
455
456	static void
457	hlfinditem(HeadlineParsedText prs, TSQuery query, int32 pos, char* buf, int* buflen)
458	{
459	int i;
460	QueryItem *item = GETQUERY(query);
461	HeadlineWordEntry *word;
462
463	while (prs->curwords + query->size >= prs->lenwords)
464	{
465	prs->lenwords *= `2`;
466	prs->words = (HeadlineWordEntry ) repalloc((void* ) prs->words, prs->lenwords sizeof(HeadlineWordEntry));
467	}
468
469	word = &(prs->words[prs->curwords - `1`]);
470	word->pos = LIMITPOS(pos);
471	for (i = `0`; i < query->size; i++)
472	{
473	if (item->type == QI_VAL &&
474	tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
475	buf, buflen, item->qoperand.prefix) == `0`)
476	{
477	if (word->item)
478	{
479	memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
480	prs->words[prs->curwords].item = &item->qoperand;
481	prs->words[prs->curwords].repeated = `1`;
482	prs->curwords++;
483	}
484	else
485	word->item = &item->qoperand;
486	}
487	item++;
488	}
489	}
490
491	static void
492	addHLParsedLex(HeadlineParsedText prs, TSQuery query, ParsedLex lexs, TSLexeme *norms)
493	{
494	ParsedLex *tmplexs;
495	TSLexeme *ptr;
496	int32 savedpos;
497
498	while (lexs)
499	{
500	if (lexs->type > `0`)
501	hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
502
503	ptr = norms;
504	savedpos = prs->vectorpos;
505	while (ptr && ptr->lexeme)
506	{
507	if (ptr->flags & TSL_ADDPOS)
508	savedpos++;
509	hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
510	ptr++;
511	}
512
513	tmplexs = lexs->next;
514	pfree(lexs);
515	lexs = tmplexs;
516	}
517
518	if (norms)
519	{
520	ptr = norms;
521	while (ptr->lexeme)
522	{
523	if (ptr->flags & TSL_ADDPOS)
524	prs->vectorpos++;
525	pfree(ptr->lexeme);
526	ptr++;
527	}
528	pfree(norms);
529	}
530	}
531
532	void
533	hlparsetext(Oid cfgId, HeadlineParsedText prs, TSQuery query, char* buf, int* buflen)
534	{
535	int type,
536	lenlemm;
537	char *lemm = NULL;
538	LexizeData ldata;
539	TSLexeme *norms;
540	ParsedLex *lexs;
541	TSConfigCacheEntry *cfg;
542	TSParserCacheEntry *prsobj;
543	void *prsdata;
544
545	cfg = lookup_ts_config_cache(cfgId);
546	prsobj = lookup_ts_parser_cache(cfg->prsId);
547
548	prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
549	PointerGetDatum(buf),
550	Int32GetDatum(buflen)));
551
552	LexizeInit(&ldata, cfg);
553
554	do
555	{
556	type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
557	PointerGetDatum(prsdata),
558	PointerGetDatum(&lemm),
559	PointerGetDatum(&lenlemm)));
560
561	if (type > `0` && lenlemm >= MAXSTRLEN)
562	{
563	#ifdef IGNORE_LONGLEXEME
564	ereport(NOTICE,
565	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
566	errmsg("word is too long to be indexed"),
567	errdetail("Words longer than %d characters are ignored.",
568	MAXSTRLEN)));
569	continue;
570	#else
571	ereport(ERROR,
572	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
573	errmsg("word is too long to be indexed"),
574	errdetail("Words longer than %d characters are ignored.",
575	MAXSTRLEN)));
576	#endif
577	}
578
579	LexizeAddLemm(&ldata, type, lemm, lenlemm);
580
581	do
582	{
583	if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
584	{
585	prs->vectorpos++;
586	addHLParsedLex(prs, query, lexs, norms);
587	}
588	else
589	addHLParsedLex(prs, query, lexs, NULL);
590	} while (norms);
591
592	} while (type > `0`);
593
594	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
595	}
596
597	text *
598	generateHeadline(HeadlineParsedText *prs)
599	{
600	text *out;
601	char *ptr;
602	int len = `128`;
603	int numfragments = `0`;
604	int16 infrag = `0`;
605
606	HeadlineWordEntry *wrd = prs->words;
607
608	out = (text *) palloc(len);
609	ptr = ((char *) out) + VARHDRSZ;
610
611	while (wrd - prs->words < prs->curwords)
612	{
613	while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
614	{
615	int dist = ptr - ((char *) out);
616
617	len *= `2`;
618	out = (text *) repalloc(out, len);
619	ptr = ((char *) out) + dist;
620	}
621
622	if (wrd->in && !wrd->repeated)
623	{
624	if (!infrag)
625	{
626
627	/ start of a new fragment /
628	infrag = `1`;
629	numfragments++;
630	/ add a fragment delimiter if this is after the first one /
631	if (numfragments > `1`)
632	{
633	memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
634	ptr += prs->fragdelimlen;
635	}
636
637	}
638	if (wrd->replace)
639	{
640	*ptr = `' '`;
641	ptr++;
642	}
643	else if (!wrd->skip)
644	{
645	if (wrd->selected)
646	{
647	memcpy(ptr, prs->startsel, prs->startsellen);
648	ptr += prs->startsellen;
649	}
650	memcpy(ptr, wrd->word, wrd->len);
651	ptr += wrd->len;
652	if (wrd->selected)
653	{
654	memcpy(ptr, prs->stopsel, prs->stopsellen);
655	ptr += prs->stopsellen;
656	}
657	}
658	}
659	else if (!wrd->repeated)
660	{
661	if (infrag)
662	infrag = `0`;
663	pfree(wrd->word);
664	}
665
666	wrd++;
667	}
668
669	SET_VARSIZE(out, ptr - ((char *) out));
670	return out;
671	}
672

Browse the source code of PostgreSQL/src/backend/tsearch/ts_parse.c