tsvector.c source code [PostgreSQL/src/backend/utils/adt/tsvector.c]

1	/-------------------------------------------------------------------------*
2	*
3	* tsvector.c
4	* I/O functions for tsvector
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	*
8	*
9	* IDENTIFICATION
10	* src/backend/utils/adt/tsvector.c
11	*
12	*-------------------------------------------------------------------------
13	*/
14
15	#include "postgres.h"
16
17	#include "libpq/pqformat.h"
18	#include "tsearch/ts_locale.h"
19	#include "tsearch/ts_utils.h"
20	#include "utils/builtins.h"
21	#include "utils/memutils.h"
22
23	typedef struct
24	{
25	WordEntry entry; / must be first! /
26	WordEntryPos *pos;
27	int poslen; / number of elements in pos /
28	} WordEntryIN;
29
30
31	/ Compare two WordEntryPos values for qsort /
32	int
33	compareWordEntryPos(const void a, const* void *b)
34	{
35	int apos = WEP_GETPOS((const* WordEntryPos *) a);
36	int bpos = WEP_GETPOS((const* WordEntryPos *) b);
37
38	if (apos == bpos)
39	return `0`;
40	return (apos > bpos) ? `1` : -`1`;
41	}
42
43	/*
44	* Removes duplicate pos entries. If there's two entries with same pos
45	* but different weight, the higher weight is retained.
46	*
47	* Returns new length.
48	*/
49	static int
50	uniquePos(WordEntryPos a, int* l)
51	{
52	WordEntryPos *ptr,
53	*res;
54
55	if (l <= `1`)
56	return l;
57
58	qsort((void ) a, l, sizeof*(WordEntryPos), compareWordEntryPos);
59
60	res = a;
61	ptr = a + `1`;
62	while (ptr - a < l)
63	{
64	if (WEP_GETPOS(ptr) != WEP_GETPOS(res))
65	{
66	res++;
67	res = ptr;
68	if (res - a >= MAXNUMPOS - `1` \|\|
69	WEP_GETPOS(*res) == MAXENTRYPOS - `1`)
70	break;
71	}
72	else if (WEP_GETWEIGHT(ptr) > WEP_GETWEIGHT(res))
73	WEP_SETWEIGHT(res, WEP_GETWEIGHT(ptr));
74	ptr++;
75	}
76
77	return res + `1` - a;
78	}
79
80	/ Compare two WordEntryIN values for qsort /
81	static int
82	compareentry(const void va, const* void vb, void* *arg)
83	{
84	const WordEntryIN a = (const* WordEntryIN *) va;
85	const WordEntryIN b = (const* WordEntryIN *) vb;
86	char BufferStr = (char* *) arg;
87
88	return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
89	&BufferStr[b->entry.pos], b->entry.len,
90	false);
91	}
92
93	/*
94	* Sort an array of WordEntryIN, remove duplicates.
95	* *outbuflen receives the amount of space needed for strings and positions.
96	*/
97	static int
98	uniqueentry(WordEntryIN a, int* l, char buf, int* *outbuflen)
99	{
100	int buflen;
101	WordEntryIN *ptr,
102	*res;
103
104	Assert(l >= `1`);
105
106	if (l > `1`)
107	qsort_arg((void ) a, l, sizeof*(WordEntryIN), compareentry,
108	(void *) buf);
109
110	buflen = `0`;
111	res = a;
112	ptr = a + `1`;
113	while (ptr - a < l)
114	{
115	if (!(ptr->entry.len == res->entry.len &&
116	strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
117	res->entry.len) == `0`))
118	{
119	/ done accumulating data into res, count space needed /*
120	buflen += res->entry.len;
121	if (res->entry.haspos)
122	{
123	res->poslen = uniquePos(res->pos, res->poslen);
124	buflen = SHORTALIGN(buflen);
125	buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
126	}
127	res++;
128	if (res != ptr)
129	memcpy(res, ptr, sizeof(WordEntryIN));
130	}
131	else if (ptr->entry.haspos)
132	{
133	if (res->entry.haspos)
134	{
135	/ append ptr's positions to res's positions /
136	int newlen = ptr->poslen + res->poslen;
137
138	res->pos = (WordEntryPos *)
139	repalloc(res->pos, newlen * sizeof(WordEntryPos));
140	memcpy(&res->pos[res->poslen], ptr->pos,
141	ptr->poslen * sizeof(WordEntryPos));
142	res->poslen = newlen;
143	pfree(ptr->pos);
144	}
145	else
146	{
147	/ just give ptr's positions to pos /
148	res->entry.haspos = `1`;
149	res->pos = ptr->pos;
150	res->poslen = ptr->poslen;
151	}
152	}
153	ptr++;
154	}
155
156	/ count space needed for last item /
157	buflen += res->entry.len;
158	if (res->entry.haspos)
159	{
160	res->poslen = uniquePos(res->pos, res->poslen);
161	buflen = SHORTALIGN(buflen);
162	buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
163	}
164
165	*outbuflen = buflen;
166	return res + `1` - a;
167	}
168
169	static int
170	WordEntryCMP(WordEntry a, WordEntry b, char *buf)
171	{
172	return compareentry(a, b, buf);
173	}
174
175
176	Datum
177	tsvectorin(PG_FUNCTION_ARGS)
178	{
179	char *buf = PG_GETARG_CSTRING(`0`);
180	TSVectorParseState state;
181	WordEntryIN *arr;
182	int totallen;
183	int arrlen; / allocated size of arr /
184	WordEntry *inarr;
185	int len = `0`;
186	TSVector in;
187	int i;
188	char *token;
189	int toklen;
190	WordEntryPos *pos;
191	int poslen;
192	char *strbuf;
193	int stroff;
194
195	/*
196	* Tokens are appended to tmpbuf, cur is a pointer to the end of used
197	* space in tmpbuf.
198	*/
199	char *tmpbuf;
200	char *cur;
201	int buflen = `256`; / allocated size of tmpbuf /
202
203	state = init_tsvector_parser(buf, `0`);
204
205	arrlen = `64`;
206	arr = (WordEntryIN ) palloc(sizeof(WordEntryIN) arrlen);
207	cur = tmpbuf = (char *) palloc(buflen);
208
209	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
210	{
211	if (toklen >= MAXSTRLEN)
212	ereport(ERROR,
213	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
214	errmsg("word is too long (%ld bytes, max %ld bytes)",
215	(long) toklen,
216	(long) (MAXSTRLEN - `1`))));
217
218	if (cur - tmpbuf > MAXSTRPOS)
219	ereport(ERROR,
220	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
221	errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
222	(long) (cur - tmpbuf), (long) MAXSTRPOS)));
223
224	/*
225	* Enlarge buffers if needed
226	*/
227	if (len >= arrlen)
228	{
229	arrlen *= `2`;
230	arr = (WordEntryIN *)
231	repalloc((void ) arr, sizeof(WordEntryIN) arrlen);
232	}
233	while ((cur - tmpbuf) + toklen >= buflen)
234	{
235	int dist = cur - tmpbuf;
236
237	buflen *= `2`;
238	tmpbuf = (char ) repalloc((void* *) tmpbuf, buflen);
239	cur = tmpbuf + dist;
240	}
241	arr[len].entry.len = toklen;
242	arr[len].entry.pos = cur - tmpbuf;
243	memcpy((void ) cur, (void* *) token, toklen);
244	cur += toklen;
245
246	if (poslen != `0`)
247	{
248	arr[len].entry.haspos = `1`;
249	arr[len].pos = pos;
250	arr[len].poslen = poslen;
251	}
252	else
253	{
254	arr[len].entry.haspos = `0`;
255	arr[len].pos = NULL;
256	arr[len].poslen = `0`;
257	}
258	len++;
259	}
260
261	close_tsvector_parser(state);
262
263	if (len > `0`)
264	len = uniqueentry(arr, len, tmpbuf, &buflen);
265	else
266	buflen = `0`;
267
268	if (buflen > MAXSTRPOS)
269	ereport(ERROR,
270	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
271	errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
272
273	totallen = CALCDATASIZE(len, buflen);
274	in = (TSVector) palloc0(totallen);
275	SET_VARSIZE(in, totallen);
276	in->size = len;
277	inarr = ARRPTR(in);
278	strbuf = STRPTR(in);
279	stroff = `0`;
280	for (i = `0`; i < len; i++)
281	{
282	memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
283	arr[i].entry.pos = stroff;
284	stroff += arr[i].entry.len;
285	if (arr[i].entry.haspos)
286	{
287	if (arr[i].poslen > `0xFFFF`)
288	elog(ERROR, "positions array too long");
289
290	/ Copy number of positions /
291	stroff = SHORTALIGN(stroff);
292	(uint16 ) (strbuf + stroff) = (uint16) arr[i].poslen;
293	stroff += sizeof(uint16);
294
295	/ Copy positions /
296	memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
297	stroff += arr[i].poslen * sizeof(WordEntryPos);
298
299	pfree(arr[i].pos);
300	}
301	inarr[i] = arr[i].entry;
302	}
303
304	Assert((strbuf + stroff - (char *) in) == totallen);
305
306	PG_RETURN_TSVECTOR(in);
307	}
308
309	Datum
310	tsvectorout(PG_FUNCTION_ARGS)
311	{
312	TSVector out = PG_GETARG_TSVECTOR(`0`);
313	char *outbuf;
314	int32 i,
315	lenbuf = `0`,
316	pp;
317	WordEntry *ptr = ARRPTR(out);
318	char *curbegin,
319	*curin,
320	*curout;
321
322	lenbuf = out->size * `2` / '' / + out->size - `1` / space / + `2` / \0 / ;
323	for (i = `0`; i < out->size; i++)
324	{
325	lenbuf += ptr[i].len * `2` * pg_database_encoding_max_length() / for escape / ;
326	if (ptr[i].haspos)
327	lenbuf += `1` / : / + `7` / int2 + , + weight / * POSDATALEN(out, &(ptr[i]));
328	}
329
330	curout = outbuf = (char *) palloc(lenbuf);
331	for (i = `0`; i < out->size; i++)
332	{
333	curbegin = curin = STRPTR(out) + ptr->pos;
334	if (i != `0`)
335	*curout++ = `' '`;
336	*curout++ = `'\''`;
337	while (curin - curbegin < ptr->len)
338	{
339	int len = pg_mblen(curin);
340
341	if (t_iseq(curin, `'\''`))
342	*curout++ = `'\''`;
343	else if (t_iseq(curin, `'\\'`))
344	*curout++ = `'\\'`;
345
346	while (len--)
347	curout++ = curin++;
348	}
349
350	*curout++ = `'\''`;
351	if ((pp = POSDATALEN(out, ptr)) != `0`)
352	{
353	WordEntryPos *wptr;
354
355	*curout++ = `':'`;
356	wptr = POSDATAPTR(out, ptr);
357	while (pp)
358	{
359	curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
360	switch (WEP_GETWEIGHT(*wptr))
361	{
362	case `3`:
363	*curout++ = `'A'`;
364	break;
365	case `2`:
366	*curout++ = `'B'`;
367	break;
368	case `1`:
369	*curout++ = `'C'`;
370	break;
371	case `0`:
372	default:
373	break;
374	}
375
376	if (pp > `1`)
377	*curout++ = `','`;
378	pp--;
379	wptr++;
380	}
381	}
382	ptr++;
383	}
384
385	*curout = `'\0'`;
386	PG_FREE_IF_COPY(out, `0`);
387	PG_RETURN_CSTRING(outbuf);
388	}
389
390	/*
391	* Binary Input / Output functions. The binary format is as follows:
392	*
393	* uint32 number of lexemes
394	*
395	* for each lexeme:
396	* lexeme text in client encoding, null-terminated
397	* uint16 number of positions
398	* for each position:
399	* uint16 WordEntryPos
400	*/
401
402	Datum
403	tsvectorsend(PG_FUNCTION_ARGS)
404	{
405	TSVector vec = PG_GETARG_TSVECTOR(`0`);
406	StringInfoData buf;
407	int i,
408	j;
409	WordEntry *weptr = ARRPTR(vec);
410
411	pq_begintypsend(&buf);
412
413	pq_sendint32(&buf, vec->size);
414	for (i = `0`; i < vec->size; i++)
415	{
416	uint16 npos;
417
418	/*
419	* the strings in the TSVector array are not null-terminated, so we
420	* have to send the null-terminator separately
421	*/
422	pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
423	pq_sendbyte(&buf, `'\0'`);
424
425	npos = POSDATALEN(vec, weptr);
426	pq_sendint16(&buf, npos);
427
428	if (npos > `0`)
429	{
430	WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
431
432	for (j = `0`; j < npos; j++)
433	pq_sendint16(&buf, wepptr[j]);
434	}
435	weptr++;
436	}
437
438	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
439	}
440
441	Datum
442	tsvectorrecv(PG_FUNCTION_ARGS)
443	{
444	StringInfo buf = (StringInfo) PG_GETARG_POINTER(`0`);
445	TSVector vec;
446	int i;
447	int32 nentries;
448	int datalen; / number of bytes used in the variable size*
449	* area after fixed size TSVector header and
450	* WordEntries */
451	Size hdrlen;
452	Size len; / allocated size of vec /
453	bool needSort = false;
454
455	nentries = pq_getmsgint(buf, sizeof(int32));
456	if (nentries < `0` \|\| nentries > (MaxAllocSize / sizeof(WordEntry)))
457	elog(ERROR, "invalid size of tsvector");
458
459	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
460
461	len = hdrlen * `2`; / times two to make room for lexemes /
462	vec = (TSVector) palloc0(len);
463	vec->size = nentries;
464
465	datalen = `0`;
466	for (i = `0`; i < nentries; i++)
467	{
468	const char *lexeme;
469	uint16 npos;
470	size_t lex_len;
471
472	lexeme = pq_getmsgstring(buf);
473	npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
474
475	/ sanity checks /
476
477	lex_len = strlen(lexeme);
478	if (lex_len > MAXSTRLEN)
479	elog(ERROR, "invalid tsvector: lexeme too long");
480
481	if (datalen > MAXSTRPOS)
482	elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
483
484	if (npos > MAXNUMPOS)
485	elog(ERROR, "unexpected number of tsvector positions");
486
487	/*
488	* Looks valid. Fill the WordEntry struct, and copy lexeme.
489	*
490	* But make sure the buffer is large enough first.
491	*/
492	while (hdrlen + SHORTALIGN(datalen + lex_len) +
493	(npos + `1`) * sizeof(WordEntryPos) >= len)
494	{
495	len *= `2`;
496	vec = (TSVector) repalloc(vec, len);
497	}
498
499	vec->entries[i].haspos = (npos > `0`) ? `1` : `0`;
500	vec->entries[i].len = lex_len;
501	vec->entries[i].pos = datalen;
502
503	memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
504
505	datalen += lex_len;
506
507	if (i > `0` && WordEntryCMP(&vec->entries[i],
508	&vec->entries[i - `1`],
509	STRPTR(vec)) <= `0`)
510	needSort = true;
511
512	/ Receive positions /
513	if (npos > `0`)
514	{
515	uint16 j;
516	WordEntryPos *wepptr;
517
518	/*
519	* Pad to 2-byte alignment if necessary. Though we used palloc0
520	* for the initial allocation, subsequent repalloc'd memory areas
521	* are not initialized to zero.
522	*/
523	if (datalen != SHORTALIGN(datalen))
524	{
525	*(STRPTR(vec) + datalen) = `'\0'`;
526	datalen = SHORTALIGN(datalen);
527	}
528
529	memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
530
531	wepptr = POSDATAPTR(vec, &vec->entries[i]);
532	for (j = `0`; j < npos; j++)
533	{
534	wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
535	if (j > `0` && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - `1`]))
536	elog(ERROR, "position information is misordered");
537	}
538
539	datalen += (npos + `1`) * sizeof(WordEntry);
540	}
541	}
542
543	SET_VARSIZE(vec, hdrlen + datalen);
544
545	if (needSort)
546	qsort_arg((void ) ARRPTR(vec), vec->size, sizeof*(WordEntry),
547	compareentry, (void *) STRPTR(vec));
548
549	PG_RETURN_TSVECTOR(vec);
550	}
551

Browse the source code of PostgreSQL/src/backend/utils/adt/tsvector.c