1/*-------------------------------------------------------------------------
2 *
3 * tsvector.c
4 * I/O functions for tsvector
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "libpq/pqformat.h"
18#include "tsearch/ts_locale.h"
19#include "tsearch/ts_utils.h"
20#include "utils/builtins.h"
21#include "utils/memutils.h"
22
23typedef struct
24{
25 WordEntry entry; /* must be first! */
26 WordEntryPos *pos;
27 int poslen; /* number of elements in pos */
28} WordEntryIN;
29
30
31/* Compare two WordEntryPos values for qsort */
32int
33compareWordEntryPos(const void *a, const void *b)
34{
35 int apos = WEP_GETPOS(*(const WordEntryPos *) a);
36 int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
37
38 if (apos == bpos)
39 return 0;
40 return (apos > bpos) ? 1 : -1;
41}
42
43/*
44 * Removes duplicate pos entries. If there's two entries with same pos
45 * but different weight, the higher weight is retained.
46 *
47 * Returns new length.
48 */
49static int
50uniquePos(WordEntryPos *a, int l)
51{
52 WordEntryPos *ptr,
53 *res;
54
55 if (l <= 1)
56 return l;
57
58 qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
59
60 res = a;
61 ptr = a + 1;
62 while (ptr - a < l)
63 {
64 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
65 {
66 res++;
67 *res = *ptr;
68 if (res - a >= MAXNUMPOS - 1 ||
69 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
70 break;
71 }
72 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
73 WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
74 ptr++;
75 }
76
77 return res + 1 - a;
78}
79
80/* Compare two WordEntryIN values for qsort */
81static int
82compareentry(const void *va, const void *vb, void *arg)
83{
84 const WordEntryIN *a = (const WordEntryIN *) va;
85 const WordEntryIN *b = (const WordEntryIN *) vb;
86 char *BufferStr = (char *) arg;
87
88 return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
89 &BufferStr[b->entry.pos], b->entry.len,
90 false);
91}
92
93/*
94 * Sort an array of WordEntryIN, remove duplicates.
95 * *outbuflen receives the amount of space needed for strings and positions.
96 */
97static int
98uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
99{
100 int buflen;
101 WordEntryIN *ptr,
102 *res;
103
104 Assert(l >= 1);
105
106 if (l > 1)
107 qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
108 (void *) buf);
109
110 buflen = 0;
111 res = a;
112 ptr = a + 1;
113 while (ptr - a < l)
114 {
115 if (!(ptr->entry.len == res->entry.len &&
116 strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
117 res->entry.len) == 0))
118 {
119 /* done accumulating data into *res, count space needed */
120 buflen += res->entry.len;
121 if (res->entry.haspos)
122 {
123 res->poslen = uniquePos(res->pos, res->poslen);
124 buflen = SHORTALIGN(buflen);
125 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
126 }
127 res++;
128 if (res != ptr)
129 memcpy(res, ptr, sizeof(WordEntryIN));
130 }
131 else if (ptr->entry.haspos)
132 {
133 if (res->entry.haspos)
134 {
135 /* append ptr's positions to res's positions */
136 int newlen = ptr->poslen + res->poslen;
137
138 res->pos = (WordEntryPos *)
139 repalloc(res->pos, newlen * sizeof(WordEntryPos));
140 memcpy(&res->pos[res->poslen], ptr->pos,
141 ptr->poslen * sizeof(WordEntryPos));
142 res->poslen = newlen;
143 pfree(ptr->pos);
144 }
145 else
146 {
147 /* just give ptr's positions to pos */
148 res->entry.haspos = 1;
149 res->pos = ptr->pos;
150 res->poslen = ptr->poslen;
151 }
152 }
153 ptr++;
154 }
155
156 /* count space needed for last item */
157 buflen += res->entry.len;
158 if (res->entry.haspos)
159 {
160 res->poslen = uniquePos(res->pos, res->poslen);
161 buflen = SHORTALIGN(buflen);
162 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
163 }
164
165 *outbuflen = buflen;
166 return res + 1 - a;
167}
168
169static int
170WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
171{
172 return compareentry(a, b, buf);
173}
174
175
176Datum
177tsvectorin(PG_FUNCTION_ARGS)
178{
179 char *buf = PG_GETARG_CSTRING(0);
180 TSVectorParseState state;
181 WordEntryIN *arr;
182 int totallen;
183 int arrlen; /* allocated size of arr */
184 WordEntry *inarr;
185 int len = 0;
186 TSVector in;
187 int i;
188 char *token;
189 int toklen;
190 WordEntryPos *pos;
191 int poslen;
192 char *strbuf;
193 int stroff;
194
195 /*
196 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
197 * space in tmpbuf.
198 */
199 char *tmpbuf;
200 char *cur;
201 int buflen = 256; /* allocated size of tmpbuf */
202
203 state = init_tsvector_parser(buf, 0);
204
205 arrlen = 64;
206 arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
207 cur = tmpbuf = (char *) palloc(buflen);
208
209 while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
210 {
211 if (toklen >= MAXSTRLEN)
212 ereport(ERROR,
213 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
214 errmsg("word is too long (%ld bytes, max %ld bytes)",
215 (long) toklen,
216 (long) (MAXSTRLEN - 1))));
217
218 if (cur - tmpbuf > MAXSTRPOS)
219 ereport(ERROR,
220 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
221 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
222 (long) (cur - tmpbuf), (long) MAXSTRPOS)));
223
224 /*
225 * Enlarge buffers if needed
226 */
227 if (len >= arrlen)
228 {
229 arrlen *= 2;
230 arr = (WordEntryIN *)
231 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
232 }
233 while ((cur - tmpbuf) + toklen >= buflen)
234 {
235 int dist = cur - tmpbuf;
236
237 buflen *= 2;
238 tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
239 cur = tmpbuf + dist;
240 }
241 arr[len].entry.len = toklen;
242 arr[len].entry.pos = cur - tmpbuf;
243 memcpy((void *) cur, (void *) token, toklen);
244 cur += toklen;
245
246 if (poslen != 0)
247 {
248 arr[len].entry.haspos = 1;
249 arr[len].pos = pos;
250 arr[len].poslen = poslen;
251 }
252 else
253 {
254 arr[len].entry.haspos = 0;
255 arr[len].pos = NULL;
256 arr[len].poslen = 0;
257 }
258 len++;
259 }
260
261 close_tsvector_parser(state);
262
263 if (len > 0)
264 len = uniqueentry(arr, len, tmpbuf, &buflen);
265 else
266 buflen = 0;
267
268 if (buflen > MAXSTRPOS)
269 ereport(ERROR,
270 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
271 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
272
273 totallen = CALCDATASIZE(len, buflen);
274 in = (TSVector) palloc0(totallen);
275 SET_VARSIZE(in, totallen);
276 in->size = len;
277 inarr = ARRPTR(in);
278 strbuf = STRPTR(in);
279 stroff = 0;
280 for (i = 0; i < len; i++)
281 {
282 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
283 arr[i].entry.pos = stroff;
284 stroff += arr[i].entry.len;
285 if (arr[i].entry.haspos)
286 {
287 if (arr[i].poslen > 0xFFFF)
288 elog(ERROR, "positions array too long");
289
290 /* Copy number of positions */
291 stroff = SHORTALIGN(stroff);
292 *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
293 stroff += sizeof(uint16);
294
295 /* Copy positions */
296 memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
297 stroff += arr[i].poslen * sizeof(WordEntryPos);
298
299 pfree(arr[i].pos);
300 }
301 inarr[i] = arr[i].entry;
302 }
303
304 Assert((strbuf + stroff - (char *) in) == totallen);
305
306 PG_RETURN_TSVECTOR(in);
307}
308
309Datum
310tsvectorout(PG_FUNCTION_ARGS)
311{
312 TSVector out = PG_GETARG_TSVECTOR(0);
313 char *outbuf;
314 int32 i,
315 lenbuf = 0,
316 pp;
317 WordEntry *ptr = ARRPTR(out);
318 char *curbegin,
319 *curin,
320 *curout;
321
322 lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
323 for (i = 0; i < out->size; i++)
324 {
325 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
326 if (ptr[i].haspos)
327 lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
328 }
329
330 curout = outbuf = (char *) palloc(lenbuf);
331 for (i = 0; i < out->size; i++)
332 {
333 curbegin = curin = STRPTR(out) + ptr->pos;
334 if (i != 0)
335 *curout++ = ' ';
336 *curout++ = '\'';
337 while (curin - curbegin < ptr->len)
338 {
339 int len = pg_mblen(curin);
340
341 if (t_iseq(curin, '\''))
342 *curout++ = '\'';
343 else if (t_iseq(curin, '\\'))
344 *curout++ = '\\';
345
346 while (len--)
347 *curout++ = *curin++;
348 }
349
350 *curout++ = '\'';
351 if ((pp = POSDATALEN(out, ptr)) != 0)
352 {
353 WordEntryPos *wptr;
354
355 *curout++ = ':';
356 wptr = POSDATAPTR(out, ptr);
357 while (pp)
358 {
359 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
360 switch (WEP_GETWEIGHT(*wptr))
361 {
362 case 3:
363 *curout++ = 'A';
364 break;
365 case 2:
366 *curout++ = 'B';
367 break;
368 case 1:
369 *curout++ = 'C';
370 break;
371 case 0:
372 default:
373 break;
374 }
375
376 if (pp > 1)
377 *curout++ = ',';
378 pp--;
379 wptr++;
380 }
381 }
382 ptr++;
383 }
384
385 *curout = '\0';
386 PG_FREE_IF_COPY(out, 0);
387 PG_RETURN_CSTRING(outbuf);
388}
389
390/*
391 * Binary Input / Output functions. The binary format is as follows:
392 *
393 * uint32 number of lexemes
394 *
395 * for each lexeme:
396 * lexeme text in client encoding, null-terminated
397 * uint16 number of positions
398 * for each position:
399 * uint16 WordEntryPos
400 */
401
402Datum
403tsvectorsend(PG_FUNCTION_ARGS)
404{
405 TSVector vec = PG_GETARG_TSVECTOR(0);
406 StringInfoData buf;
407 int i,
408 j;
409 WordEntry *weptr = ARRPTR(vec);
410
411 pq_begintypsend(&buf);
412
413 pq_sendint32(&buf, vec->size);
414 for (i = 0; i < vec->size; i++)
415 {
416 uint16 npos;
417
418 /*
419 * the strings in the TSVector array are not null-terminated, so we
420 * have to send the null-terminator separately
421 */
422 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
423 pq_sendbyte(&buf, '\0');
424
425 npos = POSDATALEN(vec, weptr);
426 pq_sendint16(&buf, npos);
427
428 if (npos > 0)
429 {
430 WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
431
432 for (j = 0; j < npos; j++)
433 pq_sendint16(&buf, wepptr[j]);
434 }
435 weptr++;
436 }
437
438 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
439}
440
441Datum
442tsvectorrecv(PG_FUNCTION_ARGS)
443{
444 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
445 TSVector vec;
446 int i;
447 int32 nentries;
448 int datalen; /* number of bytes used in the variable size
449 * area after fixed size TSVector header and
450 * WordEntries */
451 Size hdrlen;
452 Size len; /* allocated size of vec */
453 bool needSort = false;
454
455 nentries = pq_getmsgint(buf, sizeof(int32));
456 if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
457 elog(ERROR, "invalid size of tsvector");
458
459 hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
460
461 len = hdrlen * 2; /* times two to make room for lexemes */
462 vec = (TSVector) palloc0(len);
463 vec->size = nentries;
464
465 datalen = 0;
466 for (i = 0; i < nentries; i++)
467 {
468 const char *lexeme;
469 uint16 npos;
470 size_t lex_len;
471
472 lexeme = pq_getmsgstring(buf);
473 npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
474
475 /* sanity checks */
476
477 lex_len = strlen(lexeme);
478 if (lex_len > MAXSTRLEN)
479 elog(ERROR, "invalid tsvector: lexeme too long");
480
481 if (datalen > MAXSTRPOS)
482 elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
483
484 if (npos > MAXNUMPOS)
485 elog(ERROR, "unexpected number of tsvector positions");
486
487 /*
488 * Looks valid. Fill the WordEntry struct, and copy lexeme.
489 *
490 * But make sure the buffer is large enough first.
491 */
492 while (hdrlen + SHORTALIGN(datalen + lex_len) +
493 (npos + 1) * sizeof(WordEntryPos) >= len)
494 {
495 len *= 2;
496 vec = (TSVector) repalloc(vec, len);
497 }
498
499 vec->entries[i].haspos = (npos > 0) ? 1 : 0;
500 vec->entries[i].len = lex_len;
501 vec->entries[i].pos = datalen;
502
503 memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
504
505 datalen += lex_len;
506
507 if (i > 0 && WordEntryCMP(&vec->entries[i],
508 &vec->entries[i - 1],
509 STRPTR(vec)) <= 0)
510 needSort = true;
511
512 /* Receive positions */
513 if (npos > 0)
514 {
515 uint16 j;
516 WordEntryPos *wepptr;
517
518 /*
519 * Pad to 2-byte alignment if necessary. Though we used palloc0
520 * for the initial allocation, subsequent repalloc'd memory areas
521 * are not initialized to zero.
522 */
523 if (datalen != SHORTALIGN(datalen))
524 {
525 *(STRPTR(vec) + datalen) = '\0';
526 datalen = SHORTALIGN(datalen);
527 }
528
529 memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
530
531 wepptr = POSDATAPTR(vec, &vec->entries[i]);
532 for (j = 0; j < npos; j++)
533 {
534 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
535 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
536 elog(ERROR, "position information is misordered");
537 }
538
539 datalen += (npos + 1) * sizeof(WordEntry);
540 }
541 }
542
543 SET_VARSIZE(vec, hdrlen + datalen);
544
545 if (needSort)
546 qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
547 compareentry, (void *) STRPTR(vec));
548
549 PG_RETURN_TSVECTOR(vec);
550}
551