1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * tsvector.c |
4 | * I/O functions for tsvector |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * |
8 | * |
9 | * IDENTIFICATION |
10 | * src/backend/utils/adt/tsvector.c |
11 | * |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | |
15 | #include "postgres.h" |
16 | |
17 | #include "libpq/pqformat.h" |
18 | #include "tsearch/ts_locale.h" |
19 | #include "tsearch/ts_utils.h" |
20 | #include "utils/builtins.h" |
21 | #include "utils/memutils.h" |
22 | |
23 | typedef struct |
24 | { |
25 | WordEntry entry; /* must be first! */ |
26 | WordEntryPos *pos; |
27 | int poslen; /* number of elements in pos */ |
28 | } WordEntryIN; |
29 | |
30 | |
31 | /* Compare two WordEntryPos values for qsort */ |
32 | int |
33 | compareWordEntryPos(const void *a, const void *b) |
34 | { |
35 | int apos = WEP_GETPOS(*(const WordEntryPos *) a); |
36 | int bpos = WEP_GETPOS(*(const WordEntryPos *) b); |
37 | |
38 | if (apos == bpos) |
39 | return 0; |
40 | return (apos > bpos) ? 1 : -1; |
41 | } |
42 | |
43 | /* |
44 | * Removes duplicate pos entries. If there's two entries with same pos |
45 | * but different weight, the higher weight is retained. |
46 | * |
47 | * Returns new length. |
48 | */ |
49 | static int |
50 | uniquePos(WordEntryPos *a, int l) |
51 | { |
52 | WordEntryPos *ptr, |
53 | *res; |
54 | |
55 | if (l <= 1) |
56 | return l; |
57 | |
58 | qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos); |
59 | |
60 | res = a; |
61 | ptr = a + 1; |
62 | while (ptr - a < l) |
63 | { |
64 | if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res)) |
65 | { |
66 | res++; |
67 | *res = *ptr; |
68 | if (res - a >= MAXNUMPOS - 1 || |
69 | WEP_GETPOS(*res) == MAXENTRYPOS - 1) |
70 | break; |
71 | } |
72 | else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) |
73 | WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr)); |
74 | ptr++; |
75 | } |
76 | |
77 | return res + 1 - a; |
78 | } |
79 | |
80 | /* Compare two WordEntryIN values for qsort */ |
81 | static int |
82 | compareentry(const void *va, const void *vb, void *arg) |
83 | { |
84 | const WordEntryIN *a = (const WordEntryIN *) va; |
85 | const WordEntryIN *b = (const WordEntryIN *) vb; |
86 | char *BufferStr = (char *) arg; |
87 | |
88 | return tsCompareString(&BufferStr[a->entry.pos], a->entry.len, |
89 | &BufferStr[b->entry.pos], b->entry.len, |
90 | false); |
91 | } |
92 | |
93 | /* |
94 | * Sort an array of WordEntryIN, remove duplicates. |
95 | * *outbuflen receives the amount of space needed for strings and positions. |
96 | */ |
97 | static int |
98 | uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen) |
99 | { |
100 | int buflen; |
101 | WordEntryIN *ptr, |
102 | *res; |
103 | |
104 | Assert(l >= 1); |
105 | |
106 | if (l > 1) |
107 | qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, |
108 | (void *) buf); |
109 | |
110 | buflen = 0; |
111 | res = a; |
112 | ptr = a + 1; |
113 | while (ptr - a < l) |
114 | { |
115 | if (!(ptr->entry.len == res->entry.len && |
116 | strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], |
117 | res->entry.len) == 0)) |
118 | { |
119 | /* done accumulating data into *res, count space needed */ |
120 | buflen += res->entry.len; |
121 | if (res->entry.haspos) |
122 | { |
123 | res->poslen = uniquePos(res->pos, res->poslen); |
124 | buflen = SHORTALIGN(buflen); |
125 | buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); |
126 | } |
127 | res++; |
128 | if (res != ptr) |
129 | memcpy(res, ptr, sizeof(WordEntryIN)); |
130 | } |
131 | else if (ptr->entry.haspos) |
132 | { |
133 | if (res->entry.haspos) |
134 | { |
135 | /* append ptr's positions to res's positions */ |
136 | int newlen = ptr->poslen + res->poslen; |
137 | |
138 | res->pos = (WordEntryPos *) |
139 | repalloc(res->pos, newlen * sizeof(WordEntryPos)); |
140 | memcpy(&res->pos[res->poslen], ptr->pos, |
141 | ptr->poslen * sizeof(WordEntryPos)); |
142 | res->poslen = newlen; |
143 | pfree(ptr->pos); |
144 | } |
145 | else |
146 | { |
147 | /* just give ptr's positions to pos */ |
148 | res->entry.haspos = 1; |
149 | res->pos = ptr->pos; |
150 | res->poslen = ptr->poslen; |
151 | } |
152 | } |
153 | ptr++; |
154 | } |
155 | |
156 | /* count space needed for last item */ |
157 | buflen += res->entry.len; |
158 | if (res->entry.haspos) |
159 | { |
160 | res->poslen = uniquePos(res->pos, res->poslen); |
161 | buflen = SHORTALIGN(buflen); |
162 | buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); |
163 | } |
164 | |
165 | *outbuflen = buflen; |
166 | return res + 1 - a; |
167 | } |
168 | |
169 | static int |
170 | WordEntryCMP(WordEntry *a, WordEntry *b, char *buf) |
171 | { |
172 | return compareentry(a, b, buf); |
173 | } |
174 | |
175 | |
176 | Datum |
177 | tsvectorin(PG_FUNCTION_ARGS) |
178 | { |
179 | char *buf = PG_GETARG_CSTRING(0); |
180 | TSVectorParseState state; |
181 | WordEntryIN *arr; |
182 | int totallen; |
183 | int arrlen; /* allocated size of arr */ |
184 | WordEntry *inarr; |
185 | int len = 0; |
186 | TSVector in; |
187 | int i; |
188 | char *token; |
189 | int toklen; |
190 | WordEntryPos *pos; |
191 | int poslen; |
192 | char *strbuf; |
193 | int stroff; |
194 | |
195 | /* |
196 | * Tokens are appended to tmpbuf, cur is a pointer to the end of used |
197 | * space in tmpbuf. |
198 | */ |
199 | char *tmpbuf; |
200 | char *cur; |
201 | int buflen = 256; /* allocated size of tmpbuf */ |
202 | |
203 | state = init_tsvector_parser(buf, 0); |
204 | |
205 | arrlen = 64; |
206 | arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); |
207 | cur = tmpbuf = (char *) palloc(buflen); |
208 | |
209 | while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) |
210 | { |
211 | if (toklen >= MAXSTRLEN) |
212 | ereport(ERROR, |
213 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
214 | errmsg("word is too long (%ld bytes, max %ld bytes)" , |
215 | (long) toklen, |
216 | (long) (MAXSTRLEN - 1)))); |
217 | |
218 | if (cur - tmpbuf > MAXSTRPOS) |
219 | ereport(ERROR, |
220 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
221 | errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)" , |
222 | (long) (cur - tmpbuf), (long) MAXSTRPOS))); |
223 | |
224 | /* |
225 | * Enlarge buffers if needed |
226 | */ |
227 | if (len >= arrlen) |
228 | { |
229 | arrlen *= 2; |
230 | arr = (WordEntryIN *) |
231 | repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); |
232 | } |
233 | while ((cur - tmpbuf) + toklen >= buflen) |
234 | { |
235 | int dist = cur - tmpbuf; |
236 | |
237 | buflen *= 2; |
238 | tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); |
239 | cur = tmpbuf + dist; |
240 | } |
241 | arr[len].entry.len = toklen; |
242 | arr[len].entry.pos = cur - tmpbuf; |
243 | memcpy((void *) cur, (void *) token, toklen); |
244 | cur += toklen; |
245 | |
246 | if (poslen != 0) |
247 | { |
248 | arr[len].entry.haspos = 1; |
249 | arr[len].pos = pos; |
250 | arr[len].poslen = poslen; |
251 | } |
252 | else |
253 | { |
254 | arr[len].entry.haspos = 0; |
255 | arr[len].pos = NULL; |
256 | arr[len].poslen = 0; |
257 | } |
258 | len++; |
259 | } |
260 | |
261 | close_tsvector_parser(state); |
262 | |
263 | if (len > 0) |
264 | len = uniqueentry(arr, len, tmpbuf, &buflen); |
265 | else |
266 | buflen = 0; |
267 | |
268 | if (buflen > MAXSTRPOS) |
269 | ereport(ERROR, |
270 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
271 | errmsg("string is too long for tsvector (%d bytes, max %d bytes)" , buflen, MAXSTRPOS))); |
272 | |
273 | totallen = CALCDATASIZE(len, buflen); |
274 | in = (TSVector) palloc0(totallen); |
275 | SET_VARSIZE(in, totallen); |
276 | in->size = len; |
277 | inarr = ARRPTR(in); |
278 | strbuf = STRPTR(in); |
279 | stroff = 0; |
280 | for (i = 0; i < len; i++) |
281 | { |
282 | memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len); |
283 | arr[i].entry.pos = stroff; |
284 | stroff += arr[i].entry.len; |
285 | if (arr[i].entry.haspos) |
286 | { |
287 | if (arr[i].poslen > 0xFFFF) |
288 | elog(ERROR, "positions array too long" ); |
289 | |
290 | /* Copy number of positions */ |
291 | stroff = SHORTALIGN(stroff); |
292 | *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen; |
293 | stroff += sizeof(uint16); |
294 | |
295 | /* Copy positions */ |
296 | memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos)); |
297 | stroff += arr[i].poslen * sizeof(WordEntryPos); |
298 | |
299 | pfree(arr[i].pos); |
300 | } |
301 | inarr[i] = arr[i].entry; |
302 | } |
303 | |
304 | Assert((strbuf + stroff - (char *) in) == totallen); |
305 | |
306 | PG_RETURN_TSVECTOR(in); |
307 | } |
308 | |
309 | Datum |
310 | tsvectorout(PG_FUNCTION_ARGS) |
311 | { |
312 | TSVector out = PG_GETARG_TSVECTOR(0); |
313 | char *outbuf; |
314 | int32 i, |
315 | lenbuf = 0, |
316 | pp; |
317 | WordEntry *ptr = ARRPTR(out); |
318 | char *curbegin, |
319 | *curin, |
320 | *curout; |
321 | |
322 | lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; |
323 | for (i = 0; i < out->size; i++) |
324 | { |
325 | lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; |
326 | if (ptr[i].haspos) |
327 | lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); |
328 | } |
329 | |
330 | curout = outbuf = (char *) palloc(lenbuf); |
331 | for (i = 0; i < out->size; i++) |
332 | { |
333 | curbegin = curin = STRPTR(out) + ptr->pos; |
334 | if (i != 0) |
335 | *curout++ = ' '; |
336 | *curout++ = '\''; |
337 | while (curin - curbegin < ptr->len) |
338 | { |
339 | int len = pg_mblen(curin); |
340 | |
341 | if (t_iseq(curin, '\'')) |
342 | *curout++ = '\''; |
343 | else if (t_iseq(curin, '\\')) |
344 | *curout++ = '\\'; |
345 | |
346 | while (len--) |
347 | *curout++ = *curin++; |
348 | } |
349 | |
350 | *curout++ = '\''; |
351 | if ((pp = POSDATALEN(out, ptr)) != 0) |
352 | { |
353 | WordEntryPos *wptr; |
354 | |
355 | *curout++ = ':'; |
356 | wptr = POSDATAPTR(out, ptr); |
357 | while (pp) |
358 | { |
359 | curout += sprintf(curout, "%d" , WEP_GETPOS(*wptr)); |
360 | switch (WEP_GETWEIGHT(*wptr)) |
361 | { |
362 | case 3: |
363 | *curout++ = 'A'; |
364 | break; |
365 | case 2: |
366 | *curout++ = 'B'; |
367 | break; |
368 | case 1: |
369 | *curout++ = 'C'; |
370 | break; |
371 | case 0: |
372 | default: |
373 | break; |
374 | } |
375 | |
376 | if (pp > 1) |
377 | *curout++ = ','; |
378 | pp--; |
379 | wptr++; |
380 | } |
381 | } |
382 | ptr++; |
383 | } |
384 | |
385 | *curout = '\0'; |
386 | PG_FREE_IF_COPY(out, 0); |
387 | PG_RETURN_CSTRING(outbuf); |
388 | } |
389 | |
390 | /* |
391 | * Binary Input / Output functions. The binary format is as follows: |
392 | * |
393 | * uint32 number of lexemes |
394 | * |
395 | * for each lexeme: |
396 | * lexeme text in client encoding, null-terminated |
397 | * uint16 number of positions |
398 | * for each position: |
399 | * uint16 WordEntryPos |
400 | */ |
401 | |
402 | Datum |
403 | tsvectorsend(PG_FUNCTION_ARGS) |
404 | { |
405 | TSVector vec = PG_GETARG_TSVECTOR(0); |
406 | StringInfoData buf; |
407 | int i, |
408 | j; |
409 | WordEntry *weptr = ARRPTR(vec); |
410 | |
411 | pq_begintypsend(&buf); |
412 | |
413 | pq_sendint32(&buf, vec->size); |
414 | for (i = 0; i < vec->size; i++) |
415 | { |
416 | uint16 npos; |
417 | |
418 | /* |
419 | * the strings in the TSVector array are not null-terminated, so we |
420 | * have to send the null-terminator separately |
421 | */ |
422 | pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); |
423 | pq_sendbyte(&buf, '\0'); |
424 | |
425 | npos = POSDATALEN(vec, weptr); |
426 | pq_sendint16(&buf, npos); |
427 | |
428 | if (npos > 0) |
429 | { |
430 | WordEntryPos *wepptr = POSDATAPTR(vec, weptr); |
431 | |
432 | for (j = 0; j < npos; j++) |
433 | pq_sendint16(&buf, wepptr[j]); |
434 | } |
435 | weptr++; |
436 | } |
437 | |
438 | PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); |
439 | } |
440 | |
441 | Datum |
442 | tsvectorrecv(PG_FUNCTION_ARGS) |
443 | { |
444 | StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
445 | TSVector vec; |
446 | int i; |
447 | int32 nentries; |
448 | int datalen; /* number of bytes used in the variable size |
449 | * area after fixed size TSVector header and |
450 | * WordEntries */ |
451 | Size hdrlen; |
452 | Size len; /* allocated size of vec */ |
453 | bool needSort = false; |
454 | |
455 | nentries = pq_getmsgint(buf, sizeof(int32)); |
456 | if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry))) |
457 | elog(ERROR, "invalid size of tsvector" ); |
458 | |
459 | hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries; |
460 | |
461 | len = hdrlen * 2; /* times two to make room for lexemes */ |
462 | vec = (TSVector) palloc0(len); |
463 | vec->size = nentries; |
464 | |
465 | datalen = 0; |
466 | for (i = 0; i < nentries; i++) |
467 | { |
468 | const char *lexeme; |
469 | uint16 npos; |
470 | size_t lex_len; |
471 | |
472 | lexeme = pq_getmsgstring(buf); |
473 | npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); |
474 | |
475 | /* sanity checks */ |
476 | |
477 | lex_len = strlen(lexeme); |
478 | if (lex_len > MAXSTRLEN) |
479 | elog(ERROR, "invalid tsvector: lexeme too long" ); |
480 | |
481 | if (datalen > MAXSTRPOS) |
482 | elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded" ); |
483 | |
484 | if (npos > MAXNUMPOS) |
485 | elog(ERROR, "unexpected number of tsvector positions" ); |
486 | |
487 | /* |
488 | * Looks valid. Fill the WordEntry struct, and copy lexeme. |
489 | * |
490 | * But make sure the buffer is large enough first. |
491 | */ |
492 | while (hdrlen + SHORTALIGN(datalen + lex_len) + |
493 | (npos + 1) * sizeof(WordEntryPos) >= len) |
494 | { |
495 | len *= 2; |
496 | vec = (TSVector) repalloc(vec, len); |
497 | } |
498 | |
499 | vec->entries[i].haspos = (npos > 0) ? 1 : 0; |
500 | vec->entries[i].len = lex_len; |
501 | vec->entries[i].pos = datalen; |
502 | |
503 | memcpy(STRPTR(vec) + datalen, lexeme, lex_len); |
504 | |
505 | datalen += lex_len; |
506 | |
507 | if (i > 0 && WordEntryCMP(&vec->entries[i], |
508 | &vec->entries[i - 1], |
509 | STRPTR(vec)) <= 0) |
510 | needSort = true; |
511 | |
512 | /* Receive positions */ |
513 | if (npos > 0) |
514 | { |
515 | uint16 j; |
516 | WordEntryPos *wepptr; |
517 | |
518 | /* |
519 | * Pad to 2-byte alignment if necessary. Though we used palloc0 |
520 | * for the initial allocation, subsequent repalloc'd memory areas |
521 | * are not initialized to zero. |
522 | */ |
523 | if (datalen != SHORTALIGN(datalen)) |
524 | { |
525 | *(STRPTR(vec) + datalen) = '\0'; |
526 | datalen = SHORTALIGN(datalen); |
527 | } |
528 | |
529 | memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16)); |
530 | |
531 | wepptr = POSDATAPTR(vec, &vec->entries[i]); |
532 | for (j = 0; j < npos; j++) |
533 | { |
534 | wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); |
535 | if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) |
536 | elog(ERROR, "position information is misordered" ); |
537 | } |
538 | |
539 | datalen += (npos + 1) * sizeof(WordEntry); |
540 | } |
541 | } |
542 | |
543 | SET_VARSIZE(vec, hdrlen + datalen); |
544 | |
545 | if (needSort) |
546 | qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry), |
547 | compareentry, (void *) STRPTR(vec)); |
548 | |
549 | PG_RETURN_TSVECTOR(vec); |
550 | } |
551 | |