| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * tsvector.c |
| 4 | * I/O functions for tsvector |
| 5 | * |
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * |
| 8 | * |
| 9 | * IDENTIFICATION |
| 10 | * src/backend/utils/adt/tsvector.c |
| 11 | * |
| 12 | *------------------------------------------------------------------------- |
| 13 | */ |
| 14 | |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include "libpq/pqformat.h" |
| 18 | #include "tsearch/ts_locale.h" |
| 19 | #include "tsearch/ts_utils.h" |
| 20 | #include "utils/builtins.h" |
| 21 | #include "utils/memutils.h" |
| 22 | |
| 23 | typedef struct |
| 24 | { |
| 25 | WordEntry entry; /* must be first! */ |
| 26 | WordEntryPos *pos; |
| 27 | int poslen; /* number of elements in pos */ |
| 28 | } WordEntryIN; |
| 29 | |
| 30 | |
| 31 | /* Compare two WordEntryPos values for qsort */ |
| 32 | int |
| 33 | compareWordEntryPos(const void *a, const void *b) |
| 34 | { |
| 35 | int apos = WEP_GETPOS(*(const WordEntryPos *) a); |
| 36 | int bpos = WEP_GETPOS(*(const WordEntryPos *) b); |
| 37 | |
| 38 | if (apos == bpos) |
| 39 | return 0; |
| 40 | return (apos > bpos) ? 1 : -1; |
| 41 | } |
| 42 | |
| 43 | /* |
| 44 | * Removes duplicate pos entries. If there's two entries with same pos |
| 45 | * but different weight, the higher weight is retained. |
| 46 | * |
| 47 | * Returns new length. |
| 48 | */ |
| 49 | static int |
| 50 | uniquePos(WordEntryPos *a, int l) |
| 51 | { |
| 52 | WordEntryPos *ptr, |
| 53 | *res; |
| 54 | |
| 55 | if (l <= 1) |
| 56 | return l; |
| 57 | |
| 58 | qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos); |
| 59 | |
| 60 | res = a; |
| 61 | ptr = a + 1; |
| 62 | while (ptr - a < l) |
| 63 | { |
| 64 | if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res)) |
| 65 | { |
| 66 | res++; |
| 67 | *res = *ptr; |
| 68 | if (res - a >= MAXNUMPOS - 1 || |
| 69 | WEP_GETPOS(*res) == MAXENTRYPOS - 1) |
| 70 | break; |
| 71 | } |
| 72 | else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) |
| 73 | WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr)); |
| 74 | ptr++; |
| 75 | } |
| 76 | |
| 77 | return res + 1 - a; |
| 78 | } |
| 79 | |
| 80 | /* Compare two WordEntryIN values for qsort */ |
| 81 | static int |
| 82 | compareentry(const void *va, const void *vb, void *arg) |
| 83 | { |
| 84 | const WordEntryIN *a = (const WordEntryIN *) va; |
| 85 | const WordEntryIN *b = (const WordEntryIN *) vb; |
| 86 | char *BufferStr = (char *) arg; |
| 87 | |
| 88 | return tsCompareString(&BufferStr[a->entry.pos], a->entry.len, |
| 89 | &BufferStr[b->entry.pos], b->entry.len, |
| 90 | false); |
| 91 | } |
| 92 | |
| 93 | /* |
| 94 | * Sort an array of WordEntryIN, remove duplicates. |
| 95 | * *outbuflen receives the amount of space needed for strings and positions. |
| 96 | */ |
| 97 | static int |
| 98 | uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen) |
| 99 | { |
| 100 | int buflen; |
| 101 | WordEntryIN *ptr, |
| 102 | *res; |
| 103 | |
| 104 | Assert(l >= 1); |
| 105 | |
| 106 | if (l > 1) |
| 107 | qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, |
| 108 | (void *) buf); |
| 109 | |
| 110 | buflen = 0; |
| 111 | res = a; |
| 112 | ptr = a + 1; |
| 113 | while (ptr - a < l) |
| 114 | { |
| 115 | if (!(ptr->entry.len == res->entry.len && |
| 116 | strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], |
| 117 | res->entry.len) == 0)) |
| 118 | { |
| 119 | /* done accumulating data into *res, count space needed */ |
| 120 | buflen += res->entry.len; |
| 121 | if (res->entry.haspos) |
| 122 | { |
| 123 | res->poslen = uniquePos(res->pos, res->poslen); |
| 124 | buflen = SHORTALIGN(buflen); |
| 125 | buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); |
| 126 | } |
| 127 | res++; |
| 128 | if (res != ptr) |
| 129 | memcpy(res, ptr, sizeof(WordEntryIN)); |
| 130 | } |
| 131 | else if (ptr->entry.haspos) |
| 132 | { |
| 133 | if (res->entry.haspos) |
| 134 | { |
| 135 | /* append ptr's positions to res's positions */ |
| 136 | int newlen = ptr->poslen + res->poslen; |
| 137 | |
| 138 | res->pos = (WordEntryPos *) |
| 139 | repalloc(res->pos, newlen * sizeof(WordEntryPos)); |
| 140 | memcpy(&res->pos[res->poslen], ptr->pos, |
| 141 | ptr->poslen * sizeof(WordEntryPos)); |
| 142 | res->poslen = newlen; |
| 143 | pfree(ptr->pos); |
| 144 | } |
| 145 | else |
| 146 | { |
| 147 | /* just give ptr's positions to pos */ |
| 148 | res->entry.haspos = 1; |
| 149 | res->pos = ptr->pos; |
| 150 | res->poslen = ptr->poslen; |
| 151 | } |
| 152 | } |
| 153 | ptr++; |
| 154 | } |
| 155 | |
| 156 | /* count space needed for last item */ |
| 157 | buflen += res->entry.len; |
| 158 | if (res->entry.haspos) |
| 159 | { |
| 160 | res->poslen = uniquePos(res->pos, res->poslen); |
| 161 | buflen = SHORTALIGN(buflen); |
| 162 | buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); |
| 163 | } |
| 164 | |
| 165 | *outbuflen = buflen; |
| 166 | return res + 1 - a; |
| 167 | } |
| 168 | |
| 169 | static int |
| 170 | WordEntryCMP(WordEntry *a, WordEntry *b, char *buf) |
| 171 | { |
| 172 | return compareentry(a, b, buf); |
| 173 | } |
| 174 | |
| 175 | |
| 176 | Datum |
| 177 | tsvectorin(PG_FUNCTION_ARGS) |
| 178 | { |
| 179 | char *buf = PG_GETARG_CSTRING(0); |
| 180 | TSVectorParseState state; |
| 181 | WordEntryIN *arr; |
| 182 | int totallen; |
| 183 | int arrlen; /* allocated size of arr */ |
| 184 | WordEntry *inarr; |
| 185 | int len = 0; |
| 186 | TSVector in; |
| 187 | int i; |
| 188 | char *token; |
| 189 | int toklen; |
| 190 | WordEntryPos *pos; |
| 191 | int poslen; |
| 192 | char *strbuf; |
| 193 | int stroff; |
| 194 | |
| 195 | /* |
| 196 | * Tokens are appended to tmpbuf, cur is a pointer to the end of used |
| 197 | * space in tmpbuf. |
| 198 | */ |
| 199 | char *tmpbuf; |
| 200 | char *cur; |
| 201 | int buflen = 256; /* allocated size of tmpbuf */ |
| 202 | |
| 203 | state = init_tsvector_parser(buf, 0); |
| 204 | |
| 205 | arrlen = 64; |
| 206 | arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); |
| 207 | cur = tmpbuf = (char *) palloc(buflen); |
| 208 | |
| 209 | while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) |
| 210 | { |
| 211 | if (toklen >= MAXSTRLEN) |
| 212 | ereport(ERROR, |
| 213 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| 214 | errmsg("word is too long (%ld bytes, max %ld bytes)" , |
| 215 | (long) toklen, |
| 216 | (long) (MAXSTRLEN - 1)))); |
| 217 | |
| 218 | if (cur - tmpbuf > MAXSTRPOS) |
| 219 | ereport(ERROR, |
| 220 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| 221 | errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)" , |
| 222 | (long) (cur - tmpbuf), (long) MAXSTRPOS))); |
| 223 | |
| 224 | /* |
| 225 | * Enlarge buffers if needed |
| 226 | */ |
| 227 | if (len >= arrlen) |
| 228 | { |
| 229 | arrlen *= 2; |
| 230 | arr = (WordEntryIN *) |
| 231 | repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); |
| 232 | } |
| 233 | while ((cur - tmpbuf) + toklen >= buflen) |
| 234 | { |
| 235 | int dist = cur - tmpbuf; |
| 236 | |
| 237 | buflen *= 2; |
| 238 | tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); |
| 239 | cur = tmpbuf + dist; |
| 240 | } |
| 241 | arr[len].entry.len = toklen; |
| 242 | arr[len].entry.pos = cur - tmpbuf; |
| 243 | memcpy((void *) cur, (void *) token, toklen); |
| 244 | cur += toklen; |
| 245 | |
| 246 | if (poslen != 0) |
| 247 | { |
| 248 | arr[len].entry.haspos = 1; |
| 249 | arr[len].pos = pos; |
| 250 | arr[len].poslen = poslen; |
| 251 | } |
| 252 | else |
| 253 | { |
| 254 | arr[len].entry.haspos = 0; |
| 255 | arr[len].pos = NULL; |
| 256 | arr[len].poslen = 0; |
| 257 | } |
| 258 | len++; |
| 259 | } |
| 260 | |
| 261 | close_tsvector_parser(state); |
| 262 | |
| 263 | if (len > 0) |
| 264 | len = uniqueentry(arr, len, tmpbuf, &buflen); |
| 265 | else |
| 266 | buflen = 0; |
| 267 | |
| 268 | if (buflen > MAXSTRPOS) |
| 269 | ereport(ERROR, |
| 270 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| 271 | errmsg("string is too long for tsvector (%d bytes, max %d bytes)" , buflen, MAXSTRPOS))); |
| 272 | |
| 273 | totallen = CALCDATASIZE(len, buflen); |
| 274 | in = (TSVector) palloc0(totallen); |
| 275 | SET_VARSIZE(in, totallen); |
| 276 | in->size = len; |
| 277 | inarr = ARRPTR(in); |
| 278 | strbuf = STRPTR(in); |
| 279 | stroff = 0; |
| 280 | for (i = 0; i < len; i++) |
| 281 | { |
| 282 | memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len); |
| 283 | arr[i].entry.pos = stroff; |
| 284 | stroff += arr[i].entry.len; |
| 285 | if (arr[i].entry.haspos) |
| 286 | { |
| 287 | if (arr[i].poslen > 0xFFFF) |
| 288 | elog(ERROR, "positions array too long" ); |
| 289 | |
| 290 | /* Copy number of positions */ |
| 291 | stroff = SHORTALIGN(stroff); |
| 292 | *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen; |
| 293 | stroff += sizeof(uint16); |
| 294 | |
| 295 | /* Copy positions */ |
| 296 | memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos)); |
| 297 | stroff += arr[i].poslen * sizeof(WordEntryPos); |
| 298 | |
| 299 | pfree(arr[i].pos); |
| 300 | } |
| 301 | inarr[i] = arr[i].entry; |
| 302 | } |
| 303 | |
| 304 | Assert((strbuf + stroff - (char *) in) == totallen); |
| 305 | |
| 306 | PG_RETURN_TSVECTOR(in); |
| 307 | } |
| 308 | |
| 309 | Datum |
| 310 | tsvectorout(PG_FUNCTION_ARGS) |
| 311 | { |
| 312 | TSVector out = PG_GETARG_TSVECTOR(0); |
| 313 | char *outbuf; |
| 314 | int32 i, |
| 315 | lenbuf = 0, |
| 316 | pp; |
| 317 | WordEntry *ptr = ARRPTR(out); |
| 318 | char *curbegin, |
| 319 | *curin, |
| 320 | *curout; |
| 321 | |
| 322 | lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; |
| 323 | for (i = 0; i < out->size; i++) |
| 324 | { |
| 325 | lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; |
| 326 | if (ptr[i].haspos) |
| 327 | lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); |
| 328 | } |
| 329 | |
| 330 | curout = outbuf = (char *) palloc(lenbuf); |
| 331 | for (i = 0; i < out->size; i++) |
| 332 | { |
| 333 | curbegin = curin = STRPTR(out) + ptr->pos; |
| 334 | if (i != 0) |
| 335 | *curout++ = ' '; |
| 336 | *curout++ = '\''; |
| 337 | while (curin - curbegin < ptr->len) |
| 338 | { |
| 339 | int len = pg_mblen(curin); |
| 340 | |
| 341 | if (t_iseq(curin, '\'')) |
| 342 | *curout++ = '\''; |
| 343 | else if (t_iseq(curin, '\\')) |
| 344 | *curout++ = '\\'; |
| 345 | |
| 346 | while (len--) |
| 347 | *curout++ = *curin++; |
| 348 | } |
| 349 | |
| 350 | *curout++ = '\''; |
| 351 | if ((pp = POSDATALEN(out, ptr)) != 0) |
| 352 | { |
| 353 | WordEntryPos *wptr; |
| 354 | |
| 355 | *curout++ = ':'; |
| 356 | wptr = POSDATAPTR(out, ptr); |
| 357 | while (pp) |
| 358 | { |
| 359 | curout += sprintf(curout, "%d" , WEP_GETPOS(*wptr)); |
| 360 | switch (WEP_GETWEIGHT(*wptr)) |
| 361 | { |
| 362 | case 3: |
| 363 | *curout++ = 'A'; |
| 364 | break; |
| 365 | case 2: |
| 366 | *curout++ = 'B'; |
| 367 | break; |
| 368 | case 1: |
| 369 | *curout++ = 'C'; |
| 370 | break; |
| 371 | case 0: |
| 372 | default: |
| 373 | break; |
| 374 | } |
| 375 | |
| 376 | if (pp > 1) |
| 377 | *curout++ = ','; |
| 378 | pp--; |
| 379 | wptr++; |
| 380 | } |
| 381 | } |
| 382 | ptr++; |
| 383 | } |
| 384 | |
| 385 | *curout = '\0'; |
| 386 | PG_FREE_IF_COPY(out, 0); |
| 387 | PG_RETURN_CSTRING(outbuf); |
| 388 | } |
| 389 | |
| 390 | /* |
| 391 | * Binary Input / Output functions. The binary format is as follows: |
| 392 | * |
| 393 | * uint32 number of lexemes |
| 394 | * |
| 395 | * for each lexeme: |
| 396 | * lexeme text in client encoding, null-terminated |
| 397 | * uint16 number of positions |
| 398 | * for each position: |
| 399 | * uint16 WordEntryPos |
| 400 | */ |
| 401 | |
| 402 | Datum |
| 403 | tsvectorsend(PG_FUNCTION_ARGS) |
| 404 | { |
| 405 | TSVector vec = PG_GETARG_TSVECTOR(0); |
| 406 | StringInfoData buf; |
| 407 | int i, |
| 408 | j; |
| 409 | WordEntry *weptr = ARRPTR(vec); |
| 410 | |
| 411 | pq_begintypsend(&buf); |
| 412 | |
| 413 | pq_sendint32(&buf, vec->size); |
| 414 | for (i = 0; i < vec->size; i++) |
| 415 | { |
| 416 | uint16 npos; |
| 417 | |
| 418 | /* |
| 419 | * the strings in the TSVector array are not null-terminated, so we |
| 420 | * have to send the null-terminator separately |
| 421 | */ |
| 422 | pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); |
| 423 | pq_sendbyte(&buf, '\0'); |
| 424 | |
| 425 | npos = POSDATALEN(vec, weptr); |
| 426 | pq_sendint16(&buf, npos); |
| 427 | |
| 428 | if (npos > 0) |
| 429 | { |
| 430 | WordEntryPos *wepptr = POSDATAPTR(vec, weptr); |
| 431 | |
| 432 | for (j = 0; j < npos; j++) |
| 433 | pq_sendint16(&buf, wepptr[j]); |
| 434 | } |
| 435 | weptr++; |
| 436 | } |
| 437 | |
| 438 | PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); |
| 439 | } |
| 440 | |
| 441 | Datum |
| 442 | tsvectorrecv(PG_FUNCTION_ARGS) |
| 443 | { |
| 444 | StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); |
| 445 | TSVector vec; |
| 446 | int i; |
| 447 | int32 nentries; |
| 448 | int datalen; /* number of bytes used in the variable size |
| 449 | * area after fixed size TSVector header and |
| 450 | * WordEntries */ |
| 451 | Size hdrlen; |
| 452 | Size len; /* allocated size of vec */ |
| 453 | bool needSort = false; |
| 454 | |
| 455 | nentries = pq_getmsgint(buf, sizeof(int32)); |
| 456 | if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry))) |
| 457 | elog(ERROR, "invalid size of tsvector" ); |
| 458 | |
| 459 | hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries; |
| 460 | |
| 461 | len = hdrlen * 2; /* times two to make room for lexemes */ |
| 462 | vec = (TSVector) palloc0(len); |
| 463 | vec->size = nentries; |
| 464 | |
| 465 | datalen = 0; |
| 466 | for (i = 0; i < nentries; i++) |
| 467 | { |
| 468 | const char *lexeme; |
| 469 | uint16 npos; |
| 470 | size_t lex_len; |
| 471 | |
| 472 | lexeme = pq_getmsgstring(buf); |
| 473 | npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); |
| 474 | |
| 475 | /* sanity checks */ |
| 476 | |
| 477 | lex_len = strlen(lexeme); |
| 478 | if (lex_len > MAXSTRLEN) |
| 479 | elog(ERROR, "invalid tsvector: lexeme too long" ); |
| 480 | |
| 481 | if (datalen > MAXSTRPOS) |
| 482 | elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded" ); |
| 483 | |
| 484 | if (npos > MAXNUMPOS) |
| 485 | elog(ERROR, "unexpected number of tsvector positions" ); |
| 486 | |
| 487 | /* |
| 488 | * Looks valid. Fill the WordEntry struct, and copy lexeme. |
| 489 | * |
| 490 | * But make sure the buffer is large enough first. |
| 491 | */ |
| 492 | while (hdrlen + SHORTALIGN(datalen + lex_len) + |
| 493 | (npos + 1) * sizeof(WordEntryPos) >= len) |
| 494 | { |
| 495 | len *= 2; |
| 496 | vec = (TSVector) repalloc(vec, len); |
| 497 | } |
| 498 | |
| 499 | vec->entries[i].haspos = (npos > 0) ? 1 : 0; |
| 500 | vec->entries[i].len = lex_len; |
| 501 | vec->entries[i].pos = datalen; |
| 502 | |
| 503 | memcpy(STRPTR(vec) + datalen, lexeme, lex_len); |
| 504 | |
| 505 | datalen += lex_len; |
| 506 | |
| 507 | if (i > 0 && WordEntryCMP(&vec->entries[i], |
| 508 | &vec->entries[i - 1], |
| 509 | STRPTR(vec)) <= 0) |
| 510 | needSort = true; |
| 511 | |
| 512 | /* Receive positions */ |
| 513 | if (npos > 0) |
| 514 | { |
| 515 | uint16 j; |
| 516 | WordEntryPos *wepptr; |
| 517 | |
| 518 | /* |
| 519 | * Pad to 2-byte alignment if necessary. Though we used palloc0 |
| 520 | * for the initial allocation, subsequent repalloc'd memory areas |
| 521 | * are not initialized to zero. |
| 522 | */ |
| 523 | if (datalen != SHORTALIGN(datalen)) |
| 524 | { |
| 525 | *(STRPTR(vec) + datalen) = '\0'; |
| 526 | datalen = SHORTALIGN(datalen); |
| 527 | } |
| 528 | |
| 529 | memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16)); |
| 530 | |
| 531 | wepptr = POSDATAPTR(vec, &vec->entries[i]); |
| 532 | for (j = 0; j < npos; j++) |
| 533 | { |
| 534 | wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); |
| 535 | if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) |
| 536 | elog(ERROR, "position information is misordered" ); |
| 537 | } |
| 538 | |
| 539 | datalen += (npos + 1) * sizeof(WordEntry); |
| 540 | } |
| 541 | } |
| 542 | |
| 543 | SET_VARSIZE(vec, hdrlen + datalen); |
| 544 | |
| 545 | if (needSort) |
| 546 | qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry), |
| 547 | compareentry, (void *) STRPTR(vec)); |
| 548 | |
| 549 | PG_RETURN_TSVECTOR(vec); |
| 550 | } |
| 551 | |