| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * ginpostinglist.c |
| 4 | * routines for dealing with posting lists. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/access/gin/ginpostinglist.c |
| 12 | *------------------------------------------------------------------------- |
| 13 | */ |
| 14 | |
| 15 | #include "postgres.h" |
| 16 | |
| 17 | #include "access/gin_private.h" |
| 18 | |
| 19 | #ifdef USE_ASSERT_CHECKING |
| 20 | #define CHECK_ENCODING_ROUNDTRIP |
| 21 | #endif |
| 22 | |
| 23 | /* |
| 24 | * For encoding purposes, item pointers are represented as 64-bit unsigned |
| 25 | * integers. The lowest 11 bits represent the offset number, and the next |
| 26 | * lowest 32 bits are the block number. That leaves 21 bits unused, i.e. |
| 27 | * only 43 low bits are used. |
| 28 | * |
| 29 | * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage < |
| 30 | * 2^11 on all supported block sizes. We are frugal with the bits, because |
| 31 | * smaller integers use fewer bytes in the varbyte encoding, saving disk |
| 32 | * space. (If we get a new table AM in the future that wants to use the full |
| 33 | * range of possible offset numbers, we'll need to change this.) |
| 34 | * |
| 35 | * These 43-bit integers are encoded using varbyte encoding. In each byte, |
| 36 | * the 7 low bits contain data, while the highest bit is a continuation bit. |
| 37 | * When the continuation bit is set, the next byte is part of the same |
| 38 | * integer, otherwise this is the last byte of this integer. 43 bits need |
| 39 | * at most 7 bytes in this encoding: |
| 40 | * |
| 41 | * 0XXXXXXX |
| 42 | * 1XXXXXXX 0XXXXYYY |
| 43 | * 1XXXXXXX 1XXXXYYY 0YYYYYYY |
| 44 | * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY |
| 45 | * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY |
| 46 | * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY |
| 47 | * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY |
| 48 | * |
| 49 | * X = bits used for offset number |
| 50 | * Y = bits used for block number |
| 51 | * u = unused bit |
| 52 | * |
| 53 | * The bytes are in stored in little-endian order. |
| 54 | * |
| 55 | * An important property of this encoding is that removing an item from list |
| 56 | * never increases the size of the resulting compressed posting list. Proof: |
| 57 | * |
| 58 | * Removing number is actually replacement of two numbers with their sum. We |
| 59 | * have to prove that varbyte encoding of a sum can't be longer than varbyte |
| 60 | * encoding of its summands. Sum of two numbers is at most one bit wider than |
| 61 | * the larger of the summands. Widening a number by one bit enlarges its length |
| 62 | * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum |
| 63 | * is at most one byte longer than varbyte encoding of larger summand. Lesser |
| 64 | * summand is at least one byte, so the sum cannot take more space than the |
| 65 | * summands, Q.E.D. |
| 66 | * |
| 67 | * This property greatly simplifies VACUUM, which can assume that posting |
| 68 | * lists always fit on the same page after vacuuming. Note that even though |
| 69 | * that holds for removing items from a posting list, you must also be |
| 70 | * careful to not cause expansion e.g. when merging uncompressed items on the |
| 71 | * page into the compressed lists, when vacuuming. |
| 72 | */ |
| 73 | |
| 74 | /* |
| 75 | * How many bits do you need to encode offset number? OffsetNumber is a 16-bit |
| 76 | * integer, but you can't fit that many items on a page. 11 ought to be more |
| 77 | * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and |
| 78 | * use the minimum number of bits, but that would require changing the on-disk |
| 79 | * format if MaxHeapTuplesPerPage changes. Better to leave some slack. |
| 80 | */ |
| 81 | #define MaxHeapTuplesPerPageBits 11 |
| 82 | |
| 83 | /* Max. number of bytes needed to encode the largest supported integer. */ |
| 84 | #define MaxBytesPerInteger 7 |
| 85 | |
| 86 | static inline uint64 |
| 87 | itemptr_to_uint64(const ItemPointer iptr) |
| 88 | { |
| 89 | uint64 val; |
| 90 | |
| 91 | Assert(ItemPointerIsValid(iptr)); |
| 92 | Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits)); |
| 93 | |
| 94 | val = GinItemPointerGetBlockNumber(iptr); |
| 95 | val <<= MaxHeapTuplesPerPageBits; |
| 96 | val |= GinItemPointerGetOffsetNumber(iptr); |
| 97 | |
| 98 | return val; |
| 99 | } |
| 100 | |
| 101 | static inline void |
| 102 | uint64_to_itemptr(uint64 val, ItemPointer iptr) |
| 103 | { |
| 104 | GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1)); |
| 105 | val = val >> MaxHeapTuplesPerPageBits; |
| 106 | GinItemPointerSetBlockNumber(iptr, val); |
| 107 | |
| 108 | Assert(ItemPointerIsValid(iptr)); |
| 109 | } |
| 110 | |
| 111 | /* |
| 112 | * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer. |
| 113 | */ |
| 114 | static void |
| 115 | encode_varbyte(uint64 val, unsigned char **ptr) |
| 116 | { |
| 117 | unsigned char *p = *ptr; |
| 118 | |
| 119 | while (val > 0x7F) |
| 120 | { |
| 121 | *(p++) = 0x80 | (val & 0x7F); |
| 122 | val >>= 7; |
| 123 | } |
| 124 | *(p++) = (unsigned char) val; |
| 125 | |
| 126 | *ptr = p; |
| 127 | } |
| 128 | |
| 129 | /* |
| 130 | * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer. |
| 131 | */ |
| 132 | static uint64 |
| 133 | decode_varbyte(unsigned char **ptr) |
| 134 | { |
| 135 | uint64 val; |
| 136 | unsigned char *p = *ptr; |
| 137 | uint64 c; |
| 138 | |
| 139 | /* 1st byte */ |
| 140 | c = *(p++); |
| 141 | val = c & 0x7F; |
| 142 | if (c & 0x80) |
| 143 | { |
| 144 | /* 2nd byte */ |
| 145 | c = *(p++); |
| 146 | val |= (c & 0x7F) << 7; |
| 147 | if (c & 0x80) |
| 148 | { |
| 149 | /* 3rd byte */ |
| 150 | c = *(p++); |
| 151 | val |= (c & 0x7F) << 14; |
| 152 | if (c & 0x80) |
| 153 | { |
| 154 | /* 4th byte */ |
| 155 | c = *(p++); |
| 156 | val |= (c & 0x7F) << 21; |
| 157 | if (c & 0x80) |
| 158 | { |
| 159 | /* 5th byte */ |
| 160 | c = *(p++); |
| 161 | val |= (c & 0x7F) << 28; |
| 162 | if (c & 0x80) |
| 163 | { |
| 164 | /* 6th byte */ |
| 165 | c = *(p++); |
| 166 | val |= (c & 0x7F) << 35; |
| 167 | if (c & 0x80) |
| 168 | { |
| 169 | /* 7th byte, should not have continuation bit */ |
| 170 | c = *(p++); |
| 171 | val |= c << 42; |
| 172 | Assert((c & 0x80) == 0); |
| 173 | } |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | *ptr = p; |
| 181 | |
| 182 | return val; |
| 183 | } |
| 184 | |
| 185 | /* |
| 186 | * Encode a posting list. |
| 187 | * |
| 188 | * The encoded list is returned in a palloc'd struct, which will be at most |
| 189 | * 'maxsize' bytes in size. The number items in the returned segment is |
| 190 | * returned in *nwritten. If it's not equal to nipd, not all the items fit |
| 191 | * in 'maxsize', and only the first *nwritten were encoded. |
| 192 | * |
| 193 | * The allocated size of the returned struct is short-aligned, and the padding |
| 194 | * byte at the end, if any, is zero. |
| 195 | */ |
| 196 | GinPostingList * |
| 197 | ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, |
| 198 | int *nwritten) |
| 199 | { |
| 200 | uint64 prev; |
| 201 | int totalpacked = 0; |
| 202 | int maxbytes; |
| 203 | GinPostingList *result; |
| 204 | unsigned char *ptr; |
| 205 | unsigned char *endptr; |
| 206 | |
| 207 | maxsize = SHORTALIGN_DOWN(maxsize); |
| 208 | |
| 209 | result = palloc(maxsize); |
| 210 | |
| 211 | maxbytes = maxsize - offsetof(GinPostingList, bytes); |
| 212 | Assert(maxbytes > 0); |
| 213 | |
| 214 | /* Store the first special item */ |
| 215 | result->first = ipd[0]; |
| 216 | |
| 217 | prev = itemptr_to_uint64(&result->first); |
| 218 | |
| 219 | ptr = result->bytes; |
| 220 | endptr = result->bytes + maxbytes; |
| 221 | for (totalpacked = 1; totalpacked < nipd; totalpacked++) |
| 222 | { |
| 223 | uint64 val = itemptr_to_uint64(&ipd[totalpacked]); |
| 224 | uint64 delta = val - prev; |
| 225 | |
| 226 | Assert(val > prev); |
| 227 | |
| 228 | if (endptr - ptr >= MaxBytesPerInteger) |
| 229 | encode_varbyte(delta, &ptr); |
| 230 | else |
| 231 | { |
| 232 | /* |
| 233 | * There are less than 7 bytes left. Have to check if the next |
| 234 | * item fits in that space before writing it out. |
| 235 | */ |
| 236 | unsigned char buf[MaxBytesPerInteger]; |
| 237 | unsigned char *p = buf; |
| 238 | |
| 239 | encode_varbyte(delta, &p); |
| 240 | if (p - buf > (endptr - ptr)) |
| 241 | break; /* output is full */ |
| 242 | |
| 243 | memcpy(ptr, buf, p - buf); |
| 244 | ptr += (p - buf); |
| 245 | } |
| 246 | prev = val; |
| 247 | } |
| 248 | result->nbytes = ptr - result->bytes; |
| 249 | |
| 250 | /* |
| 251 | * If we wrote an odd number of bytes, zero out the padding byte at the |
| 252 | * end. |
| 253 | */ |
| 254 | if (result->nbytes != SHORTALIGN(result->nbytes)) |
| 255 | result->bytes[result->nbytes] = 0; |
| 256 | |
| 257 | if (nwritten) |
| 258 | *nwritten = totalpacked; |
| 259 | |
| 260 | Assert(SizeOfGinPostingList(result) <= maxsize); |
| 261 | |
| 262 | /* |
| 263 | * Check that the encoded segment decodes back to the original items. |
| 264 | */ |
| 265 | #if defined (CHECK_ENCODING_ROUNDTRIP) |
| 266 | { |
| 267 | int ndecoded; |
| 268 | ItemPointer tmp = ginPostingListDecode(result, &ndecoded); |
| 269 | int i; |
| 270 | |
| 271 | Assert(ndecoded == totalpacked); |
| 272 | for (i = 0; i < ndecoded; i++) |
| 273 | Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0); |
| 274 | pfree(tmp); |
| 275 | } |
| 276 | #endif |
| 277 | |
| 278 | return result; |
| 279 | } |
| 280 | |
| 281 | /* |
| 282 | * Decode a compressed posting list into an array of item pointers. |
| 283 | * The number of items is returned in *ndecoded. |
| 284 | */ |
| 285 | ItemPointer |
| 286 | ginPostingListDecode(GinPostingList *plist, int *ndecoded) |
| 287 | { |
| 288 | return ginPostingListDecodeAllSegments(plist, |
| 289 | SizeOfGinPostingList(plist), |
| 290 | ndecoded); |
| 291 | } |
| 292 | |
| 293 | /* |
| 294 | * Decode multiple posting list segments into an array of item pointers. |
| 295 | * The number of items is returned in *ndecoded_out. The segments are stored |
| 296 | * one after each other, with total size 'len' bytes. |
| 297 | */ |
| 298 | ItemPointer |
| 299 | ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out) |
| 300 | { |
| 301 | ItemPointer result; |
| 302 | int nallocated; |
| 303 | uint64 val; |
| 304 | char *endseg = ((char *) segment) + len; |
| 305 | int ndecoded; |
| 306 | unsigned char *ptr; |
| 307 | unsigned char *endptr; |
| 308 | |
| 309 | /* |
| 310 | * Guess an initial size of the array. |
| 311 | */ |
| 312 | nallocated = segment->nbytes * 2 + 1; |
| 313 | result = palloc(nallocated * sizeof(ItemPointerData)); |
| 314 | |
| 315 | ndecoded = 0; |
| 316 | while ((char *) segment < endseg) |
| 317 | { |
| 318 | /* enlarge output array if needed */ |
| 319 | if (ndecoded >= nallocated) |
| 320 | { |
| 321 | nallocated *= 2; |
| 322 | result = repalloc(result, nallocated * sizeof(ItemPointerData)); |
| 323 | } |
| 324 | |
| 325 | /* copy the first item */ |
| 326 | Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); |
| 327 | Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0); |
| 328 | result[ndecoded] = segment->first; |
| 329 | ndecoded++; |
| 330 | |
| 331 | val = itemptr_to_uint64(&segment->first); |
| 332 | ptr = segment->bytes; |
| 333 | endptr = segment->bytes + segment->nbytes; |
| 334 | while (ptr < endptr) |
| 335 | { |
| 336 | /* enlarge output array if needed */ |
| 337 | if (ndecoded >= nallocated) |
| 338 | { |
| 339 | nallocated *= 2; |
| 340 | result = repalloc(result, nallocated * sizeof(ItemPointerData)); |
| 341 | } |
| 342 | |
| 343 | val += decode_varbyte(&ptr); |
| 344 | |
| 345 | uint64_to_itemptr(val, &result[ndecoded]); |
| 346 | ndecoded++; |
| 347 | } |
| 348 | segment = GinNextPostingListSegment(segment); |
| 349 | } |
| 350 | |
| 351 | if (ndecoded_out) |
| 352 | *ndecoded_out = ndecoded; |
| 353 | return result; |
| 354 | } |
| 355 | |
| 356 | /* |
| 357 | * Add all item pointers from a bunch of posting lists to a TIDBitmap. |
| 358 | */ |
| 359 | int |
| 360 | ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, |
| 361 | TIDBitmap *tbm) |
| 362 | { |
| 363 | int ndecoded; |
| 364 | ItemPointer items; |
| 365 | |
| 366 | items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded); |
| 367 | tbm_add_tuples(tbm, items, ndecoded, false); |
| 368 | pfree(items); |
| 369 | |
| 370 | return ndecoded; |
| 371 | } |
| 372 | |
| 373 | /* |
| 374 | * Merge two ordered arrays of itempointers, eliminating any duplicates. |
| 375 | * |
| 376 | * Returns a palloc'd array, and *nmerged is set to the number of items in |
| 377 | * the result, after eliminating duplicates. |
| 378 | */ |
| 379 | ItemPointer |
| 380 | ginMergeItemPointers(ItemPointerData *a, uint32 na, |
| 381 | ItemPointerData *b, uint32 nb, |
| 382 | int *nmerged) |
| 383 | { |
| 384 | ItemPointerData *dst; |
| 385 | |
| 386 | dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData)); |
| 387 | |
| 388 | /* |
| 389 | * If the argument arrays don't overlap, we can just append them to each |
| 390 | * other. |
| 391 | */ |
| 392 | if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0) |
| 393 | { |
| 394 | memcpy(dst, a, na * sizeof(ItemPointerData)); |
| 395 | memcpy(&dst[na], b, nb * sizeof(ItemPointerData)); |
| 396 | *nmerged = na + nb; |
| 397 | } |
| 398 | else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0) |
| 399 | { |
| 400 | memcpy(dst, b, nb * sizeof(ItemPointerData)); |
| 401 | memcpy(&dst[nb], a, na * sizeof(ItemPointerData)); |
| 402 | *nmerged = na + nb; |
| 403 | } |
| 404 | else |
| 405 | { |
| 406 | ItemPointerData *dptr = dst; |
| 407 | ItemPointerData *aptr = a; |
| 408 | ItemPointerData *bptr = b; |
| 409 | |
| 410 | while (aptr - a < na && bptr - b < nb) |
| 411 | { |
| 412 | int cmp = ginCompareItemPointers(aptr, bptr); |
| 413 | |
| 414 | if (cmp > 0) |
| 415 | *dptr++ = *bptr++; |
| 416 | else if (cmp == 0) |
| 417 | { |
| 418 | /* only keep one copy of the identical items */ |
| 419 | *dptr++ = *bptr++; |
| 420 | aptr++; |
| 421 | } |
| 422 | else |
| 423 | *dptr++ = *aptr++; |
| 424 | } |
| 425 | |
| 426 | while (aptr - a < na) |
| 427 | *dptr++ = *aptr++; |
| 428 | |
| 429 | while (bptr - b < nb) |
| 430 | *dptr++ = *bptr++; |
| 431 | |
| 432 | *nmerged = dptr - dst; |
| 433 | } |
| 434 | |
| 435 | return dst; |
| 436 | } |
| 437 | |