1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * ginpostinglist.c |
4 | * routines for dealing with posting lists. |
5 | * |
6 | * |
7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
8 | * Portions Copyright (c) 1994, Regents of the University of California |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/gin/ginpostinglist.c |
12 | *------------------------------------------------------------------------- |
13 | */ |
14 | |
15 | #include "postgres.h" |
16 | |
17 | #include "access/gin_private.h" |
18 | |
19 | #ifdef USE_ASSERT_CHECKING |
20 | #define CHECK_ENCODING_ROUNDTRIP |
21 | #endif |
22 | |
23 | /* |
24 | * For encoding purposes, item pointers are represented as 64-bit unsigned |
25 | * integers. The lowest 11 bits represent the offset number, and the next |
26 | * lowest 32 bits are the block number. That leaves 21 bits unused, i.e. |
27 | * only 43 low bits are used. |
28 | * |
29 | * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage < |
30 | * 2^11 on all supported block sizes. We are frugal with the bits, because |
31 | * smaller integers use fewer bytes in the varbyte encoding, saving disk |
32 | * space. (If we get a new table AM in the future that wants to use the full |
33 | * range of possible offset numbers, we'll need to change this.) |
34 | * |
35 | * These 43-bit integers are encoded using varbyte encoding. In each byte, |
36 | * the 7 low bits contain data, while the highest bit is a continuation bit. |
37 | * When the continuation bit is set, the next byte is part of the same |
38 | * integer, otherwise this is the last byte of this integer. 43 bits need |
39 | * at most 7 bytes in this encoding: |
40 | * |
41 | * 0XXXXXXX |
42 | * 1XXXXXXX 0XXXXYYY |
43 | * 1XXXXXXX 1XXXXYYY 0YYYYYYY |
44 | * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY |
45 | * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY |
46 | * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY |
47 | * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY |
48 | * |
49 | * X = bits used for offset number |
50 | * Y = bits used for block number |
51 | * u = unused bit |
52 | * |
53 | * The bytes are in stored in little-endian order. |
54 | * |
55 | * An important property of this encoding is that removing an item from list |
56 | * never increases the size of the resulting compressed posting list. Proof: |
57 | * |
58 | * Removing number is actually replacement of two numbers with their sum. We |
59 | * have to prove that varbyte encoding of a sum can't be longer than varbyte |
60 | * encoding of its summands. Sum of two numbers is at most one bit wider than |
61 | * the larger of the summands. Widening a number by one bit enlarges its length |
62 | * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum |
63 | * is at most one byte longer than varbyte encoding of larger summand. Lesser |
64 | * summand is at least one byte, so the sum cannot take more space than the |
65 | * summands, Q.E.D. |
66 | * |
67 | * This property greatly simplifies VACUUM, which can assume that posting |
68 | * lists always fit on the same page after vacuuming. Note that even though |
69 | * that holds for removing items from a posting list, you must also be |
70 | * careful to not cause expansion e.g. when merging uncompressed items on the |
71 | * page into the compressed lists, when vacuuming. |
72 | */ |
73 | |
74 | /* |
75 | * How many bits do you need to encode offset number? OffsetNumber is a 16-bit |
76 | * integer, but you can't fit that many items on a page. 11 ought to be more |
77 | * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and |
78 | * use the minimum number of bits, but that would require changing the on-disk |
79 | * format if MaxHeapTuplesPerPage changes. Better to leave some slack. |
80 | */ |
81 | #define MaxHeapTuplesPerPageBits 11 |
82 | |
83 | /* Max. number of bytes needed to encode the largest supported integer. */ |
84 | #define MaxBytesPerInteger 7 |
85 | |
86 | static inline uint64 |
87 | itemptr_to_uint64(const ItemPointer iptr) |
88 | { |
89 | uint64 val; |
90 | |
91 | Assert(ItemPointerIsValid(iptr)); |
92 | Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits)); |
93 | |
94 | val = GinItemPointerGetBlockNumber(iptr); |
95 | val <<= MaxHeapTuplesPerPageBits; |
96 | val |= GinItemPointerGetOffsetNumber(iptr); |
97 | |
98 | return val; |
99 | } |
100 | |
101 | static inline void |
102 | uint64_to_itemptr(uint64 val, ItemPointer iptr) |
103 | { |
104 | GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1)); |
105 | val = val >> MaxHeapTuplesPerPageBits; |
106 | GinItemPointerSetBlockNumber(iptr, val); |
107 | |
108 | Assert(ItemPointerIsValid(iptr)); |
109 | } |
110 | |
111 | /* |
112 | * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer. |
113 | */ |
114 | static void |
115 | encode_varbyte(uint64 val, unsigned char **ptr) |
116 | { |
117 | unsigned char *p = *ptr; |
118 | |
119 | while (val > 0x7F) |
120 | { |
121 | *(p++) = 0x80 | (val & 0x7F); |
122 | val >>= 7; |
123 | } |
124 | *(p++) = (unsigned char) val; |
125 | |
126 | *ptr = p; |
127 | } |
128 | |
129 | /* |
130 | * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer. |
131 | */ |
132 | static uint64 |
133 | decode_varbyte(unsigned char **ptr) |
134 | { |
135 | uint64 val; |
136 | unsigned char *p = *ptr; |
137 | uint64 c; |
138 | |
139 | /* 1st byte */ |
140 | c = *(p++); |
141 | val = c & 0x7F; |
142 | if (c & 0x80) |
143 | { |
144 | /* 2nd byte */ |
145 | c = *(p++); |
146 | val |= (c & 0x7F) << 7; |
147 | if (c & 0x80) |
148 | { |
149 | /* 3rd byte */ |
150 | c = *(p++); |
151 | val |= (c & 0x7F) << 14; |
152 | if (c & 0x80) |
153 | { |
154 | /* 4th byte */ |
155 | c = *(p++); |
156 | val |= (c & 0x7F) << 21; |
157 | if (c & 0x80) |
158 | { |
159 | /* 5th byte */ |
160 | c = *(p++); |
161 | val |= (c & 0x7F) << 28; |
162 | if (c & 0x80) |
163 | { |
164 | /* 6th byte */ |
165 | c = *(p++); |
166 | val |= (c & 0x7F) << 35; |
167 | if (c & 0x80) |
168 | { |
169 | /* 7th byte, should not have continuation bit */ |
170 | c = *(p++); |
171 | val |= c << 42; |
172 | Assert((c & 0x80) == 0); |
173 | } |
174 | } |
175 | } |
176 | } |
177 | } |
178 | } |
179 | |
180 | *ptr = p; |
181 | |
182 | return val; |
183 | } |
184 | |
185 | /* |
186 | * Encode a posting list. |
187 | * |
188 | * The encoded list is returned in a palloc'd struct, which will be at most |
189 | * 'maxsize' bytes in size. The number items in the returned segment is |
190 | * returned in *nwritten. If it's not equal to nipd, not all the items fit |
191 | * in 'maxsize', and only the first *nwritten were encoded. |
192 | * |
193 | * The allocated size of the returned struct is short-aligned, and the padding |
194 | * byte at the end, if any, is zero. |
195 | */ |
196 | GinPostingList * |
197 | ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, |
198 | int *nwritten) |
199 | { |
200 | uint64 prev; |
201 | int totalpacked = 0; |
202 | int maxbytes; |
203 | GinPostingList *result; |
204 | unsigned char *ptr; |
205 | unsigned char *endptr; |
206 | |
207 | maxsize = SHORTALIGN_DOWN(maxsize); |
208 | |
209 | result = palloc(maxsize); |
210 | |
211 | maxbytes = maxsize - offsetof(GinPostingList, bytes); |
212 | Assert(maxbytes > 0); |
213 | |
214 | /* Store the first special item */ |
215 | result->first = ipd[0]; |
216 | |
217 | prev = itemptr_to_uint64(&result->first); |
218 | |
219 | ptr = result->bytes; |
220 | endptr = result->bytes + maxbytes; |
221 | for (totalpacked = 1; totalpacked < nipd; totalpacked++) |
222 | { |
223 | uint64 val = itemptr_to_uint64(&ipd[totalpacked]); |
224 | uint64 delta = val - prev; |
225 | |
226 | Assert(val > prev); |
227 | |
228 | if (endptr - ptr >= MaxBytesPerInteger) |
229 | encode_varbyte(delta, &ptr); |
230 | else |
231 | { |
232 | /* |
233 | * There are less than 7 bytes left. Have to check if the next |
234 | * item fits in that space before writing it out. |
235 | */ |
236 | unsigned char buf[MaxBytesPerInteger]; |
237 | unsigned char *p = buf; |
238 | |
239 | encode_varbyte(delta, &p); |
240 | if (p - buf > (endptr - ptr)) |
241 | break; /* output is full */ |
242 | |
243 | memcpy(ptr, buf, p - buf); |
244 | ptr += (p - buf); |
245 | } |
246 | prev = val; |
247 | } |
248 | result->nbytes = ptr - result->bytes; |
249 | |
250 | /* |
251 | * If we wrote an odd number of bytes, zero out the padding byte at the |
252 | * end. |
253 | */ |
254 | if (result->nbytes != SHORTALIGN(result->nbytes)) |
255 | result->bytes[result->nbytes] = 0; |
256 | |
257 | if (nwritten) |
258 | *nwritten = totalpacked; |
259 | |
260 | Assert(SizeOfGinPostingList(result) <= maxsize); |
261 | |
262 | /* |
263 | * Check that the encoded segment decodes back to the original items. |
264 | */ |
265 | #if defined (CHECK_ENCODING_ROUNDTRIP) |
266 | { |
267 | int ndecoded; |
268 | ItemPointer tmp = ginPostingListDecode(result, &ndecoded); |
269 | int i; |
270 | |
271 | Assert(ndecoded == totalpacked); |
272 | for (i = 0; i < ndecoded; i++) |
273 | Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0); |
274 | pfree(tmp); |
275 | } |
276 | #endif |
277 | |
278 | return result; |
279 | } |
280 | |
281 | /* |
282 | * Decode a compressed posting list into an array of item pointers. |
283 | * The number of items is returned in *ndecoded. |
284 | */ |
285 | ItemPointer |
286 | ginPostingListDecode(GinPostingList *plist, int *ndecoded) |
287 | { |
288 | return ginPostingListDecodeAllSegments(plist, |
289 | SizeOfGinPostingList(plist), |
290 | ndecoded); |
291 | } |
292 | |
293 | /* |
294 | * Decode multiple posting list segments into an array of item pointers. |
295 | * The number of items is returned in *ndecoded_out. The segments are stored |
296 | * one after each other, with total size 'len' bytes. |
297 | */ |
298 | ItemPointer |
299 | ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out) |
300 | { |
301 | ItemPointer result; |
302 | int nallocated; |
303 | uint64 val; |
304 | char *endseg = ((char *) segment) + len; |
305 | int ndecoded; |
306 | unsigned char *ptr; |
307 | unsigned char *endptr; |
308 | |
309 | /* |
310 | * Guess an initial size of the array. |
311 | */ |
312 | nallocated = segment->nbytes * 2 + 1; |
313 | result = palloc(nallocated * sizeof(ItemPointerData)); |
314 | |
315 | ndecoded = 0; |
316 | while ((char *) segment < endseg) |
317 | { |
318 | /* enlarge output array if needed */ |
319 | if (ndecoded >= nallocated) |
320 | { |
321 | nallocated *= 2; |
322 | result = repalloc(result, nallocated * sizeof(ItemPointerData)); |
323 | } |
324 | |
325 | /* copy the first item */ |
326 | Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); |
327 | Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0); |
328 | result[ndecoded] = segment->first; |
329 | ndecoded++; |
330 | |
331 | val = itemptr_to_uint64(&segment->first); |
332 | ptr = segment->bytes; |
333 | endptr = segment->bytes + segment->nbytes; |
334 | while (ptr < endptr) |
335 | { |
336 | /* enlarge output array if needed */ |
337 | if (ndecoded >= nallocated) |
338 | { |
339 | nallocated *= 2; |
340 | result = repalloc(result, nallocated * sizeof(ItemPointerData)); |
341 | } |
342 | |
343 | val += decode_varbyte(&ptr); |
344 | |
345 | uint64_to_itemptr(val, &result[ndecoded]); |
346 | ndecoded++; |
347 | } |
348 | segment = GinNextPostingListSegment(segment); |
349 | } |
350 | |
351 | if (ndecoded_out) |
352 | *ndecoded_out = ndecoded; |
353 | return result; |
354 | } |
355 | |
356 | /* |
357 | * Add all item pointers from a bunch of posting lists to a TIDBitmap. |
358 | */ |
359 | int |
360 | ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, |
361 | TIDBitmap *tbm) |
362 | { |
363 | int ndecoded; |
364 | ItemPointer items; |
365 | |
366 | items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded); |
367 | tbm_add_tuples(tbm, items, ndecoded, false); |
368 | pfree(items); |
369 | |
370 | return ndecoded; |
371 | } |
372 | |
373 | /* |
374 | * Merge two ordered arrays of itempointers, eliminating any duplicates. |
375 | * |
376 | * Returns a palloc'd array, and *nmerged is set to the number of items in |
377 | * the result, after eliminating duplicates. |
378 | */ |
379 | ItemPointer |
380 | ginMergeItemPointers(ItemPointerData *a, uint32 na, |
381 | ItemPointerData *b, uint32 nb, |
382 | int *nmerged) |
383 | { |
384 | ItemPointerData *dst; |
385 | |
386 | dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData)); |
387 | |
388 | /* |
389 | * If the argument arrays don't overlap, we can just append them to each |
390 | * other. |
391 | */ |
392 | if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0) |
393 | { |
394 | memcpy(dst, a, na * sizeof(ItemPointerData)); |
395 | memcpy(&dst[na], b, nb * sizeof(ItemPointerData)); |
396 | *nmerged = na + nb; |
397 | } |
398 | else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0) |
399 | { |
400 | memcpy(dst, b, nb * sizeof(ItemPointerData)); |
401 | memcpy(&dst[nb], a, na * sizeof(ItemPointerData)); |
402 | *nmerged = na + nb; |
403 | } |
404 | else |
405 | { |
406 | ItemPointerData *dptr = dst; |
407 | ItemPointerData *aptr = a; |
408 | ItemPointerData *bptr = b; |
409 | |
410 | while (aptr - a < na && bptr - b < nb) |
411 | { |
412 | int cmp = ginCompareItemPointers(aptr, bptr); |
413 | |
414 | if (cmp > 0) |
415 | *dptr++ = *bptr++; |
416 | else if (cmp == 0) |
417 | { |
418 | /* only keep one copy of the identical items */ |
419 | *dptr++ = *bptr++; |
420 | aptr++; |
421 | } |
422 | else |
423 | *dptr++ = *aptr++; |
424 | } |
425 | |
426 | while (aptr - a < na) |
427 | *dptr++ = *aptr++; |
428 | |
429 | while (bptr - b < nb) |
430 | *dptr++ = *bptr++; |
431 | |
432 | *nmerged = dptr - dst; |
433 | } |
434 | |
435 | return dst; |
436 | } |
437 | |