ginpostinglist.c source code [PostgreSQL/src/backend/access/gin/ginpostinglist.c]

1	/-------------------------------------------------------------------------*
2	*
3	* ginpostinglist.c
4	* routines for dealing with posting lists.
5	*
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	* IDENTIFICATION
11	* src/backend/access/gin/ginpostinglist.c
12	*-------------------------------------------------------------------------
13	*/
14
15	#include "postgres.h"
16
17	#include "access/gin_private.h"
18
19	#ifdef USE_ASSERT_CHECKING
20	#define CHECK_ENCODING_ROUNDTRIP
21	#endif
22
23	/*
24	* For encoding purposes, item pointers are represented as 64-bit unsigned
25	* integers. The lowest 11 bits represent the offset number, and the next
26	* lowest 32 bits are the block number. That leaves 21 bits unused, i.e.
27	* only 43 low bits are used.
28	*
29	* 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <
30	* 2^11 on all supported block sizes. We are frugal with the bits, because
31	* smaller integers use fewer bytes in the varbyte encoding, saving disk
32	* space. (If we get a new table AM in the future that wants to use the full
33	* range of possible offset numbers, we'll need to change this.)
34	*
35	* These 43-bit integers are encoded using varbyte encoding. In each byte,
36	* the 7 low bits contain data, while the highest bit is a continuation bit.
37	* When the continuation bit is set, the next byte is part of the same
38	* integer, otherwise this is the last byte of this integer. 43 bits need
39	* at most 7 bytes in this encoding:
40	*
41	* 0XXXXXXX
42	* 1XXXXXXX 0XXXXYYY
43	* 1XXXXXXX 1XXXXYYY 0YYYYYYY
44	* 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
45	* 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
46	* 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
47	* 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY
48	*
49	* X = bits used for offset number
50	* Y = bits used for block number
51	* u = unused bit
52	*
53	* The bytes are in stored in little-endian order.
54	*
55	* An important property of this encoding is that removing an item from list
56	* never increases the size of the resulting compressed posting list. Proof:
57	*
58	* Removing number is actually replacement of two numbers with their sum. We
59	* have to prove that varbyte encoding of a sum can't be longer than varbyte
60	* encoding of its summands. Sum of two numbers is at most one bit wider than
61	* the larger of the summands. Widening a number by one bit enlarges its length
62	* in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
63	* is at most one byte longer than varbyte encoding of larger summand. Lesser
64	* summand is at least one byte, so the sum cannot take more space than the
65	* summands, Q.E.D.
66	*
67	* This property greatly simplifies VACUUM, which can assume that posting
68	* lists always fit on the same page after vacuuming. Note that even though
69	* that holds for removing items from a posting list, you must also be
70	* careful to not cause expansion e.g. when merging uncompressed items on the
71	* page into the compressed lists, when vacuuming.
72	*/
73
74	/*
75	* How many bits do you need to encode offset number? OffsetNumber is a 16-bit
76	* integer, but you can't fit that many items on a page. 11 ought to be more
77	* than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
78	* use the minimum number of bits, but that would require changing the on-disk
79	* format if MaxHeapTuplesPerPage changes. Better to leave some slack.
80	*/
81	#define MaxHeapTuplesPerPageBits 11
82
83	/ Max. number of bytes needed to encode the largest supported integer. /
84	#define MaxBytesPerInteger 7
85
86	static inline uint64
87	itemptr_to_uint64(const ItemPointer iptr)
88	{
89	uint64 val;
90
91	Assert(ItemPointerIsValid(iptr));
92	Assert(GinItemPointerGetOffsetNumber(iptr) < (`1` << MaxHeapTuplesPerPageBits));
93
94	val = GinItemPointerGetBlockNumber(iptr);
95	val <<= MaxHeapTuplesPerPageBits;
96	val \|= GinItemPointerGetOffsetNumber(iptr);
97
98	return val;
99	}
100
101	static inline void
102	uint64_to_itemptr(uint64 val, ItemPointer iptr)
103	{
104	GinItemPointerSetOffsetNumber(iptr, val & ((`1` << MaxHeapTuplesPerPageBits) - `1`));
105	val = val >> MaxHeapTuplesPerPageBits;
106	GinItemPointerSetBlockNumber(iptr, val);
107
108	Assert(ItemPointerIsValid(iptr));
109	}
110
111	/*
112	* Varbyte-encode 'val' into ptr. ptr is incremented to next integer.
113	*/
114	static void
115	encode_varbyte(uint64 val, unsigned char **ptr)
116	{
117	unsigned char p = ptr;
118
119	while (val > `0x7F`)
120	{
121	*(p++) = `0x80` \| (val & `0x7F`);
122	val >>= `7`;
123	}
124	(p++) = (unsigned* char) val;
125
126	*ptr = p;
127	}
128
129	/*
130	* Decode varbyte-encoded integer at ptr. ptr is incremented to next integer.
131	*/
132	static uint64
133	decode_varbyte(unsigned char **ptr)
134	{
135	uint64 val;
136	unsigned char p = ptr;
137	uint64 c;
138
139	/ 1st byte /
140	c = *(p++);
141	val = c & `0x7F`;
142	if (c & `0x80`)
143	{
144	/ 2nd byte /
145	c = *(p++);
146	val \|= (c & `0x7F`) << `7`;
147	if (c & `0x80`)
148	{
149	/ 3rd byte /
150	c = *(p++);
151	val \|= (c & `0x7F`) << `14`;
152	if (c & `0x80`)
153	{
154	/ 4th byte /
155	c = *(p++);
156	val \|= (c & `0x7F`) << `21`;
157	if (c & `0x80`)
158	{
159	/ 5th byte /
160	c = *(p++);
161	val \|= (c & `0x7F`) << `28`;
162	if (c & `0x80`)
163	{
164	/ 6th byte /
165	c = *(p++);
166	val \|= (c & `0x7F`) << `35`;
167	if (c & `0x80`)
168	{
169	/ 7th byte, should not have continuation bit /
170	c = *(p++);
171	val \|= c << `42`;
172	Assert((c & `0x80`) == `0`);
173	}
174	}
175	}
176	}
177	}
178	}
179
180	*ptr = p;
181
182	return val;
183	}
184
185	/*
186	* Encode a posting list.
187	*
188	* The encoded list is returned in a palloc'd struct, which will be at most
189	* 'maxsize' bytes in size. The number items in the returned segment is
190	* returned in *nwritten. If it's not equal to nipd, not all the items fit
191	* in 'maxsize', and only the first *nwritten were encoded.
192	*
193	* The allocated size of the returned struct is short-aligned, and the padding
194	* byte at the end, if any, is zero.
195	*/
196	GinPostingList *
197	ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
198	int *nwritten)
199	{
200	uint64 prev;
201	int totalpacked = `0`;
202	int maxbytes;
203	GinPostingList *result;
204	unsigned char *ptr;
205	unsigned char *endptr;
206
207	maxsize = SHORTALIGN_DOWN(maxsize);
208
209	result = palloc(maxsize);
210
211	maxbytes = maxsize - offsetof(GinPostingList, bytes);
212	Assert(maxbytes > `0`);
213
214	/ Store the first special item /
215	result->first = ipd[`0`];
216
217	prev = itemptr_to_uint64(&result->first);
218
219	ptr = result->bytes;
220	endptr = result->bytes + maxbytes;
221	for (totalpacked = `1`; totalpacked < nipd; totalpacked++)
222	{
223	uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
224	uint64 delta = val - prev;
225
226	Assert(val > prev);
227
228	if (endptr - ptr >= MaxBytesPerInteger)
229	encode_varbyte(delta, &ptr);
230	else
231	{
232	/*
233	* There are less than 7 bytes left. Have to check if the next
234	* item fits in that space before writing it out.
235	*/
236	unsigned char buf[MaxBytesPerInteger];
237	unsigned char *p = buf;
238
239	encode_varbyte(delta, &p);
240	if (p - buf > (endptr - ptr))
241	break; / output is full /
242
243	memcpy(ptr, buf, p - buf);
244	ptr += (p - buf);
245	}
246	prev = val;
247	}
248	result->nbytes = ptr - result->bytes;
249
250	/*
251	* If we wrote an odd number of bytes, zero out the padding byte at the
252	* end.
253	*/
254	if (result->nbytes != SHORTALIGN(result->nbytes))
255	result->bytes[result->nbytes] = `0`;
256
257	if (nwritten)
258	*nwritten = totalpacked;
259
260	Assert(SizeOfGinPostingList(result) <= maxsize);
261
262	/*
263	* Check that the encoded segment decodes back to the original items.
264	*/
265	#if defined (CHECK_ENCODING_ROUNDTRIP)
266	{
267	int ndecoded;
268	ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
269	int i;
270
271	Assert(ndecoded == totalpacked);
272	for (i = `0`; i < ndecoded; i++)
273	Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == `0`);
274	pfree(tmp);
275	}
276	#endif
277
278	return result;
279	}
280
281	/*
282	* Decode a compressed posting list into an array of item pointers.
283	* The number of items is returned in *ndecoded.
284	*/
285	ItemPointer
286	ginPostingListDecode(GinPostingList plist, int* *ndecoded)
287	{
288	return ginPostingListDecodeAllSegments(plist,
289	SizeOfGinPostingList(plist),
290	ndecoded);
291	}
292
293	/*
294	* Decode multiple posting list segments into an array of item pointers.
295	* The number of items is returned in *ndecoded_out. The segments are stored
296	* one after each other, with total size 'len' bytes.
297	*/
298	ItemPointer
299	ginPostingListDecodeAllSegments(GinPostingList segment, int* len, int *ndecoded_out)
300	{
301	ItemPointer result;
302	int nallocated;
303	uint64 val;
304	char endseg = ((char* *) segment) + len;
305	int ndecoded;
306	unsigned char *ptr;
307	unsigned char *endptr;
308
309	/*
310	* Guess an initial size of the array.
311	*/
312	nallocated = segment->nbytes * `2` + `1`;
313	result = palloc(nallocated * sizeof(ItemPointerData));
314
315	ndecoded = `0`;
316	while ((char *) segment < endseg)
317	{
318	/ enlarge output array if needed /
319	if (ndecoded >= nallocated)
320	{
321	nallocated *= `2`;
322	result = repalloc(result, nallocated * sizeof(ItemPointerData));
323	}
324
325	/ copy the first item /
326	Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
327	Assert(ndecoded == `0` \|\| ginCompareItemPointers(&segment->first, &result[ndecoded - `1`]) > `0`);
328	result[ndecoded] = segment->first;
329	ndecoded++;
330
331	val = itemptr_to_uint64(&segment->first);
332	ptr = segment->bytes;
333	endptr = segment->bytes + segment->nbytes;
334	while (ptr < endptr)
335	{
336	/ enlarge output array if needed /
337	if (ndecoded >= nallocated)
338	{
339	nallocated *= `2`;
340	result = repalloc(result, nallocated * sizeof(ItemPointerData));
341	}
342
343	val += decode_varbyte(&ptr);
344
345	uint64_to_itemptr(val, &result[ndecoded]);
346	ndecoded++;
347	}
348	segment = GinNextPostingListSegment(segment);
349	}
350
351	if (ndecoded_out)
352	*ndecoded_out = ndecoded;
353	return result;
354	}
355
356	/*
357	* Add all item pointers from a bunch of posting lists to a TIDBitmap.
358	*/
359	int
360	ginPostingListDecodeAllSegmentsToTbm(GinPostingList ptr, int* len,
361	TIDBitmap *tbm)
362	{
363	int ndecoded;
364	ItemPointer items;
365
366	items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
367	tbm_add_tuples(tbm, items, ndecoded, false);
368	pfree(items);
369
370	return ndecoded;
371	}
372
373	/*
374	* Merge two ordered arrays of itempointers, eliminating any duplicates.
375	*
376	* Returns a palloc'd array, and *nmerged is set to the number of items in
377	* the result, after eliminating duplicates.
378	*/
379	ItemPointer
380	ginMergeItemPointers(ItemPointerData *a, uint32 na,
381	ItemPointerData *b, uint32 nb,
382	int *nmerged)
383	{
384	ItemPointerData *dst;
385
386	dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
387
388	/*
389	* If the argument arrays don't overlap, we can just append them to each
390	* other.
391	*/
392	if (na == `0` \|\| nb == `0` \|\| ginCompareItemPointers(&a[na - `1`], &b[`0`]) < `0`)
393	{
394	memcpy(dst, a, na * sizeof(ItemPointerData));
395	memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
396	*nmerged = na + nb;
397	}
398	else if (ginCompareItemPointers(&b[nb - `1`], &a[`0`]) < `0`)
399	{
400	memcpy(dst, b, nb * sizeof(ItemPointerData));
401	memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
402	*nmerged = na + nb;
403	}
404	else
405	{
406	ItemPointerData *dptr = dst;
407	ItemPointerData *aptr = a;
408	ItemPointerData *bptr = b;
409
410	while (aptr - a < na && bptr - b < nb)
411	{
412	int cmp = ginCompareItemPointers(aptr, bptr);
413
414	if (cmp > `0`)
415	dptr++ = bptr++;
416	else if (cmp == `0`)
417	{
418	/ only keep one copy of the identical items /
419	dptr++ = bptr++;
420	aptr++;
421	}
422	else
423	dptr++ = aptr++;
424	}
425
426	while (aptr - a < na)
427	dptr++ = aptr++;
428
429	while (bptr - b < nb)
430	dptr++ = bptr++;
431
432	*nmerged = dptr - dst;
433	}
434
435	return dst;
436	}
437

Browse the source code of PostgreSQL/src/backend/access/gin/ginpostinglist.c