hashinsert.c source code [PostgreSQL/src/backend/access/hash/hashinsert.c]

1	/-------------------------------------------------------------------------*
2	*
3	* hashinsert.c
4	* Item insertion in hash tables for Postgres.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/access/hash/hashinsert.c
12	*
13	*-------------------------------------------------------------------------
14	*/
15
16	#include "postgres.h"
17
18	#include "access/hash.h"
19	#include "access/hash_xlog.h"
20	#include "miscadmin.h"
21	#include "utils/rel.h"
22	#include "storage/lwlock.h"
23	#include "storage/buf_internals.h"
24	#include "storage/predicate.h"
25
26	static void _hash_vacuum_one_page(Relation rel, Relation hrel,
27	Buffer metabuf, Buffer buf);
28
29	/*
30	* _hash_doinsert() -- Handle insertion of a single index tuple.
31	*
32	* This routine is called by the public interface routines, hashbuild
33	* and hashinsert. By here, itup is completely filled in.
34	*/
35	void
36	_hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
37	{
38	Buffer buf = InvalidBuffer;
39	Buffer bucket_buf;
40	Buffer metabuf;
41	HashMetaPage metap;
42	HashMetaPage usedmetap = NULL;
43	Page metapage;
44	Page page;
45	HashPageOpaque pageopaque;
46	Size itemsz;
47	bool do_expand;
48	uint32 hashkey;
49	Bucket bucket;
50	OffsetNumber itup_off;
51
52	/*
53	* Get the hash key for the item (it's stored in the index tuple itself).
54	*/
55	hashkey = _hash_get_indextuple_hashkey(itup);
56
57	/ compute item size too /
58	itemsz = IndexTupleSize(itup);
59	itemsz = MAXALIGN(itemsz); / be safe, PageAddItem will do this but we*
60	* need to be consistent */
61
62	restart_insert:
63
64	/*
65	* Read the metapage. We don't lock it yet; HashMaxItemSize() will
66	* examine pd_pagesize_version, but that can't change so we can examine it
67	* without a lock.
68	*/
69	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
70	metapage = BufferGetPage(metabuf);
71
72	/*
73	* Check whether the item can fit on a hash page at all. (Eventually, we
74	* ought to try to apply TOAST methods if not.) Note that at this point,
75	* itemsz doesn't include the ItemId.
76	*
77	* XXX this is useless code if we are only storing hash keys.
78	*/
79	if (itemsz > HashMaxItemSize(metapage))
80	ereport(ERROR,
81	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
82	errmsg("index row size %zu exceeds hash maximum %zu",
83	itemsz, HashMaxItemSize(metapage)),
84	errhint("Values larger than a buffer page cannot be indexed.")));
85
86	/ Lock the primary bucket page for the target bucket. /
87	buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
88	&usedmetap);
89	Assert(usedmetap != NULL);
90
91	CheckForSerializableConflictIn(rel, NULL, buf);
92
93	/ remember the primary bucket buffer to release the pin on it at end. /
94	bucket_buf = buf;
95
96	page = BufferGetPage(buf);
97	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
98	bucket = pageopaque->hasho_bucket;
99
100	/*
101	* If this bucket is in the process of being split, try to finish the
102	* split before inserting, because that might create room for the
103	* insertion to proceed without allocating an additional overflow page.
104	* It's only interesting to finish the split if we're trying to insert
105	* into the bucket from which we're removing tuples (the "old" bucket),
106	* not if we're trying to insert into the bucket into which tuples are
107	* being moved (the "new" bucket).
108	*/
109	if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
110	{
111	/ release the lock on bucket buffer, before completing the split. /
112	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
113
114	_hash_finish_split(rel, metabuf, buf, bucket,
115	usedmetap->hashm_maxbucket,
116	usedmetap->hashm_highmask,
117	usedmetap->hashm_lowmask);
118
119	/ release the pin on old and meta buffer. retry for insert. /
120	_hash_dropbuf(rel, buf);
121	_hash_dropbuf(rel, metabuf);
122	goto restart_insert;
123	}
124
125	/ Do the insertion /
126	while (PageGetFreeSpace(page) < itemsz)
127	{
128	BlockNumber nextblkno;
129
130	/*
131	* Check if current page has any DEAD tuples. If yes, delete these
132	* tuples and see if we can get a space for the new item to be
133	* inserted before moving to the next page in the bucket chain.
134	*/
135	if (H_HAS_DEAD_TUPLES(pageopaque))
136	{
137
138	if (IsBufferCleanupOK(buf))
139	{
140	_hash_vacuum_one_page(rel, heapRel, metabuf, buf);
141
142	if (PageGetFreeSpace(page) >= itemsz)
143	break; / OK, now we have enough space /
144	}
145	}
146
147	/*
148	* no space on this page; check for an overflow page
149	*/
150	nextblkno = pageopaque->hasho_nextblkno;
151
152	if (BlockNumberIsValid(nextblkno))
153	{
154	/*
155	* ovfl page exists; go get it. if it doesn't have room, we'll
156	* find out next pass through the loop test above. we always
157	* release both the lock and pin if this is an overflow page, but
158	* only the lock if this is the primary bucket page, since the pin
159	* on the primary bucket must be retained throughout the scan.
160	*/
161	if (buf != bucket_buf)
162	_hash_relbuf(rel, buf);
163	else
164	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
165	buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
166	page = BufferGetPage(buf);
167	}
168	else
169	{
170	/*
171	* we're at the end of the bucket chain and we haven't found a
172	* page with enough room. allocate a new overflow page.
173	*/
174
175	/ release our write lock without modifying buffer /
176	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
177
178	/ chain to a new overflow page /
179	buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
180	page = BufferGetPage(buf);
181
182	/ should fit now, given test above /
183	Assert(PageGetFreeSpace(page) >= itemsz);
184	}
185	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
186	Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
187	Assert(pageopaque->hasho_bucket == bucket);
188	}
189
190	/*
191	* Write-lock the metapage so we can increment the tuple count. After
192	* incrementing it, check to see if it's time for a split.
193	*/
194	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
195
196	/ Do the update. No ereport(ERROR) until changes are logged /
197	START_CRIT_SECTION();
198
199	/ found page with enough space, so add the item here /
200	itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
201	MarkBufferDirty(buf);
202
203	/ metapage operations /
204	metap = HashPageGetMeta(metapage);
205	metap->hashm_ntuples += `1`;
206
207	/ Make sure this stays in sync with _hash_expandtable() /
208	do_expand = metap->hashm_ntuples >
209	(double) metap->hashm_ffactor * (metap->hashm_maxbucket + `1`);
210
211	MarkBufferDirty(metabuf);
212
213	/ XLOG stuff /
214	if (RelationNeedsWAL(rel))
215	{
216	xl_hash_insert xlrec;
217	XLogRecPtr recptr;
218
219	xlrec.offnum = itup_off;
220
221	XLogBeginInsert();
222	XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
223
224	XLogRegisterBuffer(`1`, metabuf, REGBUF_STANDARD);
225
226	XLogRegisterBuffer(`0`, buf, REGBUF_STANDARD);
227	XLogRegisterBufData(`0`, (char *) itup, IndexTupleSize(itup));
228
229	recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
230
231	PageSetLSN(BufferGetPage(buf), recptr);
232	PageSetLSN(BufferGetPage(metabuf), recptr);
233	}
234
235	END_CRIT_SECTION();
236
237	/ drop lock on metapage, but keep pin /
238	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
239
240	/*
241	* Release the modified page and ensure to release the pin on primary
242	* page.
243	*/
244	_hash_relbuf(rel, buf);
245	if (buf != bucket_buf)
246	_hash_dropbuf(rel, bucket_buf);
247
248	/ Attempt to split if a split is needed /
249	if (do_expand)
250	_hash_expandtable(rel, metabuf);
251
252	/ Finally drop our pin on the metapage /
253	_hash_dropbuf(rel, metabuf);
254	}
255
256	/*
257	* _hash_pgaddtup() -- add a tuple to a particular page in the index.
258	*
259	* This routine adds the tuple to the page as requested; it does not write out
260	* the page. It is an error to call pgaddtup() without pin and write lock on
261	* the target buffer.
262	*
263	* Returns the offset number at which the tuple was inserted. This function
264	* is responsible for preserving the condition that tuples in a hash index
265	* page are sorted by hashkey value.
266	*/
267	OffsetNumber
268	_hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
269	{
270	OffsetNumber itup_off;
271	Page page;
272	uint32 hashkey;
273
274	_hash_checkpage(rel, buf, LH_BUCKET_PAGE \| LH_OVERFLOW_PAGE);
275	page = BufferGetPage(buf);
276
277	/ Find where to insert the tuple (preserving page's hashkey ordering) /
278	hashkey = _hash_get_indextuple_hashkey(itup);
279	itup_off = _hash_binsearch(page, hashkey);
280
281	if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
282	== InvalidOffsetNumber)
283	elog(ERROR, "failed to add index item to \"%s\"",
284	RelationGetRelationName(rel));
285
286	return itup_off;
287	}
288
289	/*
290	* _hash_pgaddmultitup() -- add a tuple vector to a particular page in the
291	* index.
292	*
293	* This routine has same requirements for locking and tuple ordering as
294	* _hash_pgaddtup().
295	*
296	* Returns the offset number array at which the tuples were inserted.
297	*/
298	void
299	_hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
300	OffsetNumber *itup_offsets, uint16 nitups)
301	{
302	OffsetNumber itup_off;
303	Page page;
304	uint32 hashkey;
305	int i;
306
307	_hash_checkpage(rel, buf, LH_BUCKET_PAGE \| LH_OVERFLOW_PAGE);
308	page = BufferGetPage(buf);
309
310	for (i = `0`; i < nitups; i++)
311	{
312	Size itemsize;
313
314	itemsize = IndexTupleSize(itups[i]);
315	itemsize = MAXALIGN(itemsize);
316
317	/ Find where to insert the tuple (preserving page's hashkey ordering) /
318	hashkey = _hash_get_indextuple_hashkey(itups[i]);
319	itup_off = _hash_binsearch(page, hashkey);
320
321	itup_offsets[i] = itup_off;
322
323	if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
324	== InvalidOffsetNumber)
325	elog(ERROR, "failed to add index item to \"%s\"",
326	RelationGetRelationName(rel));
327	}
328	}
329
330	/*
331	* _hash_vacuum_one_page - vacuum just one index page.
332	*
333	* Try to remove LP_DEAD items from the given page. We must acquire cleanup
334	* lock on the page being modified before calling this function.
335	*/
336
337	static void
338	_hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf)
339	{
340	OffsetNumber deletable[MaxOffsetNumber];
341	int ndeletable = `0`;
342	OffsetNumber offnum,
343	maxoff;
344	Page page = BufferGetPage(buf);
345	HashPageOpaque pageopaque;
346	HashMetaPage metap;
347
348	/ Scan each tuple in page to see if it is marked as LP_DEAD /
349	maxoff = PageGetMaxOffsetNumber(page);
350	for (offnum = FirstOffsetNumber;
351	offnum <= maxoff;
352	offnum = OffsetNumberNext(offnum))
353	{
354	ItemId itemId = PageGetItemId(page, offnum);
355
356	if (ItemIdIsDead(itemId))
357	deletable[ndeletable++] = offnum;
358	}
359
360	if (ndeletable > `0`)
361	{
362	TransactionId latestRemovedXid;
363
364	latestRemovedXid =
365	index_compute_xid_horizon_for_tuples(rel, hrel, buf,
366	deletable, ndeletable);
367
368	/*
369	* Write-lock the meta page so that we can decrement tuple count.
370	*/
371	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
372
373	/ No ereport(ERROR) until changes are logged /
374	START_CRIT_SECTION();
375
376	PageIndexMultiDelete(page, deletable, ndeletable);
377
378	/*
379	* Mark the page as not containing any LP_DEAD items. This is not
380	* certainly true (there might be some that have recently been marked,
381	* but weren't included in our target-item list), but it will almost
382	* always be true and it doesn't seem worth an additional page scan to
383	* check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
384	* anyway.
385	*/
386	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
387	pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
388
389	metap = HashPageGetMeta(BufferGetPage(metabuf));
390	metap->hashm_ntuples -= ndeletable;
391
392	MarkBufferDirty(buf);
393	MarkBufferDirty(metabuf);
394
395	/ XLOG stuff /
396	if (RelationNeedsWAL(rel))
397	{
398	xl_hash_vacuum_one_page xlrec;
399	XLogRecPtr recptr;
400
401	xlrec.latestRemovedXid = latestRemovedXid;
402	xlrec.ntuples = ndeletable;
403
404	XLogBeginInsert();
405	XLogRegisterBuffer(`0`, buf, REGBUF_STANDARD);
406	XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
407
408	/*
409	* We need the target-offsets array whether or not we store the
410	* whole buffer, to allow us to find the latestRemovedXid on a
411	* standby server.
412	*/
413	XLogRegisterData((char *) deletable,
414	ndeletable * sizeof(OffsetNumber));
415
416	XLogRegisterBuffer(`1`, metabuf, REGBUF_STANDARD);
417
418	recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
419
420	PageSetLSN(BufferGetPage(buf), recptr);
421	PageSetLSN(BufferGetPage(metabuf), recptr);
422	}
423
424	END_CRIT_SECTION();
425
426	/*
427	* Releasing write lock on meta page as we have updated the tuple
428	* count.
429	*/
430	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
431	}
432	}
433

Browse the source code of PostgreSQL/src/backend/access/hash/hashinsert.c