1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * hio.c |
4 | * POSTGRES heap access method input/output code. |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/heap/hio.c |
12 | * |
13 | *------------------------------------------------------------------------- |
14 | */ |
15 | |
16 | #include "postgres.h" |
17 | |
18 | #include "access/heapam.h" |
19 | #include "access/hio.h" |
20 | #include "access/htup_details.h" |
21 | #include "access/visibilitymap.h" |
22 | #include "storage/bufmgr.h" |
23 | #include "storage/freespace.h" |
24 | #include "storage/lmgr.h" |
25 | #include "storage/smgr.h" |
26 | |
27 | |
28 | /* |
29 | * RelationPutHeapTuple - place tuple at specified page |
30 | * |
31 | * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!! |
32 | * |
33 | * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer. |
34 | */ |
35 | void |
36 | RelationPutHeapTuple(Relation relation, |
37 | Buffer buffer, |
38 | HeapTuple tuple, |
39 | bool token) |
40 | { |
41 | Page ; |
42 | OffsetNumber offnum; |
43 | |
44 | /* |
45 | * A tuple that's being inserted speculatively should already have its |
46 | * token set. |
47 | */ |
48 | Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data)); |
49 | |
50 | /* Add the tuple to the page */ |
51 | pageHeader = BufferGetPage(buffer); |
52 | |
53 | offnum = PageAddItem(pageHeader, (Item) tuple->t_data, |
54 | tuple->t_len, InvalidOffsetNumber, false, true); |
55 | |
56 | if (offnum == InvalidOffsetNumber) |
57 | elog(PANIC, "failed to add tuple to page" ); |
58 | |
59 | /* Update tuple->t_self to the actual position where it was stored */ |
60 | ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum); |
61 | |
62 | /* |
63 | * Insert the correct position into CTID of the stored tuple, too (unless |
64 | * this is a speculative insertion, in which case the token is held in |
65 | * CTID field instead) |
66 | */ |
67 | if (!token) |
68 | { |
69 | ItemId itemId = PageGetItemId(pageHeader, offnum); |
70 | HeapTupleHeader item = (HeapTupleHeader) PageGetItem(pageHeader, itemId); |
71 | |
72 | item->t_ctid = tuple->t_self; |
73 | } |
74 | } |
75 | |
76 | /* |
77 | * Read in a buffer in mode, using bulk-insert strategy if bistate isn't NULL. |
78 | */ |
79 | static Buffer |
80 | ReadBufferBI(Relation relation, BlockNumber targetBlock, |
81 | ReadBufferMode mode, BulkInsertState bistate) |
82 | { |
83 | Buffer buffer; |
84 | |
85 | /* If not bulk-insert, exactly like ReadBuffer */ |
86 | if (!bistate) |
87 | return ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, |
88 | mode, NULL); |
89 | |
90 | /* If we have the desired block already pinned, re-pin and return it */ |
91 | if (bistate->current_buf != InvalidBuffer) |
92 | { |
93 | if (BufferGetBlockNumber(bistate->current_buf) == targetBlock) |
94 | { |
95 | /* |
96 | * Currently the LOCK variants are only used for extending |
97 | * relation, which should never reach this branch. |
98 | */ |
99 | Assert(mode != RBM_ZERO_AND_LOCK && |
100 | mode != RBM_ZERO_AND_CLEANUP_LOCK); |
101 | |
102 | IncrBufferRefCount(bistate->current_buf); |
103 | return bistate->current_buf; |
104 | } |
105 | /* ... else drop the old buffer */ |
106 | ReleaseBuffer(bistate->current_buf); |
107 | bistate->current_buf = InvalidBuffer; |
108 | } |
109 | |
110 | /* Perform a read using the buffer strategy */ |
111 | buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock, |
112 | mode, bistate->strategy); |
113 | |
114 | /* Save the selected block as target for future inserts */ |
115 | IncrBufferRefCount(buffer); |
116 | bistate->current_buf = buffer; |
117 | |
118 | return buffer; |
119 | } |
120 | |
121 | /* |
122 | * For each heap page which is all-visible, acquire a pin on the appropriate |
123 | * visibility map page, if we haven't already got one. |
124 | * |
125 | * buffer2 may be InvalidBuffer, if only one buffer is involved. buffer1 |
126 | * must not be InvalidBuffer. If both buffers are specified, block1 must |
127 | * be less than block2. |
128 | */ |
129 | static void |
130 | GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, |
131 | BlockNumber block1, BlockNumber block2, |
132 | Buffer *vmbuffer1, Buffer *vmbuffer2) |
133 | { |
134 | bool need_to_pin_buffer1; |
135 | bool need_to_pin_buffer2; |
136 | |
137 | Assert(BufferIsValid(buffer1)); |
138 | Assert(buffer2 == InvalidBuffer || block1 <= block2); |
139 | |
140 | while (1) |
141 | { |
142 | /* Figure out which pins we need but don't have. */ |
143 | need_to_pin_buffer1 = PageIsAllVisible(BufferGetPage(buffer1)) |
144 | && !visibilitymap_pin_ok(block1, *vmbuffer1); |
145 | need_to_pin_buffer2 = buffer2 != InvalidBuffer |
146 | && PageIsAllVisible(BufferGetPage(buffer2)) |
147 | && !visibilitymap_pin_ok(block2, *vmbuffer2); |
148 | if (!need_to_pin_buffer1 && !need_to_pin_buffer2) |
149 | return; |
150 | |
151 | /* We must unlock both buffers before doing any I/O. */ |
152 | LockBuffer(buffer1, BUFFER_LOCK_UNLOCK); |
153 | if (buffer2 != InvalidBuffer && buffer2 != buffer1) |
154 | LockBuffer(buffer2, BUFFER_LOCK_UNLOCK); |
155 | |
156 | /* Get pins. */ |
157 | if (need_to_pin_buffer1) |
158 | visibilitymap_pin(relation, block1, vmbuffer1); |
159 | if (need_to_pin_buffer2) |
160 | visibilitymap_pin(relation, block2, vmbuffer2); |
161 | |
162 | /* Relock buffers. */ |
163 | LockBuffer(buffer1, BUFFER_LOCK_EXCLUSIVE); |
164 | if (buffer2 != InvalidBuffer && buffer2 != buffer1) |
165 | LockBuffer(buffer2, BUFFER_LOCK_EXCLUSIVE); |
166 | |
167 | /* |
168 | * If there are two buffers involved and we pinned just one of them, |
169 | * it's possible that the second one became all-visible while we were |
170 | * busy pinning the first one. If it looks like that's a possible |
171 | * scenario, we'll need to make a second pass through this loop. |
172 | */ |
173 | if (buffer2 == InvalidBuffer || buffer1 == buffer2 |
174 | || (need_to_pin_buffer1 && need_to_pin_buffer2)) |
175 | break; |
176 | } |
177 | } |
178 | |
179 | /* |
180 | * Extend a relation by multiple blocks to avoid future contention on the |
181 | * relation extension lock. Our goal is to pre-extend the relation by an |
182 | * amount which ramps up as the degree of contention ramps up, but limiting |
183 | * the result to some sane overall value. |
184 | */ |
185 | static void |
186 | (Relation relation, BulkInsertState bistate) |
187 | { |
188 | BlockNumber blockNum, |
189 | firstBlock = InvalidBlockNumber; |
190 | int ; |
191 | int lockWaiters; |
192 | |
193 | /* Use the length of the lock wait queue to judge how much to extend. */ |
194 | lockWaiters = RelationExtensionLockWaiterCount(relation); |
195 | if (lockWaiters <= 0) |
196 | return; |
197 | |
198 | /* |
199 | * It might seem like multiplying the number of lock waiters by as much as |
200 | * 20 is too aggressive, but benchmarking revealed that smaller numbers |
201 | * were insufficient. 512 is just an arbitrary cap to prevent |
202 | * pathological results. |
203 | */ |
204 | extraBlocks = Min(512, lockWaiters * 20); |
205 | |
206 | do |
207 | { |
208 | Buffer buffer; |
209 | Page page; |
210 | Size freespace; |
211 | |
212 | /* |
213 | * Extend by one page. This should generally match the main-line |
214 | * extension code in RelationGetBufferForTuple, except that we hold |
215 | * the relation extension lock throughout, and we don't immediately |
216 | * initialize the page (see below). |
217 | */ |
218 | buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate); |
219 | page = BufferGetPage(buffer); |
220 | |
221 | if (!PageIsNew(page)) |
222 | elog(ERROR, "page %u of relation \"%s\" should be empty but is not" , |
223 | BufferGetBlockNumber(buffer), |
224 | RelationGetRelationName(relation)); |
225 | |
226 | /* |
227 | * Add the page to the FSM without initializing. If we were to |
228 | * initialize here, the page would potentially get flushed out to disk |
229 | * before we add any useful content. There's no guarantee that that'd |
230 | * happen before a potential crash, so we need to deal with |
231 | * uninitialized pages anyway, thus avoid the potential for |
232 | * unnecessary writes. |
233 | */ |
234 | |
235 | /* we'll need this info below */ |
236 | blockNum = BufferGetBlockNumber(buffer); |
237 | freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData; |
238 | |
239 | UnlockReleaseBuffer(buffer); |
240 | |
241 | /* Remember first block number thus added. */ |
242 | if (firstBlock == InvalidBlockNumber) |
243 | firstBlock = blockNum; |
244 | |
245 | /* |
246 | * Immediately update the bottom level of the FSM. This has a good |
247 | * chance of making this page visible to other concurrently inserting |
248 | * backends, and we want that to happen without delay. |
249 | */ |
250 | RecordPageWithFreeSpace(relation, blockNum, freespace); |
251 | } |
252 | while (--extraBlocks > 0); |
253 | |
254 | /* |
255 | * Updating the upper levels of the free space map is too expensive to do |
256 | * for every block, but it's worth doing once at the end to make sure that |
257 | * subsequent insertion activity sees all of those nifty free pages we |
258 | * just inserted. |
259 | */ |
260 | FreeSpaceMapVacuumRange(relation, firstBlock, blockNum + 1); |
261 | } |
262 | |
263 | /* |
264 | * RelationGetBufferForTuple |
265 | * |
266 | * Returns pinned and exclusive-locked buffer of a page in given relation |
267 | * with free space >= given len. |
268 | * |
269 | * If otherBuffer is not InvalidBuffer, then it references a previously |
270 | * pinned buffer of another page in the same relation; on return, this |
271 | * buffer will also be exclusive-locked. (This case is used by heap_update; |
272 | * the otherBuffer contains the tuple being updated.) |
273 | * |
274 | * The reason for passing otherBuffer is that if two backends are doing |
275 | * concurrent heap_update operations, a deadlock could occur if they try |
276 | * to lock the same two buffers in opposite orders. To ensure that this |
277 | * can't happen, we impose the rule that buffers of a relation must be |
278 | * locked in increasing page number order. This is most conveniently done |
279 | * by having RelationGetBufferForTuple lock them both, with suitable care |
280 | * for ordering. |
281 | * |
282 | * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the |
283 | * same buffer we select for insertion of the new tuple (this could only |
284 | * happen if space is freed in that page after heap_update finds there's not |
285 | * enough there). In that case, the page will be pinned and locked only once. |
286 | * |
287 | * For the vmbuffer and vmbuffer_other arguments, we avoid deadlock by |
288 | * locking them only after locking the corresponding heap page, and taking |
289 | * no further lwlocks while they are locked. |
290 | * |
291 | * We normally use FSM to help us find free space. However, |
292 | * if HEAP_INSERT_SKIP_FSM is specified, we just append a new empty page to |
293 | * the end of the relation if the tuple won't fit on the current target page. |
294 | * This can save some cycles when we know the relation is new and doesn't |
295 | * contain useful amounts of free space. |
296 | * |
297 | * HEAP_INSERT_SKIP_FSM is also useful for non-WAL-logged additions to a |
298 | * relation, if the caller holds exclusive lock and is careful to invalidate |
299 | * relation's smgr_targblock before the first insertion --- that ensures that |
300 | * all insertions will occur into newly added pages and not be intermixed |
301 | * with tuples from other transactions. That way, a crash can't risk losing |
302 | * any committed data of other transactions. (See heap_insert's comments |
303 | * for additional constraints needed for safe usage of this behavior.) |
304 | * |
305 | * The caller can also provide a BulkInsertState object to optimize many |
306 | * insertions into the same relation. This keeps a pin on the current |
307 | * insertion target page (to save pin/unpin cycles) and also passes a |
308 | * BULKWRITE buffer selection strategy object to the buffer manager. |
309 | * Passing NULL for bistate selects the default behavior. |
310 | * |
311 | * We always try to avoid filling existing pages further than the fillfactor. |
312 | * This is OK since this routine is not consulted when updating a tuple and |
313 | * keeping it on the same page, which is the scenario fillfactor is meant |
314 | * to reserve space for. |
315 | * |
316 | * ereport(ERROR) is allowed here, so this routine *must* be called |
317 | * before any (unlogged) changes are made in buffer pool. |
318 | */ |
319 | Buffer |
320 | RelationGetBufferForTuple(Relation relation, Size len, |
321 | Buffer otherBuffer, int options, |
322 | BulkInsertState bistate, |
323 | Buffer *vmbuffer, Buffer *vmbuffer_other) |
324 | { |
325 | bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); |
326 | Buffer buffer = InvalidBuffer; |
327 | Page page; |
328 | Size pageFreeSpace = 0, |
329 | saveFreeSpace = 0; |
330 | BlockNumber targetBlock, |
331 | otherBlock; |
332 | bool needLock; |
333 | |
334 | len = MAXALIGN(len); /* be conservative */ |
335 | |
336 | /* Bulk insert is not supported for updates, only inserts. */ |
337 | Assert(otherBuffer == InvalidBuffer || !bistate); |
338 | |
339 | /* |
340 | * If we're gonna fail for oversize tuple, do it right away |
341 | */ |
342 | if (len > MaxHeapTupleSize) |
343 | ereport(ERROR, |
344 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
345 | errmsg("row is too big: size %zu, maximum size %zu" , |
346 | len, MaxHeapTupleSize))); |
347 | |
348 | /* Compute desired extra freespace due to fillfactor option */ |
349 | saveFreeSpace = RelationGetTargetPageFreeSpace(relation, |
350 | HEAP_DEFAULT_FILLFACTOR); |
351 | |
352 | if (otherBuffer != InvalidBuffer) |
353 | otherBlock = BufferGetBlockNumber(otherBuffer); |
354 | else |
355 | otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */ |
356 | |
357 | /* |
358 | * We first try to put the tuple on the same page we last inserted a tuple |
359 | * on, as cached in the BulkInsertState or relcache entry. If that |
360 | * doesn't work, we ask the Free Space Map to locate a suitable page. |
361 | * Since the FSM's info might be out of date, we have to be prepared to |
362 | * loop around and retry multiple times. (To insure this isn't an infinite |
363 | * loop, we must update the FSM with the correct amount of free space on |
364 | * each page that proves not to be suitable.) If the FSM has no record of |
365 | * a page with enough free space, we give up and extend the relation. |
366 | * |
367 | * When use_fsm is false, we either put the tuple onto the existing target |
368 | * page or extend the relation. |
369 | */ |
370 | if (len + saveFreeSpace > MaxHeapTupleSize) |
371 | { |
372 | /* can't fit, don't bother asking FSM */ |
373 | targetBlock = InvalidBlockNumber; |
374 | use_fsm = false; |
375 | } |
376 | else if (bistate && bistate->current_buf != InvalidBuffer) |
377 | targetBlock = BufferGetBlockNumber(bistate->current_buf); |
378 | else |
379 | targetBlock = RelationGetTargetBlock(relation); |
380 | |
381 | if (targetBlock == InvalidBlockNumber && use_fsm) |
382 | { |
383 | /* |
384 | * We have no cached target page, so ask the FSM for an initial |
385 | * target. |
386 | */ |
387 | targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); |
388 | |
389 | /* |
390 | * If the FSM knows nothing of the rel, try the last page before we |
391 | * give up and extend. This avoids one-tuple-per-page syndrome during |
392 | * bootstrapping or in a recently-started system. |
393 | */ |
394 | if (targetBlock == InvalidBlockNumber) |
395 | { |
396 | BlockNumber nblocks = RelationGetNumberOfBlocks(relation); |
397 | |
398 | if (nblocks > 0) |
399 | targetBlock = nblocks - 1; |
400 | } |
401 | } |
402 | |
403 | loop: |
404 | while (targetBlock != InvalidBlockNumber) |
405 | { |
406 | /* |
407 | * Read and exclusive-lock the target block, as well as the other |
408 | * block if one was given, taking suitable care with lock ordering and |
409 | * the possibility they are the same block. |
410 | * |
411 | * If the page-level all-visible flag is set, caller will need to |
412 | * clear both that and the corresponding visibility map bit. However, |
413 | * by the time we return, we'll have x-locked the buffer, and we don't |
414 | * want to do any I/O while in that state. So we check the bit here |
415 | * before taking the lock, and pin the page if it appears necessary. |
416 | * Checking without the lock creates a risk of getting the wrong |
417 | * answer, so we'll have to recheck after acquiring the lock. |
418 | */ |
419 | if (otherBuffer == InvalidBuffer) |
420 | { |
421 | /* easy case */ |
422 | buffer = ReadBufferBI(relation, targetBlock, RBM_NORMAL, bistate); |
423 | if (PageIsAllVisible(BufferGetPage(buffer))) |
424 | visibilitymap_pin(relation, targetBlock, vmbuffer); |
425 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
426 | } |
427 | else if (otherBlock == targetBlock) |
428 | { |
429 | /* also easy case */ |
430 | buffer = otherBuffer; |
431 | if (PageIsAllVisible(BufferGetPage(buffer))) |
432 | visibilitymap_pin(relation, targetBlock, vmbuffer); |
433 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
434 | } |
435 | else if (otherBlock < targetBlock) |
436 | { |
437 | /* lock other buffer first */ |
438 | buffer = ReadBuffer(relation, targetBlock); |
439 | if (PageIsAllVisible(BufferGetPage(buffer))) |
440 | visibilitymap_pin(relation, targetBlock, vmbuffer); |
441 | LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); |
442 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
443 | } |
444 | else |
445 | { |
446 | /* lock target buffer first */ |
447 | buffer = ReadBuffer(relation, targetBlock); |
448 | if (PageIsAllVisible(BufferGetPage(buffer))) |
449 | visibilitymap_pin(relation, targetBlock, vmbuffer); |
450 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
451 | LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); |
452 | } |
453 | |
454 | /* |
455 | * We now have the target page (and the other buffer, if any) pinned |
456 | * and locked. However, since our initial PageIsAllVisible checks |
457 | * were performed before acquiring the lock, the results might now be |
458 | * out of date, either for the selected victim buffer, or for the |
459 | * other buffer passed by the caller. In that case, we'll need to |
460 | * give up our locks, go get the pin(s) we failed to get earlier, and |
461 | * re-lock. That's pretty painful, but hopefully shouldn't happen |
462 | * often. |
463 | * |
464 | * Note that there's a small possibility that we didn't pin the page |
465 | * above but still have the correct page pinned anyway, either because |
466 | * we've already made a previous pass through this loop, or because |
467 | * caller passed us the right page anyway. |
468 | * |
469 | * Note also that it's possible that by the time we get the pin and |
470 | * retake the buffer locks, the visibility map bit will have been |
471 | * cleared by some other backend anyway. In that case, we'll have |
472 | * done a bit of extra work for no gain, but there's no real harm |
473 | * done. |
474 | */ |
475 | if (otherBuffer == InvalidBuffer || targetBlock <= otherBlock) |
476 | GetVisibilityMapPins(relation, buffer, otherBuffer, |
477 | targetBlock, otherBlock, vmbuffer, |
478 | vmbuffer_other); |
479 | else |
480 | GetVisibilityMapPins(relation, otherBuffer, buffer, |
481 | otherBlock, targetBlock, vmbuffer_other, |
482 | vmbuffer); |
483 | |
484 | /* |
485 | * Now we can check to see if there's enough free space here. If so, |
486 | * we're done. |
487 | */ |
488 | page = BufferGetPage(buffer); |
489 | |
490 | /* |
491 | * If necessary initialize page, it'll be used soon. We could avoid |
492 | * dirtying the buffer here, and rely on the caller to do so whenever |
493 | * it puts a tuple onto the page, but there seems not much benefit in |
494 | * doing so. |
495 | */ |
496 | if (PageIsNew(page)) |
497 | { |
498 | PageInit(page, BufferGetPageSize(buffer), 0); |
499 | MarkBufferDirty(buffer); |
500 | } |
501 | |
502 | pageFreeSpace = PageGetHeapFreeSpace(page); |
503 | if (len + saveFreeSpace <= pageFreeSpace) |
504 | { |
505 | /* use this page as future insert target, too */ |
506 | RelationSetTargetBlock(relation, targetBlock); |
507 | return buffer; |
508 | } |
509 | |
510 | /* |
511 | * Not enough space, so we must give up our page locks and pin (if |
512 | * any) and prepare to look elsewhere. We don't care which order we |
513 | * unlock the two buffers in, so this can be slightly simpler than the |
514 | * code above. |
515 | */ |
516 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
517 | if (otherBuffer == InvalidBuffer) |
518 | ReleaseBuffer(buffer); |
519 | else if (otherBlock != targetBlock) |
520 | { |
521 | LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); |
522 | ReleaseBuffer(buffer); |
523 | } |
524 | |
525 | /* Without FSM, always fall out of the loop and extend */ |
526 | if (!use_fsm) |
527 | break; |
528 | |
529 | /* |
530 | * Update FSM as to condition of this page, and ask for another page |
531 | * to try. |
532 | */ |
533 | targetBlock = RecordAndGetPageWithFreeSpace(relation, |
534 | targetBlock, |
535 | pageFreeSpace, |
536 | len + saveFreeSpace); |
537 | } |
538 | |
539 | /* |
540 | * Have to extend the relation. |
541 | * |
542 | * We have to use a lock to ensure no one else is extending the rel at the |
543 | * same time, else we will both try to initialize the same new page. We |
544 | * can skip locking for new or temp relations, however, since no one else |
545 | * could be accessing them. |
546 | */ |
547 | needLock = !RELATION_IS_LOCAL(relation); |
548 | |
549 | /* |
550 | * If we need the lock but are not able to acquire it immediately, we'll |
551 | * consider extending the relation by multiple blocks at a time to manage |
552 | * contention on the relation extension lock. However, this only makes |
553 | * sense if we're using the FSM; otherwise, there's no point. |
554 | */ |
555 | if (needLock) |
556 | { |
557 | if (!use_fsm) |
558 | LockRelationForExtension(relation, ExclusiveLock); |
559 | else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock)) |
560 | { |
561 | /* Couldn't get the lock immediately; wait for it. */ |
562 | LockRelationForExtension(relation, ExclusiveLock); |
563 | |
564 | /* |
565 | * Check if some other backend has extended a block for us while |
566 | * we were waiting on the lock. |
567 | */ |
568 | targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); |
569 | |
570 | /* |
571 | * If some other waiter has already extended the relation, we |
572 | * don't need to do so; just use the existing freespace. |
573 | */ |
574 | if (targetBlock != InvalidBlockNumber) |
575 | { |
576 | UnlockRelationForExtension(relation, ExclusiveLock); |
577 | goto loop; |
578 | } |
579 | |
580 | /* Time to bulk-extend. */ |
581 | RelationAddExtraBlocks(relation, bistate); |
582 | } |
583 | } |
584 | |
585 | /* |
586 | * In addition to whatever extension we performed above, we always add at |
587 | * least one block to satisfy our own request. |
588 | * |
589 | * XXX This does an lseek - rather expensive - but at the moment it is the |
590 | * only way to accurately determine how many blocks are in a relation. Is |
591 | * it worth keeping an accurate file length in shared memory someplace, |
592 | * rather than relying on the kernel to do it for us? |
593 | */ |
594 | buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate); |
595 | |
596 | /* |
597 | * We need to initialize the empty new page. Double-check that it really |
598 | * is empty (this should never happen, but if it does we don't want to |
599 | * risk wiping out valid data). |
600 | */ |
601 | page = BufferGetPage(buffer); |
602 | |
603 | if (!PageIsNew(page)) |
604 | elog(ERROR, "page %u of relation \"%s\" should be empty but is not" , |
605 | BufferGetBlockNumber(buffer), |
606 | RelationGetRelationName(relation)); |
607 | |
608 | PageInit(page, BufferGetPageSize(buffer), 0); |
609 | MarkBufferDirty(buffer); |
610 | |
611 | /* |
612 | * Release the file-extension lock; it's now OK for someone else to extend |
613 | * the relation some more. |
614 | */ |
615 | if (needLock) |
616 | UnlockRelationForExtension(relation, ExclusiveLock); |
617 | |
618 | /* |
619 | * Lock the other buffer. It's guaranteed to be of a lower page number |
620 | * than the new page. To conform with the deadlock prevent rules, we ought |
621 | * to lock otherBuffer first, but that would give other backends a chance |
622 | * to put tuples on our page. To reduce the likelihood of that, attempt to |
623 | * lock the other buffer conditionally, that's very likely to work. |
624 | * Otherwise we need to lock buffers in the correct order, and retry if |
625 | * the space has been used in the mean time. |
626 | * |
627 | * Alternatively, we could acquire the lock on otherBuffer before |
628 | * extending the relation, but that'd require holding the lock while |
629 | * performing IO, which seems worse than an unlikely retry. |
630 | */ |
631 | if (otherBuffer != InvalidBuffer) |
632 | { |
633 | Assert(otherBuffer != buffer); |
634 | |
635 | if (unlikely(!ConditionalLockBuffer(otherBuffer))) |
636 | { |
637 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
638 | LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE); |
639 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
640 | |
641 | /* |
642 | * Because the buffer was unlocked for a while, it's possible, |
643 | * although unlikely, that the page was filled. If so, just retry |
644 | * from start. |
645 | */ |
646 | if (len > PageGetHeapFreeSpace(page)) |
647 | { |
648 | LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK); |
649 | UnlockReleaseBuffer(buffer); |
650 | |
651 | goto loop; |
652 | } |
653 | } |
654 | } |
655 | |
656 | if (len > PageGetHeapFreeSpace(page)) |
657 | { |
658 | /* We should not get here given the test at the top */ |
659 | elog(PANIC, "tuple is too big: size %zu" , len); |
660 | } |
661 | |
662 | /* |
663 | * Remember the new page as our target for future insertions. |
664 | * |
665 | * XXX should we enter the new page into the free space map immediately, |
666 | * or just keep it for this backend's exclusive use in the short run |
667 | * (until VACUUM sees it)? Seems to depend on whether you expect the |
668 | * current backend to make more insertions or not, which is probably a |
669 | * good bet most of the time. So for now, don't add it to FSM yet. |
670 | */ |
671 | RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer)); |
672 | |
673 | return buffer; |
674 | } |
675 | |