| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * hashinsert.c |
| 4 | * Item insertion in hash tables for Postgres. |
| 5 | * |
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 7 | * Portions Copyright (c) 1994, Regents of the University of California |
| 8 | * |
| 9 | * |
| 10 | * IDENTIFICATION |
| 11 | * src/backend/access/hash/hashinsert.c |
| 12 | * |
| 13 | *------------------------------------------------------------------------- |
| 14 | */ |
| 15 | |
| 16 | #include "postgres.h" |
| 17 | |
| 18 | #include "access/hash.h" |
| 19 | #include "access/hash_xlog.h" |
| 20 | #include "miscadmin.h" |
| 21 | #include "utils/rel.h" |
| 22 | #include "storage/lwlock.h" |
| 23 | #include "storage/buf_internals.h" |
| 24 | #include "storage/predicate.h" |
| 25 | |
| 26 | static void _hash_vacuum_one_page(Relation rel, Relation hrel, |
| 27 | Buffer metabuf, Buffer buf); |
| 28 | |
| 29 | /* |
| 30 | * _hash_doinsert() -- Handle insertion of a single index tuple. |
| 31 | * |
| 32 | * This routine is called by the public interface routines, hashbuild |
| 33 | * and hashinsert. By here, itup is completely filled in. |
| 34 | */ |
| 35 | void |
| 36 | _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel) |
| 37 | { |
| 38 | Buffer buf = InvalidBuffer; |
| 39 | Buffer bucket_buf; |
| 40 | Buffer metabuf; |
| 41 | HashMetaPage metap; |
| 42 | HashMetaPage usedmetap = NULL; |
| 43 | Page metapage; |
| 44 | Page page; |
| 45 | HashPageOpaque pageopaque; |
| 46 | Size itemsz; |
| 47 | bool do_expand; |
| 48 | uint32 hashkey; |
| 49 | Bucket bucket; |
| 50 | OffsetNumber itup_off; |
| 51 | |
| 52 | /* |
| 53 | * Get the hash key for the item (it's stored in the index tuple itself). |
| 54 | */ |
| 55 | hashkey = _hash_get_indextuple_hashkey(itup); |
| 56 | |
| 57 | /* compute item size too */ |
| 58 | itemsz = IndexTupleSize(itup); |
| 59 | itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we |
| 60 | * need to be consistent */ |
| 61 | |
| 62 | restart_insert: |
| 63 | |
| 64 | /* |
| 65 | * Read the metapage. We don't lock it yet; HashMaxItemSize() will |
| 66 | * examine pd_pagesize_version, but that can't change so we can examine it |
| 67 | * without a lock. |
| 68 | */ |
| 69 | metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); |
| 70 | metapage = BufferGetPage(metabuf); |
| 71 | |
| 72 | /* |
| 73 | * Check whether the item can fit on a hash page at all. (Eventually, we |
| 74 | * ought to try to apply TOAST methods if not.) Note that at this point, |
| 75 | * itemsz doesn't include the ItemId. |
| 76 | * |
| 77 | * XXX this is useless code if we are only storing hash keys. |
| 78 | */ |
| 79 | if (itemsz > HashMaxItemSize(metapage)) |
| 80 | ereport(ERROR, |
| 81 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| 82 | errmsg("index row size %zu exceeds hash maximum %zu" , |
| 83 | itemsz, HashMaxItemSize(metapage)), |
| 84 | errhint("Values larger than a buffer page cannot be indexed." ))); |
| 85 | |
| 86 | /* Lock the primary bucket page for the target bucket. */ |
| 87 | buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, |
| 88 | &usedmetap); |
| 89 | Assert(usedmetap != NULL); |
| 90 | |
| 91 | CheckForSerializableConflictIn(rel, NULL, buf); |
| 92 | |
| 93 | /* remember the primary bucket buffer to release the pin on it at end. */ |
| 94 | bucket_buf = buf; |
| 95 | |
| 96 | page = BufferGetPage(buf); |
| 97 | pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); |
| 98 | bucket = pageopaque->hasho_bucket; |
| 99 | |
| 100 | /* |
| 101 | * If this bucket is in the process of being split, try to finish the |
| 102 | * split before inserting, because that might create room for the |
| 103 | * insertion to proceed without allocating an additional overflow page. |
| 104 | * It's only interesting to finish the split if we're trying to insert |
| 105 | * into the bucket from which we're removing tuples (the "old" bucket), |
| 106 | * not if we're trying to insert into the bucket into which tuples are |
| 107 | * being moved (the "new" bucket). |
| 108 | */ |
| 109 | if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) |
| 110 | { |
| 111 | /* release the lock on bucket buffer, before completing the split. */ |
| 112 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| 113 | |
| 114 | _hash_finish_split(rel, metabuf, buf, bucket, |
| 115 | usedmetap->hashm_maxbucket, |
| 116 | usedmetap->hashm_highmask, |
| 117 | usedmetap->hashm_lowmask); |
| 118 | |
| 119 | /* release the pin on old and meta buffer. retry for insert. */ |
| 120 | _hash_dropbuf(rel, buf); |
| 121 | _hash_dropbuf(rel, metabuf); |
| 122 | goto restart_insert; |
| 123 | } |
| 124 | |
| 125 | /* Do the insertion */ |
| 126 | while (PageGetFreeSpace(page) < itemsz) |
| 127 | { |
| 128 | BlockNumber nextblkno; |
| 129 | |
| 130 | /* |
| 131 | * Check if current page has any DEAD tuples. If yes, delete these |
| 132 | * tuples and see if we can get a space for the new item to be |
| 133 | * inserted before moving to the next page in the bucket chain. |
| 134 | */ |
| 135 | if (H_HAS_DEAD_TUPLES(pageopaque)) |
| 136 | { |
| 137 | |
| 138 | if (IsBufferCleanupOK(buf)) |
| 139 | { |
| 140 | _hash_vacuum_one_page(rel, heapRel, metabuf, buf); |
| 141 | |
| 142 | if (PageGetFreeSpace(page) >= itemsz) |
| 143 | break; /* OK, now we have enough space */ |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | /* |
| 148 | * no space on this page; check for an overflow page |
| 149 | */ |
| 150 | nextblkno = pageopaque->hasho_nextblkno; |
| 151 | |
| 152 | if (BlockNumberIsValid(nextblkno)) |
| 153 | { |
| 154 | /* |
| 155 | * ovfl page exists; go get it. if it doesn't have room, we'll |
| 156 | * find out next pass through the loop test above. we always |
| 157 | * release both the lock and pin if this is an overflow page, but |
| 158 | * only the lock if this is the primary bucket page, since the pin |
| 159 | * on the primary bucket must be retained throughout the scan. |
| 160 | */ |
| 161 | if (buf != bucket_buf) |
| 162 | _hash_relbuf(rel, buf); |
| 163 | else |
| 164 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| 165 | buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); |
| 166 | page = BufferGetPage(buf); |
| 167 | } |
| 168 | else |
| 169 | { |
| 170 | /* |
| 171 | * we're at the end of the bucket chain and we haven't found a |
| 172 | * page with enough room. allocate a new overflow page. |
| 173 | */ |
| 174 | |
| 175 | /* release our write lock without modifying buffer */ |
| 176 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| 177 | |
| 178 | /* chain to a new overflow page */ |
| 179 | buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false); |
| 180 | page = BufferGetPage(buf); |
| 181 | |
| 182 | /* should fit now, given test above */ |
| 183 | Assert(PageGetFreeSpace(page) >= itemsz); |
| 184 | } |
| 185 | pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); |
| 186 | Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE); |
| 187 | Assert(pageopaque->hasho_bucket == bucket); |
| 188 | } |
| 189 | |
| 190 | /* |
| 191 | * Write-lock the metapage so we can increment the tuple count. After |
| 192 | * incrementing it, check to see if it's time for a split. |
| 193 | */ |
| 194 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| 195 | |
| 196 | /* Do the update. No ereport(ERROR) until changes are logged */ |
| 197 | START_CRIT_SECTION(); |
| 198 | |
| 199 | /* found page with enough space, so add the item here */ |
| 200 | itup_off = _hash_pgaddtup(rel, buf, itemsz, itup); |
| 201 | MarkBufferDirty(buf); |
| 202 | |
| 203 | /* metapage operations */ |
| 204 | metap = HashPageGetMeta(metapage); |
| 205 | metap->hashm_ntuples += 1; |
| 206 | |
| 207 | /* Make sure this stays in sync with _hash_expandtable() */ |
| 208 | do_expand = metap->hashm_ntuples > |
| 209 | (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); |
| 210 | |
| 211 | MarkBufferDirty(metabuf); |
| 212 | |
| 213 | /* XLOG stuff */ |
| 214 | if (RelationNeedsWAL(rel)) |
| 215 | { |
| 216 | xl_hash_insert xlrec; |
| 217 | XLogRecPtr recptr; |
| 218 | |
| 219 | xlrec.offnum = itup_off; |
| 220 | |
| 221 | XLogBeginInsert(); |
| 222 | XLogRegisterData((char *) &xlrec, SizeOfHashInsert); |
| 223 | |
| 224 | XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); |
| 225 | |
| 226 | XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
| 227 | XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); |
| 228 | |
| 229 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT); |
| 230 | |
| 231 | PageSetLSN(BufferGetPage(buf), recptr); |
| 232 | PageSetLSN(BufferGetPage(metabuf), recptr); |
| 233 | } |
| 234 | |
| 235 | END_CRIT_SECTION(); |
| 236 | |
| 237 | /* drop lock on metapage, but keep pin */ |
| 238 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
| 239 | |
| 240 | /* |
| 241 | * Release the modified page and ensure to release the pin on primary |
| 242 | * page. |
| 243 | */ |
| 244 | _hash_relbuf(rel, buf); |
| 245 | if (buf != bucket_buf) |
| 246 | _hash_dropbuf(rel, bucket_buf); |
| 247 | |
| 248 | /* Attempt to split if a split is needed */ |
| 249 | if (do_expand) |
| 250 | _hash_expandtable(rel, metabuf); |
| 251 | |
| 252 | /* Finally drop our pin on the metapage */ |
| 253 | _hash_dropbuf(rel, metabuf); |
| 254 | } |
| 255 | |
| 256 | /* |
| 257 | * _hash_pgaddtup() -- add a tuple to a particular page in the index. |
| 258 | * |
| 259 | * This routine adds the tuple to the page as requested; it does not write out |
| 260 | * the page. It is an error to call pgaddtup() without pin and write lock on |
| 261 | * the target buffer. |
| 262 | * |
| 263 | * Returns the offset number at which the tuple was inserted. This function |
| 264 | * is responsible for preserving the condition that tuples in a hash index |
| 265 | * page are sorted by hashkey value. |
| 266 | */ |
| 267 | OffsetNumber |
| 268 | _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup) |
| 269 | { |
| 270 | OffsetNumber itup_off; |
| 271 | Page page; |
| 272 | uint32 hashkey; |
| 273 | |
| 274 | _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); |
| 275 | page = BufferGetPage(buf); |
| 276 | |
| 277 | /* Find where to insert the tuple (preserving page's hashkey ordering) */ |
| 278 | hashkey = _hash_get_indextuple_hashkey(itup); |
| 279 | itup_off = _hash_binsearch(page, hashkey); |
| 280 | |
| 281 | if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) |
| 282 | == InvalidOffsetNumber) |
| 283 | elog(ERROR, "failed to add index item to \"%s\"" , |
| 284 | RelationGetRelationName(rel)); |
| 285 | |
| 286 | return itup_off; |
| 287 | } |
| 288 | |
| 289 | /* |
| 290 | * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the |
| 291 | * index. |
| 292 | * |
| 293 | * This routine has same requirements for locking and tuple ordering as |
| 294 | * _hash_pgaddtup(). |
| 295 | * |
| 296 | * Returns the offset number array at which the tuples were inserted. |
| 297 | */ |
| 298 | void |
| 299 | _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, |
| 300 | OffsetNumber *itup_offsets, uint16 nitups) |
| 301 | { |
| 302 | OffsetNumber itup_off; |
| 303 | Page page; |
| 304 | uint32 hashkey; |
| 305 | int i; |
| 306 | |
| 307 | _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); |
| 308 | page = BufferGetPage(buf); |
| 309 | |
| 310 | for (i = 0; i < nitups; i++) |
| 311 | { |
| 312 | Size itemsize; |
| 313 | |
| 314 | itemsize = IndexTupleSize(itups[i]); |
| 315 | itemsize = MAXALIGN(itemsize); |
| 316 | |
| 317 | /* Find where to insert the tuple (preserving page's hashkey ordering) */ |
| 318 | hashkey = _hash_get_indextuple_hashkey(itups[i]); |
| 319 | itup_off = _hash_binsearch(page, hashkey); |
| 320 | |
| 321 | itup_offsets[i] = itup_off; |
| 322 | |
| 323 | if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) |
| 324 | == InvalidOffsetNumber) |
| 325 | elog(ERROR, "failed to add index item to \"%s\"" , |
| 326 | RelationGetRelationName(rel)); |
| 327 | } |
| 328 | } |
| 329 | |
| 330 | /* |
| 331 | * _hash_vacuum_one_page - vacuum just one index page. |
| 332 | * |
| 333 | * Try to remove LP_DEAD items from the given page. We must acquire cleanup |
| 334 | * lock on the page being modified before calling this function. |
| 335 | */ |
| 336 | |
| 337 | static void |
| 338 | _hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf) |
| 339 | { |
| 340 | OffsetNumber deletable[MaxOffsetNumber]; |
| 341 | int ndeletable = 0; |
| 342 | OffsetNumber offnum, |
| 343 | maxoff; |
| 344 | Page page = BufferGetPage(buf); |
| 345 | HashPageOpaque pageopaque; |
| 346 | HashMetaPage metap; |
| 347 | |
| 348 | /* Scan each tuple in page to see if it is marked as LP_DEAD */ |
| 349 | maxoff = PageGetMaxOffsetNumber(page); |
| 350 | for (offnum = FirstOffsetNumber; |
| 351 | offnum <= maxoff; |
| 352 | offnum = OffsetNumberNext(offnum)) |
| 353 | { |
| 354 | ItemId itemId = PageGetItemId(page, offnum); |
| 355 | |
| 356 | if (ItemIdIsDead(itemId)) |
| 357 | deletable[ndeletable++] = offnum; |
| 358 | } |
| 359 | |
| 360 | if (ndeletable > 0) |
| 361 | { |
| 362 | TransactionId latestRemovedXid; |
| 363 | |
| 364 | latestRemovedXid = |
| 365 | index_compute_xid_horizon_for_tuples(rel, hrel, buf, |
| 366 | deletable, ndeletable); |
| 367 | |
| 368 | /* |
| 369 | * Write-lock the meta page so that we can decrement tuple count. |
| 370 | */ |
| 371 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| 372 | |
| 373 | /* No ereport(ERROR) until changes are logged */ |
| 374 | START_CRIT_SECTION(); |
| 375 | |
| 376 | PageIndexMultiDelete(page, deletable, ndeletable); |
| 377 | |
| 378 | /* |
| 379 | * Mark the page as not containing any LP_DEAD items. This is not |
| 380 | * certainly true (there might be some that have recently been marked, |
| 381 | * but weren't included in our target-item list), but it will almost |
| 382 | * always be true and it doesn't seem worth an additional page scan to |
| 383 | * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint |
| 384 | * anyway. |
| 385 | */ |
| 386 | pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); |
| 387 | pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; |
| 388 | |
| 389 | metap = HashPageGetMeta(BufferGetPage(metabuf)); |
| 390 | metap->hashm_ntuples -= ndeletable; |
| 391 | |
| 392 | MarkBufferDirty(buf); |
| 393 | MarkBufferDirty(metabuf); |
| 394 | |
| 395 | /* XLOG stuff */ |
| 396 | if (RelationNeedsWAL(rel)) |
| 397 | { |
| 398 | xl_hash_vacuum_one_page xlrec; |
| 399 | XLogRecPtr recptr; |
| 400 | |
| 401 | xlrec.latestRemovedXid = latestRemovedXid; |
| 402 | xlrec.ntuples = ndeletable; |
| 403 | |
| 404 | XLogBeginInsert(); |
| 405 | XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
| 406 | XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage); |
| 407 | |
| 408 | /* |
| 409 | * We need the target-offsets array whether or not we store the |
| 410 | * whole buffer, to allow us to find the latestRemovedXid on a |
| 411 | * standby server. |
| 412 | */ |
| 413 | XLogRegisterData((char *) deletable, |
| 414 | ndeletable * sizeof(OffsetNumber)); |
| 415 | |
| 416 | XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); |
| 417 | |
| 418 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE); |
| 419 | |
| 420 | PageSetLSN(BufferGetPage(buf), recptr); |
| 421 | PageSetLSN(BufferGetPage(metabuf), recptr); |
| 422 | } |
| 423 | |
| 424 | END_CRIT_SECTION(); |
| 425 | |
| 426 | /* |
| 427 | * Releasing write lock on meta page as we have updated the tuple |
| 428 | * count. |
| 429 | */ |
| 430 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
| 431 | } |
| 432 | } |
| 433 | |