| 1 | /*------------------------------------------------------------------------- | 
| 2 |  * | 
| 3 |  * hashutil.c | 
| 4 |  *	  Utility code for Postgres hash implementation. | 
| 5 |  * | 
| 6 |  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group | 
| 7 |  * Portions Copyright (c) 1994, Regents of the University of California | 
| 8 |  * | 
| 9 |  * | 
| 10 |  * IDENTIFICATION | 
| 11 |  *	  src/backend/access/hash/hashutil.c | 
| 12 |  * | 
| 13 |  *------------------------------------------------------------------------- | 
| 14 |  */ | 
| 15 | #include "postgres.h" | 
| 16 |  | 
| 17 | #include "access/hash.h" | 
| 18 | #include "access/reloptions.h" | 
| 19 | #include "access/relscan.h" | 
| 20 | #include "utils/lsyscache.h" | 
| 21 | #include "utils/rel.h" | 
| 22 | #include "storage/buf_internals.h" | 
| 23 |  | 
| 24 | #define CALC_NEW_BUCKET(old_bucket, lowmask) \ | 
| 25 | 			old_bucket | (lowmask + 1) | 
| 26 |  | 
| 27 | /* | 
| 28 |  * _hash_checkqual -- does the index tuple satisfy the scan conditions? | 
| 29 |  */ | 
| 30 | bool | 
| 31 | _hash_checkqual(IndexScanDesc scan, IndexTuple itup) | 
| 32 | { | 
| 33 | 	/* | 
| 34 | 	 * Currently, we can't check any of the scan conditions since we do not | 
| 35 | 	 * have the original index entry value to supply to the sk_func. Always | 
| 36 | 	 * return true; we expect that hashgettuple already set the recheck flag | 
| 37 | 	 * to make the main indexscan code do it. | 
| 38 | 	 */ | 
| 39 | #ifdef NOT_USED | 
| 40 | 	TupleDesc	tupdesc = RelationGetDescr(scan->indexRelation); | 
| 41 | 	ScanKey		key = scan->keyData; | 
| 42 | 	int			scanKeySize = scan->numberOfKeys; | 
| 43 |  | 
| 44 | 	while (scanKeySize > 0) | 
| 45 | 	{ | 
| 46 | 		Datum		datum; | 
| 47 | 		bool		isNull; | 
| 48 | 		Datum		test; | 
| 49 |  | 
| 50 | 		datum = index_getattr(itup, | 
| 51 | 							  key->sk_attno, | 
| 52 | 							  tupdesc, | 
| 53 | 							  &isNull); | 
| 54 |  | 
| 55 | 		/* assume sk_func is strict */ | 
| 56 | 		if (isNull) | 
| 57 | 			return false; | 
| 58 | 		if (key->sk_flags & SK_ISNULL) | 
| 59 | 			return false; | 
| 60 |  | 
| 61 | 		test = FunctionCall2Coll(&key->sk_func, key->sk_collation, | 
| 62 | 								 datum, key->sk_argument); | 
| 63 |  | 
| 64 | 		if (!DatumGetBool(test)) | 
| 65 | 			return false; | 
| 66 |  | 
| 67 | 		key++; | 
| 68 | 		scanKeySize--; | 
| 69 | 	} | 
| 70 | #endif | 
| 71 |  | 
| 72 | 	return true; | 
| 73 | } | 
| 74 |  | 
| 75 | /* | 
| 76 |  * _hash_datum2hashkey -- given a Datum, call the index's hash function | 
| 77 |  * | 
| 78 |  * The Datum is assumed to be of the index's column type, so we can use the | 
| 79 |  * "primary" hash function that's tracked for us by the generic index code. | 
| 80 |  */ | 
| 81 | uint32 | 
| 82 | _hash_datum2hashkey(Relation rel, Datum key) | 
| 83 | { | 
| 84 | 	FmgrInfo   *procinfo; | 
| 85 | 	Oid			collation; | 
| 86 |  | 
| 87 | 	/* XXX assumes index has only one attribute */ | 
| 88 | 	procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC); | 
| 89 | 	collation = rel->rd_indcollation[0]; | 
| 90 |  | 
| 91 | 	return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key)); | 
| 92 | } | 
| 93 |  | 
| 94 | /* | 
| 95 |  * _hash_datum2hashkey_type -- given a Datum of a specified type, | 
| 96 |  *			hash it in a fashion compatible with this index | 
| 97 |  * | 
| 98 |  * This is much more expensive than _hash_datum2hashkey, so use it only in | 
| 99 |  * cross-type situations. | 
| 100 |  */ | 
| 101 | uint32 | 
| 102 | _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype) | 
| 103 | { | 
| 104 | 	RegProcedure hash_proc; | 
| 105 | 	Oid			collation; | 
| 106 |  | 
| 107 | 	/* XXX assumes index has only one attribute */ | 
| 108 | 	hash_proc = get_opfamily_proc(rel->rd_opfamily[0], | 
| 109 | 								  keytype, | 
| 110 | 								  keytype, | 
| 111 | 								  HASHSTANDARD_PROC); | 
| 112 | 	if (!RegProcedureIsValid(hash_proc)) | 
| 113 | 		elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"" , | 
| 114 | 			 HASHSTANDARD_PROC, keytype, keytype, | 
| 115 | 			 RelationGetRelationName(rel)); | 
| 116 | 	collation = rel->rd_indcollation[0]; | 
| 117 |  | 
| 118 | 	return DatumGetUInt32(OidFunctionCall1Coll(hash_proc, collation, key)); | 
| 119 | } | 
| 120 |  | 
| 121 | /* | 
| 122 |  * _hash_hashkey2bucket -- determine which bucket the hashkey maps to. | 
| 123 |  */ | 
| 124 | Bucket | 
| 125 | _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, | 
| 126 | 					 uint32 highmask, uint32 lowmask) | 
| 127 | { | 
| 128 | 	Bucket		bucket; | 
| 129 |  | 
| 130 | 	bucket = hashkey & highmask; | 
| 131 | 	if (bucket > maxbucket) | 
| 132 | 		bucket = bucket & lowmask; | 
| 133 |  | 
| 134 | 	return bucket; | 
| 135 | } | 
| 136 |  | 
| 137 | /* | 
| 138 |  * _hash_log2 -- returns ceil(lg2(num)) | 
| 139 |  */ | 
| 140 | uint32 | 
| 141 | _hash_log2(uint32 num) | 
| 142 | { | 
| 143 | 	uint32		i, | 
| 144 | 				limit; | 
| 145 |  | 
| 146 | 	limit = 1; | 
| 147 | 	for (i = 0; limit < num; limit <<= 1, i++) | 
| 148 | 		; | 
| 149 | 	return i; | 
| 150 | } | 
| 151 |  | 
| 152 | /* | 
| 153 |  * _hash_spareindex -- returns spare index / global splitpoint phase of the | 
| 154 |  *					   bucket | 
| 155 |  */ | 
| 156 | uint32 | 
| 157 | _hash_spareindex(uint32 num_bucket) | 
| 158 | { | 
| 159 | 	uint32		splitpoint_group; | 
| 160 | 	uint32		splitpoint_phases; | 
| 161 |  | 
| 162 | 	splitpoint_group = _hash_log2(num_bucket); | 
| 163 |  | 
| 164 | 	if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) | 
| 165 | 		return splitpoint_group; | 
| 166 |  | 
| 167 | 	/* account for single-phase groups */ | 
| 168 | 	splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; | 
| 169 |  | 
| 170 | 	/* account for multi-phase groups before splitpoint_group */ | 
| 171 | 	splitpoint_phases += | 
| 172 | 		((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) << | 
| 173 | 		 HASH_SPLITPOINT_PHASE_BITS); | 
| 174 |  | 
| 175 | 	/* account for phases within current group */ | 
| 176 | 	splitpoint_phases += | 
| 177 | 		(((num_bucket - 1) >> | 
| 178 | 		  (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) & | 
| 179 | 		 HASH_SPLITPOINT_PHASE_MASK);	/* to 0-based value. */ | 
| 180 |  | 
| 181 | 	return splitpoint_phases; | 
| 182 | } | 
| 183 |  | 
| 184 | /* | 
| 185 |  *	_hash_get_totalbuckets -- returns total number of buckets allocated till | 
| 186 |  *							the given splitpoint phase. | 
| 187 |  */ | 
| 188 | uint32 | 
| 189 | _hash_get_totalbuckets(uint32 splitpoint_phase) | 
| 190 | { | 
| 191 | 	uint32		splitpoint_group; | 
| 192 | 	uint32		total_buckets; | 
| 193 | 	uint32		phases_within_splitpoint_group; | 
| 194 |  | 
| 195 | 	if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) | 
| 196 | 		return (1 << splitpoint_phase); | 
| 197 |  | 
| 198 | 	/* get splitpoint's group */ | 
| 199 | 	splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; | 
| 200 | 	splitpoint_group += | 
| 201 | 		((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >> | 
| 202 | 		 HASH_SPLITPOINT_PHASE_BITS); | 
| 203 |  | 
| 204 | 	/* account for buckets before splitpoint_group */ | 
| 205 | 	total_buckets = (1 << (splitpoint_group - 1)); | 
| 206 |  | 
| 207 | 	/* account for buckets within splitpoint_group */ | 
| 208 | 	phases_within_splitpoint_group = | 
| 209 | 		(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) & | 
| 210 | 		  HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */ | 
| 211 | 	total_buckets += | 
| 212 | 		(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) * | 
| 213 | 		 phases_within_splitpoint_group); | 
| 214 |  | 
| 215 | 	return total_buckets; | 
| 216 | } | 
| 217 |  | 
| 218 | /* | 
| 219 |  * _hash_checkpage -- sanity checks on the format of all hash pages | 
| 220 |  * | 
| 221 |  * If flags is not zero, it is a bitwise OR of the acceptable page types | 
| 222 |  * (values of hasho_flag & LH_PAGE_TYPE). | 
| 223 |  */ | 
| 224 | void | 
| 225 | _hash_checkpage(Relation rel, Buffer buf, int flags) | 
| 226 | { | 
| 227 | 	Page		page = BufferGetPage(buf); | 
| 228 |  | 
| 229 | 	/* | 
| 230 | 	 * ReadBuffer verifies that every newly-read page passes | 
| 231 | 	 * PageHeaderIsValid, which means it either contains a reasonably sane | 
| 232 | 	 * page header or is all-zero.  We have to defend against the all-zero | 
| 233 | 	 * case, however. | 
| 234 | 	 */ | 
| 235 | 	if (PageIsNew(page)) | 
| 236 | 		ereport(ERROR, | 
| 237 | 				(errcode(ERRCODE_INDEX_CORRUPTED), | 
| 238 | 				 errmsg("index \"%s\" contains unexpected zero page at block %u" , | 
| 239 | 						RelationGetRelationName(rel), | 
| 240 | 						BufferGetBlockNumber(buf)), | 
| 241 | 				 errhint("Please REINDEX it." ))); | 
| 242 |  | 
| 243 | 	/* | 
| 244 | 	 * Additionally check that the special area looks sane. | 
| 245 | 	 */ | 
| 246 | 	if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) | 
| 247 | 		ereport(ERROR, | 
| 248 | 				(errcode(ERRCODE_INDEX_CORRUPTED), | 
| 249 | 				 errmsg("index \"%s\" contains corrupted page at block %u" , | 
| 250 | 						RelationGetRelationName(rel), | 
| 251 | 						BufferGetBlockNumber(buf)), | 
| 252 | 				 errhint("Please REINDEX it." ))); | 
| 253 |  | 
| 254 | 	if (flags) | 
| 255 | 	{ | 
| 256 | 		HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); | 
| 257 |  | 
| 258 | 		if ((opaque->hasho_flag & flags) == 0) | 
| 259 | 			ereport(ERROR, | 
| 260 | 					(errcode(ERRCODE_INDEX_CORRUPTED), | 
| 261 | 					 errmsg("index \"%s\" contains corrupted page at block %u" , | 
| 262 | 							RelationGetRelationName(rel), | 
| 263 | 							BufferGetBlockNumber(buf)), | 
| 264 | 					 errhint("Please REINDEX it." ))); | 
| 265 | 	} | 
| 266 |  | 
| 267 | 	/* | 
| 268 | 	 * When checking the metapage, also verify magic number and version. | 
| 269 | 	 */ | 
| 270 | 	if (flags == LH_META_PAGE) | 
| 271 | 	{ | 
| 272 | 		HashMetaPage metap = HashPageGetMeta(page); | 
| 273 |  | 
| 274 | 		if (metap->hashm_magic != HASH_MAGIC) | 
| 275 | 			ereport(ERROR, | 
| 276 | 					(errcode(ERRCODE_INDEX_CORRUPTED), | 
| 277 | 					 errmsg("index \"%s\" is not a hash index" , | 
| 278 | 							RelationGetRelationName(rel)))); | 
| 279 |  | 
| 280 | 		if (metap->hashm_version != HASH_VERSION) | 
| 281 | 			ereport(ERROR, | 
| 282 | 					(errcode(ERRCODE_INDEX_CORRUPTED), | 
| 283 | 					 errmsg("index \"%s\" has wrong hash version" , | 
| 284 | 							RelationGetRelationName(rel)), | 
| 285 | 					 errhint("Please REINDEX it." ))); | 
| 286 | 	} | 
| 287 | } | 
| 288 |  | 
| 289 | bytea * | 
| 290 | hashoptions(Datum reloptions, bool validate) | 
| 291 | { | 
| 292 | 	return default_reloptions(reloptions, validate, RELOPT_KIND_HASH); | 
| 293 | } | 
| 294 |  | 
| 295 | /* | 
| 296 |  * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value | 
| 297 |  */ | 
| 298 | uint32 | 
| 299 | _hash_get_indextuple_hashkey(IndexTuple itup) | 
| 300 | { | 
| 301 | 	char	   *attp; | 
| 302 |  | 
| 303 | 	/* | 
| 304 | 	 * We assume the hash key is the first attribute and can't be null, so | 
| 305 | 	 * this can be done crudely but very very cheaply ... | 
| 306 | 	 */ | 
| 307 | 	attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info); | 
| 308 | 	return *((uint32 *) attp); | 
| 309 | } | 
| 310 |  | 
| 311 | /* | 
| 312 |  * _hash_convert_tuple - convert raw index data to hash key | 
| 313 |  * | 
| 314 |  * Inputs: values and isnull arrays for the user data column(s) | 
| 315 |  * Outputs: values and isnull arrays for the index tuple, suitable for | 
| 316 |  *		passing to index_form_tuple(). | 
| 317 |  * | 
| 318 |  * Returns true if successful, false if not (because there are null values). | 
| 319 |  * On a false result, the given data need not be indexed. | 
| 320 |  * | 
| 321 |  * Note: callers know that the index-column arrays are always of length 1. | 
| 322 |  * In principle, there could be more than one input column, though we do not | 
| 323 |  * currently support that. | 
| 324 |  */ | 
| 325 | bool | 
| 326 | _hash_convert_tuple(Relation index, | 
| 327 | 					Datum *user_values, bool *user_isnull, | 
| 328 | 					Datum *index_values, bool *index_isnull) | 
| 329 | { | 
| 330 | 	uint32		hashkey; | 
| 331 |  | 
| 332 | 	/* | 
| 333 | 	 * We do not insert null values into hash indexes.  This is okay because | 
| 334 | 	 * the only supported search operator is '=', and we assume it is strict. | 
| 335 | 	 */ | 
| 336 | 	if (user_isnull[0]) | 
| 337 | 		return false; | 
| 338 |  | 
| 339 | 	hashkey = _hash_datum2hashkey(index, user_values[0]); | 
| 340 | 	index_values[0] = UInt32GetDatum(hashkey); | 
| 341 | 	index_isnull[0] = false; | 
| 342 | 	return true; | 
| 343 | } | 
| 344 |  | 
| 345 | /* | 
| 346 |  * _hash_binsearch - Return the offset number in the page where the | 
| 347 |  *					 specified hash value should be sought or inserted. | 
| 348 |  * | 
| 349 |  * We use binary search, relying on the assumption that the existing entries | 
| 350 |  * are ordered by hash key. | 
| 351 |  * | 
| 352 |  * Returns the offset of the first index entry having hashkey >= hash_value, | 
| 353 |  * or the page's max offset plus one if hash_value is greater than all | 
| 354 |  * existing hash keys in the page.  This is the appropriate place to start | 
| 355 |  * a search, or to insert a new item. | 
| 356 |  */ | 
| 357 | OffsetNumber | 
| 358 | _hash_binsearch(Page page, uint32 hash_value) | 
| 359 | { | 
| 360 | 	OffsetNumber upper; | 
| 361 | 	OffsetNumber lower; | 
| 362 |  | 
| 363 | 	/* Loop invariant: lower <= desired place <= upper */ | 
| 364 | 	upper = PageGetMaxOffsetNumber(page) + 1; | 
| 365 | 	lower = FirstOffsetNumber; | 
| 366 |  | 
| 367 | 	while (upper > lower) | 
| 368 | 	{ | 
| 369 | 		OffsetNumber off; | 
| 370 | 		IndexTuple	itup; | 
| 371 | 		uint32		hashkey; | 
| 372 |  | 
| 373 | 		off = (upper + lower) / 2; | 
| 374 | 		Assert(OffsetNumberIsValid(off)); | 
| 375 |  | 
| 376 | 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); | 
| 377 | 		hashkey = _hash_get_indextuple_hashkey(itup); | 
| 378 | 		if (hashkey < hash_value) | 
| 379 | 			lower = off + 1; | 
| 380 | 		else | 
| 381 | 			upper = off; | 
| 382 | 	} | 
| 383 |  | 
| 384 | 	return lower; | 
| 385 | } | 
| 386 |  | 
| 387 | /* | 
| 388 |  * _hash_binsearch_last | 
| 389 |  * | 
| 390 |  * Same as above, except that if there are multiple matching items in the | 
| 391 |  * page, we return the offset of the last one instead of the first one, | 
| 392 |  * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1. | 
| 393 |  * This is handy for starting a new page in a backwards scan. | 
| 394 |  */ | 
| 395 | OffsetNumber | 
| 396 | _hash_binsearch_last(Page page, uint32 hash_value) | 
| 397 | { | 
| 398 | 	OffsetNumber upper; | 
| 399 | 	OffsetNumber lower; | 
| 400 |  | 
| 401 | 	/* Loop invariant: lower <= desired place <= upper */ | 
| 402 | 	upper = PageGetMaxOffsetNumber(page); | 
| 403 | 	lower = FirstOffsetNumber - 1; | 
| 404 |  | 
| 405 | 	while (upper > lower) | 
| 406 | 	{ | 
| 407 | 		IndexTuple	itup; | 
| 408 | 		OffsetNumber off; | 
| 409 | 		uint32		hashkey; | 
| 410 |  | 
| 411 | 		off = (upper + lower + 1) / 2; | 
| 412 | 		Assert(OffsetNumberIsValid(off)); | 
| 413 |  | 
| 414 | 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); | 
| 415 | 		hashkey = _hash_get_indextuple_hashkey(itup); | 
| 416 | 		if (hashkey > hash_value) | 
| 417 | 			upper = off - 1; | 
| 418 | 		else | 
| 419 | 			lower = off; | 
| 420 | 	} | 
| 421 |  | 
| 422 | 	return lower; | 
| 423 | } | 
| 424 |  | 
| 425 | /* | 
| 426 |  *	_hash_get_oldblock_from_newbucket() -- get the block number of a bucket | 
| 427 |  *			from which current (new) bucket is being split. | 
| 428 |  */ | 
| 429 | BlockNumber | 
| 430 | _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket) | 
| 431 | { | 
| 432 | 	Bucket		old_bucket; | 
| 433 | 	uint32		mask; | 
| 434 | 	Buffer		metabuf; | 
| 435 | 	HashMetaPage metap; | 
| 436 | 	BlockNumber blkno; | 
| 437 |  | 
| 438 | 	/* | 
| 439 | 	 * To get the old bucket from the current bucket, we need a mask to modulo | 
| 440 | 	 * into lower half of table.  This mask is stored in meta page as | 
| 441 | 	 * hashm_lowmask, but here we can't rely on the same, because we need a | 
| 442 | 	 * value of lowmask that was prevalent at the time when bucket split was | 
| 443 | 	 * started.  Masking the most significant bit of new bucket would give us | 
| 444 | 	 * old bucket. | 
| 445 | 	 */ | 
| 446 | 	mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1; | 
| 447 | 	old_bucket = new_bucket & mask; | 
| 448 |  | 
| 449 | 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); | 
| 450 | 	metap = HashPageGetMeta(BufferGetPage(metabuf)); | 
| 451 |  | 
| 452 | 	blkno = BUCKET_TO_BLKNO(metap, old_bucket); | 
| 453 |  | 
| 454 | 	_hash_relbuf(rel, metabuf); | 
| 455 |  | 
| 456 | 	return blkno; | 
| 457 | } | 
| 458 |  | 
| 459 | /* | 
| 460 |  *	_hash_get_newblock_from_oldbucket() -- get the block number of a bucket | 
| 461 |  *			that will be generated after split from old bucket. | 
| 462 |  * | 
| 463 |  * This is used to find the new bucket from old bucket based on current table | 
| 464 |  * half.  It is mainly required to finish the incomplete splits where we are | 
| 465 |  * sure that not more than one bucket could have split in progress from old | 
| 466 |  * bucket. | 
| 467 |  */ | 
| 468 | BlockNumber | 
| 469 | _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket) | 
| 470 | { | 
| 471 | 	Bucket		new_bucket; | 
| 472 | 	Buffer		metabuf; | 
| 473 | 	HashMetaPage metap; | 
| 474 | 	BlockNumber blkno; | 
| 475 |  | 
| 476 | 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); | 
| 477 | 	metap = HashPageGetMeta(BufferGetPage(metabuf)); | 
| 478 |  | 
| 479 | 	new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket, | 
| 480 | 													metap->hashm_lowmask, | 
| 481 | 													metap->hashm_maxbucket); | 
| 482 | 	blkno = BUCKET_TO_BLKNO(metap, new_bucket); | 
| 483 |  | 
| 484 | 	_hash_relbuf(rel, metabuf); | 
| 485 |  | 
| 486 | 	return blkno; | 
| 487 | } | 
| 488 |  | 
| 489 | /* | 
| 490 |  *	_hash_get_newbucket_from_oldbucket() -- get the new bucket that will be | 
| 491 |  *			generated after split from current (old) bucket. | 
| 492 |  * | 
| 493 |  * This is used to find the new bucket from old bucket.  New bucket can be | 
| 494 |  * obtained by OR'ing old bucket with most significant bit of current table | 
| 495 |  * half (lowmask passed in this function can be used to identify msb of | 
| 496 |  * current table half).  There could be multiple buckets that could have | 
| 497 |  * been split from current bucket.  We need the first such bucket that exists. | 
| 498 |  * Caller must ensure that no more than one split has happened from old | 
| 499 |  * bucket. | 
| 500 |  */ | 
| 501 | Bucket | 
| 502 | _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, | 
| 503 | 								   uint32 lowmask, uint32 maxbucket) | 
| 504 | { | 
| 505 | 	Bucket		new_bucket; | 
| 506 |  | 
| 507 | 	new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); | 
| 508 | 	if (new_bucket > maxbucket) | 
| 509 | 	{ | 
| 510 | 		lowmask = lowmask >> 1; | 
| 511 | 		new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); | 
| 512 | 	} | 
| 513 |  | 
| 514 | 	return new_bucket; | 
| 515 | } | 
| 516 |  | 
| 517 | /* | 
| 518 |  * _hash_kill_items - set LP_DEAD state for items an indexscan caller has | 
| 519 |  * told us were killed. | 
| 520 |  * | 
| 521 |  * scan->opaque, referenced locally through so, contains information about the | 
| 522 |  * current page and killed tuples thereon (generally, this should only be | 
| 523 |  * called if so->numKilled > 0). | 
| 524 |  * | 
| 525 |  * The caller does not have a lock on the page and may or may not have the | 
| 526 |  * page pinned in a buffer.  Note that read-lock is sufficient for setting | 
| 527 |  * LP_DEAD status (which is only a hint). | 
| 528 |  * | 
| 529 |  * The caller must have pin on bucket buffer, but may or may not have pin | 
| 530 |  * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos). | 
| 531 |  * | 
| 532 |  * We match items by heap TID before assuming they are the right ones to | 
| 533 |  * delete. | 
| 534 |  * | 
| 535 |  * There are never any scans active in a bucket at the time VACUUM begins, | 
| 536 |  * because VACUUM takes a cleanup lock on the primary bucket page and scans | 
| 537 |  * hold a pin.  A scan can begin after VACUUM leaves the primary bucket page | 
| 538 |  * but before it finishes the entire bucket, but it can never pass VACUUM, | 
| 539 |  * because VACUUM always locks the next page before releasing the lock on | 
| 540 |  * the previous one.  Therefore, we don't have to worry about accidentally | 
| 541 |  * killing a TID that has been reused for an unrelated tuple. | 
| 542 |  */ | 
| 543 | void | 
| 544 | _hash_kill_items(IndexScanDesc scan) | 
| 545 | { | 
| 546 | 	HashScanOpaque so = (HashScanOpaque) scan->opaque; | 
| 547 | 	Relation	rel = scan->indexRelation; | 
| 548 | 	BlockNumber blkno; | 
| 549 | 	Buffer		buf; | 
| 550 | 	Page		page; | 
| 551 | 	HashPageOpaque opaque; | 
| 552 | 	OffsetNumber offnum, | 
| 553 | 				maxoff; | 
| 554 | 	int			numKilled = so->numKilled; | 
| 555 | 	int			i; | 
| 556 | 	bool		killedsomething = false; | 
| 557 | 	bool		havePin = false; | 
| 558 |  | 
| 559 | 	Assert(so->numKilled > 0); | 
| 560 | 	Assert(so->killedItems != NULL); | 
| 561 | 	Assert(HashScanPosIsValid(so->currPos)); | 
| 562 |  | 
| 563 | 	/* | 
| 564 | 	 * Always reset the scan state, so we don't look for same items on other | 
| 565 | 	 * pages. | 
| 566 | 	 */ | 
| 567 | 	so->numKilled = 0; | 
| 568 |  | 
| 569 | 	blkno = so->currPos.currPage; | 
| 570 | 	if (HashScanPosIsPinned(so->currPos)) | 
| 571 | 	{ | 
| 572 | 		/* | 
| 573 | 		 * We already have pin on this buffer, so, all we need to do is | 
| 574 | 		 * acquire lock on it. | 
| 575 | 		 */ | 
| 576 | 		havePin = true; | 
| 577 | 		buf = so->currPos.buf; | 
| 578 | 		LockBuffer(buf, BUFFER_LOCK_SHARE); | 
| 579 | 	} | 
| 580 | 	else | 
| 581 | 		buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); | 
| 582 |  | 
| 583 | 	page = BufferGetPage(buf); | 
| 584 | 	opaque = (HashPageOpaque) PageGetSpecialPointer(page); | 
| 585 | 	maxoff = PageGetMaxOffsetNumber(page); | 
| 586 |  | 
| 587 | 	for (i = 0; i < numKilled; i++) | 
| 588 | 	{ | 
| 589 | 		int			itemIndex = so->killedItems[i]; | 
| 590 | 		HashScanPosItem *currItem = &so->currPos.items[itemIndex]; | 
| 591 |  | 
| 592 | 		offnum = currItem->indexOffset; | 
| 593 |  | 
| 594 | 		Assert(itemIndex >= so->currPos.firstItem && | 
| 595 | 			   itemIndex <= so->currPos.lastItem); | 
| 596 |  | 
| 597 | 		while (offnum <= maxoff) | 
| 598 | 		{ | 
| 599 | 			ItemId		iid = PageGetItemId(page, offnum); | 
| 600 | 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid); | 
| 601 |  | 
| 602 | 			if (ItemPointerEquals(&ituple->t_tid, &currItem->heapTid)) | 
| 603 | 			{ | 
| 604 | 				/* found the item */ | 
| 605 | 				ItemIdMarkDead(iid); | 
| 606 | 				killedsomething = true; | 
| 607 | 				break;			/* out of inner search loop */ | 
| 608 | 			} | 
| 609 | 			offnum = OffsetNumberNext(offnum); | 
| 610 | 		} | 
| 611 | 	} | 
| 612 |  | 
| 613 | 	/* | 
| 614 | 	 * Since this can be redone later if needed, mark as dirty hint. Whenever | 
| 615 | 	 * we mark anything LP_DEAD, we also set the page's | 
| 616 | 	 * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. | 
| 617 | 	 */ | 
| 618 | 	if (killedsomething) | 
| 619 | 	{ | 
| 620 | 		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; | 
| 621 | 		MarkBufferDirtyHint(buf, true); | 
| 622 | 	} | 
| 623 |  | 
| 624 | 	if (so->hashso_bucket_buf == so->currPos.buf || | 
| 625 | 		havePin) | 
| 626 | 		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); | 
| 627 | 	else | 
| 628 | 		_hash_relbuf(rel, buf); | 
| 629 | } | 
| 630 |  |