hashfn.c source code [PostgreSQL/src/backend/utils/hash/hashfn.c]

1	/-------------------------------------------------------------------------*
2	*
3	* hashfn.c
4	* Generic hashing functions, and hash functions for use in dynahash.c
5	* hashtables
6	*
7	*
8	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
9	* Portions Copyright (c) 1994, Regents of the University of California
10	*
11	*
12	* IDENTIFICATION
13	* src/backend/utils/hash/hashfn.c
14	*
15	* NOTES
16	* It is expected that every bit of a hash function's 32-bit result is
17	* as random as every other; failure to ensure this is likely to lead
18	* to poor performance of hash tables. In most cases a hash
19	* function should use hash_any() or its variant hash_uint32().
20	*
21	*-------------------------------------------------------------------------
22	*/
23	#include "postgres.h"
24
25	#include "fmgr.h"
26	#include "nodes/bitmapset.h"
27	#include "utils/hashutils.h"
28	#include "utils/hsearch.h"
29
30
31	/*
32	* This hash function was written by Bob Jenkins
33	* (bob_jenkins@burtleburtle.net), and superficially adapted
34	* for PostgreSQL by Neil Conway. For more information on this
35	* hash function, see http://burtleburtle.net/bob/hash/doobs.html,
36	* or Bob's article in Dr. Dobb's Journal, Sept. 1997.
37	*
38	* In the current code, we have adopted Bob's 2006 update of his hash
39	* function to fetch the data a word at a time when it is suitably aligned.
40	* This makes for a useful speedup, at the cost of having to maintain
41	* four code paths (aligned vs unaligned, and little-endian vs big-endian).
42	* It also uses two separate mixing functions mix() and final(), instead
43	* of a slower multi-purpose function.
44	*/
45
46	/ Get a bit mask of the bits set in non-uint32 aligned addresses /
47	#define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
48
49	/ Rotate a uint32 value left by k bits - note multiple evaluation! /
50	#define rot(x,k) (((x)<<(k)) \| ((x)>>(32-(k))))
51
52	/----------*
53	* mix -- mix 3 32-bit values reversibly.
54	*
55	* This is reversible, so any information in (a,b,c) before mix() is
56	* still in (a,b,c) after mix().
57	*
58	* If four pairs of (a,b,c) inputs are run through mix(), or through
59	* mix() in reverse, there are at least 32 bits of the output that
60	* are sometimes the same for one pair and different for another pair.
61	* This was tested for:
62	* * pairs that differed by one bit, by two bits, in any combination
63	* of top bits of (a,b,c), or in any combination of bottom bits of
64	* (a,b,c).
65	* * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
66	* the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
67	* is commonly produced by subtraction) look like a single 1-bit
68	* difference.
69	* * the base values were pseudorandom, all zero but one bit set, or
70	* all zero plus a counter that starts at zero.
71	*
72	* This does not achieve avalanche. There are input bits of (a,b,c)
73	* that fail to affect some output bits of (a,b,c), especially of a. The
74	* most thoroughly mixed value is c, but it doesn't really even achieve
75	* avalanche in c.
76	*
77	* This allows some parallelism. Read-after-writes are good at doubling
78	* the number of bits affected, so the goal of mixing pulls in the opposite
79	* direction from the goal of parallelism. I did what I could. Rotates
80	* seem to cost as much as shifts on every machine I could lay my hands on,
81	* and rotates are much kinder to the top and bottom bits, so I used rotates.
82	*----------
83	*/
84	#define mix(a,b,c) \
85	{ \
86	a -= c; a ^= rot(c, 4); c += b; \
87	b -= a; b ^= rot(a, 6); a += c; \
88	c -= b; c ^= rot(b, 8); b += a; \
89	a -= c; a ^= rot(c,16); c += b; \
90	b -= a; b ^= rot(a,19); a += c; \
91	c -= b; c ^= rot(b, 4); b += a; \
92	}
93
94	/----------*
95	* final -- final mixing of 3 32-bit values (a,b,c) into c
96	*
97	* Pairs of (a,b,c) values differing in only a few bits will usually
98	* produce values of c that look totally different. This was tested for
99	* * pairs that differed by one bit, by two bits, in any combination
100	* of top bits of (a,b,c), or in any combination of bottom bits of
101	* (a,b,c).
102	* * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
103	* the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
104	* is commonly produced by subtraction) look like a single 1-bit
105	* difference.
106	* * the base values were pseudorandom, all zero but one bit set, or
107	* all zero plus a counter that starts at zero.
108	*
109	* The use of separate functions for mix() and final() allow for a
110	* substantial performance increase since final() does not need to
111	* do well in reverse, but is does need to affect all output bits.
112	* mix(), on the other hand, does not need to affect all output
113	* bits (affecting 32 bits is enough). The original hash function had
114	* a single mixing operation that had to satisfy both sets of requirements
115	* and was slower as a result.
116	*----------
117	*/
118	#define final(a,b,c) \
119	{ \
120	c ^= b; c -= rot(b,14); \
121	a ^= c; a -= rot(c,11); \
122	b ^= a; b -= rot(a,25); \
123	c ^= b; c -= rot(b,16); \
124	a ^= c; a -= rot(c, 4); \
125	b ^= a; b -= rot(a,14); \
126	c ^= b; c -= rot(b,24); \
127	}
128
129	/*
130	* hash_any() -- hash a variable-length key into a 32-bit value
131	* k : the key (the unaligned variable-length array of bytes)
132	* len : the length of the key, counting by bytes
133	*
134	* Returns a uint32 value. Every bit of the key affects every bit of
135	* the return value. Every 1-bit and 2-bit delta achieves avalanche.
136	* About 6*len+35 instructions. The best hash table sizes are powers
137	* of 2. There is no need to do mod a prime (mod is sooo slow!).
138	* If you need less than 32 bits, use a bitmask.
139	*
140	* This procedure must never throw elog(ERROR); the ResourceOwner code
141	* relies on this not to fail.
142	*
143	* Note: we could easily change this function to return a 64-bit hash value
144	* by using the final values of both b and c. b is perhaps a little less
145	* well mixed than c, however.
146	*/
147	Datum
148	hash_any(register const unsigned char k, register* int keylen)
149	{
150	register uint32 a,
151	b,
152	c,
153	len;
154
155	/ Set up the internal state /
156	len = keylen;
157	a = b = c = `0x9e3779b9` + len + `3923095`;
158
159	/ If the source pointer is word-aligned, we use word-wide fetches /
160	if (((uintptr_t) k & UINT32_ALIGN_MASK) == `0`)
161	{
162	/ Code path for aligned source data /
163	register const uint32 ka = (const* uint32 *) k;
164
165	/ handle most of the key /
166	while (len >= `12`)
167	{
168	a += ka[`0`];
169	b += ka[`1`];
170	c += ka[`2`];
171	mix(a, b, c);
172	ka += `3`;
173	len -= `12`;
174	}
175
176	/ handle the last 11 bytes /
177	k = (const unsigned char *) ka;
178	#ifdef WORDS_BIGENDIAN
179	switch (len)
180	{
181	case `11`:
182	c += ((uint32) k[`10`] << `8`);
183	/ fall through /
184	case `10`:
185	c += ((uint32) k[`9`] << `16`);
186	/ fall through /
187	case `9`:
188	c += ((uint32) k[`8`] << `24`);
189	/ fall through /
190	case `8`:
191	/ the lowest byte of c is reserved for the length /
192	b += ka[`1`];
193	a += ka[`0`];
194	break;
195	case `7`:
196	b += ((uint32) k[`6`] << `8`);
197	/ fall through /
198	case `6`:
199	b += ((uint32) k[`5`] << `16`);
200	/ fall through /
201	case `5`:
202	b += ((uint32) k[`4`] << `24`);
203	/ fall through /
204	case `4`:
205	a += ka[`0`];
206	break;
207	case `3`:
208	a += ((uint32) k[`2`] << `8`);
209	/ fall through /
210	case `2`:
211	a += ((uint32) k[`1`] << `16`);
212	/ fall through /
213	case `1`:
214	a += ((uint32) k[`0`] << `24`);
215	/ case 0: nothing left to add /
216	}
217	#else /* !WORDS_BIGENDIAN */
218	switch (len)
219	{
220	case `11`:
221	c += ((uint32) k[`10`] << `24`);
222	/ fall through /
223	case `10`:
224	c += ((uint32) k[`9`] << `16`);
225	/ fall through /
226	case `9`:
227	c += ((uint32) k[`8`] << `8`);
228	/ fall through /
229	case `8`:
230	/ the lowest byte of c is reserved for the length /
231	b += ka[`1`];
232	a += ka[`0`];
233	break;
234	case `7`:
235	b += ((uint32) k[`6`] << `16`);
236	/ fall through /
237	case `6`:
238	b += ((uint32) k[`5`] << `8`);
239	/ fall through /
240	case `5`:
241	b += k[`4`];
242	/ fall through /
243	case `4`:
244	a += ka[`0`];
245	break;
246	case `3`:
247	a += ((uint32) k[`2`] << `16`);
248	/ fall through /
249	case `2`:
250	a += ((uint32) k[`1`] << `8`);
251	/ fall through /
252	case `1`:
253	a += k[`0`];
254	/ case 0: nothing left to add /
255	}
256	#endif /* WORDS_BIGENDIAN */
257	}
258	else
259	{
260	/ Code path for non-aligned source data /
261
262	/ handle most of the key /
263	while (len >= `12`)
264	{
265	#ifdef WORDS_BIGENDIAN
266	a += (k[`3`] + ((uint32) k[`2`] << `8`) + ((uint32) k[`1`] << `16`) + ((uint32) k[`0`] << `24`));
267	b += (k[`7`] + ((uint32) k[`6`] << `8`) + ((uint32) k[`5`] << `16`) + ((uint32) k[`4`] << `24`));
268	c += (k[`11`] + ((uint32) k[`10`] << `8`) + ((uint32) k[`9`] << `16`) + ((uint32) k[`8`] << `24`));
269	#else /* !WORDS_BIGENDIAN */
270	a += (k[`0`] + ((uint32) k[`1`] << `8`) + ((uint32) k[`2`] << `16`) + ((uint32) k[`3`] << `24`));
271	b += (k[`4`] + ((uint32) k[`5`] << `8`) + ((uint32) k[`6`] << `16`) + ((uint32) k[`7`] << `24`));
272	c += (k[`8`] + ((uint32) k[`9`] << `8`) + ((uint32) k[`10`] << `16`) + ((uint32) k[`11`] << `24`));
273	#endif /* WORDS_BIGENDIAN */
274	mix(a, b, c);
275	k += `12`;
276	len -= `12`;
277	}
278
279	/ handle the last 11 bytes /
280	#ifdef WORDS_BIGENDIAN
281	switch (len)
282	{
283	case `11`:
284	c += ((uint32) k[`10`] << `8`);
285	/ fall through /
286	case `10`:
287	c += ((uint32) k[`9`] << `16`);
288	/ fall through /
289	case `9`:
290	c += ((uint32) k[`8`] << `24`);
291	/ fall through /
292	case `8`:
293	/ the lowest byte of c is reserved for the length /
294	b += k[`7`];
295	/ fall through /
296	case `7`:
297	b += ((uint32) k[`6`] << `8`);
298	/ fall through /
299	case `6`:
300	b += ((uint32) k[`5`] << `16`);
301	/ fall through /
302	case `5`:
303	b += ((uint32) k[`4`] << `24`);
304	/ fall through /
305	case `4`:
306	a += k[`3`];
307	/ fall through /
308	case `3`:
309	a += ((uint32) k[`2`] << `8`);
310	/ fall through /
311	case `2`:
312	a += ((uint32) k[`1`] << `16`);
313	/ fall through /
314	case `1`:
315	a += ((uint32) k[`0`] << `24`);
316	/ case 0: nothing left to add /
317	}
318	#else /* !WORDS_BIGENDIAN */
319	switch (len)
320	{
321	case `11`:
322	c += ((uint32) k[`10`] << `24`);
323	/ fall through /
324	case `10`:
325	c += ((uint32) k[`9`] << `16`);
326	/ fall through /
327	case `9`:
328	c += ((uint32) k[`8`] << `8`);
329	/ fall through /
330	case `8`:
331	/ the lowest byte of c is reserved for the length /
332	b += ((uint32) k[`7`] << `24`);
333	/ fall through /
334	case `7`:
335	b += ((uint32) k[`6`] << `16`);
336	/ fall through /
337	case `6`:
338	b += ((uint32) k[`5`] << `8`);
339	/ fall through /
340	case `5`:
341	b += k[`4`];
342	/ fall through /
343	case `4`:
344	a += ((uint32) k[`3`] << `24`);
345	/ fall through /
346	case `3`:
347	a += ((uint32) k[`2`] << `16`);
348	/ fall through /
349	case `2`:
350	a += ((uint32) k[`1`] << `8`);
351	/ fall through /
352	case `1`:
353	a += k[`0`];
354	/ case 0: nothing left to add /
355	}
356	#endif /* WORDS_BIGENDIAN */
357	}
358
359	final(a, b, c);
360
361	/ report the result /
362	return UInt32GetDatum(c);
363	}
364
365	/*
366	* hash_any_extended() -- hash into a 64-bit value, using an optional seed
367	* k : the key (the unaligned variable-length array of bytes)
368	* len : the length of the key, counting by bytes
369	* seed : a 64-bit seed (0 means no seed)
370	*
371	* Returns a uint64 value. Otherwise similar to hash_any.
372	*/
373	Datum
374	hash_any_extended(register const unsigned char k, register* int keylen,
375	uint64 seed)
376	{
377	register uint32 a,
378	b,
379	c,
380	len;
381
382	/ Set up the internal state /
383	len = keylen;
384	a = b = c = `0x9e3779b9` + len + `3923095`;
385
386	/ If the seed is non-zero, use it to perturb the internal state. /
387	if (seed != `0`)
388	{
389	/*
390	* In essence, the seed is treated as part of the data being hashed,
391	* but for simplicity, we pretend that it's padded with four bytes of
392	* zeroes so that the seed constitutes a 12-byte chunk.
393	*/
394	a += (uint32) (seed >> `32`);
395	b += (uint32) seed;
396	mix(a, b, c);
397	}
398
399	/ If the source pointer is word-aligned, we use word-wide fetches /
400	if (((uintptr_t) k & UINT32_ALIGN_MASK) == `0`)
401	{
402	/ Code path for aligned source data /
403	register const uint32 ka = (const* uint32 *) k;
404
405	/ handle most of the key /
406	while (len >= `12`)
407	{
408	a += ka[`0`];
409	b += ka[`1`];
410	c += ka[`2`];
411	mix(a, b, c);
412	ka += `3`;
413	len -= `12`;
414	}
415
416	/ handle the last 11 bytes /
417	k = (const unsigned char *) ka;
418	#ifdef WORDS_BIGENDIAN
419	switch (len)
420	{
421	case `11`:
422	c += ((uint32) k[`10`] << `8`);
423	/ fall through /
424	case `10`:
425	c += ((uint32) k[`9`] << `16`);
426	/ fall through /
427	case `9`:
428	c += ((uint32) k[`8`] << `24`);
429	/ fall through /
430	case `8`:
431	/ the lowest byte of c is reserved for the length /
432	b += ka[`1`];
433	a += ka[`0`];
434	break;
435	case `7`:
436	b += ((uint32) k[`6`] << `8`);
437	/ fall through /
438	case `6`:
439	b += ((uint32) k[`5`] << `16`);
440	/ fall through /
441	case `5`:
442	b += ((uint32) k[`4`] << `24`);
443	/ fall through /
444	case `4`:
445	a += ka[`0`];
446	break;
447	case `3`:
448	a += ((uint32) k[`2`] << `8`);
449	/ fall through /
450	case `2`:
451	a += ((uint32) k[`1`] << `16`);
452	/ fall through /
453	case `1`:
454	a += ((uint32) k[`0`] << `24`);
455	/ case 0: nothing left to add /
456	}
457	#else /* !WORDS_BIGENDIAN */
458	switch (len)
459	{
460	case `11`:
461	c += ((uint32) k[`10`] << `24`);
462	/ fall through /
463	case `10`:
464	c += ((uint32) k[`9`] << `16`);
465	/ fall through /
466	case `9`:
467	c += ((uint32) k[`8`] << `8`);
468	/ fall through /
469	case `8`:
470	/ the lowest byte of c is reserved for the length /
471	b += ka[`1`];
472	a += ka[`0`];
473	break;
474	case `7`:
475	b += ((uint32) k[`6`] << `16`);
476	/ fall through /
477	case `6`:
478	b += ((uint32) k[`5`] << `8`);
479	/ fall through /
480	case `5`:
481	b += k[`4`];
482	/ fall through /
483	case `4`:
484	a += ka[`0`];
485	break;
486	case `3`:
487	a += ((uint32) k[`2`] << `16`);
488	/ fall through /
489	case `2`:
490	a += ((uint32) k[`1`] << `8`);
491	/ fall through /
492	case `1`:
493	a += k[`0`];
494	/ case 0: nothing left to add /
495	}
496	#endif /* WORDS_BIGENDIAN */
497	}
498	else
499	{
500	/ Code path for non-aligned source data /
501
502	/ handle most of the key /
503	while (len >= `12`)
504	{
505	#ifdef WORDS_BIGENDIAN
506	a += (k[`3`] + ((uint32) k[`2`] << `8`) + ((uint32) k[`1`] << `16`) + ((uint32) k[`0`] << `24`));
507	b += (k[`7`] + ((uint32) k[`6`] << `8`) + ((uint32) k[`5`] << `16`) + ((uint32) k[`4`] << `24`));
508	c += (k[`11`] + ((uint32) k[`10`] << `8`) + ((uint32) k[`9`] << `16`) + ((uint32) k[`8`] << `24`));
509	#else /* !WORDS_BIGENDIAN */
510	a += (k[`0`] + ((uint32) k[`1`] << `8`) + ((uint32) k[`2`] << `16`) + ((uint32) k[`3`] << `24`));
511	b += (k[`4`] + ((uint32) k[`5`] << `8`) + ((uint32) k[`6`] << `16`) + ((uint32) k[`7`] << `24`));
512	c += (k[`8`] + ((uint32) k[`9`] << `8`) + ((uint32) k[`10`] << `16`) + ((uint32) k[`11`] << `24`));
513	#endif /* WORDS_BIGENDIAN */
514	mix(a, b, c);
515	k += `12`;
516	len -= `12`;
517	}
518
519	/ handle the last 11 bytes /
520	#ifdef WORDS_BIGENDIAN
521	switch (len)
522	{
523	case `11`:
524	c += ((uint32) k[`10`] << `8`);
525	/ fall through /
526	case `10`:
527	c += ((uint32) k[`9`] << `16`);
528	/ fall through /
529	case `9`:
530	c += ((uint32) k[`8`] << `24`);
531	/ fall through /
532	case `8`:
533	/ the lowest byte of c is reserved for the length /
534	b += k[`7`];
535	/ fall through /
536	case `7`:
537	b += ((uint32) k[`6`] << `8`);
538	/ fall through /
539	case `6`:
540	b += ((uint32) k[`5`] << `16`);
541	/ fall through /
542	case `5`:
543	b += ((uint32) k[`4`] << `24`);
544	/ fall through /
545	case `4`:
546	a += k[`3`];
547	/ fall through /
548	case `3`:
549	a += ((uint32) k[`2`] << `8`);
550	/ fall through /
551	case `2`:
552	a += ((uint32) k[`1`] << `16`);
553	/ fall through /
554	case `1`:
555	a += ((uint32) k[`0`] << `24`);
556	/ case 0: nothing left to add /
557	}
558	#else /* !WORDS_BIGENDIAN */
559	switch (len)
560	{
561	case `11`:
562	c += ((uint32) k[`10`] << `24`);
563	/ fall through /
564	case `10`:
565	c += ((uint32) k[`9`] << `16`);
566	/ fall through /
567	case `9`:
568	c += ((uint32) k[`8`] << `8`);
569	/ fall through /
570	case `8`:
571	/ the lowest byte of c is reserved for the length /
572	b += ((uint32) k[`7`] << `24`);
573	/ fall through /
574	case `7`:
575	b += ((uint32) k[`6`] << `16`);
576	/ fall through /
577	case `6`:
578	b += ((uint32) k[`5`] << `8`);
579	/ fall through /
580	case `5`:
581	b += k[`4`];
582	/ fall through /
583	case `4`:
584	a += ((uint32) k[`3`] << `24`);
585	/ fall through /
586	case `3`:
587	a += ((uint32) k[`2`] << `16`);
588	/ fall through /
589	case `2`:
590	a += ((uint32) k[`1`] << `8`);
591	/ fall through /
592	case `1`:
593	a += k[`0`];
594	/ case 0: nothing left to add /
595	}
596	#endif /* WORDS_BIGENDIAN */
597	}
598
599	final(a, b, c);
600
601	/ report the result /
602	PG_RETURN_UINT64(((uint64) b << `32`) \| c);
603	}
604
605	/*
606	* hash_uint32() -- hash a 32-bit value to a 32-bit value
607	*
608	* This has the same result as
609	* hash_any(&k, sizeof(uint32))
610	* but is faster and doesn't force the caller to store k into memory.
611	*/
612	Datum
613	hash_uint32(uint32 k)
614	{
615	register uint32 a,
616	b,
617	c;
618
619	a = b = c = `0x9e3779b9` + (uint32) sizeof(uint32) + `3923095`;
620	a += k;
621
622	final(a, b, c);
623
624	/ report the result /
625	return UInt32GetDatum(c);
626	}
627
628	/*
629	* hash_uint32_extended() -- hash a 32-bit value to a 64-bit value, with a seed
630	*
631	* Like hash_uint32, this is a convenience function.
632	*/
633	Datum
634	hash_uint32_extended(uint32 k, uint64 seed)
635	{
636	register uint32 a,
637	b,
638	c;
639
640	a = b = c = `0x9e3779b9` + (uint32) sizeof(uint32) + `3923095`;
641
642	if (seed != `0`)
643	{
644	a += (uint32) (seed >> `32`);
645	b += (uint32) seed;
646	mix(a, b, c);
647	}
648
649	a += k;
650
651	final(a, b, c);
652
653	/ report the result /
654	PG_RETURN_UINT64(((uint64) b << `32`) \| c);
655	}
656
657	/*
658	* string_hash: hash function for keys that are NUL-terminated strings.
659	*
660	* NOTE: this is the default hash function if none is specified.
661	*/
662	uint32
663	string_hash(const void *key, Size keysize)
664	{
665	/*
666	* If the string exceeds keysize-1 bytes, we want to hash only that many,
667	* because when it is copied into the hash table it will be truncated at
668	* that length.
669	*/
670	Size s_len = strlen((const char *) key);
671
672	s_len = Min(s_len, keysize - `1`);
673	return DatumGetUInt32(hash_any((const unsigned char *) key,
674	(int) s_len));
675	}
676
677	/*
678	* tag_hash: hash function for fixed-size tag values
679	*/
680	uint32
681	tag_hash(const void *key, Size keysize)
682	{
683	return DatumGetUInt32(hash_any((const unsigned char *) key,
684	(int) keysize));
685	}
686
687	/*
688	* uint32_hash: hash function for keys that are uint32 or int32
689	*
690	* (tag_hash works for this case too, but is slower)
691	*/
692	uint32
693	uint32_hash(const void *key, Size keysize)
694	{
695	Assert(keysize == sizeof(uint32));
696	return DatumGetUInt32(hash_uint32(((const* uint32 *) key)));
697	}
698
699	/*
700	* bitmap_hash: hash function for keys that are (pointers to) Bitmapsets
701	*
702	* Note: don't forget to specify bitmap_match as the match function!
703	*/
704	uint32
705	bitmap_hash(const void *key, Size keysize)
706	{
707	Assert(keysize == sizeof(Bitmapset *));
708	return bms_hash_value(((const* Bitmapset *const *) key));
709	}
710
711	/*
712	* bitmap_match: match function to use with bitmap_hash
713	*/
714	int
715	bitmap_match(const void key1, const* void *key2, Size keysize)
716	{
717	Assert(keysize == sizeof(Bitmapset *));
718	return !bms_equal(((const* Bitmapset *const *) key1),
719	((const* Bitmapset *const *) key2));
720	}
721

Browse the source code of PostgreSQL/src/backend/utils/hash/hashfn.c