1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * hashfunc.c |
4 | * Support functions for hash access method. |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * |
10 | * IDENTIFICATION |
11 | * src/backend/access/hash/hashfunc.c |
12 | * |
13 | * NOTES |
14 | * These functions are stored in pg_amproc. For each operator class |
15 | * defined for hash indexes, they compute the hash value of the argument. |
16 | * |
17 | * Additional hash functions appear in /utils/adt/ files for various |
18 | * specialized datatypes. |
19 | * |
20 | * It is expected that every bit of a hash function's 32-bit result is |
21 | * as random as every other; failure to ensure this is likely to lead |
22 | * to poor performance of hash joins, for example. In most cases a hash |
23 | * function should use hash_any() or its variant hash_uint32(). |
24 | *------------------------------------------------------------------------- |
25 | */ |
26 | |
27 | #include "postgres.h" |
28 | |
29 | #include "access/hash.h" |
30 | #include "catalog/pg_collation.h" |
31 | #include "utils/builtins.h" |
32 | #include "utils/hashutils.h" |
33 | #include "utils/pg_locale.h" |
34 | |
35 | /* |
36 | * Datatype-specific hash functions. |
37 | * |
38 | * These support both hash indexes and hash joins. |
39 | * |
40 | * NOTE: some of these are also used by catcache operations, without |
41 | * any direct connection to hash indexes. Also, the common hash_any |
42 | * routine is also used by dynahash tables. |
43 | */ |
44 | |
45 | /* Note: this is used for both "char" and boolean datatypes */ |
46 | Datum |
47 | hashchar(PG_FUNCTION_ARGS) |
48 | { |
49 | return hash_uint32((int32) PG_GETARG_CHAR(0)); |
50 | } |
51 | |
52 | Datum |
53 | hashcharextended(PG_FUNCTION_ARGS) |
54 | { |
55 | return hash_uint32_extended((int32) PG_GETARG_CHAR(0), PG_GETARG_INT64(1)); |
56 | } |
57 | |
58 | Datum |
59 | hashint2(PG_FUNCTION_ARGS) |
60 | { |
61 | return hash_uint32((int32) PG_GETARG_INT16(0)); |
62 | } |
63 | |
64 | Datum |
65 | hashint2extended(PG_FUNCTION_ARGS) |
66 | { |
67 | return hash_uint32_extended((int32) PG_GETARG_INT16(0), PG_GETARG_INT64(1)); |
68 | } |
69 | |
70 | Datum |
71 | hashint4(PG_FUNCTION_ARGS) |
72 | { |
73 | return hash_uint32(PG_GETARG_INT32(0)); |
74 | } |
75 | |
76 | Datum |
77 | hashint4extended(PG_FUNCTION_ARGS) |
78 | { |
79 | return hash_uint32_extended(PG_GETARG_INT32(0), PG_GETARG_INT64(1)); |
80 | } |
81 | |
82 | Datum |
83 | hashint8(PG_FUNCTION_ARGS) |
84 | { |
85 | /* |
86 | * The idea here is to produce a hash value compatible with the values |
87 | * produced by hashint4 and hashint2 for logically equal inputs; this is |
88 | * necessary to support cross-type hash joins across these input types. |
89 | * Since all three types are signed, we can xor the high half of the int8 |
90 | * value if the sign is positive, or the complement of the high half when |
91 | * the sign is negative. |
92 | */ |
93 | int64 val = PG_GETARG_INT64(0); |
94 | uint32 lohalf = (uint32) val; |
95 | uint32 hihalf = (uint32) (val >> 32); |
96 | |
97 | lohalf ^= (val >= 0) ? hihalf : ~hihalf; |
98 | |
99 | return hash_uint32(lohalf); |
100 | } |
101 | |
102 | Datum |
103 | hashint8extended(PG_FUNCTION_ARGS) |
104 | { |
105 | /* Same approach as hashint8 */ |
106 | int64 val = PG_GETARG_INT64(0); |
107 | uint32 lohalf = (uint32) val; |
108 | uint32 hihalf = (uint32) (val >> 32); |
109 | |
110 | lohalf ^= (val >= 0) ? hihalf : ~hihalf; |
111 | |
112 | return hash_uint32_extended(lohalf, PG_GETARG_INT64(1)); |
113 | } |
114 | |
115 | Datum |
116 | hashoid(PG_FUNCTION_ARGS) |
117 | { |
118 | return hash_uint32((uint32) PG_GETARG_OID(0)); |
119 | } |
120 | |
121 | Datum |
122 | hashoidextended(PG_FUNCTION_ARGS) |
123 | { |
124 | return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1)); |
125 | } |
126 | |
127 | Datum |
128 | hashenum(PG_FUNCTION_ARGS) |
129 | { |
130 | return hash_uint32((uint32) PG_GETARG_OID(0)); |
131 | } |
132 | |
133 | Datum |
134 | hashenumextended(PG_FUNCTION_ARGS) |
135 | { |
136 | return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1)); |
137 | } |
138 | |
139 | Datum |
140 | hashfloat4(PG_FUNCTION_ARGS) |
141 | { |
142 | float4 key = PG_GETARG_FLOAT4(0); |
143 | float8 key8; |
144 | |
145 | /* |
146 | * On IEEE-float machines, minus zero and zero have different bit patterns |
147 | * but should compare as equal. We must ensure that they have the same |
148 | * hash value, which is most reliably done this way: |
149 | */ |
150 | if (key == (float4) 0) |
151 | PG_RETURN_UINT32(0); |
152 | |
153 | /* |
154 | * To support cross-type hashing of float8 and float4, we want to return |
155 | * the same hash value hashfloat8 would produce for an equal float8 value. |
156 | * So, widen the value to float8 and hash that. (We must do this rather |
157 | * than have hashfloat8 try to narrow its value to float4; that could fail |
158 | * on overflow.) |
159 | */ |
160 | key8 = key; |
161 | |
162 | return hash_any((unsigned char *) &key8, sizeof(key8)); |
163 | } |
164 | |
165 | Datum |
166 | hashfloat4extended(PG_FUNCTION_ARGS) |
167 | { |
168 | float4 key = PG_GETARG_FLOAT4(0); |
169 | uint64 seed = PG_GETARG_INT64(1); |
170 | float8 key8; |
171 | |
172 | /* Same approach as hashfloat4 */ |
173 | if (key == (float4) 0) |
174 | PG_RETURN_UINT64(seed); |
175 | key8 = key; |
176 | |
177 | return hash_any_extended((unsigned char *) &key8, sizeof(key8), seed); |
178 | } |
179 | |
180 | Datum |
181 | hashfloat8(PG_FUNCTION_ARGS) |
182 | { |
183 | float8 key = PG_GETARG_FLOAT8(0); |
184 | |
185 | /* |
186 | * On IEEE-float machines, minus zero and zero have different bit patterns |
187 | * but should compare as equal. We must ensure that they have the same |
188 | * hash value, which is most reliably done this way: |
189 | */ |
190 | if (key == (float8) 0) |
191 | PG_RETURN_UINT32(0); |
192 | |
193 | return hash_any((unsigned char *) &key, sizeof(key)); |
194 | } |
195 | |
196 | Datum |
197 | hashfloat8extended(PG_FUNCTION_ARGS) |
198 | { |
199 | float8 key = PG_GETARG_FLOAT8(0); |
200 | uint64 seed = PG_GETARG_INT64(1); |
201 | |
202 | /* Same approach as hashfloat8 */ |
203 | if (key == (float8) 0) |
204 | PG_RETURN_UINT64(seed); |
205 | |
206 | return hash_any_extended((unsigned char *) &key, sizeof(key), seed); |
207 | } |
208 | |
209 | Datum |
210 | hashoidvector(PG_FUNCTION_ARGS) |
211 | { |
212 | oidvector *key = (oidvector *) PG_GETARG_POINTER(0); |
213 | |
214 | return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid)); |
215 | } |
216 | |
217 | Datum |
218 | hashoidvectorextended(PG_FUNCTION_ARGS) |
219 | { |
220 | oidvector *key = (oidvector *) PG_GETARG_POINTER(0); |
221 | |
222 | return hash_any_extended((unsigned char *) key->values, |
223 | key->dim1 * sizeof(Oid), |
224 | PG_GETARG_INT64(1)); |
225 | } |
226 | |
227 | Datum |
228 | hashname(PG_FUNCTION_ARGS) |
229 | { |
230 | char *key = NameStr(*PG_GETARG_NAME(0)); |
231 | |
232 | return hash_any((unsigned char *) key, strlen(key)); |
233 | } |
234 | |
235 | Datum |
236 | hashnameextended(PG_FUNCTION_ARGS) |
237 | { |
238 | char *key = NameStr(*PG_GETARG_NAME(0)); |
239 | |
240 | return hash_any_extended((unsigned char *) key, strlen(key), |
241 | PG_GETARG_INT64(1)); |
242 | } |
243 | |
244 | Datum |
245 | hashtext(PG_FUNCTION_ARGS) |
246 | { |
247 | text *key = PG_GETARG_TEXT_PP(0); |
248 | Oid collid = PG_GET_COLLATION(); |
249 | pg_locale_t mylocale = 0; |
250 | Datum result; |
251 | |
252 | if (!collid) |
253 | ereport(ERROR, |
254 | (errcode(ERRCODE_INDETERMINATE_COLLATION), |
255 | errmsg("could not determine which collation to use for string hashing" ), |
256 | errhint("Use the COLLATE clause to set the collation explicitly." ))); |
257 | |
258 | if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) |
259 | mylocale = pg_newlocale_from_collation(collid); |
260 | |
261 | if (!mylocale || mylocale->deterministic) |
262 | { |
263 | result = hash_any((unsigned char *) VARDATA_ANY(key), |
264 | VARSIZE_ANY_EXHDR(key)); |
265 | } |
266 | else |
267 | { |
268 | #ifdef USE_ICU |
269 | if (mylocale->provider == COLLPROVIDER_ICU) |
270 | { |
271 | int32_t ulen = -1; |
272 | UChar *uchar = NULL; |
273 | Size bsize; |
274 | uint8_t *buf; |
275 | |
276 | ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); |
277 | |
278 | bsize = ucol_getSortKey(mylocale->info.icu.ucol, |
279 | uchar, ulen, NULL, 0); |
280 | buf = palloc(bsize); |
281 | ucol_getSortKey(mylocale->info.icu.ucol, |
282 | uchar, ulen, buf, bsize); |
283 | |
284 | result = hash_any(buf, bsize); |
285 | |
286 | pfree(buf); |
287 | } |
288 | else |
289 | #endif |
290 | /* shouldn't happen */ |
291 | elog(ERROR, "unsupported collprovider: %c" , mylocale->provider); |
292 | } |
293 | |
294 | /* Avoid leaking memory for toasted inputs */ |
295 | PG_FREE_IF_COPY(key, 0); |
296 | |
297 | return result; |
298 | } |
299 | |
300 | Datum |
301 | hashtextextended(PG_FUNCTION_ARGS) |
302 | { |
303 | text *key = PG_GETARG_TEXT_PP(0); |
304 | Oid collid = PG_GET_COLLATION(); |
305 | pg_locale_t mylocale = 0; |
306 | Datum result; |
307 | |
308 | if (!collid) |
309 | ereport(ERROR, |
310 | (errcode(ERRCODE_INDETERMINATE_COLLATION), |
311 | errmsg("could not determine which collation to use for string hashing" ), |
312 | errhint("Use the COLLATE clause to set the collation explicitly." ))); |
313 | |
314 | if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) |
315 | mylocale = pg_newlocale_from_collation(collid); |
316 | |
317 | if (!mylocale || mylocale->deterministic) |
318 | { |
319 | result = hash_any_extended((unsigned char *) VARDATA_ANY(key), |
320 | VARSIZE_ANY_EXHDR(key), |
321 | PG_GETARG_INT64(1)); |
322 | } |
323 | else |
324 | { |
325 | #ifdef USE_ICU |
326 | if (mylocale->provider == COLLPROVIDER_ICU) |
327 | { |
328 | int32_t ulen = -1; |
329 | UChar *uchar = NULL; |
330 | Size bsize; |
331 | uint8_t *buf; |
332 | |
333 | ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); |
334 | |
335 | bsize = ucol_getSortKey(mylocale->info.icu.ucol, |
336 | uchar, ulen, NULL, 0); |
337 | buf = palloc(bsize); |
338 | ucol_getSortKey(mylocale->info.icu.ucol, |
339 | uchar, ulen, buf, bsize); |
340 | |
341 | result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); |
342 | |
343 | pfree(buf); |
344 | } |
345 | else |
346 | #endif |
347 | /* shouldn't happen */ |
348 | elog(ERROR, "unsupported collprovider: %c" , mylocale->provider); |
349 | } |
350 | |
351 | PG_FREE_IF_COPY(key, 0); |
352 | |
353 | return result; |
354 | } |
355 | |
356 | /* |
357 | * hashvarlena() can be used for any varlena datatype in which there are |
358 | * no non-significant bits, ie, distinct bitpatterns never compare as equal. |
359 | */ |
360 | Datum |
361 | hashvarlena(PG_FUNCTION_ARGS) |
362 | { |
363 | struct varlena *key = PG_GETARG_VARLENA_PP(0); |
364 | Datum result; |
365 | |
366 | result = hash_any((unsigned char *) VARDATA_ANY(key), |
367 | VARSIZE_ANY_EXHDR(key)); |
368 | |
369 | /* Avoid leaking memory for toasted inputs */ |
370 | PG_FREE_IF_COPY(key, 0); |
371 | |
372 | return result; |
373 | } |
374 | |
375 | Datum |
376 | hashvarlenaextended(PG_FUNCTION_ARGS) |
377 | { |
378 | struct varlena *key = PG_GETARG_VARLENA_PP(0); |
379 | Datum result; |
380 | |
381 | result = hash_any_extended((unsigned char *) VARDATA_ANY(key), |
382 | VARSIZE_ANY_EXHDR(key), |
383 | PG_GETARG_INT64(1)); |
384 | |
385 | PG_FREE_IF_COPY(key, 0); |
386 | |
387 | return result; |
388 | } |
389 | |