| 1 | /*------------------------------------------------------------------------- | 
|---|
| 2 | * | 
|---|
| 3 | * hashfunc.c | 
|---|
| 4 | *	  Support functions for hash access method. | 
|---|
| 5 | * | 
|---|
| 6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group | 
|---|
| 7 | * Portions Copyright (c) 1994, Regents of the University of California | 
|---|
| 8 | * | 
|---|
| 9 | * | 
|---|
| 10 | * IDENTIFICATION | 
|---|
| 11 | *	  src/backend/access/hash/hashfunc.c | 
|---|
| 12 | * | 
|---|
| 13 | * NOTES | 
|---|
| 14 | *	  These functions are stored in pg_amproc.  For each operator class | 
|---|
| 15 | *	  defined for hash indexes, they compute the hash value of the argument. | 
|---|
| 16 | * | 
|---|
| 17 | *	  Additional hash functions appear in /utils/adt/ files for various | 
|---|
| 18 | *	  specialized datatypes. | 
|---|
| 19 | * | 
|---|
| 20 | *	  It is expected that every bit of a hash function's 32-bit result is | 
|---|
| 21 | *	  as random as every other; failure to ensure this is likely to lead | 
|---|
| 22 | *	  to poor performance of hash joins, for example.  In most cases a hash | 
|---|
| 23 | *	  function should use hash_any() or its variant hash_uint32(). | 
|---|
| 24 | *------------------------------------------------------------------------- | 
|---|
| 25 | */ | 
|---|
| 26 |  | 
|---|
| 27 | #include "postgres.h" | 
|---|
| 28 |  | 
|---|
| 29 | #include "access/hash.h" | 
|---|
| 30 | #include "catalog/pg_collation.h" | 
|---|
| 31 | #include "utils/builtins.h" | 
|---|
| 32 | #include "utils/hashutils.h" | 
|---|
| 33 | #include "utils/pg_locale.h" | 
|---|
| 34 |  | 
|---|
| 35 | /* | 
|---|
| 36 | * Datatype-specific hash functions. | 
|---|
| 37 | * | 
|---|
| 38 | * These support both hash indexes and hash joins. | 
|---|
| 39 | * | 
|---|
| 40 | * NOTE: some of these are also used by catcache operations, without | 
|---|
| 41 | * any direct connection to hash indexes.  Also, the common hash_any | 
|---|
| 42 | * routine is also used by dynahash tables. | 
|---|
| 43 | */ | 
|---|
| 44 |  | 
|---|
| 45 | /* Note: this is used for both "char" and boolean datatypes */ | 
|---|
| 46 | Datum | 
|---|
| 47 | hashchar(PG_FUNCTION_ARGS) | 
|---|
| 48 | { | 
|---|
| 49 | return hash_uint32((int32) PG_GETARG_CHAR(0)); | 
|---|
| 50 | } | 
|---|
| 51 |  | 
|---|
| 52 | Datum | 
|---|
| 53 | hashcharextended(PG_FUNCTION_ARGS) | 
|---|
| 54 | { | 
|---|
| 55 | return hash_uint32_extended((int32) PG_GETARG_CHAR(0), PG_GETARG_INT64(1)); | 
|---|
| 56 | } | 
|---|
| 57 |  | 
|---|
| 58 | Datum | 
|---|
| 59 | hashint2(PG_FUNCTION_ARGS) | 
|---|
| 60 | { | 
|---|
| 61 | return hash_uint32((int32) PG_GETARG_INT16(0)); | 
|---|
| 62 | } | 
|---|
| 63 |  | 
|---|
| 64 | Datum | 
|---|
| 65 | hashint2extended(PG_FUNCTION_ARGS) | 
|---|
| 66 | { | 
|---|
| 67 | return hash_uint32_extended((int32) PG_GETARG_INT16(0), PG_GETARG_INT64(1)); | 
|---|
| 68 | } | 
|---|
| 69 |  | 
|---|
| 70 | Datum | 
|---|
| 71 | hashint4(PG_FUNCTION_ARGS) | 
|---|
| 72 | { | 
|---|
| 73 | return hash_uint32(PG_GETARG_INT32(0)); | 
|---|
| 74 | } | 
|---|
| 75 |  | 
|---|
| 76 | Datum | 
|---|
| 77 | hashint4extended(PG_FUNCTION_ARGS) | 
|---|
| 78 | { | 
|---|
| 79 | return hash_uint32_extended(PG_GETARG_INT32(0), PG_GETARG_INT64(1)); | 
|---|
| 80 | } | 
|---|
| 81 |  | 
|---|
| 82 | Datum | 
|---|
| 83 | hashint8(PG_FUNCTION_ARGS) | 
|---|
| 84 | { | 
|---|
| 85 | /* | 
|---|
| 86 | * The idea here is to produce a hash value compatible with the values | 
|---|
| 87 | * produced by hashint4 and hashint2 for logically equal inputs; this is | 
|---|
| 88 | * necessary to support cross-type hash joins across these input types. | 
|---|
| 89 | * Since all three types are signed, we can xor the high half of the int8 | 
|---|
| 90 | * value if the sign is positive, or the complement of the high half when | 
|---|
| 91 | * the sign is negative. | 
|---|
| 92 | */ | 
|---|
| 93 | int64		val = PG_GETARG_INT64(0); | 
|---|
| 94 | uint32		lohalf = (uint32) val; | 
|---|
| 95 | uint32		hihalf = (uint32) (val >> 32); | 
|---|
| 96 |  | 
|---|
| 97 | lohalf ^= (val >= 0) ? hihalf : ~hihalf; | 
|---|
| 98 |  | 
|---|
| 99 | return hash_uint32(lohalf); | 
|---|
| 100 | } | 
|---|
| 101 |  | 
|---|
| 102 | Datum | 
|---|
| 103 | hashint8extended(PG_FUNCTION_ARGS) | 
|---|
| 104 | { | 
|---|
| 105 | /* Same approach as hashint8 */ | 
|---|
| 106 | int64		val = PG_GETARG_INT64(0); | 
|---|
| 107 | uint32		lohalf = (uint32) val; | 
|---|
| 108 | uint32		hihalf = (uint32) (val >> 32); | 
|---|
| 109 |  | 
|---|
| 110 | lohalf ^= (val >= 0) ? hihalf : ~hihalf; | 
|---|
| 111 |  | 
|---|
| 112 | return hash_uint32_extended(lohalf, PG_GETARG_INT64(1)); | 
|---|
| 113 | } | 
|---|
| 114 |  | 
|---|
| 115 | Datum | 
|---|
| 116 | hashoid(PG_FUNCTION_ARGS) | 
|---|
| 117 | { | 
|---|
| 118 | return hash_uint32((uint32) PG_GETARG_OID(0)); | 
|---|
| 119 | } | 
|---|
| 120 |  | 
|---|
| 121 | Datum | 
|---|
| 122 | hashoidextended(PG_FUNCTION_ARGS) | 
|---|
| 123 | { | 
|---|
| 124 | return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1)); | 
|---|
| 125 | } | 
|---|
| 126 |  | 
|---|
| 127 | Datum | 
|---|
| 128 | hashenum(PG_FUNCTION_ARGS) | 
|---|
| 129 | { | 
|---|
| 130 | return hash_uint32((uint32) PG_GETARG_OID(0)); | 
|---|
| 131 | } | 
|---|
| 132 |  | 
|---|
| 133 | Datum | 
|---|
| 134 | hashenumextended(PG_FUNCTION_ARGS) | 
|---|
| 135 | { | 
|---|
| 136 | return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1)); | 
|---|
| 137 | } | 
|---|
| 138 |  | 
|---|
| 139 | Datum | 
|---|
| 140 | hashfloat4(PG_FUNCTION_ARGS) | 
|---|
| 141 | { | 
|---|
| 142 | float4		key = PG_GETARG_FLOAT4(0); | 
|---|
| 143 | float8		key8; | 
|---|
| 144 |  | 
|---|
| 145 | /* | 
|---|
| 146 | * On IEEE-float machines, minus zero and zero have different bit patterns | 
|---|
| 147 | * but should compare as equal.  We must ensure that they have the same | 
|---|
| 148 | * hash value, which is most reliably done this way: | 
|---|
| 149 | */ | 
|---|
| 150 | if (key == (float4) 0) | 
|---|
| 151 | PG_RETURN_UINT32(0); | 
|---|
| 152 |  | 
|---|
| 153 | /* | 
|---|
| 154 | * To support cross-type hashing of float8 and float4, we want to return | 
|---|
| 155 | * the same hash value hashfloat8 would produce for an equal float8 value. | 
|---|
| 156 | * So, widen the value to float8 and hash that.  (We must do this rather | 
|---|
| 157 | * than have hashfloat8 try to narrow its value to float4; that could fail | 
|---|
| 158 | * on overflow.) | 
|---|
| 159 | */ | 
|---|
| 160 | key8 = key; | 
|---|
| 161 |  | 
|---|
| 162 | return hash_any((unsigned char *) &key8, sizeof(key8)); | 
|---|
| 163 | } | 
|---|
| 164 |  | 
|---|
| 165 | Datum | 
|---|
| 166 | hashfloat4extended(PG_FUNCTION_ARGS) | 
|---|
| 167 | { | 
|---|
| 168 | float4		key = PG_GETARG_FLOAT4(0); | 
|---|
| 169 | uint64		seed = PG_GETARG_INT64(1); | 
|---|
| 170 | float8		key8; | 
|---|
| 171 |  | 
|---|
| 172 | /* Same approach as hashfloat4 */ | 
|---|
| 173 | if (key == (float4) 0) | 
|---|
| 174 | PG_RETURN_UINT64(seed); | 
|---|
| 175 | key8 = key; | 
|---|
| 176 |  | 
|---|
| 177 | return hash_any_extended((unsigned char *) &key8, sizeof(key8), seed); | 
|---|
| 178 | } | 
|---|
| 179 |  | 
|---|
| 180 | Datum | 
|---|
| 181 | hashfloat8(PG_FUNCTION_ARGS) | 
|---|
| 182 | { | 
|---|
| 183 | float8		key = PG_GETARG_FLOAT8(0); | 
|---|
| 184 |  | 
|---|
| 185 | /* | 
|---|
| 186 | * On IEEE-float machines, minus zero and zero have different bit patterns | 
|---|
| 187 | * but should compare as equal.  We must ensure that they have the same | 
|---|
| 188 | * hash value, which is most reliably done this way: | 
|---|
| 189 | */ | 
|---|
| 190 | if (key == (float8) 0) | 
|---|
| 191 | PG_RETURN_UINT32(0); | 
|---|
| 192 |  | 
|---|
| 193 | return hash_any((unsigned char *) &key, sizeof(key)); | 
|---|
| 194 | } | 
|---|
| 195 |  | 
|---|
| 196 | Datum | 
|---|
| 197 | hashfloat8extended(PG_FUNCTION_ARGS) | 
|---|
| 198 | { | 
|---|
| 199 | float8		key = PG_GETARG_FLOAT8(0); | 
|---|
| 200 | uint64		seed = PG_GETARG_INT64(1); | 
|---|
| 201 |  | 
|---|
| 202 | /* Same approach as hashfloat8 */ | 
|---|
| 203 | if (key == (float8) 0) | 
|---|
| 204 | PG_RETURN_UINT64(seed); | 
|---|
| 205 |  | 
|---|
| 206 | return hash_any_extended((unsigned char *) &key, sizeof(key), seed); | 
|---|
| 207 | } | 
|---|
| 208 |  | 
|---|
| 209 | Datum | 
|---|
| 210 | hashoidvector(PG_FUNCTION_ARGS) | 
|---|
| 211 | { | 
|---|
| 212 | oidvector  *key = (oidvector *) PG_GETARG_POINTER(0); | 
|---|
| 213 |  | 
|---|
| 214 | return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid)); | 
|---|
| 215 | } | 
|---|
| 216 |  | 
|---|
| 217 | Datum | 
|---|
| 218 | hashoidvectorextended(PG_FUNCTION_ARGS) | 
|---|
| 219 | { | 
|---|
| 220 | oidvector  *key = (oidvector *) PG_GETARG_POINTER(0); | 
|---|
| 221 |  | 
|---|
| 222 | return hash_any_extended((unsigned char *) key->values, | 
|---|
| 223 | key->dim1 * sizeof(Oid), | 
|---|
| 224 | PG_GETARG_INT64(1)); | 
|---|
| 225 | } | 
|---|
| 226 |  | 
|---|
| 227 | Datum | 
|---|
| 228 | hashname(PG_FUNCTION_ARGS) | 
|---|
| 229 | { | 
|---|
| 230 | char	   *key = NameStr(*PG_GETARG_NAME(0)); | 
|---|
| 231 |  | 
|---|
| 232 | return hash_any((unsigned char *) key, strlen(key)); | 
|---|
| 233 | } | 
|---|
| 234 |  | 
|---|
| 235 | Datum | 
|---|
| 236 | hashnameextended(PG_FUNCTION_ARGS) | 
|---|
| 237 | { | 
|---|
| 238 | char	   *key = NameStr(*PG_GETARG_NAME(0)); | 
|---|
| 239 |  | 
|---|
| 240 | return hash_any_extended((unsigned char *) key, strlen(key), | 
|---|
| 241 | PG_GETARG_INT64(1)); | 
|---|
| 242 | } | 
|---|
| 243 |  | 
|---|
| 244 | Datum | 
|---|
| 245 | hashtext(PG_FUNCTION_ARGS) | 
|---|
| 246 | { | 
|---|
| 247 | text	   *key = PG_GETARG_TEXT_PP(0); | 
|---|
| 248 | Oid			collid = PG_GET_COLLATION(); | 
|---|
| 249 | pg_locale_t mylocale = 0; | 
|---|
| 250 | Datum		result; | 
|---|
| 251 |  | 
|---|
| 252 | if (!collid) | 
|---|
| 253 | ereport(ERROR, | 
|---|
| 254 | (errcode(ERRCODE_INDETERMINATE_COLLATION), | 
|---|
| 255 | errmsg( "could not determine which collation to use for string hashing"), | 
|---|
| 256 | errhint( "Use the COLLATE clause to set the collation explicitly."))); | 
|---|
| 257 |  | 
|---|
| 258 | if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) | 
|---|
| 259 | mylocale = pg_newlocale_from_collation(collid); | 
|---|
| 260 |  | 
|---|
| 261 | if (!mylocale || mylocale->deterministic) | 
|---|
| 262 | { | 
|---|
| 263 | result = hash_any((unsigned char *) VARDATA_ANY(key), | 
|---|
| 264 | VARSIZE_ANY_EXHDR(key)); | 
|---|
| 265 | } | 
|---|
| 266 | else | 
|---|
| 267 | { | 
|---|
| 268 | #ifdef USE_ICU | 
|---|
| 269 | if (mylocale->provider == COLLPROVIDER_ICU) | 
|---|
| 270 | { | 
|---|
| 271 | int32_t		ulen = -1; | 
|---|
| 272 | UChar	   *uchar = NULL; | 
|---|
| 273 | Size		bsize; | 
|---|
| 274 | uint8_t    *buf; | 
|---|
| 275 |  | 
|---|
| 276 | ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); | 
|---|
| 277 |  | 
|---|
| 278 | bsize = ucol_getSortKey(mylocale->info.icu.ucol, | 
|---|
| 279 | uchar, ulen, NULL, 0); | 
|---|
| 280 | buf = palloc(bsize); | 
|---|
| 281 | ucol_getSortKey(mylocale->info.icu.ucol, | 
|---|
| 282 | uchar, ulen, buf, bsize); | 
|---|
| 283 |  | 
|---|
| 284 | result = hash_any(buf, bsize); | 
|---|
| 285 |  | 
|---|
| 286 | pfree(buf); | 
|---|
| 287 | } | 
|---|
| 288 | else | 
|---|
| 289 | #endif | 
|---|
| 290 | /* shouldn't happen */ | 
|---|
| 291 | elog(ERROR, "unsupported collprovider: %c", mylocale->provider); | 
|---|
| 292 | } | 
|---|
| 293 |  | 
|---|
| 294 | /* Avoid leaking memory for toasted inputs */ | 
|---|
| 295 | PG_FREE_IF_COPY(key, 0); | 
|---|
| 296 |  | 
|---|
| 297 | return result; | 
|---|
| 298 | } | 
|---|
| 299 |  | 
|---|
| 300 | Datum | 
|---|
| 301 | hashtextextended(PG_FUNCTION_ARGS) | 
|---|
| 302 | { | 
|---|
| 303 | text	   *key = PG_GETARG_TEXT_PP(0); | 
|---|
| 304 | Oid			collid = PG_GET_COLLATION(); | 
|---|
| 305 | pg_locale_t mylocale = 0; | 
|---|
| 306 | Datum		result; | 
|---|
| 307 |  | 
|---|
| 308 | if (!collid) | 
|---|
| 309 | ereport(ERROR, | 
|---|
| 310 | (errcode(ERRCODE_INDETERMINATE_COLLATION), | 
|---|
| 311 | errmsg( "could not determine which collation to use for string hashing"), | 
|---|
| 312 | errhint( "Use the COLLATE clause to set the collation explicitly."))); | 
|---|
| 313 |  | 
|---|
| 314 | if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID) | 
|---|
| 315 | mylocale = pg_newlocale_from_collation(collid); | 
|---|
| 316 |  | 
|---|
| 317 | if (!mylocale || mylocale->deterministic) | 
|---|
| 318 | { | 
|---|
| 319 | result = hash_any_extended((unsigned char *) VARDATA_ANY(key), | 
|---|
| 320 | VARSIZE_ANY_EXHDR(key), | 
|---|
| 321 | PG_GETARG_INT64(1)); | 
|---|
| 322 | } | 
|---|
| 323 | else | 
|---|
| 324 | { | 
|---|
| 325 | #ifdef USE_ICU | 
|---|
| 326 | if (mylocale->provider == COLLPROVIDER_ICU) | 
|---|
| 327 | { | 
|---|
| 328 | int32_t		ulen = -1; | 
|---|
| 329 | UChar	   *uchar = NULL; | 
|---|
| 330 | Size		bsize; | 
|---|
| 331 | uint8_t    *buf; | 
|---|
| 332 |  | 
|---|
| 333 | ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key)); | 
|---|
| 334 |  | 
|---|
| 335 | bsize = ucol_getSortKey(mylocale->info.icu.ucol, | 
|---|
| 336 | uchar, ulen, NULL, 0); | 
|---|
| 337 | buf = palloc(bsize); | 
|---|
| 338 | ucol_getSortKey(mylocale->info.icu.ucol, | 
|---|
| 339 | uchar, ulen, buf, bsize); | 
|---|
| 340 |  | 
|---|
| 341 | result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1)); | 
|---|
| 342 |  | 
|---|
| 343 | pfree(buf); | 
|---|
| 344 | } | 
|---|
| 345 | else | 
|---|
| 346 | #endif | 
|---|
| 347 | /* shouldn't happen */ | 
|---|
| 348 | elog(ERROR, "unsupported collprovider: %c", mylocale->provider); | 
|---|
| 349 | } | 
|---|
| 350 |  | 
|---|
| 351 | PG_FREE_IF_COPY(key, 0); | 
|---|
| 352 |  | 
|---|
| 353 | return result; | 
|---|
| 354 | } | 
|---|
| 355 |  | 
|---|
| 356 | /* | 
|---|
| 357 | * hashvarlena() can be used for any varlena datatype in which there are | 
|---|
| 358 | * no non-significant bits, ie, distinct bitpatterns never compare as equal. | 
|---|
| 359 | */ | 
|---|
| 360 | Datum | 
|---|
| 361 | hashvarlena(PG_FUNCTION_ARGS) | 
|---|
| 362 | { | 
|---|
| 363 | struct varlena *key = PG_GETARG_VARLENA_PP(0); | 
|---|
| 364 | Datum		result; | 
|---|
| 365 |  | 
|---|
| 366 | result = hash_any((unsigned char *) VARDATA_ANY(key), | 
|---|
| 367 | VARSIZE_ANY_EXHDR(key)); | 
|---|
| 368 |  | 
|---|
| 369 | /* Avoid leaking memory for toasted inputs */ | 
|---|
| 370 | PG_FREE_IF_COPY(key, 0); | 
|---|
| 371 |  | 
|---|
| 372 | return result; | 
|---|
| 373 | } | 
|---|
| 374 |  | 
|---|
| 375 | Datum | 
|---|
| 376 | hashvarlenaextended(PG_FUNCTION_ARGS) | 
|---|
| 377 | { | 
|---|
| 378 | struct varlena *key = PG_GETARG_VARLENA_PP(0); | 
|---|
| 379 | Datum		result; | 
|---|
| 380 |  | 
|---|
| 381 | result = hash_any_extended((unsigned char *) VARDATA_ANY(key), | 
|---|
| 382 | VARSIZE_ANY_EXHDR(key), | 
|---|
| 383 | PG_GETARG_INT64(1)); | 
|---|
| 384 |  | 
|---|
| 385 | PG_FREE_IF_COPY(key, 0); | 
|---|
| 386 |  | 
|---|
| 387 | return result; | 
|---|
| 388 | } | 
|---|
| 389 |  | 
|---|