| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * execGrouping.c |
| 4 | * executor utility routines for grouping, hashing, and aggregation |
| 5 | * |
| 6 | * Note: we currently assume that equality and hashing functions are not |
| 7 | * collation-sensitive, so the code in this file has no support for passing |
| 8 | * collation settings through from callers. That may have to change someday. |
| 9 | * |
| 10 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 11 | * Portions Copyright (c) 1994, Regents of the University of California |
| 12 | * |
| 13 | * |
| 14 | * IDENTIFICATION |
| 15 | * src/backend/executor/execGrouping.c |
| 16 | * |
| 17 | *------------------------------------------------------------------------- |
| 18 | */ |
| 19 | #include "postgres.h" |
| 20 | |
| 21 | #include "access/parallel.h" |
| 22 | #include "executor/executor.h" |
| 23 | #include "miscadmin.h" |
| 24 | #include "utils/lsyscache.h" |
| 25 | #include "utils/hashutils.h" |
| 26 | #include "utils/memutils.h" |
| 27 | |
| 28 | static uint32 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple); |
| 29 | static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2); |
| 30 | |
| 31 | /* |
| 32 | * Define parameters for tuple hash table code generation. The interface is |
| 33 | * *also* declared in execnodes.h (to generate the types, which are externally |
| 34 | * visible). |
| 35 | */ |
| 36 | #define SH_PREFIX tuplehash |
| 37 | #define SH_ELEMENT_TYPE TupleHashEntryData |
| 38 | #define SH_KEY_TYPE MinimalTuple |
| 39 | #define SH_KEY firstTuple |
| 40 | #define SH_HASH_KEY(tb, key) TupleHashTableHash(tb, key) |
| 41 | #define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0 |
| 42 | #define SH_SCOPE extern |
| 43 | #define SH_STORE_HASH |
| 44 | #define SH_GET_HASH(tb, a) a->hash |
| 45 | #define SH_DEFINE |
| 46 | #include "lib/simplehash.h" |
| 47 | |
| 48 | |
| 49 | /***************************************************************************** |
| 50 | * Utility routines for grouping tuples together |
| 51 | *****************************************************************************/ |
| 52 | |
| 53 | /* |
| 54 | * execTuplesMatchPrepare |
| 55 | * Build expression that can be evaluated using ExecQual(), returning |
| 56 | * whether an ExprContext's inner/outer tuples are NOT DISTINCT |
| 57 | */ |
| 58 | ExprState * |
| 59 | execTuplesMatchPrepare(TupleDesc desc, |
| 60 | int numCols, |
| 61 | const AttrNumber *keyColIdx, |
| 62 | const Oid *eqOperators, |
| 63 | const Oid *collations, |
| 64 | PlanState *parent) |
| 65 | { |
| 66 | Oid *eqFunctions = (Oid *) palloc(numCols * sizeof(Oid)); |
| 67 | int i; |
| 68 | ExprState *expr; |
| 69 | |
| 70 | if (numCols == 0) |
| 71 | return NULL; |
| 72 | |
| 73 | /* lookup equality functions */ |
| 74 | for (i = 0; i < numCols; i++) |
| 75 | eqFunctions[i] = get_opcode(eqOperators[i]); |
| 76 | |
| 77 | /* build actual expression */ |
| 78 | expr = ExecBuildGroupingEqual(desc, desc, NULL, NULL, |
| 79 | numCols, keyColIdx, eqFunctions, collations, |
| 80 | parent); |
| 81 | |
| 82 | return expr; |
| 83 | } |
| 84 | |
| 85 | /* |
| 86 | * execTuplesHashPrepare |
| 87 | * Look up the equality and hashing functions needed for a TupleHashTable. |
| 88 | * |
| 89 | * This is similar to execTuplesMatchPrepare, but we also need to find the |
| 90 | * hash functions associated with the equality operators. *eqFunctions and |
| 91 | * *hashFunctions receive the palloc'd result arrays. |
| 92 | * |
| 93 | * Note: we expect that the given operators are not cross-type comparisons. |
| 94 | */ |
| 95 | void |
| 96 | execTuplesHashPrepare(int numCols, |
| 97 | const Oid *eqOperators, |
| 98 | Oid **eqFuncOids, |
| 99 | FmgrInfo **hashFunctions) |
| 100 | { |
| 101 | int i; |
| 102 | |
| 103 | *eqFuncOids = (Oid *) palloc(numCols * sizeof(Oid)); |
| 104 | *hashFunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo)); |
| 105 | |
| 106 | for (i = 0; i < numCols; i++) |
| 107 | { |
| 108 | Oid eq_opr = eqOperators[i]; |
| 109 | Oid eq_function; |
| 110 | Oid left_hash_function; |
| 111 | Oid right_hash_function; |
| 112 | |
| 113 | eq_function = get_opcode(eq_opr); |
| 114 | if (!get_op_hash_functions(eq_opr, |
| 115 | &left_hash_function, &right_hash_function)) |
| 116 | elog(ERROR, "could not find hash function for hash operator %u" , |
| 117 | eq_opr); |
| 118 | /* We're not supporting cross-type cases here */ |
| 119 | Assert(left_hash_function == right_hash_function); |
| 120 | (*eqFuncOids)[i] = eq_function; |
| 121 | fmgr_info(right_hash_function, &(*hashFunctions)[i]); |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | |
| 126 | /***************************************************************************** |
| 127 | * Utility routines for all-in-memory hash tables |
| 128 | * |
| 129 | * These routines build hash tables for grouping tuples together (eg, for |
| 130 | * hash aggregation). There is one entry for each not-distinct set of tuples |
| 131 | * presented. |
| 132 | *****************************************************************************/ |
| 133 | |
| 134 | /* |
| 135 | * Construct an empty TupleHashTable |
| 136 | * |
| 137 | * numCols, keyColIdx: identify the tuple fields to use as lookup key |
| 138 | * eqfunctions: equality comparison functions to use |
| 139 | * hashfunctions: datatype-specific hashing functions to use |
| 140 | * nbuckets: initial estimate of hashtable size |
| 141 | * additionalsize: size of data stored in ->additional |
| 142 | * metacxt: memory context for long-lived allocation, but not per-entry data |
| 143 | * tablecxt: memory context in which to store table entries |
| 144 | * tempcxt: short-lived context for evaluation hash and comparison functions |
| 145 | * |
| 146 | * The function arrays may be made with execTuplesHashPrepare(). Note they |
| 147 | * are not cross-type functions, but expect to see the table datatype(s) |
| 148 | * on both sides. |
| 149 | * |
| 150 | * Note that keyColIdx, eqfunctions, and hashfunctions must be allocated in |
| 151 | * storage that will live as long as the hashtable does. |
| 152 | */ |
| 153 | TupleHashTable |
| 154 | BuildTupleHashTableExt(PlanState *parent, |
| 155 | TupleDesc inputDesc, |
| 156 | int numCols, AttrNumber *keyColIdx, |
| 157 | const Oid *eqfuncoids, |
| 158 | FmgrInfo *hashfunctions, |
| 159 | Oid *collations, |
| 160 | long nbuckets, Size additionalsize, |
| 161 | MemoryContext metacxt, |
| 162 | MemoryContext tablecxt, |
| 163 | MemoryContext tempcxt, |
| 164 | bool use_variable_hash_iv) |
| 165 | { |
| 166 | TupleHashTable hashtable; |
| 167 | Size entrysize = sizeof(TupleHashEntryData) + additionalsize; |
| 168 | MemoryContext oldcontext; |
| 169 | bool allow_jit; |
| 170 | |
| 171 | Assert(nbuckets > 0); |
| 172 | |
| 173 | /* Limit initial table size request to not more than work_mem */ |
| 174 | nbuckets = Min(nbuckets, (long) ((work_mem * 1024L) / entrysize)); |
| 175 | |
| 176 | oldcontext = MemoryContextSwitchTo(metacxt); |
| 177 | |
| 178 | hashtable = (TupleHashTable) palloc(sizeof(TupleHashTableData)); |
| 179 | |
| 180 | hashtable->numCols = numCols; |
| 181 | hashtable->keyColIdx = keyColIdx; |
| 182 | hashtable->tab_hash_funcs = hashfunctions; |
| 183 | hashtable->tab_collations = collations; |
| 184 | hashtable->tablecxt = tablecxt; |
| 185 | hashtable->tempcxt = tempcxt; |
| 186 | hashtable->entrysize = entrysize; |
| 187 | hashtable->tableslot = NULL; /* will be made on first lookup */ |
| 188 | hashtable->inputslot = NULL; |
| 189 | hashtable->in_hash_funcs = NULL; |
| 190 | hashtable->cur_eq_func = NULL; |
| 191 | |
| 192 | /* |
| 193 | * If parallelism is in use, even if the master backend is performing the |
| 194 | * scan itself, we don't want to create the hashtable exactly the same way |
| 195 | * in all workers. As hashtables are iterated over in keyspace-order, |
| 196 | * doing so in all processes in the same way is likely to lead to |
| 197 | * "unbalanced" hashtables when the table size initially is |
| 198 | * underestimated. |
| 199 | */ |
| 200 | if (use_variable_hash_iv) |
| 201 | hashtable->hash_iv = murmurhash32(ParallelWorkerNumber); |
| 202 | else |
| 203 | hashtable->hash_iv = 0; |
| 204 | |
| 205 | hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable); |
| 206 | |
| 207 | /* |
| 208 | * We copy the input tuple descriptor just for safety --- we assume all |
| 209 | * input tuples will have equivalent descriptors. |
| 210 | */ |
| 211 | hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc), |
| 212 | &TTSOpsMinimalTuple); |
| 213 | |
| 214 | /* |
| 215 | * If the old reset interface is used (i.e. BuildTupleHashTable, rather |
| 216 | * than BuildTupleHashTableExt), allowing JIT would lead to the generated |
| 217 | * functions to a) live longer than the query b) be re-generated each time |
| 218 | * the table is being reset. Therefore prevent JIT from being used in that |
| 219 | * case, by not providing a parent node (which prevents accessing the |
| 220 | * JitContext in the EState). |
| 221 | */ |
| 222 | allow_jit = metacxt != tablecxt; |
| 223 | |
| 224 | /* build comparator for all columns */ |
| 225 | /* XXX: should we support non-minimal tuples for the inputslot? */ |
| 226 | hashtable->tab_eq_func = ExecBuildGroupingEqual(inputDesc, inputDesc, |
| 227 | &TTSOpsMinimalTuple, &TTSOpsMinimalTuple, |
| 228 | numCols, |
| 229 | keyColIdx, eqfuncoids, collations, |
| 230 | allow_jit ? parent : NULL); |
| 231 | |
| 232 | /* |
| 233 | * While not pretty, it's ok to not shut down this context, but instead |
| 234 | * rely on the containing memory context being reset, as |
| 235 | * ExecBuildGroupingEqual() only builds a very simple expression calling |
| 236 | * functions (i.e. nothing that'd employ RegisterExprContextCallback()). |
| 237 | */ |
| 238 | hashtable->exprcontext = CreateStandaloneExprContext(); |
| 239 | |
| 240 | MemoryContextSwitchTo(oldcontext); |
| 241 | |
| 242 | return hashtable; |
| 243 | } |
| 244 | |
| 245 | /* |
| 246 | * BuildTupleHashTable is a backwards-compatibilty wrapper for |
| 247 | * BuildTupleHashTableExt(), that allocates the hashtable's metadata in |
| 248 | * tablecxt. Note that hashtables created this way cannot be reset leak-free |
| 249 | * with ResetTupleHashTable(). |
| 250 | */ |
| 251 | TupleHashTable |
| 252 | BuildTupleHashTable(PlanState *parent, |
| 253 | TupleDesc inputDesc, |
| 254 | int numCols, AttrNumber *keyColIdx, |
| 255 | const Oid *eqfuncoids, |
| 256 | FmgrInfo *hashfunctions, |
| 257 | Oid *collations, |
| 258 | long nbuckets, Size additionalsize, |
| 259 | MemoryContext tablecxt, |
| 260 | MemoryContext tempcxt, |
| 261 | bool use_variable_hash_iv) |
| 262 | { |
| 263 | return BuildTupleHashTableExt(parent, |
| 264 | inputDesc, |
| 265 | numCols, keyColIdx, |
| 266 | eqfuncoids, |
| 267 | hashfunctions, |
| 268 | collations, |
| 269 | nbuckets, additionalsize, |
| 270 | tablecxt, |
| 271 | tablecxt, |
| 272 | tempcxt, |
| 273 | use_variable_hash_iv); |
| 274 | } |
| 275 | |
| 276 | /* |
| 277 | * Reset contents of the hashtable to be empty, preserving all the non-content |
| 278 | * state. Note that the tablecxt passed to BuildTupleHashTableExt() should |
| 279 | * also be reset, otherwise there will be leaks. |
| 280 | */ |
| 281 | void |
| 282 | ResetTupleHashTable(TupleHashTable hashtable) |
| 283 | { |
| 284 | tuplehash_reset(hashtable->hashtab); |
| 285 | } |
| 286 | |
| 287 | /* |
| 288 | * Find or create a hashtable entry for the tuple group containing the |
| 289 | * given tuple. The tuple must be the same type as the hashtable entries. |
| 290 | * |
| 291 | * If isnew is NULL, we do not create new entries; we return NULL if no |
| 292 | * match is found. |
| 293 | * |
| 294 | * If isnew isn't NULL, then a new entry is created if no existing entry |
| 295 | * matches. On return, *isnew is true if the entry is newly created, |
| 296 | * false if it existed already. ->additional_data in the new entry has |
| 297 | * been zeroed. |
| 298 | */ |
| 299 | TupleHashEntry |
| 300 | LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, |
| 301 | bool *isnew) |
| 302 | { |
| 303 | TupleHashEntryData *entry; |
| 304 | MemoryContext oldContext; |
| 305 | bool found; |
| 306 | MinimalTuple key; |
| 307 | |
| 308 | /* Need to run the hash functions in short-lived context */ |
| 309 | oldContext = MemoryContextSwitchTo(hashtable->tempcxt); |
| 310 | |
| 311 | /* set up data needed by hash and match functions */ |
| 312 | hashtable->inputslot = slot; |
| 313 | hashtable->in_hash_funcs = hashtable->tab_hash_funcs; |
| 314 | hashtable->cur_eq_func = hashtable->tab_eq_func; |
| 315 | |
| 316 | key = NULL; /* flag to reference inputslot */ |
| 317 | |
| 318 | if (isnew) |
| 319 | { |
| 320 | entry = tuplehash_insert(hashtable->hashtab, key, &found); |
| 321 | |
| 322 | if (found) |
| 323 | { |
| 324 | /* found pre-existing entry */ |
| 325 | *isnew = false; |
| 326 | } |
| 327 | else |
| 328 | { |
| 329 | /* created new entry */ |
| 330 | *isnew = true; |
| 331 | /* zero caller data */ |
| 332 | entry->additional = NULL; |
| 333 | MemoryContextSwitchTo(hashtable->tablecxt); |
| 334 | /* Copy the first tuple into the table context */ |
| 335 | entry->firstTuple = ExecCopySlotMinimalTuple(slot); |
| 336 | } |
| 337 | } |
| 338 | else |
| 339 | { |
| 340 | entry = tuplehash_lookup(hashtable->hashtab, key); |
| 341 | } |
| 342 | |
| 343 | MemoryContextSwitchTo(oldContext); |
| 344 | |
| 345 | return entry; |
| 346 | } |
| 347 | |
| 348 | /* |
| 349 | * Search for a hashtable entry matching the given tuple. No entry is |
| 350 | * created if there's not a match. This is similar to the non-creating |
| 351 | * case of LookupTupleHashEntry, except that it supports cross-type |
| 352 | * comparisons, in which the given tuple is not of the same type as the |
| 353 | * table entries. The caller must provide the hash functions to use for |
| 354 | * the input tuple, as well as the equality functions, since these may be |
| 355 | * different from the table's internal functions. |
| 356 | */ |
| 357 | TupleHashEntry |
| 358 | FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, |
| 359 | ExprState *eqcomp, |
| 360 | FmgrInfo *hashfunctions) |
| 361 | { |
| 362 | TupleHashEntry entry; |
| 363 | MemoryContext oldContext; |
| 364 | MinimalTuple key; |
| 365 | |
| 366 | /* Need to run the hash functions in short-lived context */ |
| 367 | oldContext = MemoryContextSwitchTo(hashtable->tempcxt); |
| 368 | |
| 369 | /* Set up data needed by hash and match functions */ |
| 370 | hashtable->inputslot = slot; |
| 371 | hashtable->in_hash_funcs = hashfunctions; |
| 372 | hashtable->cur_eq_func = eqcomp; |
| 373 | |
| 374 | /* Search the hash table */ |
| 375 | key = NULL; /* flag to reference inputslot */ |
| 376 | entry = tuplehash_lookup(hashtable->hashtab, key); |
| 377 | MemoryContextSwitchTo(oldContext); |
| 378 | |
| 379 | return entry; |
| 380 | } |
| 381 | |
| 382 | /* |
| 383 | * Compute the hash value for a tuple |
| 384 | * |
| 385 | * The passed-in key is a pointer to TupleHashEntryData. In an actual hash |
| 386 | * table entry, the firstTuple field points to a tuple (in MinimalTuple |
| 387 | * format). LookupTupleHashEntry sets up a dummy TupleHashEntryData with a |
| 388 | * NULL firstTuple field --- that cues us to look at the inputslot instead. |
| 389 | * This convention avoids the need to materialize virtual input tuples unless |
| 390 | * they actually need to get copied into the table. |
| 391 | * |
| 392 | * Also, the caller must select an appropriate memory context for running |
| 393 | * the hash functions. (dynahash.c doesn't change CurrentMemoryContext.) |
| 394 | */ |
| 395 | static uint32 |
| 396 | TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple) |
| 397 | { |
| 398 | TupleHashTable hashtable = (TupleHashTable) tb->private_data; |
| 399 | int numCols = hashtable->numCols; |
| 400 | AttrNumber *keyColIdx = hashtable->keyColIdx; |
| 401 | uint32 hashkey = hashtable->hash_iv; |
| 402 | TupleTableSlot *slot; |
| 403 | FmgrInfo *hashfunctions; |
| 404 | int i; |
| 405 | |
| 406 | if (tuple == NULL) |
| 407 | { |
| 408 | /* Process the current input tuple for the table */ |
| 409 | slot = hashtable->inputslot; |
| 410 | hashfunctions = hashtable->in_hash_funcs; |
| 411 | } |
| 412 | else |
| 413 | { |
| 414 | /* |
| 415 | * Process a tuple already stored in the table. |
| 416 | * |
| 417 | * (this case never actually occurs due to the way simplehash.h is |
| 418 | * used, as the hash-value is stored in the entries) |
| 419 | */ |
| 420 | slot = hashtable->tableslot; |
| 421 | ExecStoreMinimalTuple(tuple, slot, false); |
| 422 | hashfunctions = hashtable->tab_hash_funcs; |
| 423 | } |
| 424 | |
| 425 | for (i = 0; i < numCols; i++) |
| 426 | { |
| 427 | AttrNumber att = keyColIdx[i]; |
| 428 | Datum attr; |
| 429 | bool isNull; |
| 430 | |
| 431 | /* rotate hashkey left 1 bit at each step */ |
| 432 | hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0); |
| 433 | |
| 434 | attr = slot_getattr(slot, att, &isNull); |
| 435 | |
| 436 | if (!isNull) /* treat nulls as having hash key 0 */ |
| 437 | { |
| 438 | uint32 hkey; |
| 439 | |
| 440 | hkey = DatumGetUInt32(FunctionCall1Coll(&hashfunctions[i], |
| 441 | hashtable->tab_collations[i], |
| 442 | attr)); |
| 443 | hashkey ^= hkey; |
| 444 | } |
| 445 | } |
| 446 | |
| 447 | /* |
| 448 | * The way hashes are combined above, among each other and with the IV, |
| 449 | * doesn't lead to good bit perturbation. As the IV's goal is to lead to |
| 450 | * achieve that, perform a round of hashing of the combined hash - |
| 451 | * resulting in near perfect perturbation. |
| 452 | */ |
| 453 | return murmurhash32(hashkey); |
| 454 | } |
| 455 | |
| 456 | /* |
| 457 | * See whether two tuples (presumably of the same hash value) match |
| 458 | * |
| 459 | * As above, the passed pointers are pointers to TupleHashEntryData. |
| 460 | */ |
| 461 | static int |
| 462 | TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2) |
| 463 | { |
| 464 | TupleTableSlot *slot1; |
| 465 | TupleTableSlot *slot2; |
| 466 | TupleHashTable hashtable = (TupleHashTable) tb->private_data; |
| 467 | ExprContext *econtext = hashtable->exprcontext; |
| 468 | |
| 469 | /* |
| 470 | * We assume that simplehash.h will only ever call us with the first |
| 471 | * argument being an actual table entry, and the second argument being |
| 472 | * LookupTupleHashEntry's dummy TupleHashEntryData. The other direction |
| 473 | * could be supported too, but is not currently required. |
| 474 | */ |
| 475 | Assert(tuple1 != NULL); |
| 476 | slot1 = hashtable->tableslot; |
| 477 | ExecStoreMinimalTuple(tuple1, slot1, false); |
| 478 | Assert(tuple2 == NULL); |
| 479 | slot2 = hashtable->inputslot; |
| 480 | |
| 481 | /* For crosstype comparisons, the inputslot must be first */ |
| 482 | econtext->ecxt_innertuple = slot2; |
| 483 | econtext->ecxt_outertuple = slot1; |
| 484 | return !ExecQualAndReset(hashtable->cur_eq_func, econtext); |
| 485 | } |
| 486 | |