1/*-------------------------------------------------------------------------
2 *
3 * costsize.c
4 * Routines to compute (and set) relation sizes and path costs
5 *
6 * Path costs are measured in arbitrary units established by these basic
7 * parameters:
8 *
9 * seq_page_cost Cost of a sequential page fetch
10 * random_page_cost Cost of a non-sequential page fetch
11 * cpu_tuple_cost Cost of typical CPU time to process a tuple
12 * cpu_index_tuple_cost Cost of typical CPU time to process an index tuple
13 * cpu_operator_cost Cost of CPU time to execute an operator or function
14 * parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend
15 * parallel_setup_cost Cost of setting up shared memory for parallelism
16 *
17 * We expect that the kernel will typically do some amount of read-ahead
18 * optimization; this in conjunction with seek costs means that seq_page_cost
19 * is normally considerably less than random_page_cost. (However, if the
20 * database is fully cached in RAM, it is reasonable to set them equal.)
21 *
22 * We also use a rough estimate "effective_cache_size" of the number of
23 * disk pages in Postgres + OS-level disk cache. (We can't simply use
24 * NBuffers for this purpose because that would ignore the effects of
25 * the kernel's disk cache.)
26 *
27 * Obviously, taking constants for these values is an oversimplification,
28 * but it's tough enough to get any useful estimates even at this level of
29 * detail. Note that all of these parameters are user-settable, in case
30 * the default values are drastically off for a particular platform.
31 *
32 * seq_page_cost and random_page_cost can also be overridden for an individual
33 * tablespace, in case some data is on a fast disk and other data is on a slow
34 * disk. Per-tablespace overrides never apply to temporary work files such as
35 * an external sort or a materialize node that overflows work_mem.
36 *
37 * We compute two separate costs for each path:
38 * total_cost: total estimated cost to fetch all tuples
39 * startup_cost: cost that is expended before first tuple is fetched
40 * In some scenarios, such as when there is a LIMIT or we are implementing
41 * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42 * path's result. A caller can estimate the cost of fetching a partial
43 * result by interpolating between startup_cost and total_cost. In detail:
44 * actual_cost = startup_cost +
45 * (total_cost - startup_cost) * tuples_to_fetch / path->rows;
46 * Note that a base relation's rows count (and, by extension, plan_rows for
47 * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48 * that this equation works properly. (Note: while path->rows is never zero
49 * for ordinary relations, it is zero for paths for provably-empty relations,
50 * so beware of division-by-zero.) The LIMIT is applied as a top-level
51 * plan node.
52 *
53 * For largely historical reasons, most of the routines in this module use
54 * the passed result Path only to store their results (rows, startup_cost and
55 * total_cost) into. All the input data they need is passed as separate
56 * parameters, even though much of it could be extracted from the Path.
57 * An exception is made for the cost_XXXjoin() routines, which expect all
58 * the other fields of the passed XXXPath to be filled in, and similarly
59 * cost_index() assumes the passed IndexPath is valid except for its output
60 * values.
61 *
62 *
63 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
64 * Portions Copyright (c) 1994, Regents of the University of California
65 *
66 * IDENTIFICATION
67 * src/backend/optimizer/path/costsize.c
68 *
69 *-------------------------------------------------------------------------
70 */
71
72#include "postgres.h"
73
74#include <math.h>
75
76#include "access/amapi.h"
77#include "access/htup_details.h"
78#include "access/tsmapi.h"
79#include "executor/executor.h"
80#include "executor/nodeHash.h"
81#include "miscadmin.h"
82#include "nodes/makefuncs.h"
83#include "nodes/nodeFuncs.h"
84#include "optimizer/clauses.h"
85#include "optimizer/cost.h"
86#include "optimizer/optimizer.h"
87#include "optimizer/pathnode.h"
88#include "optimizer/paths.h"
89#include "optimizer/placeholder.h"
90#include "optimizer/plancat.h"
91#include "optimizer/planmain.h"
92#include "optimizer/restrictinfo.h"
93#include "parser/parsetree.h"
94#include "utils/lsyscache.h"
95#include "utils/selfuncs.h"
96#include "utils/spccache.h"
97#include "utils/tuplesort.h"
98
99
100#define LOG2(x) (log(x) / 0.693147180559945)
101
102/*
103 * Append and MergeAppend nodes are less expensive than some other operations
104 * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
105 * per-tuple cost as cpu_tuple_cost multiplied by this value.
106 */
107#define APPEND_CPU_COST_MULTIPLIER 0.5
108
109
110double seq_page_cost = DEFAULT_SEQ_PAGE_COST;
111double random_page_cost = DEFAULT_RANDOM_PAGE_COST;
112double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
113double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
114double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
115double parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
116double parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
117
118int effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
119
120Cost disable_cost = 1.0e10;
121
122int max_parallel_workers_per_gather = 2;
123
124bool enable_seqscan = true;
125bool enable_indexscan = true;
126bool enable_indexonlyscan = true;
127bool enable_bitmapscan = true;
128bool enable_tidscan = true;
129bool enable_sort = true;
130bool enable_hashagg = true;
131bool enable_nestloop = true;
132bool enable_material = true;
133bool enable_mergejoin = true;
134bool enable_hashjoin = true;
135bool enable_gathermerge = true;
136bool enable_partitionwise_join = false;
137bool enable_partitionwise_aggregate = false;
138bool enable_parallel_append = true;
139bool enable_parallel_hash = true;
140bool enable_partition_pruning = true;
141
142typedef struct
143{
144 PlannerInfo *root;
145 QualCost total;
146} cost_qual_eval_context;
147
148static List *extract_nonindex_conditions(List *qual_clauses, List *indexclauses);
149static MergeScanSelCache *cached_scansel(PlannerInfo *root,
150 RestrictInfo *rinfo,
151 PathKey *pathkey);
152static void cost_rescan(PlannerInfo *root, Path *path,
153 Cost *rescan_startup_cost, Cost *rescan_total_cost);
154static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
155static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
156 ParamPathInfo *param_info,
157 QualCost *qpqual_cost);
158static bool has_indexed_join_quals(NestPath *joinpath);
159static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
160 List *quals);
161static double calc_joinrel_size_estimate(PlannerInfo *root,
162 RelOptInfo *joinrel,
163 RelOptInfo *outer_rel,
164 RelOptInfo *inner_rel,
165 double outer_rows,
166 double inner_rows,
167 SpecialJoinInfo *sjinfo,
168 List *restrictlist);
169static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
170 Relids outer_relids,
171 Relids inner_relids,
172 SpecialJoinInfo *sjinfo,
173 List **restrictlist);
174static Cost append_nonpartial_cost(List *subpaths, int numpaths,
175 int parallel_workers);
176static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
177static double relation_byte_size(double tuples, int width);
178static double page_size(double tuples, int width);
179static double get_parallel_divisor(Path *path);
180
181
182/*
183 * clamp_row_est
184 * Force a row-count estimate to a sane value.
185 */
186double
187clamp_row_est(double nrows)
188{
189 /*
190 * Force estimate to be at least one row, to make explain output look
191 * better and to avoid possible divide-by-zero when interpolating costs.
192 * Make it an integer, too.
193 */
194 if (nrows <= 1.0)
195 nrows = 1.0;
196 else
197 nrows = rint(nrows);
198
199 return nrows;
200}
201
202
203/*
204 * cost_seqscan
205 * Determines and returns the cost of scanning a relation sequentially.
206 *
207 * 'baserel' is the relation to be scanned
208 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
209 */
210void
211cost_seqscan(Path *path, PlannerInfo *root,
212 RelOptInfo *baserel, ParamPathInfo *param_info)
213{
214 Cost startup_cost = 0;
215 Cost cpu_run_cost;
216 Cost disk_run_cost;
217 double spc_seq_page_cost;
218 QualCost qpqual_cost;
219 Cost cpu_per_tuple;
220
221 /* Should only be applied to base relations */
222 Assert(baserel->relid > 0);
223 Assert(baserel->rtekind == RTE_RELATION);
224
225 /* Mark the path with the correct row estimate */
226 if (param_info)
227 path->rows = param_info->ppi_rows;
228 else
229 path->rows = baserel->rows;
230
231 if (!enable_seqscan)
232 startup_cost += disable_cost;
233
234 /* fetch estimated page cost for tablespace containing table */
235 get_tablespace_page_costs(baserel->reltablespace,
236 NULL,
237 &spc_seq_page_cost);
238
239 /*
240 * disk costs
241 */
242 disk_run_cost = spc_seq_page_cost * baserel->pages;
243
244 /* CPU costs */
245 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
246
247 startup_cost += qpqual_cost.startup;
248 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
249 cpu_run_cost = cpu_per_tuple * baserel->tuples;
250 /* tlist eval costs are paid per output row, not per tuple scanned */
251 startup_cost += path->pathtarget->cost.startup;
252 cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
253
254 /* Adjust costing for parallelism, if used. */
255 if (path->parallel_workers > 0)
256 {
257 double parallel_divisor = get_parallel_divisor(path);
258
259 /* The CPU cost is divided among all the workers. */
260 cpu_run_cost /= parallel_divisor;
261
262 /*
263 * It may be possible to amortize some of the I/O cost, but probably
264 * not very much, because most operating systems already do aggressive
265 * prefetching. For now, we assume that the disk run cost can't be
266 * amortized at all.
267 */
268
269 /*
270 * In the case of a parallel plan, the row count needs to represent
271 * the number of tuples processed per worker.
272 */
273 path->rows = clamp_row_est(path->rows / parallel_divisor);
274 }
275
276 path->startup_cost = startup_cost;
277 path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
278}
279
280/*
281 * cost_samplescan
282 * Determines and returns the cost of scanning a relation using sampling.
283 *
284 * 'baserel' is the relation to be scanned
285 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
286 */
287void
288cost_samplescan(Path *path, PlannerInfo *root,
289 RelOptInfo *baserel, ParamPathInfo *param_info)
290{
291 Cost startup_cost = 0;
292 Cost run_cost = 0;
293 RangeTblEntry *rte;
294 TableSampleClause *tsc;
295 TsmRoutine *tsm;
296 double spc_seq_page_cost,
297 spc_random_page_cost,
298 spc_page_cost;
299 QualCost qpqual_cost;
300 Cost cpu_per_tuple;
301
302 /* Should only be applied to base relations with tablesample clauses */
303 Assert(baserel->relid > 0);
304 rte = planner_rt_fetch(baserel->relid, root);
305 Assert(rte->rtekind == RTE_RELATION);
306 tsc = rte->tablesample;
307 Assert(tsc != NULL);
308 tsm = GetTsmRoutine(tsc->tsmhandler);
309
310 /* Mark the path with the correct row estimate */
311 if (param_info)
312 path->rows = param_info->ppi_rows;
313 else
314 path->rows = baserel->rows;
315
316 /* fetch estimated page cost for tablespace containing table */
317 get_tablespace_page_costs(baserel->reltablespace,
318 &spc_random_page_cost,
319 &spc_seq_page_cost);
320
321 /* if NextSampleBlock is used, assume random access, else sequential */
322 spc_page_cost = (tsm->NextSampleBlock != NULL) ?
323 spc_random_page_cost : spc_seq_page_cost;
324
325 /*
326 * disk costs (recall that baserel->pages has already been set to the
327 * number of pages the sampling method will visit)
328 */
329 run_cost += spc_page_cost * baserel->pages;
330
331 /*
332 * CPU costs (recall that baserel->tuples has already been set to the
333 * number of tuples the sampling method will select). Note that we ignore
334 * execution cost of the TABLESAMPLE parameter expressions; they will be
335 * evaluated only once per scan, and in most usages they'll likely be
336 * simple constants anyway. We also don't charge anything for the
337 * calculations the sampling method might do internally.
338 */
339 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
340
341 startup_cost += qpqual_cost.startup;
342 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
343 run_cost += cpu_per_tuple * baserel->tuples;
344 /* tlist eval costs are paid per output row, not per tuple scanned */
345 startup_cost += path->pathtarget->cost.startup;
346 run_cost += path->pathtarget->cost.per_tuple * path->rows;
347
348 path->startup_cost = startup_cost;
349 path->total_cost = startup_cost + run_cost;
350}
351
352/*
353 * cost_gather
354 * Determines and returns the cost of gather path.
355 *
356 * 'rel' is the relation to be operated upon
357 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
358 * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
359 * both 'rel' and 'param_info'. This is useful when the path doesn't exactly
360 * correspond to any particular RelOptInfo.
361 */
362void
363cost_gather(GatherPath *path, PlannerInfo *root,
364 RelOptInfo *rel, ParamPathInfo *param_info,
365 double *rows)
366{
367 Cost startup_cost = 0;
368 Cost run_cost = 0;
369
370 /* Mark the path with the correct row estimate */
371 if (rows)
372 path->path.rows = *rows;
373 else if (param_info)
374 path->path.rows = param_info->ppi_rows;
375 else
376 path->path.rows = rel->rows;
377
378 startup_cost = path->subpath->startup_cost;
379
380 run_cost = path->subpath->total_cost - path->subpath->startup_cost;
381
382 /* Parallel setup and communication cost. */
383 startup_cost += parallel_setup_cost;
384 run_cost += parallel_tuple_cost * path->path.rows;
385
386 path->path.startup_cost = startup_cost;
387 path->path.total_cost = (startup_cost + run_cost);
388}
389
390/*
391 * cost_gather_merge
392 * Determines and returns the cost of gather merge path.
393 *
394 * GatherMerge merges several pre-sorted input streams, using a heap that at
395 * any given instant holds the next tuple from each stream. If there are N
396 * streams, we need about N*log2(N) tuple comparisons to construct the heap at
397 * startup, and then for each output tuple, about log2(N) comparisons to
398 * replace the top heap entry with the next tuple from the same stream.
399 */
400void
401cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
402 RelOptInfo *rel, ParamPathInfo *param_info,
403 Cost input_startup_cost, Cost input_total_cost,
404 double *rows)
405{
406 Cost startup_cost = 0;
407 Cost run_cost = 0;
408 Cost comparison_cost;
409 double N;
410 double logN;
411
412 /* Mark the path with the correct row estimate */
413 if (rows)
414 path->path.rows = *rows;
415 else if (param_info)
416 path->path.rows = param_info->ppi_rows;
417 else
418 path->path.rows = rel->rows;
419
420 if (!enable_gathermerge)
421 startup_cost += disable_cost;
422
423 /*
424 * Add one to the number of workers to account for the leader. This might
425 * be overgenerous since the leader will do less work than other workers
426 * in typical cases, but we'll go with it for now.
427 */
428 Assert(path->num_workers > 0);
429 N = (double) path->num_workers + 1;
430 logN = LOG2(N);
431
432 /* Assumed cost per tuple comparison */
433 comparison_cost = 2.0 * cpu_operator_cost;
434
435 /* Heap creation cost */
436 startup_cost += comparison_cost * N * logN;
437
438 /* Per-tuple heap maintenance cost */
439 run_cost += path->path.rows * comparison_cost * logN;
440
441 /* small cost for heap management, like cost_merge_append */
442 run_cost += cpu_operator_cost * path->path.rows;
443
444 /*
445 * Parallel setup and communication cost. Since Gather Merge, unlike
446 * Gather, requires us to block until a tuple is available from every
447 * worker, we bump the IPC cost up a little bit as compared with Gather.
448 * For lack of a better idea, charge an extra 5%.
449 */
450 startup_cost += parallel_setup_cost;
451 run_cost += parallel_tuple_cost * path->path.rows * 1.05;
452
453 path->path.startup_cost = startup_cost + input_startup_cost;
454 path->path.total_cost = (startup_cost + run_cost + input_total_cost);
455}
456
457/*
458 * cost_index
459 * Determines and returns the cost of scanning a relation using an index.
460 *
461 * 'path' describes the indexscan under consideration, and is complete
462 * except for the fields to be set by this routine
463 * 'loop_count' is the number of repetitions of the indexscan to factor into
464 * estimates of caching behavior
465 *
466 * In addition to rows, startup_cost and total_cost, cost_index() sets the
467 * path's indextotalcost and indexselectivity fields. These values will be
468 * needed if the IndexPath is used in a BitmapIndexScan.
469 *
470 * NOTE: path->indexquals must contain only clauses usable as index
471 * restrictions. Any additional quals evaluated as qpquals may reduce the
472 * number of returned tuples, but they won't reduce the number of tuples
473 * we have to fetch from the table, so they don't reduce the scan cost.
474 */
475void
476cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
477 bool partial_path)
478{
479 IndexOptInfo *index = path->indexinfo;
480 RelOptInfo *baserel = index->rel;
481 bool indexonly = (path->path.pathtype == T_IndexOnlyScan);
482 amcostestimate_function amcostestimate;
483 List *qpquals;
484 Cost startup_cost = 0;
485 Cost run_cost = 0;
486 Cost cpu_run_cost = 0;
487 Cost indexStartupCost;
488 Cost indexTotalCost;
489 Selectivity indexSelectivity;
490 double indexCorrelation,
491 csquared;
492 double spc_seq_page_cost,
493 spc_random_page_cost;
494 Cost min_IO_cost,
495 max_IO_cost;
496 QualCost qpqual_cost;
497 Cost cpu_per_tuple;
498 double tuples_fetched;
499 double pages_fetched;
500 double rand_heap_pages;
501 double index_pages;
502
503 /* Should only be applied to base relations */
504 Assert(IsA(baserel, RelOptInfo) &&
505 IsA(index, IndexOptInfo));
506 Assert(baserel->relid > 0);
507 Assert(baserel->rtekind == RTE_RELATION);
508
509 /*
510 * Mark the path with the correct row estimate, and identify which quals
511 * will need to be enforced as qpquals. We need not check any quals that
512 * are implied by the index's predicate, so we can use indrestrictinfo not
513 * baserestrictinfo as the list of relevant restriction clauses for the
514 * rel.
515 */
516 if (path->path.param_info)
517 {
518 path->path.rows = path->path.param_info->ppi_rows;
519 /* qpquals come from the rel's restriction clauses and ppi_clauses */
520 qpquals = list_concat(extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
521 path->indexclauses),
522 extract_nonindex_conditions(path->path.param_info->ppi_clauses,
523 path->indexclauses));
524 }
525 else
526 {
527 path->path.rows = baserel->rows;
528 /* qpquals come from just the rel's restriction clauses */
529 qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
530 path->indexclauses);
531 }
532
533 if (!enable_indexscan)
534 startup_cost += disable_cost;
535 /* we don't need to check enable_indexonlyscan; indxpath.c does that */
536
537 /*
538 * Call index-access-method-specific code to estimate the processing cost
539 * for scanning the index, as well as the selectivity of the index (ie,
540 * the fraction of main-table tuples we will have to retrieve) and its
541 * correlation to the main-table tuple order. We need a cast here because
542 * pathnodes.h uses a weak function type to avoid including amapi.h.
543 */
544 amcostestimate = (amcostestimate_function) index->amcostestimate;
545 amcostestimate(root, path, loop_count,
546 &indexStartupCost, &indexTotalCost,
547 &indexSelectivity, &indexCorrelation,
548 &index_pages);
549
550 /*
551 * Save amcostestimate's results for possible use in bitmap scan planning.
552 * We don't bother to save indexStartupCost or indexCorrelation, because a
553 * bitmap scan doesn't care about either.
554 */
555 path->indextotalcost = indexTotalCost;
556 path->indexselectivity = indexSelectivity;
557
558 /* all costs for touching index itself included here */
559 startup_cost += indexStartupCost;
560 run_cost += indexTotalCost - indexStartupCost;
561
562 /* estimate number of main-table tuples fetched */
563 tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
564
565 /* fetch estimated page costs for tablespace containing table */
566 get_tablespace_page_costs(baserel->reltablespace,
567 &spc_random_page_cost,
568 &spc_seq_page_cost);
569
570 /*----------
571 * Estimate number of main-table pages fetched, and compute I/O cost.
572 *
573 * When the index ordering is uncorrelated with the table ordering,
574 * we use an approximation proposed by Mackert and Lohman (see
575 * index_pages_fetched() for details) to compute the number of pages
576 * fetched, and then charge spc_random_page_cost per page fetched.
577 *
578 * When the index ordering is exactly correlated with the table ordering
579 * (just after a CLUSTER, for example), the number of pages fetched should
580 * be exactly selectivity * table_size. What's more, all but the first
581 * will be sequential fetches, not the random fetches that occur in the
582 * uncorrelated case. So if the number of pages is more than 1, we
583 * ought to charge
584 * spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
585 * For partially-correlated indexes, we ought to charge somewhere between
586 * these two estimates. We currently interpolate linearly between the
587 * estimates based on the correlation squared (XXX is that appropriate?).
588 *
589 * If it's an index-only scan, then we will not need to fetch any heap
590 * pages for which the visibility map shows all tuples are visible.
591 * Hence, reduce the estimated number of heap fetches accordingly.
592 * We use the measured fraction of the entire heap that is all-visible,
593 * which might not be particularly relevant to the subset of the heap
594 * that this query will fetch; but it's not clear how to do better.
595 *----------
596 */
597 if (loop_count > 1)
598 {
599 /*
600 * For repeated indexscans, the appropriate estimate for the
601 * uncorrelated case is to scale up the number of tuples fetched in
602 * the Mackert and Lohman formula by the number of scans, so that we
603 * estimate the number of pages fetched by all the scans; then
604 * pro-rate the costs for one scan. In this case we assume all the
605 * fetches are random accesses.
606 */
607 pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
608 baserel->pages,
609 (double) index->pages,
610 root);
611
612 if (indexonly)
613 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
614
615 rand_heap_pages = pages_fetched;
616
617 max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
618
619 /*
620 * In the perfectly correlated case, the number of pages touched by
621 * each scan is selectivity * table_size, and we can use the Mackert
622 * and Lohman formula at the page level to estimate how much work is
623 * saved by caching across scans. We still assume all the fetches are
624 * random, though, which is an overestimate that's hard to correct for
625 * without double-counting the cache effects. (But in most cases
626 * where such a plan is actually interesting, only one page would get
627 * fetched per scan anyway, so it shouldn't matter much.)
628 */
629 pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
630
631 pages_fetched = index_pages_fetched(pages_fetched * loop_count,
632 baserel->pages,
633 (double) index->pages,
634 root);
635
636 if (indexonly)
637 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
638
639 min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
640 }
641 else
642 {
643 /*
644 * Normal case: apply the Mackert and Lohman formula, and then
645 * interpolate between that and the correlation-derived result.
646 */
647 pages_fetched = index_pages_fetched(tuples_fetched,
648 baserel->pages,
649 (double) index->pages,
650 root);
651
652 if (indexonly)
653 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
654
655 rand_heap_pages = pages_fetched;
656
657 /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
658 max_IO_cost = pages_fetched * spc_random_page_cost;
659
660 /* min_IO_cost is for the perfectly correlated case (csquared=1) */
661 pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
662
663 if (indexonly)
664 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
665
666 if (pages_fetched > 0)
667 {
668 min_IO_cost = spc_random_page_cost;
669 if (pages_fetched > 1)
670 min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
671 }
672 else
673 min_IO_cost = 0;
674 }
675
676 if (partial_path)
677 {
678 /*
679 * For index only scans compute workers based on number of index pages
680 * fetched; the number of heap pages we fetch might be so small as to
681 * effectively rule out parallelism, which we don't want to do.
682 */
683 if (indexonly)
684 rand_heap_pages = -1;
685
686 /*
687 * Estimate the number of parallel workers required to scan index. Use
688 * the number of heap pages computed considering heap fetches won't be
689 * sequential as for parallel scans the pages are accessed in random
690 * order.
691 */
692 path->path.parallel_workers = compute_parallel_worker(baserel,
693 rand_heap_pages,
694 index_pages,
695 max_parallel_workers_per_gather);
696
697 /*
698 * Fall out if workers can't be assigned for parallel scan, because in
699 * such a case this path will be rejected. So there is no benefit in
700 * doing extra computation.
701 */
702 if (path->path.parallel_workers <= 0)
703 return;
704
705 path->path.parallel_aware = true;
706 }
707
708 /*
709 * Now interpolate based on estimated index order correlation to get total
710 * disk I/O cost for main table accesses.
711 */
712 csquared = indexCorrelation * indexCorrelation;
713
714 run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
715
716 /*
717 * Estimate CPU costs per tuple.
718 *
719 * What we want here is cpu_tuple_cost plus the evaluation costs of any
720 * qual clauses that we have to evaluate as qpquals.
721 */
722 cost_qual_eval(&qpqual_cost, qpquals, root);
723
724 startup_cost += qpqual_cost.startup;
725 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
726
727 cpu_run_cost += cpu_per_tuple * tuples_fetched;
728
729 /* tlist eval costs are paid per output row, not per tuple scanned */
730 startup_cost += path->path.pathtarget->cost.startup;
731 cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
732
733 /* Adjust costing for parallelism, if used. */
734 if (path->path.parallel_workers > 0)
735 {
736 double parallel_divisor = get_parallel_divisor(&path->path);
737
738 path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
739
740 /* The CPU cost is divided among all the workers. */
741 cpu_run_cost /= parallel_divisor;
742 }
743
744 run_cost += cpu_run_cost;
745
746 path->path.startup_cost = startup_cost;
747 path->path.total_cost = startup_cost + run_cost;
748}
749
750/*
751 * extract_nonindex_conditions
752 *
753 * Given a list of quals to be enforced in an indexscan, extract the ones that
754 * will have to be applied as qpquals (ie, the index machinery won't handle
755 * them). Here we detect only whether a qual clause is directly redundant
756 * with some indexclause. If the index path is chosen for use, createplan.c
757 * will try a bit harder to get rid of redundant qual conditions; specifically
758 * it will see if quals can be proven to be implied by the indexquals. But
759 * it does not seem worth the cycles to try to factor that in at this stage,
760 * since we're only trying to estimate qual eval costs. Otherwise this must
761 * match the logic in create_indexscan_plan().
762 *
763 * qual_clauses, and the result, are lists of RestrictInfos.
764 * indexclauses is a list of IndexClauses.
765 */
766static List *
767extract_nonindex_conditions(List *qual_clauses, List *indexclauses)
768{
769 List *result = NIL;
770 ListCell *lc;
771
772 foreach(lc, qual_clauses)
773 {
774 RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
775
776 if (rinfo->pseudoconstant)
777 continue; /* we may drop pseudoconstants here */
778 if (is_redundant_with_indexclauses(rinfo, indexclauses))
779 continue; /* dup or derived from same EquivalenceClass */
780 /* ... skip the predicate proof attempt createplan.c will try ... */
781 result = lappend(result, rinfo);
782 }
783 return result;
784}
785
786/*
787 * index_pages_fetched
788 * Estimate the number of pages actually fetched after accounting for
789 * cache effects.
790 *
791 * We use an approximation proposed by Mackert and Lohman, "Index Scans
792 * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
793 * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
794 * The Mackert and Lohman approximation is that the number of pages
795 * fetched is
796 * PF =
797 * min(2TNs/(2T+Ns), T) when T <= b
798 * 2TNs/(2T+Ns) when T > b and Ns <= 2Tb/(2T-b)
799 * b + (Ns - 2Tb/(2T-b))*(T-b)/T when T > b and Ns > 2Tb/(2T-b)
800 * where
801 * T = # pages in table
802 * N = # tuples in table
803 * s = selectivity = fraction of table to be scanned
804 * b = # buffer pages available (we include kernel space here)
805 *
806 * We assume that effective_cache_size is the total number of buffer pages
807 * available for the whole query, and pro-rate that space across all the
808 * tables in the query and the index currently under consideration. (This
809 * ignores space needed for other indexes used by the query, but since we
810 * don't know which indexes will get used, we can't estimate that very well;
811 * and in any case counting all the tables may well be an overestimate, since
812 * depending on the join plan not all the tables may be scanned concurrently.)
813 *
814 * The product Ns is the number of tuples fetched; we pass in that
815 * product rather than calculating it here. "pages" is the number of pages
816 * in the object under consideration (either an index or a table).
817 * "index_pages" is the amount to add to the total table space, which was
818 * computed for us by make_one_rel.
819 *
820 * Caller is expected to have ensured that tuples_fetched is greater than zero
821 * and rounded to integer (see clamp_row_est). The result will likewise be
822 * greater than zero and integral.
823 */
824double
825index_pages_fetched(double tuples_fetched, BlockNumber pages,
826 double index_pages, PlannerInfo *root)
827{
828 double pages_fetched;
829 double total_pages;
830 double T,
831 b;
832
833 /* T is # pages in table, but don't allow it to be zero */
834 T = (pages > 1) ? (double) pages : 1.0;
835
836 /* Compute number of pages assumed to be competing for cache space */
837 total_pages = root->total_table_pages + index_pages;
838 total_pages = Max(total_pages, 1.0);
839 Assert(T <= total_pages);
840
841 /* b is pro-rated share of effective_cache_size */
842 b = (double) effective_cache_size * T / total_pages;
843
844 /* force it positive and integral */
845 if (b <= 1.0)
846 b = 1.0;
847 else
848 b = ceil(b);
849
850 /* This part is the Mackert and Lohman formula */
851 if (T <= b)
852 {
853 pages_fetched =
854 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
855 if (pages_fetched >= T)
856 pages_fetched = T;
857 else
858 pages_fetched = ceil(pages_fetched);
859 }
860 else
861 {
862 double lim;
863
864 lim = (2.0 * T * b) / (2.0 * T - b);
865 if (tuples_fetched <= lim)
866 {
867 pages_fetched =
868 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
869 }
870 else
871 {
872 pages_fetched =
873 b + (tuples_fetched - lim) * (T - b) / T;
874 }
875 pages_fetched = ceil(pages_fetched);
876 }
877 return pages_fetched;
878}
879
880/*
881 * get_indexpath_pages
882 * Determine the total size of the indexes used in a bitmap index path.
883 *
884 * Note: if the same index is used more than once in a bitmap tree, we will
885 * count it multiple times, which perhaps is the wrong thing ... but it's
886 * not completely clear, and detecting duplicates is difficult, so ignore it
887 * for now.
888 */
889static double
890get_indexpath_pages(Path *bitmapqual)
891{
892 double result = 0;
893 ListCell *l;
894
895 if (IsA(bitmapqual, BitmapAndPath))
896 {
897 BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
898
899 foreach(l, apath->bitmapquals)
900 {
901 result += get_indexpath_pages((Path *) lfirst(l));
902 }
903 }
904 else if (IsA(bitmapqual, BitmapOrPath))
905 {
906 BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
907
908 foreach(l, opath->bitmapquals)
909 {
910 result += get_indexpath_pages((Path *) lfirst(l));
911 }
912 }
913 else if (IsA(bitmapqual, IndexPath))
914 {
915 IndexPath *ipath = (IndexPath *) bitmapqual;
916
917 result = (double) ipath->indexinfo->pages;
918 }
919 else
920 elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
921
922 return result;
923}
924
925/*
926 * cost_bitmap_heap_scan
927 * Determines and returns the cost of scanning a relation using a bitmap
928 * index-then-heap plan.
929 *
930 * 'baserel' is the relation to be scanned
931 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
932 * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
933 * 'loop_count' is the number of repetitions of the indexscan to factor into
934 * estimates of caching behavior
935 *
936 * Note: the component IndexPaths in bitmapqual should have been costed
937 * using the same loop_count.
938 */
939void
940cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
941 ParamPathInfo *param_info,
942 Path *bitmapqual, double loop_count)
943{
944 Cost startup_cost = 0;
945 Cost run_cost = 0;
946 Cost indexTotalCost;
947 QualCost qpqual_cost;
948 Cost cpu_per_tuple;
949 Cost cost_per_page;
950 Cost cpu_run_cost;
951 double tuples_fetched;
952 double pages_fetched;
953 double spc_seq_page_cost,
954 spc_random_page_cost;
955 double T;
956
957 /* Should only be applied to base relations */
958 Assert(IsA(baserel, RelOptInfo));
959 Assert(baserel->relid > 0);
960 Assert(baserel->rtekind == RTE_RELATION);
961
962 /* Mark the path with the correct row estimate */
963 if (param_info)
964 path->rows = param_info->ppi_rows;
965 else
966 path->rows = baserel->rows;
967
968 if (!enable_bitmapscan)
969 startup_cost += disable_cost;
970
971 pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
972 loop_count, &indexTotalCost,
973 &tuples_fetched);
974
975 startup_cost += indexTotalCost;
976 T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
977
978 /* Fetch estimated page costs for tablespace containing table. */
979 get_tablespace_page_costs(baserel->reltablespace,
980 &spc_random_page_cost,
981 &spc_seq_page_cost);
982
983 /*
984 * For small numbers of pages we should charge spc_random_page_cost
985 * apiece, while if nearly all the table's pages are being read, it's more
986 * appropriate to charge spc_seq_page_cost apiece. The effect is
987 * nonlinear, too. For lack of a better idea, interpolate like this to
988 * determine the cost per page.
989 */
990 if (pages_fetched >= 2.0)
991 cost_per_page = spc_random_page_cost -
992 (spc_random_page_cost - spc_seq_page_cost)
993 * sqrt(pages_fetched / T);
994 else
995 cost_per_page = spc_random_page_cost;
996
997 run_cost += pages_fetched * cost_per_page;
998
999 /*
1000 * Estimate CPU costs per tuple.
1001 *
1002 * Often the indexquals don't need to be rechecked at each tuple ... but
1003 * not always, especially not if there are enough tuples involved that the
1004 * bitmaps become lossy. For the moment, just assume they will be
1005 * rechecked always. This means we charge the full freight for all the
1006 * scan clauses.
1007 */
1008 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1009
1010 startup_cost += qpqual_cost.startup;
1011 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1012 cpu_run_cost = cpu_per_tuple * tuples_fetched;
1013
1014 /* Adjust costing for parallelism, if used. */
1015 if (path->parallel_workers > 0)
1016 {
1017 double parallel_divisor = get_parallel_divisor(path);
1018
1019 /* The CPU cost is divided among all the workers. */
1020 cpu_run_cost /= parallel_divisor;
1021
1022 path->rows = clamp_row_est(path->rows / parallel_divisor);
1023 }
1024
1025
1026 run_cost += cpu_run_cost;
1027
1028 /* tlist eval costs are paid per output row, not per tuple scanned */
1029 startup_cost += path->pathtarget->cost.startup;
1030 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1031
1032 path->startup_cost = startup_cost;
1033 path->total_cost = startup_cost + run_cost;
1034}
1035
1036/*
1037 * cost_bitmap_tree_node
1038 * Extract cost and selectivity from a bitmap tree node (index/and/or)
1039 */
1040void
1041cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
1042{
1043 if (IsA(path, IndexPath))
1044 {
1045 *cost = ((IndexPath *) path)->indextotalcost;
1046 *selec = ((IndexPath *) path)->indexselectivity;
1047
1048 /*
1049 * Charge a small amount per retrieved tuple to reflect the costs of
1050 * manipulating the bitmap. This is mostly to make sure that a bitmap
1051 * scan doesn't look to be the same cost as an indexscan to retrieve a
1052 * single tuple.
1053 */
1054 *cost += 0.1 * cpu_operator_cost * path->rows;
1055 }
1056 else if (IsA(path, BitmapAndPath))
1057 {
1058 *cost = path->total_cost;
1059 *selec = ((BitmapAndPath *) path)->bitmapselectivity;
1060 }
1061 else if (IsA(path, BitmapOrPath))
1062 {
1063 *cost = path->total_cost;
1064 *selec = ((BitmapOrPath *) path)->bitmapselectivity;
1065 }
1066 else
1067 {
1068 elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1069 *cost = *selec = 0; /* keep compiler quiet */
1070 }
1071}
1072
1073/*
1074 * cost_bitmap_and_node
1075 * Estimate the cost of a BitmapAnd node
1076 *
1077 * Note that this considers only the costs of index scanning and bitmap
1078 * creation, not the eventual heap access. In that sense the object isn't
1079 * truly a Path, but it has enough path-like properties (costs in particular)
1080 * to warrant treating it as one. We don't bother to set the path rows field,
1081 * however.
1082 */
1083void
1084cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
1085{
1086 Cost totalCost;
1087 Selectivity selec;
1088 ListCell *l;
1089
1090 /*
1091 * We estimate AND selectivity on the assumption that the inputs are
1092 * independent. This is probably often wrong, but we don't have the info
1093 * to do better.
1094 *
1095 * The runtime cost of the BitmapAnd itself is estimated at 100x
1096 * cpu_operator_cost for each tbm_intersect needed. Probably too small,
1097 * definitely too simplistic?
1098 */
1099 totalCost = 0.0;
1100 selec = 1.0;
1101 foreach(l, path->bitmapquals)
1102 {
1103 Path *subpath = (Path *) lfirst(l);
1104 Cost subCost;
1105 Selectivity subselec;
1106
1107 cost_bitmap_tree_node(subpath, &subCost, &subselec);
1108
1109 selec *= subselec;
1110
1111 totalCost += subCost;
1112 if (l != list_head(path->bitmapquals))
1113 totalCost += 100.0 * cpu_operator_cost;
1114 }
1115 path->bitmapselectivity = selec;
1116 path->path.rows = 0; /* per above, not used */
1117 path->path.startup_cost = totalCost;
1118 path->path.total_cost = totalCost;
1119}
1120
1121/*
1122 * cost_bitmap_or_node
1123 * Estimate the cost of a BitmapOr node
1124 *
1125 * See comments for cost_bitmap_and_node.
1126 */
1127void
1128cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
1129{
1130 Cost totalCost;
1131 Selectivity selec;
1132 ListCell *l;
1133
1134 /*
1135 * We estimate OR selectivity on the assumption that the inputs are
1136 * non-overlapping, since that's often the case in "x IN (list)" type
1137 * situations. Of course, we clamp to 1.0 at the end.
1138 *
1139 * The runtime cost of the BitmapOr itself is estimated at 100x
1140 * cpu_operator_cost for each tbm_union needed. Probably too small,
1141 * definitely too simplistic? We are aware that the tbm_unions are
1142 * optimized out when the inputs are BitmapIndexScans.
1143 */
1144 totalCost = 0.0;
1145 selec = 0.0;
1146 foreach(l, path->bitmapquals)
1147 {
1148 Path *subpath = (Path *) lfirst(l);
1149 Cost subCost;
1150 Selectivity subselec;
1151
1152 cost_bitmap_tree_node(subpath, &subCost, &subselec);
1153
1154 selec += subselec;
1155
1156 totalCost += subCost;
1157 if (l != list_head(path->bitmapquals) &&
1158 !IsA(subpath, IndexPath))
1159 totalCost += 100.0 * cpu_operator_cost;
1160 }
1161 path->bitmapselectivity = Min(selec, 1.0);
1162 path->path.rows = 0; /* per above, not used */
1163 path->path.startup_cost = totalCost;
1164 path->path.total_cost = totalCost;
1165}
1166
1167/*
1168 * cost_tidscan
1169 * Determines and returns the cost of scanning a relation using TIDs.
1170 *
1171 * 'baserel' is the relation to be scanned
1172 * 'tidquals' is the list of TID-checkable quals
1173 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1174 */
1175void
1176cost_tidscan(Path *path, PlannerInfo *root,
1177 RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1178{
1179 Cost startup_cost = 0;
1180 Cost run_cost = 0;
1181 bool isCurrentOf = false;
1182 QualCost qpqual_cost;
1183 Cost cpu_per_tuple;
1184 QualCost tid_qual_cost;
1185 int ntuples;
1186 ListCell *l;
1187 double spc_random_page_cost;
1188
1189 /* Should only be applied to base relations */
1190 Assert(baserel->relid > 0);
1191 Assert(baserel->rtekind == RTE_RELATION);
1192
1193 /* Mark the path with the correct row estimate */
1194 if (param_info)
1195 path->rows = param_info->ppi_rows;
1196 else
1197 path->rows = baserel->rows;
1198
1199 /* Count how many tuples we expect to retrieve */
1200 ntuples = 0;
1201 foreach(l, tidquals)
1202 {
1203 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1204 Expr *qual = rinfo->clause;
1205
1206 if (IsA(qual, ScalarArrayOpExpr))
1207 {
1208 /* Each element of the array yields 1 tuple */
1209 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
1210 Node *arraynode = (Node *) lsecond(saop->args);
1211
1212 ntuples += estimate_array_length(arraynode);
1213 }
1214 else if (IsA(qual, CurrentOfExpr))
1215 {
1216 /* CURRENT OF yields 1 tuple */
1217 isCurrentOf = true;
1218 ntuples++;
1219 }
1220 else
1221 {
1222 /* It's just CTID = something, count 1 tuple */
1223 ntuples++;
1224 }
1225 }
1226
1227 /*
1228 * We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1229 * understands how to do it correctly. Therefore, honor enable_tidscan
1230 * only when CURRENT OF isn't present. Also note that cost_qual_eval
1231 * counts a CurrentOfExpr as having startup cost disable_cost, which we
1232 * subtract off here; that's to prevent other plan types such as seqscan
1233 * from winning.
1234 */
1235 if (isCurrentOf)
1236 {
1237 Assert(baserel->baserestrictcost.startup >= disable_cost);
1238 startup_cost -= disable_cost;
1239 }
1240 else if (!enable_tidscan)
1241 startup_cost += disable_cost;
1242
1243 /*
1244 * The TID qual expressions will be computed once, any other baserestrict
1245 * quals once per retrieved tuple.
1246 */
1247 cost_qual_eval(&tid_qual_cost, tidquals, root);
1248
1249 /* fetch estimated page cost for tablespace containing table */
1250 get_tablespace_page_costs(baserel->reltablespace,
1251 &spc_random_page_cost,
1252 NULL);
1253
1254 /* disk costs --- assume each tuple on a different page */
1255 run_cost += spc_random_page_cost * ntuples;
1256
1257 /* Add scanning CPU costs */
1258 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1259
1260 /* XXX currently we assume TID quals are a subset of qpquals */
1261 startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1262 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1263 tid_qual_cost.per_tuple;
1264 run_cost += cpu_per_tuple * ntuples;
1265
1266 /* tlist eval costs are paid per output row, not per tuple scanned */
1267 startup_cost += path->pathtarget->cost.startup;
1268 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1269
1270 path->startup_cost = startup_cost;
1271 path->total_cost = startup_cost + run_cost;
1272}
1273
1274/*
1275 * cost_subqueryscan
1276 * Determines and returns the cost of scanning a subquery RTE.
1277 *
1278 * 'baserel' is the relation to be scanned
1279 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1280 */
1281void
1282cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
1283 RelOptInfo *baserel, ParamPathInfo *param_info)
1284{
1285 Cost startup_cost;
1286 Cost run_cost;
1287 QualCost qpqual_cost;
1288 Cost cpu_per_tuple;
1289
1290 /* Should only be applied to base relations that are subqueries */
1291 Assert(baserel->relid > 0);
1292 Assert(baserel->rtekind == RTE_SUBQUERY);
1293
1294 /* Mark the path with the correct row estimate */
1295 if (param_info)
1296 path->path.rows = param_info->ppi_rows;
1297 else
1298 path->path.rows = baserel->rows;
1299
1300 /*
1301 * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1302 * any restriction clauses and tlist that will be attached to the
1303 * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1304 * projection overhead.
1305 */
1306 path->path.startup_cost = path->subpath->startup_cost;
1307 path->path.total_cost = path->subpath->total_cost;
1308
1309 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1310
1311 startup_cost = qpqual_cost.startup;
1312 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1313 run_cost = cpu_per_tuple * baserel->tuples;
1314
1315 /* tlist eval costs are paid per output row, not per tuple scanned */
1316 startup_cost += path->path.pathtarget->cost.startup;
1317 run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1318
1319 path->path.startup_cost += startup_cost;
1320 path->path.total_cost += startup_cost + run_cost;
1321}
1322
1323/*
1324 * cost_functionscan
1325 * Determines and returns the cost of scanning a function RTE.
1326 *
1327 * 'baserel' is the relation to be scanned
1328 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1329 */
1330void
1331cost_functionscan(Path *path, PlannerInfo *root,
1332 RelOptInfo *baserel, ParamPathInfo *param_info)
1333{
1334 Cost startup_cost = 0;
1335 Cost run_cost = 0;
1336 QualCost qpqual_cost;
1337 Cost cpu_per_tuple;
1338 RangeTblEntry *rte;
1339 QualCost exprcost;
1340
1341 /* Should only be applied to base relations that are functions */
1342 Assert(baserel->relid > 0);
1343 rte = planner_rt_fetch(baserel->relid, root);
1344 Assert(rte->rtekind == RTE_FUNCTION);
1345
1346 /* Mark the path with the correct row estimate */
1347 if (param_info)
1348 path->rows = param_info->ppi_rows;
1349 else
1350 path->rows = baserel->rows;
1351
1352 /*
1353 * Estimate costs of executing the function expression(s).
1354 *
1355 * Currently, nodeFunctionscan.c always executes the functions to
1356 * completion before returning any rows, and caches the results in a
1357 * tuplestore. So the function eval cost is all startup cost, and per-row
1358 * costs are minimal.
1359 *
1360 * XXX in principle we ought to charge tuplestore spill costs if the
1361 * number of rows is large. However, given how phony our rowcount
1362 * estimates for functions tend to be, there's not a lot of point in that
1363 * refinement right now.
1364 */
1365 cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1366
1367 startup_cost += exprcost.startup + exprcost.per_tuple;
1368
1369 /* Add scanning CPU costs */
1370 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1371
1372 startup_cost += qpqual_cost.startup;
1373 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1374 run_cost += cpu_per_tuple * baserel->tuples;
1375
1376 /* tlist eval costs are paid per output row, not per tuple scanned */
1377 startup_cost += path->pathtarget->cost.startup;
1378 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1379
1380 path->startup_cost = startup_cost;
1381 path->total_cost = startup_cost + run_cost;
1382}
1383
1384/*
1385 * cost_tablefuncscan
1386 * Determines and returns the cost of scanning a table function.
1387 *
1388 * 'baserel' is the relation to be scanned
1389 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1390 */
1391void
1392cost_tablefuncscan(Path *path, PlannerInfo *root,
1393 RelOptInfo *baserel, ParamPathInfo *param_info)
1394{
1395 Cost startup_cost = 0;
1396 Cost run_cost = 0;
1397 QualCost qpqual_cost;
1398 Cost cpu_per_tuple;
1399 RangeTblEntry *rte;
1400 QualCost exprcost;
1401
1402 /* Should only be applied to base relations that are functions */
1403 Assert(baserel->relid > 0);
1404 rte = planner_rt_fetch(baserel->relid, root);
1405 Assert(rte->rtekind == RTE_TABLEFUNC);
1406
1407 /* Mark the path with the correct row estimate */
1408 if (param_info)
1409 path->rows = param_info->ppi_rows;
1410 else
1411 path->rows = baserel->rows;
1412
1413 /*
1414 * Estimate costs of executing the table func expression(s).
1415 *
1416 * XXX in principle we ought to charge tuplestore spill costs if the
1417 * number of rows is large. However, given how phony our rowcount
1418 * estimates for tablefuncs tend to be, there's not a lot of point in that
1419 * refinement right now.
1420 */
1421 cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1422
1423 startup_cost += exprcost.startup + exprcost.per_tuple;
1424
1425 /* Add scanning CPU costs */
1426 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1427
1428 startup_cost += qpqual_cost.startup;
1429 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1430 run_cost += cpu_per_tuple * baserel->tuples;
1431
1432 /* tlist eval costs are paid per output row, not per tuple scanned */
1433 startup_cost += path->pathtarget->cost.startup;
1434 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1435
1436 path->startup_cost = startup_cost;
1437 path->total_cost = startup_cost + run_cost;
1438}
1439
1440/*
1441 * cost_valuesscan
1442 * Determines and returns the cost of scanning a VALUES RTE.
1443 *
1444 * 'baserel' is the relation to be scanned
1445 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1446 */
1447void
1448cost_valuesscan(Path *path, PlannerInfo *root,
1449 RelOptInfo *baserel, ParamPathInfo *param_info)
1450{
1451 Cost startup_cost = 0;
1452 Cost run_cost = 0;
1453 QualCost qpqual_cost;
1454 Cost cpu_per_tuple;
1455
1456 /* Should only be applied to base relations that are values lists */
1457 Assert(baserel->relid > 0);
1458 Assert(baserel->rtekind == RTE_VALUES);
1459
1460 /* Mark the path with the correct row estimate */
1461 if (param_info)
1462 path->rows = param_info->ppi_rows;
1463 else
1464 path->rows = baserel->rows;
1465
1466 /*
1467 * For now, estimate list evaluation cost at one operator eval per list
1468 * (probably pretty bogus, but is it worth being smarter?)
1469 */
1470 cpu_per_tuple = cpu_operator_cost;
1471
1472 /* Add scanning CPU costs */
1473 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1474
1475 startup_cost += qpqual_cost.startup;
1476 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1477 run_cost += cpu_per_tuple * baserel->tuples;
1478
1479 /* tlist eval costs are paid per output row, not per tuple scanned */
1480 startup_cost += path->pathtarget->cost.startup;
1481 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1482
1483 path->startup_cost = startup_cost;
1484 path->total_cost = startup_cost + run_cost;
1485}
1486
1487/*
1488 * cost_ctescan
1489 * Determines and returns the cost of scanning a CTE RTE.
1490 *
1491 * Note: this is used for both self-reference and regular CTEs; the
1492 * possible cost differences are below the threshold of what we could
1493 * estimate accurately anyway. Note that the costs of evaluating the
1494 * referenced CTE query are added into the final plan as initplan costs,
1495 * and should NOT be counted here.
1496 */
1497void
1498cost_ctescan(Path *path, PlannerInfo *root,
1499 RelOptInfo *baserel, ParamPathInfo *param_info)
1500{
1501 Cost startup_cost = 0;
1502 Cost run_cost = 0;
1503 QualCost qpqual_cost;
1504 Cost cpu_per_tuple;
1505
1506 /* Should only be applied to base relations that are CTEs */
1507 Assert(baserel->relid > 0);
1508 Assert(baserel->rtekind == RTE_CTE);
1509
1510 /* Mark the path with the correct row estimate */
1511 if (param_info)
1512 path->rows = param_info->ppi_rows;
1513 else
1514 path->rows = baserel->rows;
1515
1516 /* Charge one CPU tuple cost per row for tuplestore manipulation */
1517 cpu_per_tuple = cpu_tuple_cost;
1518
1519 /* Add scanning CPU costs */
1520 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1521
1522 startup_cost += qpqual_cost.startup;
1523 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1524 run_cost += cpu_per_tuple * baserel->tuples;
1525
1526 /* tlist eval costs are paid per output row, not per tuple scanned */
1527 startup_cost += path->pathtarget->cost.startup;
1528 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1529
1530 path->startup_cost = startup_cost;
1531 path->total_cost = startup_cost + run_cost;
1532}
1533
1534/*
1535 * cost_namedtuplestorescan
1536 * Determines and returns the cost of scanning a named tuplestore.
1537 */
1538void
1539cost_namedtuplestorescan(Path *path, PlannerInfo *root,
1540 RelOptInfo *baserel, ParamPathInfo *param_info)
1541{
1542 Cost startup_cost = 0;
1543 Cost run_cost = 0;
1544 QualCost qpqual_cost;
1545 Cost cpu_per_tuple;
1546
1547 /* Should only be applied to base relations that are Tuplestores */
1548 Assert(baserel->relid > 0);
1549 Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1550
1551 /* Mark the path with the correct row estimate */
1552 if (param_info)
1553 path->rows = param_info->ppi_rows;
1554 else
1555 path->rows = baserel->rows;
1556
1557 /* Charge one CPU tuple cost per row for tuplestore manipulation */
1558 cpu_per_tuple = cpu_tuple_cost;
1559
1560 /* Add scanning CPU costs */
1561 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1562
1563 startup_cost += qpqual_cost.startup;
1564 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1565 run_cost += cpu_per_tuple * baserel->tuples;
1566
1567 path->startup_cost = startup_cost;
1568 path->total_cost = startup_cost + run_cost;
1569}
1570
1571/*
1572 * cost_resultscan
1573 * Determines and returns the cost of scanning an RTE_RESULT relation.
1574 */
1575void
1576cost_resultscan(Path *path, PlannerInfo *root,
1577 RelOptInfo *baserel, ParamPathInfo *param_info)
1578{
1579 Cost startup_cost = 0;
1580 Cost run_cost = 0;
1581 QualCost qpqual_cost;
1582 Cost cpu_per_tuple;
1583
1584 /* Should only be applied to RTE_RESULT base relations */
1585 Assert(baserel->relid > 0);
1586 Assert(baserel->rtekind == RTE_RESULT);
1587
1588 /* Mark the path with the correct row estimate */
1589 if (param_info)
1590 path->rows = param_info->ppi_rows;
1591 else
1592 path->rows = baserel->rows;
1593
1594 /* We charge qual cost plus cpu_tuple_cost */
1595 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1596
1597 startup_cost += qpqual_cost.startup;
1598 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1599 run_cost += cpu_per_tuple * baserel->tuples;
1600
1601 path->startup_cost = startup_cost;
1602 path->total_cost = startup_cost + run_cost;
1603}
1604
1605/*
1606 * cost_recursive_union
1607 * Determines and returns the cost of performing a recursive union,
1608 * and also the estimated output size.
1609 *
1610 * We are given Paths for the nonrecursive and recursive terms.
1611 */
1612void
1613cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1614{
1615 Cost startup_cost;
1616 Cost total_cost;
1617 double total_rows;
1618
1619 /* We probably have decent estimates for the non-recursive term */
1620 startup_cost = nrterm->startup_cost;
1621 total_cost = nrterm->total_cost;
1622 total_rows = nrterm->rows;
1623
1624 /*
1625 * We arbitrarily assume that about 10 recursive iterations will be
1626 * needed, and that we've managed to get a good fix on the cost and output
1627 * size of each one of them. These are mighty shaky assumptions but it's
1628 * hard to see how to do better.
1629 */
1630 total_cost += 10 * rterm->total_cost;
1631 total_rows += 10 * rterm->rows;
1632
1633 /*
1634 * Also charge cpu_tuple_cost per row to account for the costs of
1635 * manipulating the tuplestores. (We don't worry about possible
1636 * spill-to-disk costs.)
1637 */
1638 total_cost += cpu_tuple_cost * total_rows;
1639
1640 runion->startup_cost = startup_cost;
1641 runion->total_cost = total_cost;
1642 runion->rows = total_rows;
1643 runion->pathtarget->width = Max(nrterm->pathtarget->width,
1644 rterm->pathtarget->width);
1645}
1646
1647/*
1648 * cost_sort
1649 * Determines and returns the cost of sorting a relation, including
1650 * the cost of reading the input data.
1651 *
1652 * If the total volume of data to sort is less than sort_mem, we will do
1653 * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1654 * comparisons for t tuples.
1655 *
1656 * If the total volume exceeds sort_mem, we switch to a tape-style merge
1657 * algorithm. There will still be about t*log2(t) tuple comparisons in
1658 * total, but we will also need to write and read each tuple once per
1659 * merge pass. We expect about ceil(logM(r)) merge passes where r is the
1660 * number of initial runs formed and M is the merge order used by tuplesort.c.
1661 * Since the average initial run should be about sort_mem, we have
1662 * disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1663 * cpu = comparison_cost * t * log2(t)
1664 *
1665 * If the sort is bounded (i.e., only the first k result tuples are needed)
1666 * and k tuples can fit into sort_mem, we use a heap method that keeps only
1667 * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1668 *
1669 * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1670 * accesses (XXX can't we refine that guess?)
1671 *
1672 * By default, we charge two operator evals per tuple comparison, which should
1673 * be in the right ballpark in most cases. The caller can tweak this by
1674 * specifying nonzero comparison_cost; typically that's used for any extra
1675 * work that has to be done to prepare the inputs to the comparison operators.
1676 *
1677 * 'pathkeys' is a list of sort keys
1678 * 'input_cost' is the total cost for reading the input data
1679 * 'tuples' is the number of tuples in the relation
1680 * 'width' is the average tuple width in bytes
1681 * 'comparison_cost' is the extra cost per comparison, if any
1682 * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1683 * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1684 *
1685 * NOTE: some callers currently pass NIL for pathkeys because they
1686 * can't conveniently supply the sort keys. Since this routine doesn't
1687 * currently do anything with pathkeys anyway, that doesn't matter...
1688 * but if it ever does, it should react gracefully to lack of key data.
1689 * (Actually, the thing we'd most likely be interested in is just the number
1690 * of sort keys, which all callers *could* supply.)
1691 */
1692void
1693cost_sort(Path *path, PlannerInfo *root,
1694 List *pathkeys, Cost input_cost, double tuples, int width,
1695 Cost comparison_cost, int sort_mem,
1696 double limit_tuples)
1697{
1698 Cost startup_cost = input_cost;
1699 Cost run_cost = 0;
1700 double input_bytes = relation_byte_size(tuples, width);
1701 double output_bytes;
1702 double output_tuples;
1703 long sort_mem_bytes = sort_mem * 1024L;
1704
1705 if (!enable_sort)
1706 startup_cost += disable_cost;
1707
1708 path->rows = tuples;
1709
1710 /*
1711 * We want to be sure the cost of a sort is never estimated as zero, even
1712 * if passed-in tuple count is zero. Besides, mustn't do log(0)...
1713 */
1714 if (tuples < 2.0)
1715 tuples = 2.0;
1716
1717 /* Include the default cost-per-comparison */
1718 comparison_cost += 2.0 * cpu_operator_cost;
1719
1720 /* Do we have a useful LIMIT? */
1721 if (limit_tuples > 0 && limit_tuples < tuples)
1722 {
1723 output_tuples = limit_tuples;
1724 output_bytes = relation_byte_size(output_tuples, width);
1725 }
1726 else
1727 {
1728 output_tuples = tuples;
1729 output_bytes = input_bytes;
1730 }
1731
1732 if (output_bytes > sort_mem_bytes)
1733 {
1734 /*
1735 * We'll have to use a disk-based sort of all the tuples
1736 */
1737 double npages = ceil(input_bytes / BLCKSZ);
1738 double nruns = input_bytes / sort_mem_bytes;
1739 double mergeorder = tuplesort_merge_order(sort_mem_bytes);
1740 double log_runs;
1741 double npageaccesses;
1742
1743 /*
1744 * CPU costs
1745 *
1746 * Assume about N log2 N comparisons
1747 */
1748 startup_cost += comparison_cost * tuples * LOG2(tuples);
1749
1750 /* Disk costs */
1751
1752 /* Compute logM(r) as log(r) / log(M) */
1753 if (nruns > mergeorder)
1754 log_runs = ceil(log(nruns) / log(mergeorder));
1755 else
1756 log_runs = 1.0;
1757 npageaccesses = 2.0 * npages * log_runs;
1758 /* Assume 3/4ths of accesses are sequential, 1/4th are not */
1759 startup_cost += npageaccesses *
1760 (seq_page_cost * 0.75 + random_page_cost * 0.25);
1761 }
1762 else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
1763 {
1764 /*
1765 * We'll use a bounded heap-sort keeping just K tuples in memory, for
1766 * a total number of tuple comparisons of N log2 K; but the constant
1767 * factor is a bit higher than for quicksort. Tweak it so that the
1768 * cost curve is continuous at the crossover point.
1769 */
1770 startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
1771 }
1772 else
1773 {
1774 /* We'll use plain quicksort on all the input tuples */
1775 startup_cost += comparison_cost * tuples * LOG2(tuples);
1776 }
1777
1778 /*
1779 * Also charge a small amount (arbitrarily set equal to operator cost) per
1780 * extracted tuple. We don't charge cpu_tuple_cost because a Sort node
1781 * doesn't do qual-checking or projection, so it has less overhead than
1782 * most plan nodes. Note it's correct to use tuples not output_tuples
1783 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
1784 * counting the LIMIT otherwise.
1785 */
1786 run_cost += cpu_operator_cost * tuples;
1787
1788 path->startup_cost = startup_cost;
1789 path->total_cost = startup_cost + run_cost;
1790}
1791
1792/*
1793 * append_nonpartial_cost
1794 * Estimate the cost of the non-partial paths in a Parallel Append.
1795 * The non-partial paths are assumed to be the first "numpaths" paths
1796 * from the subpaths list, and to be in order of decreasing cost.
1797 */
1798static Cost
1799append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
1800{
1801 Cost *costarr;
1802 int arrlen;
1803 ListCell *l;
1804 ListCell *cell;
1805 int i;
1806 int path_index;
1807 int min_index;
1808 int max_index;
1809
1810 if (numpaths == 0)
1811 return 0;
1812
1813 /*
1814 * Array length is number of workers or number of relevants paths,
1815 * whichever is less.
1816 */
1817 arrlen = Min(parallel_workers, numpaths);
1818 costarr = (Cost *) palloc(sizeof(Cost) * arrlen);
1819
1820 /* The first few paths will each be claimed by a different worker. */
1821 path_index = 0;
1822 foreach(cell, subpaths)
1823 {
1824 Path *subpath = (Path *) lfirst(cell);
1825
1826 if (path_index == arrlen)
1827 break;
1828 costarr[path_index++] = subpath->total_cost;
1829 }
1830
1831 /*
1832 * Since subpaths are sorted by decreasing cost, the last one will have
1833 * the minimum cost.
1834 */
1835 min_index = arrlen - 1;
1836
1837 /*
1838 * For each of the remaining subpaths, add its cost to the array element
1839 * with minimum cost.
1840 */
1841 for_each_cell(l, cell)
1842 {
1843 Path *subpath = (Path *) lfirst(l);
1844 int i;
1845
1846 /* Consider only the non-partial paths */
1847 if (path_index++ == numpaths)
1848 break;
1849
1850 costarr[min_index] += subpath->total_cost;
1851
1852 /* Update the new min cost array index */
1853 for (min_index = i = 0; i < arrlen; i++)
1854 {
1855 if (costarr[i] < costarr[min_index])
1856 min_index = i;
1857 }
1858 }
1859
1860 /* Return the highest cost from the array */
1861 for (max_index = i = 0; i < arrlen; i++)
1862 {
1863 if (costarr[i] > costarr[max_index])
1864 max_index = i;
1865 }
1866
1867 return costarr[max_index];
1868}
1869
1870/*
1871 * cost_append
1872 * Determines and returns the cost of an Append node.
1873 */
1874void
1875cost_append(AppendPath *apath)
1876{
1877 ListCell *l;
1878
1879 apath->path.startup_cost = 0;
1880 apath->path.total_cost = 0;
1881 apath->path.rows = 0;
1882
1883 if (apath->subpaths == NIL)
1884 return;
1885
1886 if (!apath->path.parallel_aware)
1887 {
1888 List *pathkeys = apath->path.pathkeys;
1889
1890 if (pathkeys == NIL)
1891 {
1892 Path *subpath = (Path *) linitial(apath->subpaths);
1893
1894 /*
1895 * For an unordered, non-parallel-aware Append we take the startup
1896 * cost as the startup cost of the first subpath.
1897 */
1898 apath->path.startup_cost = subpath->startup_cost;
1899
1900 /* Compute rows and costs as sums of subplan rows and costs. */
1901 foreach(l, apath->subpaths)
1902 {
1903 Path *subpath = (Path *) lfirst(l);
1904
1905 apath->path.rows += subpath->rows;
1906 apath->path.total_cost += subpath->total_cost;
1907 }
1908 }
1909 else
1910 {
1911 /*
1912 * For an ordered, non-parallel-aware Append we take the startup
1913 * cost as the sum of the subpath startup costs. This ensures
1914 * that we don't underestimate the startup cost when a query's
1915 * LIMIT is such that several of the children have to be run to
1916 * satisfy it. This might be overkill --- another plausible hack
1917 * would be to take the Append's startup cost as the maximum of
1918 * the child startup costs. But we don't want to risk believing
1919 * that an ORDER BY LIMIT query can be satisfied at small cost
1920 * when the first child has small startup cost but later ones
1921 * don't. (If we had the ability to deal with nonlinear cost
1922 * interpolation for partial retrievals, we would not need to be
1923 * so conservative about this.)
1924 *
1925 * This case is also different from the above in that we have to
1926 * account for possibly injecting sorts into subpaths that aren't
1927 * natively ordered.
1928 */
1929 foreach(l, apath->subpaths)
1930 {
1931 Path *subpath = (Path *) lfirst(l);
1932 Path sort_path; /* dummy for result of cost_sort */
1933
1934 if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
1935 {
1936 /*
1937 * We'll need to insert a Sort node, so include costs for
1938 * that. We can use the parent's LIMIT if any, since we
1939 * certainly won't pull more than that many tuples from
1940 * any child.
1941 */
1942 cost_sort(&sort_path,
1943 NULL, /* doesn't currently need root */
1944 pathkeys,
1945 subpath->total_cost,
1946 subpath->rows,
1947 subpath->pathtarget->width,
1948 0.0,
1949 work_mem,
1950 apath->limit_tuples);
1951 subpath = &sort_path;
1952 }
1953
1954 apath->path.rows += subpath->rows;
1955 apath->path.startup_cost += subpath->startup_cost;
1956 apath->path.total_cost += subpath->total_cost;
1957 }
1958 }
1959 }
1960 else /* parallel-aware */
1961 {
1962 int i = 0;
1963 double parallel_divisor = get_parallel_divisor(&apath->path);
1964
1965 /* Parallel-aware Append never produces ordered output. */
1966 Assert(apath->path.pathkeys == NIL);
1967
1968 /* Calculate startup cost. */
1969 foreach(l, apath->subpaths)
1970 {
1971 Path *subpath = (Path *) lfirst(l);
1972
1973 /*
1974 * Append will start returning tuples when the child node having
1975 * lowest startup cost is done setting up. We consider only the
1976 * first few subplans that immediately get a worker assigned.
1977 */
1978 if (i == 0)
1979 apath->path.startup_cost = subpath->startup_cost;
1980 else if (i < apath->path.parallel_workers)
1981 apath->path.startup_cost = Min(apath->path.startup_cost,
1982 subpath->startup_cost);
1983
1984 /*
1985 * Apply parallel divisor to subpaths. Scale the number of rows
1986 * for each partial subpath based on the ratio of the parallel
1987 * divisor originally used for the subpath to the one we adopted.
1988 * Also add the cost of partial paths to the total cost, but
1989 * ignore non-partial paths for now.
1990 */
1991 if (i < apath->first_partial_path)
1992 apath->path.rows += subpath->rows / parallel_divisor;
1993 else
1994 {
1995 double subpath_parallel_divisor;
1996
1997 subpath_parallel_divisor = get_parallel_divisor(subpath);
1998 apath->path.rows += subpath->rows * (subpath_parallel_divisor /
1999 parallel_divisor);
2000 apath->path.total_cost += subpath->total_cost;
2001 }
2002
2003 apath->path.rows = clamp_row_est(apath->path.rows);
2004
2005 i++;
2006 }
2007
2008 /* Add cost for non-partial subpaths. */
2009 apath->path.total_cost +=
2010 append_nonpartial_cost(apath->subpaths,
2011 apath->first_partial_path,
2012 apath->path.parallel_workers);
2013 }
2014
2015 /*
2016 * Although Append does not do any selection or projection, it's not free;
2017 * add a small per-tuple overhead.
2018 */
2019 apath->path.total_cost +=
2020 cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
2021}
2022
2023/*
2024 * cost_merge_append
2025 * Determines and returns the cost of a MergeAppend node.
2026 *
2027 * MergeAppend merges several pre-sorted input streams, using a heap that
2028 * at any given instant holds the next tuple from each stream. If there
2029 * are N streams, we need about N*log2(N) tuple comparisons to construct
2030 * the heap at startup, and then for each output tuple, about log2(N)
2031 * comparisons to replace the top entry.
2032 *
2033 * (The effective value of N will drop once some of the input streams are
2034 * exhausted, but it seems unlikely to be worth trying to account for that.)
2035 *
2036 * The heap is never spilled to disk, since we assume N is not very large.
2037 * So this is much simpler than cost_sort.
2038 *
2039 * As in cost_sort, we charge two operator evals per tuple comparison.
2040 *
2041 * 'pathkeys' is a list of sort keys
2042 * 'n_streams' is the number of input streams
2043 * 'input_startup_cost' is the sum of the input streams' startup costs
2044 * 'input_total_cost' is the sum of the input streams' total costs
2045 * 'tuples' is the number of tuples in all the streams
2046 */
2047void
2048cost_merge_append(Path *path, PlannerInfo *root,
2049 List *pathkeys, int n_streams,
2050 Cost input_startup_cost, Cost input_total_cost,
2051 double tuples)
2052{
2053 Cost startup_cost = 0;
2054 Cost run_cost = 0;
2055 Cost comparison_cost;
2056 double N;
2057 double logN;
2058
2059 /*
2060 * Avoid log(0)...
2061 */
2062 N = (n_streams < 2) ? 2.0 : (double) n_streams;
2063 logN = LOG2(N);
2064
2065 /* Assumed cost per tuple comparison */
2066 comparison_cost = 2.0 * cpu_operator_cost;
2067
2068 /* Heap creation cost */
2069 startup_cost += comparison_cost * N * logN;
2070
2071 /* Per-tuple heap maintenance cost */
2072 run_cost += tuples * comparison_cost * logN;
2073
2074 /*
2075 * Although MergeAppend does not do any selection or projection, it's not
2076 * free; add a small per-tuple overhead.
2077 */
2078 run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2079
2080 path->startup_cost = startup_cost + input_startup_cost;
2081 path->total_cost = startup_cost + run_cost + input_total_cost;
2082}
2083
2084/*
2085 * cost_material
2086 * Determines and returns the cost of materializing a relation, including
2087 * the cost of reading the input data.
2088 *
2089 * If the total volume of data to materialize exceeds work_mem, we will need
2090 * to write it to disk, so the cost is much higher in that case.
2091 *
2092 * Note that here we are estimating the costs for the first scan of the
2093 * relation, so the materialization is all overhead --- any savings will
2094 * occur only on rescan, which is estimated in cost_rescan.
2095 */
2096void
2097cost_material(Path *path,
2098 Cost input_startup_cost, Cost input_total_cost,
2099 double tuples, int width)
2100{
2101 Cost startup_cost = input_startup_cost;
2102 Cost run_cost = input_total_cost - input_startup_cost;
2103 double nbytes = relation_byte_size(tuples, width);
2104 long work_mem_bytes = work_mem * 1024L;
2105
2106 path->rows = tuples;
2107
2108 /*
2109 * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2110 * reflect bookkeeping overhead. (This rate must be more than what
2111 * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2112 * if it is exactly the same then there will be a cost tie between
2113 * nestloop with A outer, materialized B inner and nestloop with B outer,
2114 * materialized A inner. The extra cost ensures we'll prefer
2115 * materializing the smaller rel.) Note that this is normally a good deal
2116 * less than cpu_tuple_cost; which is OK because a Material plan node
2117 * doesn't do qual-checking or projection, so it's got less overhead than
2118 * most plan nodes.
2119 */
2120 run_cost += 2 * cpu_operator_cost * tuples;
2121
2122 /*
2123 * If we will spill to disk, charge at the rate of seq_page_cost per page.
2124 * This cost is assumed to be evenly spread through the plan run phase,
2125 * which isn't exactly accurate but our cost model doesn't allow for
2126 * nonuniform costs within the run phase.
2127 */
2128 if (nbytes > work_mem_bytes)
2129 {
2130 double npages = ceil(nbytes / BLCKSZ);
2131
2132 run_cost += seq_page_cost * npages;
2133 }
2134
2135 path->startup_cost = startup_cost;
2136 path->total_cost = startup_cost + run_cost;
2137}
2138
2139/*
2140 * cost_agg
2141 * Determines and returns the cost of performing an Agg plan node,
2142 * including the cost of its input.
2143 *
2144 * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2145 * we are using a hashed Agg node just to do grouping).
2146 *
2147 * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2148 * are for appropriately-sorted input.
2149 */
2150void
2151cost_agg(Path *path, PlannerInfo *root,
2152 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2153 int numGroupCols, double numGroups,
2154 List *quals,
2155 Cost input_startup_cost, Cost input_total_cost,
2156 double input_tuples)
2157{
2158 double output_tuples;
2159 Cost startup_cost;
2160 Cost total_cost;
2161 AggClauseCosts dummy_aggcosts;
2162
2163 /* Use all-zero per-aggregate costs if NULL is passed */
2164 if (aggcosts == NULL)
2165 {
2166 Assert(aggstrategy == AGG_HASHED);
2167 MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
2168 aggcosts = &dummy_aggcosts;
2169 }
2170
2171 /*
2172 * The transCost.per_tuple component of aggcosts should be charged once
2173 * per input tuple, corresponding to the costs of evaluating the aggregate
2174 * transfns and their input expressions. The finalCost.per_tuple component
2175 * is charged once per output tuple, corresponding to the costs of
2176 * evaluating the finalfns. Startup costs are of course charged but once.
2177 *
2178 * If we are grouping, we charge an additional cpu_operator_cost per
2179 * grouping column per input tuple for grouping comparisons.
2180 *
2181 * We will produce a single output tuple if not grouping, and a tuple per
2182 * group otherwise. We charge cpu_tuple_cost for each output tuple.
2183 *
2184 * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2185 * same total CPU cost, but AGG_SORTED has lower startup cost. If the
2186 * input path is already sorted appropriately, AGG_SORTED should be
2187 * preferred (since it has no risk of memory overflow). This will happen
2188 * as long as the computed total costs are indeed exactly equal --- but if
2189 * there's roundoff error we might do the wrong thing. So be sure that
2190 * the computations below form the same intermediate values in the same
2191 * order.
2192 */
2193 if (aggstrategy == AGG_PLAIN)
2194 {
2195 startup_cost = input_total_cost;
2196 startup_cost += aggcosts->transCost.startup;
2197 startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2198 startup_cost += aggcosts->finalCost.startup;
2199 startup_cost += aggcosts->finalCost.per_tuple;
2200 /* we aren't grouping */
2201 total_cost = startup_cost + cpu_tuple_cost;
2202 output_tuples = 1;
2203 }
2204 else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2205 {
2206 /* Here we are able to deliver output on-the-fly */
2207 startup_cost = input_startup_cost;
2208 total_cost = input_total_cost;
2209 if (aggstrategy == AGG_MIXED && !enable_hashagg)
2210 {
2211 startup_cost += disable_cost;
2212 total_cost += disable_cost;
2213 }
2214 /* calcs phrased this way to match HASHED case, see note above */
2215 total_cost += aggcosts->transCost.startup;
2216 total_cost += aggcosts->transCost.per_tuple * input_tuples;
2217 total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2218 total_cost += aggcosts->finalCost.startup;
2219 total_cost += aggcosts->finalCost.per_tuple * numGroups;
2220 total_cost += cpu_tuple_cost * numGroups;
2221 output_tuples = numGroups;
2222 }
2223 else
2224 {
2225 /* must be AGG_HASHED */
2226 startup_cost = input_total_cost;
2227 if (!enable_hashagg)
2228 startup_cost += disable_cost;
2229 startup_cost += aggcosts->transCost.startup;
2230 startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2231 startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2232 startup_cost += aggcosts->finalCost.startup;
2233 total_cost = startup_cost;
2234 total_cost += aggcosts->finalCost.per_tuple * numGroups;
2235 total_cost += cpu_tuple_cost * numGroups;
2236 output_tuples = numGroups;
2237 }
2238
2239 /*
2240 * If there are quals (HAVING quals), account for their cost and
2241 * selectivity.
2242 */
2243 if (quals)
2244 {
2245 QualCost qual_cost;
2246
2247 cost_qual_eval(&qual_cost, quals, root);
2248 startup_cost += qual_cost.startup;
2249 total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2250
2251 output_tuples = clamp_row_est(output_tuples *
2252 clauselist_selectivity(root,
2253 quals,
2254 0,
2255 JOIN_INNER,
2256 NULL));
2257 }
2258
2259 path->rows = output_tuples;
2260 path->startup_cost = startup_cost;
2261 path->total_cost = total_cost;
2262}
2263
2264/*
2265 * cost_windowagg
2266 * Determines and returns the cost of performing a WindowAgg plan node,
2267 * including the cost of its input.
2268 *
2269 * Input is assumed already properly sorted.
2270 */
2271void
2272cost_windowagg(Path *path, PlannerInfo *root,
2273 List *windowFuncs, int numPartCols, int numOrderCols,
2274 Cost input_startup_cost, Cost input_total_cost,
2275 double input_tuples)
2276{
2277 Cost startup_cost;
2278 Cost total_cost;
2279 ListCell *lc;
2280
2281 startup_cost = input_startup_cost;
2282 total_cost = input_total_cost;
2283
2284 /*
2285 * Window functions are assumed to cost their stated execution cost, plus
2286 * the cost of evaluating their input expressions, per tuple. Since they
2287 * may in fact evaluate their inputs at multiple rows during each cycle,
2288 * this could be a drastic underestimate; but without a way to know how
2289 * many rows the window function will fetch, it's hard to do better. In
2290 * any case, it's a good estimate for all the built-in window functions,
2291 * so we'll just do this for now.
2292 */
2293 foreach(lc, windowFuncs)
2294 {
2295 WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
2296 Cost wfunccost;
2297 QualCost argcosts;
2298
2299 argcosts.startup = argcosts.per_tuple = 0;
2300 add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
2301 &argcosts);
2302 startup_cost += argcosts.startup;
2303 wfunccost = argcosts.per_tuple;
2304
2305 /* also add the input expressions' cost to per-input-row costs */
2306 cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
2307 startup_cost += argcosts.startup;
2308 wfunccost += argcosts.per_tuple;
2309
2310 /*
2311 * Add the filter's cost to per-input-row costs. XXX We should reduce
2312 * input expression costs according to filter selectivity.
2313 */
2314 cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
2315 startup_cost += argcosts.startup;
2316 wfunccost += argcosts.per_tuple;
2317
2318 total_cost += wfunccost * input_tuples;
2319 }
2320
2321 /*
2322 * We also charge cpu_operator_cost per grouping column per tuple for
2323 * grouping comparisons, plus cpu_tuple_cost per tuple for general
2324 * overhead.
2325 *
2326 * XXX this neglects costs of spooling the data to disk when it overflows
2327 * work_mem. Sooner or later that should get accounted for.
2328 */
2329 total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
2330 total_cost += cpu_tuple_cost * input_tuples;
2331
2332 path->rows = input_tuples;
2333 path->startup_cost = startup_cost;
2334 path->total_cost = total_cost;
2335}
2336
2337/*
2338 * cost_group
2339 * Determines and returns the cost of performing a Group plan node,
2340 * including the cost of its input.
2341 *
2342 * Note: caller must ensure that input costs are for appropriately-sorted
2343 * input.
2344 */
2345void
2346cost_group(Path *path, PlannerInfo *root,
2347 int numGroupCols, double numGroups,
2348 List *quals,
2349 Cost input_startup_cost, Cost input_total_cost,
2350 double input_tuples)
2351{
2352 double output_tuples;
2353 Cost startup_cost;
2354 Cost total_cost;
2355
2356 output_tuples = numGroups;
2357 startup_cost = input_startup_cost;
2358 total_cost = input_total_cost;
2359
2360 /*
2361 * Charge one cpu_operator_cost per comparison per input tuple. We assume
2362 * all columns get compared at most of the tuples.
2363 */
2364 total_cost += cpu_operator_cost * input_tuples * numGroupCols;
2365
2366 /*
2367 * If there are quals (HAVING quals), account for their cost and
2368 * selectivity.
2369 */
2370 if (quals)
2371 {
2372 QualCost qual_cost;
2373
2374 cost_qual_eval(&qual_cost, quals, root);
2375 startup_cost += qual_cost.startup;
2376 total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2377
2378 output_tuples = clamp_row_est(output_tuples *
2379 clauselist_selectivity(root,
2380 quals,
2381 0,
2382 JOIN_INNER,
2383 NULL));
2384 }
2385
2386 path->rows = output_tuples;
2387 path->startup_cost = startup_cost;
2388 path->total_cost = total_cost;
2389}
2390
2391/*
2392 * initial_cost_nestloop
2393 * Preliminary estimate of the cost of a nestloop join path.
2394 *
2395 * This must quickly produce lower-bound estimates of the path's startup and
2396 * total costs. If we are unable to eliminate the proposed path from
2397 * consideration using the lower bounds, final_cost_nestloop will be called
2398 * to obtain the final estimates.
2399 *
2400 * The exact division of labor between this function and final_cost_nestloop
2401 * is private to them, and represents a tradeoff between speed of the initial
2402 * estimate and getting a tight lower bound. We choose to not examine the
2403 * join quals here, since that's by far the most expensive part of the
2404 * calculations. The end result is that CPU-cost considerations must be
2405 * left for the second phase; and for SEMI/ANTI joins, we must also postpone
2406 * incorporation of the inner path's run cost.
2407 *
2408 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2409 * other data to be used by final_cost_nestloop
2410 * 'jointype' is the type of join to be performed
2411 * 'outer_path' is the outer input to the join
2412 * 'inner_path' is the inner input to the join
2413 * 'extra' contains miscellaneous information about the join
2414 */
2415void
2416initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
2417 JoinType jointype,
2418 Path *outer_path, Path *inner_path,
2419 JoinPathExtraData *extra)
2420{
2421 Cost startup_cost = 0;
2422 Cost run_cost = 0;
2423 double outer_path_rows = outer_path->rows;
2424 Cost inner_rescan_start_cost;
2425 Cost inner_rescan_total_cost;
2426 Cost inner_run_cost;
2427 Cost inner_rescan_run_cost;
2428
2429 /* estimate costs to rescan the inner relation */
2430 cost_rescan(root, inner_path,
2431 &inner_rescan_start_cost,
2432 &inner_rescan_total_cost);
2433
2434 /* cost of source data */
2435
2436 /*
2437 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
2438 * before we can start returning tuples, so the join's startup cost is
2439 * their sum. We'll also pay the inner path's rescan startup cost
2440 * multiple times.
2441 */
2442 startup_cost += outer_path->startup_cost + inner_path->startup_cost;
2443 run_cost += outer_path->total_cost - outer_path->startup_cost;
2444 if (outer_path_rows > 1)
2445 run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
2446
2447 inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
2448 inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
2449
2450 if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
2451 extra->inner_unique)
2452 {
2453 /*
2454 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2455 * executor will stop after the first match.
2456 *
2457 * Getting decent estimates requires inspection of the join quals,
2458 * which we choose to postpone to final_cost_nestloop.
2459 */
2460
2461 /* Save private data for final_cost_nestloop */
2462 workspace->inner_run_cost = inner_run_cost;
2463 workspace->inner_rescan_run_cost = inner_rescan_run_cost;
2464 }
2465 else
2466 {
2467 /* Normal case; we'll scan whole input rel for each outer row */
2468 run_cost += inner_run_cost;
2469 if (outer_path_rows > 1)
2470 run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
2471 }
2472
2473 /* CPU costs left for later */
2474
2475 /* Public result fields */
2476 workspace->startup_cost = startup_cost;
2477 workspace->total_cost = startup_cost + run_cost;
2478 /* Save private data for final_cost_nestloop */
2479 workspace->run_cost = run_cost;
2480}
2481
2482/*
2483 * final_cost_nestloop
2484 * Final estimate of the cost and result size of a nestloop join path.
2485 *
2486 * 'path' is already filled in except for the rows and cost fields
2487 * 'workspace' is the result from initial_cost_nestloop
2488 * 'extra' contains miscellaneous information about the join
2489 */
2490void
2491final_cost_nestloop(PlannerInfo *root, NestPath *path,
2492 JoinCostWorkspace *workspace,
2493 JoinPathExtraData *extra)
2494{
2495 Path *outer_path = path->outerjoinpath;
2496 Path *inner_path = path->innerjoinpath;
2497 double outer_path_rows = outer_path->rows;
2498 double inner_path_rows = inner_path->rows;
2499 Cost startup_cost = workspace->startup_cost;
2500 Cost run_cost = workspace->run_cost;
2501 Cost cpu_per_tuple;
2502 QualCost restrict_qual_cost;
2503 double ntuples;
2504
2505 /* Protect some assumptions below that rowcounts aren't zero or NaN */
2506 if (outer_path_rows <= 0 || isnan(outer_path_rows))
2507 outer_path_rows = 1;
2508 if (inner_path_rows <= 0 || isnan(inner_path_rows))
2509 inner_path_rows = 1;
2510
2511 /* Mark the path with the correct row estimate */
2512 if (path->path.param_info)
2513 path->path.rows = path->path.param_info->ppi_rows;
2514 else
2515 path->path.rows = path->path.parent->rows;
2516
2517 /* For partial paths, scale row estimate. */
2518 if (path->path.parallel_workers > 0)
2519 {
2520 double parallel_divisor = get_parallel_divisor(&path->path);
2521
2522 path->path.rows =
2523 clamp_row_est(path->path.rows / parallel_divisor);
2524 }
2525
2526 /*
2527 * We could include disable_cost in the preliminary estimate, but that
2528 * would amount to optimizing for the case where the join method is
2529 * disabled, which doesn't seem like the way to bet.
2530 */
2531 if (!enable_nestloop)
2532 startup_cost += disable_cost;
2533
2534 /* cost of inner-relation source data (we already dealt with outer rel) */
2535
2536 if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
2537 extra->inner_unique)
2538 {
2539 /*
2540 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2541 * executor will stop after the first match.
2542 */
2543 Cost inner_run_cost = workspace->inner_run_cost;
2544 Cost inner_rescan_run_cost = workspace->inner_rescan_run_cost;
2545 double outer_matched_rows;
2546 double outer_unmatched_rows;
2547 Selectivity inner_scan_frac;
2548
2549 /*
2550 * For an outer-rel row that has at least one match, we can expect the
2551 * inner scan to stop after a fraction 1/(match_count+1) of the inner
2552 * rows, if the matches are evenly distributed. Since they probably
2553 * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
2554 * that fraction. (If we used a larger fuzz factor, we'd have to
2555 * clamp inner_scan_frac to at most 1.0; but since match_count is at
2556 * least 1, no such clamp is needed now.)
2557 */
2558 outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
2559 outer_unmatched_rows = outer_path_rows - outer_matched_rows;
2560 inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
2561
2562 /*
2563 * Compute number of tuples processed (not number emitted!). First,
2564 * account for successfully-matched outer rows.
2565 */
2566 ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
2567
2568 /*
2569 * Now we need to estimate the actual costs of scanning the inner
2570 * relation, which may be quite a bit less than N times inner_run_cost
2571 * due to early scan stops. We consider two cases. If the inner path
2572 * is an indexscan using all the joinquals as indexquals, then an
2573 * unmatched outer row results in an indexscan returning no rows,
2574 * which is probably quite cheap. Otherwise, the executor will have
2575 * to scan the whole inner rel for an unmatched row; not so cheap.
2576 */
2577 if (has_indexed_join_quals(path))
2578 {
2579 /*
2580 * Successfully-matched outer rows will only require scanning
2581 * inner_scan_frac of the inner relation. In this case, we don't
2582 * need to charge the full inner_run_cost even when that's more
2583 * than inner_rescan_run_cost, because we can assume that none of
2584 * the inner scans ever scan the whole inner relation. So it's
2585 * okay to assume that all the inner scan executions can be
2586 * fractions of the full cost, even if materialization is reducing
2587 * the rescan cost. At this writing, it's impossible to get here
2588 * for a materialized inner scan, so inner_run_cost and
2589 * inner_rescan_run_cost will be the same anyway; but just in
2590 * case, use inner_run_cost for the first matched tuple and
2591 * inner_rescan_run_cost for additional ones.
2592 */
2593 run_cost += inner_run_cost * inner_scan_frac;
2594 if (outer_matched_rows > 1)
2595 run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
2596
2597 /*
2598 * Add the cost of inner-scan executions for unmatched outer rows.
2599 * We estimate this as the same cost as returning the first tuple
2600 * of a nonempty scan. We consider that these are all rescans,
2601 * since we used inner_run_cost once already.
2602 */
2603 run_cost += outer_unmatched_rows *
2604 inner_rescan_run_cost / inner_path_rows;
2605
2606 /*
2607 * We won't be evaluating any quals at all for unmatched rows, so
2608 * don't add them to ntuples.
2609 */
2610 }
2611 else
2612 {
2613 /*
2614 * Here, a complicating factor is that rescans may be cheaper than
2615 * first scans. If we never scan all the way to the end of the
2616 * inner rel, it might be (depending on the plan type) that we'd
2617 * never pay the whole inner first-scan run cost. However it is
2618 * difficult to estimate whether that will happen (and it could
2619 * not happen if there are any unmatched outer rows!), so be
2620 * conservative and always charge the whole first-scan cost once.
2621 * We consider this charge to correspond to the first unmatched
2622 * outer row, unless there isn't one in our estimate, in which
2623 * case blame it on the first matched row.
2624 */
2625
2626 /* First, count all unmatched join tuples as being processed */
2627 ntuples += outer_unmatched_rows * inner_path_rows;
2628
2629 /* Now add the forced full scan, and decrement appropriate count */
2630 run_cost += inner_run_cost;
2631 if (outer_unmatched_rows >= 1)
2632 outer_unmatched_rows -= 1;
2633 else
2634 outer_matched_rows -= 1;
2635
2636 /* Add inner run cost for additional outer tuples having matches */
2637 if (outer_matched_rows > 0)
2638 run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
2639
2640 /* Add inner run cost for additional unmatched outer tuples */
2641 if (outer_unmatched_rows > 0)
2642 run_cost += outer_unmatched_rows * inner_rescan_run_cost;
2643 }
2644 }
2645 else
2646 {
2647 /* Normal-case source costs were included in preliminary estimate */
2648
2649 /* Compute number of tuples processed (not number emitted!) */
2650 ntuples = outer_path_rows * inner_path_rows;
2651 }
2652
2653 /* CPU costs */
2654 cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
2655 startup_cost += restrict_qual_cost.startup;
2656 cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
2657 run_cost += cpu_per_tuple * ntuples;
2658
2659 /* tlist eval costs are paid per output row, not per tuple scanned */
2660 startup_cost += path->path.pathtarget->cost.startup;
2661 run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
2662
2663 path->path.startup_cost = startup_cost;
2664 path->path.total_cost = startup_cost + run_cost;
2665}
2666
2667/*
2668 * initial_cost_mergejoin
2669 * Preliminary estimate of the cost of a mergejoin path.
2670 *
2671 * This must quickly produce lower-bound estimates of the path's startup and
2672 * total costs. If we are unable to eliminate the proposed path from
2673 * consideration using the lower bounds, final_cost_mergejoin will be called
2674 * to obtain the final estimates.
2675 *
2676 * The exact division of labor between this function and final_cost_mergejoin
2677 * is private to them, and represents a tradeoff between speed of the initial
2678 * estimate and getting a tight lower bound. We choose to not examine the
2679 * join quals here, except for obtaining the scan selectivity estimate which
2680 * is really essential (but fortunately, use of caching keeps the cost of
2681 * getting that down to something reasonable).
2682 * We also assume that cost_sort is cheap enough to use here.
2683 *
2684 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2685 * other data to be used by final_cost_mergejoin
2686 * 'jointype' is the type of join to be performed
2687 * 'mergeclauses' is the list of joinclauses to be used as merge clauses
2688 * 'outer_path' is the outer input to the join
2689 * 'inner_path' is the inner input to the join
2690 * 'outersortkeys' is the list of sort keys for the outer path
2691 * 'innersortkeys' is the list of sort keys for the inner path
2692 * 'extra' contains miscellaneous information about the join
2693 *
2694 * Note: outersortkeys and innersortkeys should be NIL if no explicit
2695 * sort is needed because the respective source path is already ordered.
2696 */
2697void
2698initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
2699 JoinType jointype,
2700 List *mergeclauses,
2701 Path *outer_path, Path *inner_path,
2702 List *outersortkeys, List *innersortkeys,
2703 JoinPathExtraData *extra)
2704{
2705 Cost startup_cost = 0;
2706 Cost run_cost = 0;
2707 double outer_path_rows = outer_path->rows;
2708 double inner_path_rows = inner_path->rows;
2709 Cost inner_run_cost;
2710 double outer_rows,
2711 inner_rows,
2712 outer_skip_rows,
2713 inner_skip_rows;
2714 Selectivity outerstartsel,
2715 outerendsel,
2716 innerstartsel,
2717 innerendsel;
2718 Path sort_path; /* dummy for result of cost_sort */
2719
2720 /* Protect some assumptions below that rowcounts aren't zero or NaN */
2721 if (outer_path_rows <= 0 || isnan(outer_path_rows))
2722 outer_path_rows = 1;
2723 if (inner_path_rows <= 0 || isnan(inner_path_rows))
2724 inner_path_rows = 1;
2725
2726 /*
2727 * A merge join will stop as soon as it exhausts either input stream
2728 * (unless it's an outer join, in which case the outer side has to be
2729 * scanned all the way anyway). Estimate fraction of the left and right
2730 * inputs that will actually need to be scanned. Likewise, we can
2731 * estimate the number of rows that will be skipped before the first join
2732 * pair is found, which should be factored into startup cost. We use only
2733 * the first (most significant) merge clause for this purpose. Since
2734 * mergejoinscansel() is a fairly expensive computation, we cache the
2735 * results in the merge clause RestrictInfo.
2736 */
2737 if (mergeclauses && jointype != JOIN_FULL)
2738 {
2739 RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
2740 List *opathkeys;
2741 List *ipathkeys;
2742 PathKey *opathkey;
2743 PathKey *ipathkey;
2744 MergeScanSelCache *cache;
2745
2746 /* Get the input pathkeys to determine the sort-order details */
2747 opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
2748 ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
2749 Assert(opathkeys);
2750 Assert(ipathkeys);
2751 opathkey = (PathKey *) linitial(opathkeys);
2752 ipathkey = (PathKey *) linitial(ipathkeys);
2753 /* debugging check */
2754 if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
2755 opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
2756 opathkey->pk_strategy != ipathkey->pk_strategy ||
2757 opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
2758 elog(ERROR, "left and right pathkeys do not match in mergejoin");
2759
2760 /* Get the selectivity with caching */
2761 cache = cached_scansel(root, firstclause, opathkey);
2762
2763 if (bms_is_subset(firstclause->left_relids,
2764 outer_path->parent->relids))
2765 {
2766 /* left side of clause is outer */
2767 outerstartsel = cache->leftstartsel;
2768 outerendsel = cache->leftendsel;
2769 innerstartsel = cache->rightstartsel;
2770 innerendsel = cache->rightendsel;
2771 }
2772 else
2773 {
2774 /* left side of clause is inner */
2775 outerstartsel = cache->rightstartsel;
2776 outerendsel = cache->rightendsel;
2777 innerstartsel = cache->leftstartsel;
2778 innerendsel = cache->leftendsel;
2779 }
2780 if (jointype == JOIN_LEFT ||
2781 jointype == JOIN_ANTI)
2782 {
2783 outerstartsel = 0.0;
2784 outerendsel = 1.0;
2785 }
2786 else if (jointype == JOIN_RIGHT)
2787 {
2788 innerstartsel = 0.0;
2789 innerendsel = 1.0;
2790 }
2791 }
2792 else
2793 {
2794 /* cope with clauseless or full mergejoin */
2795 outerstartsel = innerstartsel = 0.0;
2796 outerendsel = innerendsel = 1.0;
2797 }
2798
2799 /*
2800 * Convert selectivities to row counts. We force outer_rows and
2801 * inner_rows to be at least 1, but the skip_rows estimates can be zero.
2802 */
2803 outer_skip_rows = rint(outer_path_rows * outerstartsel);
2804 inner_skip_rows = rint(inner_path_rows * innerstartsel);
2805 outer_rows = clamp_row_est(outer_path_rows * outerendsel);
2806 inner_rows = clamp_row_est(inner_path_rows * innerendsel);
2807
2808 Assert(outer_skip_rows <= outer_rows);
2809 Assert(inner_skip_rows <= inner_rows);
2810
2811 /*
2812 * Readjust scan selectivities to account for above rounding. This is
2813 * normally an insignificant effect, but when there are only a few rows in
2814 * the inputs, failing to do this makes for a large percentage error.
2815 */
2816 outerstartsel = outer_skip_rows / outer_path_rows;
2817 innerstartsel = inner_skip_rows / inner_path_rows;
2818 outerendsel = outer_rows / outer_path_rows;
2819 innerendsel = inner_rows / inner_path_rows;
2820
2821 Assert(outerstartsel <= outerendsel);
2822 Assert(innerstartsel <= innerendsel);
2823
2824 /* cost of source data */
2825
2826 if (outersortkeys) /* do we need to sort outer? */
2827 {
2828 cost_sort(&sort_path,
2829 root,
2830 outersortkeys,
2831 outer_path->total_cost,
2832 outer_path_rows,
2833 outer_path->pathtarget->width,
2834 0.0,
2835 work_mem,
2836 -1.0);
2837 startup_cost += sort_path.startup_cost;
2838 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2839 * outerstartsel;
2840 run_cost += (sort_path.total_cost - sort_path.startup_cost)
2841 * (outerendsel - outerstartsel);
2842 }
2843 else
2844 {
2845 startup_cost += outer_path->startup_cost;
2846 startup_cost += (outer_path->total_cost - outer_path->startup_cost)
2847 * outerstartsel;
2848 run_cost += (outer_path->total_cost - outer_path->startup_cost)
2849 * (outerendsel - outerstartsel);
2850 }
2851
2852 if (innersortkeys) /* do we need to sort inner? */
2853 {
2854 cost_sort(&sort_path,
2855 root,
2856 innersortkeys,
2857 inner_path->total_cost,
2858 inner_path_rows,
2859 inner_path->pathtarget->width,
2860 0.0,
2861 work_mem,
2862 -1.0);
2863 startup_cost += sort_path.startup_cost;
2864 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2865 * innerstartsel;
2866 inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
2867 * (innerendsel - innerstartsel);
2868 }
2869 else
2870 {
2871 startup_cost += inner_path->startup_cost;
2872 startup_cost += (inner_path->total_cost - inner_path->startup_cost)
2873 * innerstartsel;
2874 inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
2875 * (innerendsel - innerstartsel);
2876 }
2877
2878 /*
2879 * We can't yet determine whether rescanning occurs, or whether
2880 * materialization of the inner input should be done. The minimum
2881 * possible inner input cost, regardless of rescan and materialization
2882 * considerations, is inner_run_cost. We include that in
2883 * workspace->total_cost, but not yet in run_cost.
2884 */
2885
2886 /* CPU costs left for later */
2887
2888 /* Public result fields */
2889 workspace->startup_cost = startup_cost;
2890 workspace->total_cost = startup_cost + run_cost + inner_run_cost;
2891 /* Save private data for final_cost_mergejoin */
2892 workspace->run_cost = run_cost;
2893 workspace->inner_run_cost = inner_run_cost;
2894 workspace->outer_rows = outer_rows;
2895 workspace->inner_rows = inner_rows;
2896 workspace->outer_skip_rows = outer_skip_rows;
2897 workspace->inner_skip_rows = inner_skip_rows;
2898}
2899
2900/*
2901 * final_cost_mergejoin
2902 * Final estimate of the cost and result size of a mergejoin path.
2903 *
2904 * Unlike other costsize functions, this routine makes two actual decisions:
2905 * whether the executor will need to do mark/restore, and whether we should
2906 * materialize the inner path. It would be logically cleaner to build
2907 * separate paths testing these alternatives, but that would require repeating
2908 * most of the cost calculations, which are not all that cheap. Since the
2909 * choice will not affect output pathkeys or startup cost, only total cost,
2910 * there is no possibility of wanting to keep more than one path. So it seems
2911 * best to make the decisions here and record them in the path's
2912 * skip_mark_restore and materialize_inner fields.
2913 *
2914 * Mark/restore overhead is usually required, but can be skipped if we know
2915 * that the executor need find only one match per outer tuple, and that the
2916 * mergeclauses are sufficient to identify a match.
2917 *
2918 * We materialize the inner path if we need mark/restore and either the inner
2919 * path can't support mark/restore, or it's cheaper to use an interposed
2920 * Material node to handle mark/restore.
2921 *
2922 * 'path' is already filled in except for the rows and cost fields and
2923 * skip_mark_restore and materialize_inner
2924 * 'workspace' is the result from initial_cost_mergejoin
2925 * 'extra' contains miscellaneous information about the join
2926 */
2927void
2928final_cost_mergejoin(PlannerInfo *root, MergePath *path,
2929 JoinCostWorkspace *workspace,
2930 JoinPathExtraData *extra)
2931{
2932 Path *outer_path = path->jpath.outerjoinpath;
2933 Path *inner_path = path->jpath.innerjoinpath;
2934 double inner_path_rows = inner_path->rows;
2935 List *mergeclauses = path->path_mergeclauses;
2936 List *innersortkeys = path->innersortkeys;
2937 Cost startup_cost = workspace->startup_cost;
2938 Cost run_cost = workspace->run_cost;
2939 Cost inner_run_cost = workspace->inner_run_cost;
2940 double outer_rows = workspace->outer_rows;
2941 double inner_rows = workspace->inner_rows;
2942 double outer_skip_rows = workspace->outer_skip_rows;
2943 double inner_skip_rows = workspace->inner_skip_rows;
2944 Cost cpu_per_tuple,
2945 bare_inner_cost,
2946 mat_inner_cost;
2947 QualCost merge_qual_cost;
2948 QualCost qp_qual_cost;
2949 double mergejointuples,
2950 rescannedtuples;
2951 double rescanratio;
2952
2953 /* Protect some assumptions below that rowcounts aren't zero or NaN */
2954 if (inner_path_rows <= 0 || isnan(inner_path_rows))
2955 inner_path_rows = 1;
2956
2957 /* Mark the path with the correct row estimate */
2958 if (path->jpath.path.param_info)
2959 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
2960 else
2961 path->jpath.path.rows = path->jpath.path.parent->rows;
2962
2963 /* For partial paths, scale row estimate. */
2964 if (path->jpath.path.parallel_workers > 0)
2965 {
2966 double parallel_divisor = get_parallel_divisor(&path->jpath.path);
2967
2968 path->jpath.path.rows =
2969 clamp_row_est(path->jpath.path.rows / parallel_divisor);
2970 }
2971
2972 /*
2973 * We could include disable_cost in the preliminary estimate, but that
2974 * would amount to optimizing for the case where the join method is
2975 * disabled, which doesn't seem like the way to bet.
2976 */
2977 if (!enable_mergejoin)
2978 startup_cost += disable_cost;
2979
2980 /*
2981 * Compute cost of the mergequals and qpquals (other restriction clauses)
2982 * separately.
2983 */
2984 cost_qual_eval(&merge_qual_cost, mergeclauses, root);
2985 cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
2986 qp_qual_cost.startup -= merge_qual_cost.startup;
2987 qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
2988
2989 /*
2990 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2991 * executor will stop scanning for matches after the first match. When
2992 * all the joinclauses are merge clauses, this means we don't ever need to
2993 * back up the merge, and so we can skip mark/restore overhead.
2994 */
2995 if ((path->jpath.jointype == JOIN_SEMI ||
2996 path->jpath.jointype == JOIN_ANTI ||
2997 extra->inner_unique) &&
2998 (list_length(path->jpath.joinrestrictinfo) ==
2999 list_length(path->path_mergeclauses)))
3000 path->skip_mark_restore = true;
3001 else
3002 path->skip_mark_restore = false;
3003
3004 /*
3005 * Get approx # tuples passing the mergequals. We use approx_tuple_count
3006 * here because we need an estimate done with JOIN_INNER semantics.
3007 */
3008 mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
3009
3010 /*
3011 * When there are equal merge keys in the outer relation, the mergejoin
3012 * must rescan any matching tuples in the inner relation. This means
3013 * re-fetching inner tuples; we have to estimate how often that happens.
3014 *
3015 * For regular inner and outer joins, the number of re-fetches can be
3016 * estimated approximately as size of merge join output minus size of
3017 * inner relation. Assume that the distinct key values are 1, 2, ..., and
3018 * denote the number of values of each key in the outer relation as m1,
3019 * m2, ...; in the inner relation, n1, n2, ... Then we have
3020 *
3021 * size of join = m1 * n1 + m2 * n2 + ...
3022 *
3023 * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
3024 * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
3025 * relation
3026 *
3027 * This equation works correctly for outer tuples having no inner match
3028 * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
3029 * are effectively subtracting those from the number of rescanned tuples,
3030 * when we should not. Can we do better without expensive selectivity
3031 * computations?
3032 *
3033 * The whole issue is moot if we are working from a unique-ified outer
3034 * input, or if we know we don't need to mark/restore at all.
3035 */
3036 if (IsA(outer_path, UniquePath) ||path->skip_mark_restore)
3037 rescannedtuples = 0;
3038 else
3039 {
3040 rescannedtuples = mergejointuples - inner_path_rows;
3041 /* Must clamp because of possible underestimate */
3042 if (rescannedtuples < 0)
3043 rescannedtuples = 0;
3044 }
3045
3046 /*
3047 * We'll inflate various costs this much to account for rescanning. Note
3048 * that this is to be multiplied by something involving inner_rows, or
3049 * another number related to the portion of the inner rel we'll scan.
3050 */
3051 rescanratio = 1.0 + (rescannedtuples / inner_rows);
3052
3053 /*
3054 * Decide whether we want to materialize the inner input to shield it from
3055 * mark/restore and performing re-fetches. Our cost model for regular
3056 * re-fetches is that a re-fetch costs the same as an original fetch,
3057 * which is probably an overestimate; but on the other hand we ignore the
3058 * bookkeeping costs of mark/restore. Not clear if it's worth developing
3059 * a more refined model. So we just need to inflate the inner run cost by
3060 * rescanratio.
3061 */
3062 bare_inner_cost = inner_run_cost * rescanratio;
3063
3064 /*
3065 * When we interpose a Material node the re-fetch cost is assumed to be
3066 * just cpu_operator_cost per tuple, independently of the underlying
3067 * plan's cost; and we charge an extra cpu_operator_cost per original
3068 * fetch as well. Note that we're assuming the materialize node will
3069 * never spill to disk, since it only has to remember tuples back to the
3070 * last mark. (If there are a huge number of duplicates, our other cost
3071 * factors will make the path so expensive that it probably won't get
3072 * chosen anyway.) So we don't use cost_rescan here.
3073 *
3074 * Note: keep this estimate in sync with create_mergejoin_plan's labeling
3075 * of the generated Material node.
3076 */
3077 mat_inner_cost = inner_run_cost +
3078 cpu_operator_cost * inner_rows * rescanratio;
3079
3080 /*
3081 * If we don't need mark/restore at all, we don't need materialization.
3082 */
3083 if (path->skip_mark_restore)
3084 path->materialize_inner = false;
3085
3086 /*
3087 * Prefer materializing if it looks cheaper, unless the user has asked to
3088 * suppress materialization.
3089 */
3090 else if (enable_material && mat_inner_cost < bare_inner_cost)
3091 path->materialize_inner = true;
3092
3093 /*
3094 * Even if materializing doesn't look cheaper, we *must* do it if the
3095 * inner path is to be used directly (without sorting) and it doesn't
3096 * support mark/restore.
3097 *
3098 * Since the inner side must be ordered, and only Sorts and IndexScans can
3099 * create order to begin with, and they both support mark/restore, you
3100 * might think there's no problem --- but you'd be wrong. Nestloop and
3101 * merge joins can *preserve* the order of their inputs, so they can be
3102 * selected as the input of a mergejoin, and they don't support
3103 * mark/restore at present.
3104 *
3105 * We don't test the value of enable_material here, because
3106 * materialization is required for correctness in this case, and turning
3107 * it off does not entitle us to deliver an invalid plan.
3108 */
3109 else if (innersortkeys == NIL &&
3110 !ExecSupportsMarkRestore(inner_path))
3111 path->materialize_inner = true;
3112
3113 /*
3114 * Also, force materializing if the inner path is to be sorted and the
3115 * sort is expected to spill to disk. This is because the final merge
3116 * pass can be done on-the-fly if it doesn't have to support mark/restore.
3117 * We don't try to adjust the cost estimates for this consideration,
3118 * though.
3119 *
3120 * Since materialization is a performance optimization in this case,
3121 * rather than necessary for correctness, we skip it if enable_material is
3122 * off.
3123 */
3124 else if (enable_material && innersortkeys != NIL &&
3125 relation_byte_size(inner_path_rows,
3126 inner_path->pathtarget->width) >
3127 (work_mem * 1024L))
3128 path->materialize_inner = true;
3129 else
3130 path->materialize_inner = false;
3131
3132 /* Charge the right incremental cost for the chosen case */
3133 if (path->materialize_inner)
3134 run_cost += mat_inner_cost;
3135 else
3136 run_cost += bare_inner_cost;
3137
3138 /* CPU costs */
3139
3140 /*
3141 * The number of tuple comparisons needed is approximately number of outer
3142 * rows plus number of inner rows plus number of rescanned tuples (can we
3143 * refine this?). At each one, we need to evaluate the mergejoin quals.
3144 */
3145 startup_cost += merge_qual_cost.startup;
3146 startup_cost += merge_qual_cost.per_tuple *
3147 (outer_skip_rows + inner_skip_rows * rescanratio);
3148 run_cost += merge_qual_cost.per_tuple *
3149 ((outer_rows - outer_skip_rows) +
3150 (inner_rows - inner_skip_rows) * rescanratio);
3151
3152 /*
3153 * For each tuple that gets through the mergejoin proper, we charge
3154 * cpu_tuple_cost plus the cost of evaluating additional restriction
3155 * clauses that are to be applied at the join. (This is pessimistic since
3156 * not all of the quals may get evaluated at each tuple.)
3157 *
3158 * Note: we could adjust for SEMI/ANTI joins skipping some qual
3159 * evaluations here, but it's probably not worth the trouble.
3160 */
3161 startup_cost += qp_qual_cost.startup;
3162 cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3163 run_cost += cpu_per_tuple * mergejointuples;
3164
3165 /* tlist eval costs are paid per output row, not per tuple scanned */
3166 startup_cost += path->jpath.path.pathtarget->cost.startup;
3167 run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3168
3169 path->jpath.path.startup_cost = startup_cost;
3170 path->jpath.path.total_cost = startup_cost + run_cost;
3171}
3172
3173/*
3174 * run mergejoinscansel() with caching
3175 */
3176static MergeScanSelCache *
3177cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
3178{
3179 MergeScanSelCache *cache;
3180 ListCell *lc;
3181 Selectivity leftstartsel,
3182 leftendsel,
3183 rightstartsel,
3184 rightendsel;
3185 MemoryContext oldcontext;
3186
3187 /* Do we have this result already? */
3188 foreach(lc, rinfo->scansel_cache)
3189 {
3190 cache = (MergeScanSelCache *) lfirst(lc);
3191 if (cache->opfamily == pathkey->pk_opfamily &&
3192 cache->collation == pathkey->pk_eclass->ec_collation &&
3193 cache->strategy == pathkey->pk_strategy &&
3194 cache->nulls_first == pathkey->pk_nulls_first)
3195 return cache;
3196 }
3197
3198 /* Nope, do the computation */
3199 mergejoinscansel(root,
3200 (Node *) rinfo->clause,
3201 pathkey->pk_opfamily,
3202 pathkey->pk_strategy,
3203 pathkey->pk_nulls_first,
3204 &leftstartsel,
3205 &leftendsel,
3206 &rightstartsel,
3207 &rightendsel);
3208
3209 /* Cache the result in suitably long-lived workspace */
3210 oldcontext = MemoryContextSwitchTo(root->planner_cxt);
3211
3212 cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache));
3213 cache->opfamily = pathkey->pk_opfamily;
3214 cache->collation = pathkey->pk_eclass->ec_collation;
3215 cache->strategy = pathkey->pk_strategy;
3216 cache->nulls_first = pathkey->pk_nulls_first;
3217 cache->leftstartsel = leftstartsel;
3218 cache->leftendsel = leftendsel;
3219 cache->rightstartsel = rightstartsel;
3220 cache->rightendsel = rightendsel;
3221
3222 rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
3223
3224 MemoryContextSwitchTo(oldcontext);
3225
3226 return cache;
3227}
3228
3229/*
3230 * initial_cost_hashjoin
3231 * Preliminary estimate of the cost of a hashjoin path.
3232 *
3233 * This must quickly produce lower-bound estimates of the path's startup and
3234 * total costs. If we are unable to eliminate the proposed path from
3235 * consideration using the lower bounds, final_cost_hashjoin will be called
3236 * to obtain the final estimates.
3237 *
3238 * The exact division of labor between this function and final_cost_hashjoin
3239 * is private to them, and represents a tradeoff between speed of the initial
3240 * estimate and getting a tight lower bound. We choose to not examine the
3241 * join quals here (other than by counting the number of hash clauses),
3242 * so we can't do much with CPU costs. We do assume that
3243 * ExecChooseHashTableSize is cheap enough to use here.
3244 *
3245 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3246 * other data to be used by final_cost_hashjoin
3247 * 'jointype' is the type of join to be performed
3248 * 'hashclauses' is the list of joinclauses to be used as hash clauses
3249 * 'outer_path' is the outer input to the join
3250 * 'inner_path' is the inner input to the join
3251 * 'extra' contains miscellaneous information about the join
3252 * 'parallel_hash' indicates that inner_path is partial and that a shared
3253 * hash table will be built in parallel
3254 */
3255void
3256initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3257 JoinType jointype,
3258 List *hashclauses,
3259 Path *outer_path, Path *inner_path,
3260 JoinPathExtraData *extra,
3261 bool parallel_hash)
3262{
3263 Cost startup_cost = 0;
3264 Cost run_cost = 0;
3265 double outer_path_rows = outer_path->rows;
3266 double inner_path_rows = inner_path->rows;
3267 double inner_path_rows_total = inner_path_rows;
3268 int num_hashclauses = list_length(hashclauses);
3269 int numbuckets;
3270 int numbatches;
3271 int num_skew_mcvs;
3272 size_t space_allowed; /* unused */
3273
3274 /* cost of source data */
3275 startup_cost += outer_path->startup_cost;
3276 run_cost += outer_path->total_cost - outer_path->startup_cost;
3277 startup_cost += inner_path->total_cost;
3278
3279 /*
3280 * Cost of computing hash function: must do it once per input tuple. We
3281 * charge one cpu_operator_cost for each column's hash function. Also,
3282 * tack on one cpu_tuple_cost per inner row, to model the costs of
3283 * inserting the row into the hashtable.
3284 *
3285 * XXX when a hashclause is more complex than a single operator, we really
3286 * should charge the extra eval costs of the left or right side, as
3287 * appropriate, here. This seems more work than it's worth at the moment.
3288 */
3289 startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
3290 * inner_path_rows;
3291 run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
3292
3293 /*
3294 * If this is a parallel hash build, then the value we have for
3295 * inner_rows_total currently refers only to the rows returned by each
3296 * participant. For shared hash table size estimation, we need the total
3297 * number, so we need to undo the division.
3298 */
3299 if (parallel_hash)
3300 inner_path_rows_total *= get_parallel_divisor(inner_path);
3301
3302 /*
3303 * Get hash table size that executor would use for inner relation.
3304 *
3305 * XXX for the moment, always assume that skew optimization will be
3306 * performed. As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
3307 * trying to determine that for sure.
3308 *
3309 * XXX at some point it might be interesting to try to account for skew
3310 * optimization in the cost estimate, but for now, we don't.
3311 */
3312 ExecChooseHashTableSize(inner_path_rows_total,
3313 inner_path->pathtarget->width,
3314 true, /* useskew */
3315 parallel_hash, /* try_combined_work_mem */
3316 outer_path->parallel_workers,
3317 &space_allowed,
3318 &numbuckets,
3319 &numbatches,
3320 &num_skew_mcvs);
3321
3322 /*
3323 * If inner relation is too big then we will need to "batch" the join,
3324 * which implies writing and reading most of the tuples to disk an extra
3325 * time. Charge seq_page_cost per page, since the I/O should be nice and
3326 * sequential. Writing the inner rel counts as startup cost, all the rest
3327 * as run cost.
3328 */
3329 if (numbatches > 1)
3330 {
3331 double outerpages = page_size(outer_path_rows,
3332 outer_path->pathtarget->width);
3333 double innerpages = page_size(inner_path_rows,
3334 inner_path->pathtarget->width);
3335
3336 startup_cost += seq_page_cost * innerpages;
3337 run_cost += seq_page_cost * (innerpages + 2 * outerpages);
3338 }
3339
3340 /* CPU costs left for later */
3341
3342 /* Public result fields */
3343 workspace->startup_cost = startup_cost;
3344 workspace->total_cost = startup_cost + run_cost;
3345 /* Save private data for final_cost_hashjoin */
3346 workspace->run_cost = run_cost;
3347 workspace->numbuckets = numbuckets;
3348 workspace->numbatches = numbatches;
3349 workspace->inner_rows_total = inner_path_rows_total;
3350}
3351
3352/*
3353 * final_cost_hashjoin
3354 * Final estimate of the cost and result size of a hashjoin path.
3355 *
3356 * Note: the numbatches estimate is also saved into 'path' for use later
3357 *
3358 * 'path' is already filled in except for the rows and cost fields and
3359 * num_batches
3360 * 'workspace' is the result from initial_cost_hashjoin
3361 * 'extra' contains miscellaneous information about the join
3362 */
3363void
3364final_cost_hashjoin(PlannerInfo *root, HashPath *path,
3365 JoinCostWorkspace *workspace,
3366 JoinPathExtraData *extra)
3367{
3368 Path *outer_path = path->jpath.outerjoinpath;
3369 Path *inner_path = path->jpath.innerjoinpath;
3370 double outer_path_rows = outer_path->rows;
3371 double inner_path_rows = inner_path->rows;
3372 double inner_path_rows_total = workspace->inner_rows_total;
3373 List *hashclauses = path->path_hashclauses;
3374 Cost startup_cost = workspace->startup_cost;
3375 Cost run_cost = workspace->run_cost;
3376 int numbuckets = workspace->numbuckets;
3377 int numbatches = workspace->numbatches;
3378 Cost cpu_per_tuple;
3379 QualCost hash_qual_cost;
3380 QualCost qp_qual_cost;
3381 double hashjointuples;
3382 double virtualbuckets;
3383 Selectivity innerbucketsize;
3384 Selectivity innermcvfreq;
3385 ListCell *hcl;
3386
3387 /* Mark the path with the correct row estimate */
3388 if (path->jpath.path.param_info)
3389 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3390 else
3391 path->jpath.path.rows = path->jpath.path.parent->rows;
3392
3393 /* For partial paths, scale row estimate. */
3394 if (path->jpath.path.parallel_workers > 0)
3395 {
3396 double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3397
3398 path->jpath.path.rows =
3399 clamp_row_est(path->jpath.path.rows / parallel_divisor);
3400 }
3401
3402 /*
3403 * We could include disable_cost in the preliminary estimate, but that
3404 * would amount to optimizing for the case where the join method is
3405 * disabled, which doesn't seem like the way to bet.
3406 */
3407 if (!enable_hashjoin)
3408 startup_cost += disable_cost;
3409
3410 /* mark the path with estimated # of batches */
3411 path->num_batches = numbatches;
3412
3413 /* store the total number of tuples (sum of partial row estimates) */
3414 path->inner_rows_total = inner_path_rows_total;
3415
3416 /* and compute the number of "virtual" buckets in the whole join */
3417 virtualbuckets = (double) numbuckets * (double) numbatches;
3418
3419 /*
3420 * Determine bucketsize fraction and MCV frequency for the inner relation.
3421 * We use the smallest bucketsize or MCV frequency estimated for any
3422 * individual hashclause; this is undoubtedly conservative.
3423 *
3424 * BUT: if inner relation has been unique-ified, we can assume it's good
3425 * for hashing. This is important both because it's the right answer, and
3426 * because we avoid contaminating the cache with a value that's wrong for
3427 * non-unique-ified paths.
3428 */
3429 if (IsA(inner_path, UniquePath))
3430 {
3431 innerbucketsize = 1.0 / virtualbuckets;
3432 innermcvfreq = 0.0;
3433 }
3434 else
3435 {
3436 innerbucketsize = 1.0;
3437 innermcvfreq = 1.0;
3438 foreach(hcl, hashclauses)
3439 {
3440 RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
3441 Selectivity thisbucketsize;
3442 Selectivity thismcvfreq;
3443
3444 /*
3445 * First we have to figure out which side of the hashjoin clause
3446 * is the inner side.
3447 *
3448 * Since we tend to visit the same clauses over and over when
3449 * planning a large query, we cache the bucket stats estimates in
3450 * the RestrictInfo node to avoid repeated lookups of statistics.
3451 */
3452 if (bms_is_subset(restrictinfo->right_relids,
3453 inner_path->parent->relids))
3454 {
3455 /* righthand side is inner */
3456 thisbucketsize = restrictinfo->right_bucketsize;
3457 if (thisbucketsize < 0)
3458 {
3459 /* not cached yet */
3460 estimate_hash_bucket_stats(root,
3461 get_rightop(restrictinfo->clause),
3462 virtualbuckets,
3463 &restrictinfo->right_mcvfreq,
3464 &restrictinfo->right_bucketsize);
3465 thisbucketsize = restrictinfo->right_bucketsize;
3466 }
3467 thismcvfreq = restrictinfo->right_mcvfreq;
3468 }
3469 else
3470 {
3471 Assert(bms_is_subset(restrictinfo->left_relids,
3472 inner_path->parent->relids));
3473 /* lefthand side is inner */
3474 thisbucketsize = restrictinfo->left_bucketsize;
3475 if (thisbucketsize < 0)
3476 {
3477 /* not cached yet */
3478 estimate_hash_bucket_stats(root,
3479 get_leftop(restrictinfo->clause),
3480 virtualbuckets,
3481 &restrictinfo->left_mcvfreq,
3482 &restrictinfo->left_bucketsize);
3483 thisbucketsize = restrictinfo->left_bucketsize;
3484 }
3485 thismcvfreq = restrictinfo->left_mcvfreq;
3486 }
3487
3488 if (innerbucketsize > thisbucketsize)
3489 innerbucketsize = thisbucketsize;
3490 if (innermcvfreq > thismcvfreq)
3491 innermcvfreq = thismcvfreq;
3492 }
3493 }
3494
3495 /*
3496 * If the bucket holding the inner MCV would exceed work_mem, we don't
3497 * want to hash unless there is really no other alternative, so apply
3498 * disable_cost. (The executor normally copes with excessive memory usage
3499 * by splitting batches, but obviously it cannot separate equal values
3500 * that way, so it will be unable to drive the batch size below work_mem
3501 * when this is true.)
3502 */
3503 if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
3504 inner_path->pathtarget->width) >
3505 (work_mem * 1024L))
3506 startup_cost += disable_cost;
3507
3508 /*
3509 * Compute cost of the hashquals and qpquals (other restriction clauses)
3510 * separately.
3511 */
3512 cost_qual_eval(&hash_qual_cost, hashclauses, root);
3513 cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3514 qp_qual_cost.startup -= hash_qual_cost.startup;
3515 qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
3516
3517 /* CPU costs */
3518
3519 if (path->jpath.jointype == JOIN_SEMI ||
3520 path->jpath.jointype == JOIN_ANTI ||
3521 extra->inner_unique)
3522 {
3523 double outer_matched_rows;
3524 Selectivity inner_scan_frac;
3525
3526 /*
3527 * With a SEMI or ANTI join, or if the innerrel is known unique, the
3528 * executor will stop after the first match.
3529 *
3530 * For an outer-rel row that has at least one match, we can expect the
3531 * bucket scan to stop after a fraction 1/(match_count+1) of the
3532 * bucket's rows, if the matches are evenly distributed. Since they
3533 * probably aren't quite evenly distributed, we apply a fuzz factor of
3534 * 2.0 to that fraction. (If we used a larger fuzz factor, we'd have
3535 * to clamp inner_scan_frac to at most 1.0; but since match_count is
3536 * at least 1, no such clamp is needed now.)
3537 */
3538 outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3539 inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3540
3541 startup_cost += hash_qual_cost.startup;
3542 run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
3543 clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
3544
3545 /*
3546 * For unmatched outer-rel rows, the picture is quite a lot different.
3547 * In the first place, there is no reason to assume that these rows
3548 * preferentially hit heavily-populated buckets; instead assume they
3549 * are uncorrelated with the inner distribution and so they see an
3550 * average bucket size of inner_path_rows / virtualbuckets. In the
3551 * second place, it seems likely that they will have few if any exact
3552 * hash-code matches and so very few of the tuples in the bucket will
3553 * actually require eval of the hash quals. We don't have any good
3554 * way to estimate how many will, but for the moment assume that the
3555 * effective cost per bucket entry is one-tenth what it is for
3556 * matchable tuples.
3557 */
3558 run_cost += hash_qual_cost.per_tuple *
3559 (outer_path_rows - outer_matched_rows) *
3560 clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
3561
3562 /* Get # of tuples that will pass the basic join */
3563 if (path->jpath.jointype == JOIN_ANTI)
3564 hashjointuples = outer_path_rows - outer_matched_rows;
3565 else
3566 hashjointuples = outer_matched_rows;
3567 }
3568 else
3569 {
3570 /*
3571 * The number of tuple comparisons needed is the number of outer
3572 * tuples times the typical number of tuples in a hash bucket, which
3573 * is the inner relation size times its bucketsize fraction. At each
3574 * one, we need to evaluate the hashjoin quals. But actually,
3575 * charging the full qual eval cost at each tuple is pessimistic,
3576 * since we don't evaluate the quals unless the hash values match
3577 * exactly. For lack of a better idea, halve the cost estimate to
3578 * allow for that.
3579 */
3580 startup_cost += hash_qual_cost.startup;
3581 run_cost += hash_qual_cost.per_tuple * outer_path_rows *
3582 clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
3583
3584 /*
3585 * Get approx # tuples passing the hashquals. We use
3586 * approx_tuple_count here because we need an estimate done with
3587 * JOIN_INNER semantics.
3588 */
3589 hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
3590 }
3591
3592 /*
3593 * For each tuple that gets through the hashjoin proper, we charge
3594 * cpu_tuple_cost plus the cost of evaluating additional restriction
3595 * clauses that are to be applied at the join. (This is pessimistic since
3596 * not all of the quals may get evaluated at each tuple.)
3597 */
3598 startup_cost += qp_qual_cost.startup;
3599 cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3600 run_cost += cpu_per_tuple * hashjointuples;
3601
3602 /* tlist eval costs are paid per output row, not per tuple scanned */
3603 startup_cost += path->jpath.path.pathtarget->cost.startup;
3604 run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3605
3606 path->jpath.path.startup_cost = startup_cost;
3607 path->jpath.path.total_cost = startup_cost + run_cost;
3608}
3609
3610
3611/*
3612 * cost_subplan
3613 * Figure the costs for a SubPlan (or initplan).
3614 *
3615 * Note: we could dig the subplan's Plan out of the root list, but in practice
3616 * all callers have it handy already, so we make them pass it.
3617 */
3618void
3619cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
3620{
3621 QualCost sp_cost;
3622
3623 /* Figure any cost for evaluating the testexpr */
3624 cost_qual_eval(&sp_cost,
3625 make_ands_implicit((Expr *) subplan->testexpr),
3626 root);
3627
3628 if (subplan->useHashTable)
3629 {
3630 /*
3631 * If we are using a hash table for the subquery outputs, then the
3632 * cost of evaluating the query is a one-time cost. We charge one
3633 * cpu_operator_cost per tuple for the work of loading the hashtable,
3634 * too.
3635 */
3636 sp_cost.startup += plan->total_cost +
3637 cpu_operator_cost * plan->plan_rows;
3638
3639 /*
3640 * The per-tuple costs include the cost of evaluating the lefthand
3641 * expressions, plus the cost of probing the hashtable. We already
3642 * accounted for the lefthand expressions as part of the testexpr, and
3643 * will also have counted one cpu_operator_cost for each comparison
3644 * operator. That is probably too low for the probing cost, but it's
3645 * hard to make a better estimate, so live with it for now.
3646 */
3647 }
3648 else
3649 {
3650 /*
3651 * Otherwise we will be rescanning the subplan output on each
3652 * evaluation. We need to estimate how much of the output we will
3653 * actually need to scan. NOTE: this logic should agree with the
3654 * tuple_fraction estimates used by make_subplan() in
3655 * plan/subselect.c.
3656 */
3657 Cost plan_run_cost = plan->total_cost - plan->startup_cost;
3658
3659 if (subplan->subLinkType == EXISTS_SUBLINK)
3660 {
3661 /* we only need to fetch 1 tuple; clamp to avoid zero divide */
3662 sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
3663 }
3664 else if (subplan->subLinkType == ALL_SUBLINK ||
3665 subplan->subLinkType == ANY_SUBLINK)
3666 {
3667 /* assume we need 50% of the tuples */
3668 sp_cost.per_tuple += 0.50 * plan_run_cost;
3669 /* also charge a cpu_operator_cost per row examined */
3670 sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
3671 }
3672 else
3673 {
3674 /* assume we need all tuples */
3675 sp_cost.per_tuple += plan_run_cost;
3676 }
3677
3678 /*
3679 * Also account for subplan's startup cost. If the subplan is
3680 * uncorrelated or undirect correlated, AND its topmost node is one
3681 * that materializes its output, assume that we'll only need to pay
3682 * its startup cost once; otherwise assume we pay the startup cost
3683 * every time.
3684 */
3685 if (subplan->parParam == NIL &&
3686 ExecMaterializesOutput(nodeTag(plan)))
3687 sp_cost.startup += plan->startup_cost;
3688 else
3689 sp_cost.per_tuple += plan->startup_cost;
3690 }
3691
3692 subplan->startup_cost = sp_cost.startup;
3693 subplan->per_call_cost = sp_cost.per_tuple;
3694}
3695
3696
3697/*
3698 * cost_rescan
3699 * Given a finished Path, estimate the costs of rescanning it after
3700 * having done so the first time. For some Path types a rescan is
3701 * cheaper than an original scan (if no parameters change), and this
3702 * function embodies knowledge about that. The default is to return
3703 * the same costs stored in the Path. (Note that the cost estimates
3704 * actually stored in Paths are always for first scans.)
3705 *
3706 * This function is not currently intended to model effects such as rescans
3707 * being cheaper due to disk block caching; what we are concerned with is
3708 * plan types wherein the executor caches results explicitly, or doesn't
3709 * redo startup calculations, etc.
3710 */
3711static void
3712cost_rescan(PlannerInfo *root, Path *path,
3713 Cost *rescan_startup_cost, /* output parameters */
3714 Cost *rescan_total_cost)
3715{
3716 switch (path->pathtype)
3717 {
3718 case T_FunctionScan:
3719
3720 /*
3721 * Currently, nodeFunctionscan.c always executes the function to
3722 * completion before returning any rows, and caches the results in
3723 * a tuplestore. So the function eval cost is all startup cost
3724 * and isn't paid over again on rescans. However, all run costs
3725 * will be paid over again.
3726 */
3727 *rescan_startup_cost = 0;
3728 *rescan_total_cost = path->total_cost - path->startup_cost;
3729 break;
3730 case T_HashJoin:
3731
3732 /*
3733 * If it's a single-batch join, we don't need to rebuild the hash
3734 * table during a rescan.
3735 */
3736 if (((HashPath *) path)->num_batches == 1)
3737 {
3738 /* Startup cost is exactly the cost of hash table building */
3739 *rescan_startup_cost = 0;
3740 *rescan_total_cost = path->total_cost - path->startup_cost;
3741 }
3742 else
3743 {
3744 /* Otherwise, no special treatment */
3745 *rescan_startup_cost = path->startup_cost;
3746 *rescan_total_cost = path->total_cost;
3747 }
3748 break;
3749 case T_CteScan:
3750 case T_WorkTableScan:
3751 {
3752 /*
3753 * These plan types materialize their final result in a
3754 * tuplestore or tuplesort object. So the rescan cost is only
3755 * cpu_tuple_cost per tuple, unless the result is large enough
3756 * to spill to disk.
3757 */
3758 Cost run_cost = cpu_tuple_cost * path->rows;
3759 double nbytes = relation_byte_size(path->rows,
3760 path->pathtarget->width);
3761 long work_mem_bytes = work_mem * 1024L;
3762
3763 if (nbytes > work_mem_bytes)
3764 {
3765 /* It will spill, so account for re-read cost */
3766 double npages = ceil(nbytes / BLCKSZ);
3767
3768 run_cost += seq_page_cost * npages;
3769 }
3770 *rescan_startup_cost = 0;
3771 *rescan_total_cost = run_cost;
3772 }
3773 break;
3774 case T_Material:
3775 case T_Sort:
3776 {
3777 /*
3778 * These plan types not only materialize their results, but do
3779 * not implement qual filtering or projection. So they are
3780 * even cheaper to rescan than the ones above. We charge only
3781 * cpu_operator_cost per tuple. (Note: keep that in sync with
3782 * the run_cost charge in cost_sort, and also see comments in
3783 * cost_material before you change it.)
3784 */
3785 Cost run_cost = cpu_operator_cost * path->rows;
3786 double nbytes = relation_byte_size(path->rows,
3787 path->pathtarget->width);
3788 long work_mem_bytes = work_mem * 1024L;
3789
3790 if (nbytes > work_mem_bytes)
3791 {
3792 /* It will spill, so account for re-read cost */
3793 double npages = ceil(nbytes / BLCKSZ);
3794
3795 run_cost += seq_page_cost * npages;
3796 }
3797 *rescan_startup_cost = 0;
3798 *rescan_total_cost = run_cost;
3799 }
3800 break;
3801 default:
3802 *rescan_startup_cost = path->startup_cost;
3803 *rescan_total_cost = path->total_cost;
3804 break;
3805 }
3806}
3807
3808
3809/*
3810 * cost_qual_eval
3811 * Estimate the CPU costs of evaluating a WHERE clause.
3812 * The input can be either an implicitly-ANDed list of boolean
3813 * expressions, or a list of RestrictInfo nodes. (The latter is
3814 * preferred since it allows caching of the results.)
3815 * The result includes both a one-time (startup) component,
3816 * and a per-evaluation component.
3817 */
3818void
3819cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
3820{
3821 cost_qual_eval_context context;
3822 ListCell *l;
3823
3824 context.root = root;
3825 context.total.startup = 0;
3826 context.total.per_tuple = 0;
3827
3828 /* We don't charge any cost for the implicit ANDing at top level ... */
3829
3830 foreach(l, quals)
3831 {
3832 Node *qual = (Node *) lfirst(l);
3833
3834 cost_qual_eval_walker(qual, &context);
3835 }
3836
3837 *cost = context.total;
3838}
3839
3840/*
3841 * cost_qual_eval_node
3842 * As above, for a single RestrictInfo or expression.
3843 */
3844void
3845cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root)
3846{
3847 cost_qual_eval_context context;
3848
3849 context.root = root;
3850 context.total.startup = 0;
3851 context.total.per_tuple = 0;
3852
3853 cost_qual_eval_walker(qual, &context);
3854
3855 *cost = context.total;
3856}
3857
3858static bool
3859cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
3860{
3861 if (node == NULL)
3862 return false;
3863
3864 /*
3865 * RestrictInfo nodes contain an eval_cost field reserved for this
3866 * routine's use, so that it's not necessary to evaluate the qual clause's
3867 * cost more than once. If the clause's cost hasn't been computed yet,
3868 * the field's startup value will contain -1.
3869 */
3870 if (IsA(node, RestrictInfo))
3871 {
3872 RestrictInfo *rinfo = (RestrictInfo *) node;
3873
3874 if (rinfo->eval_cost.startup < 0)
3875 {
3876 cost_qual_eval_context locContext;
3877
3878 locContext.root = context->root;
3879 locContext.total.startup = 0;
3880 locContext.total.per_tuple = 0;
3881
3882 /*
3883 * For an OR clause, recurse into the marked-up tree so that we
3884 * set the eval_cost for contained RestrictInfos too.
3885 */
3886 if (rinfo->orclause)
3887 cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
3888 else
3889 cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
3890
3891 /*
3892 * If the RestrictInfo is marked pseudoconstant, it will be tested
3893 * only once, so treat its cost as all startup cost.
3894 */
3895 if (rinfo->pseudoconstant)
3896 {
3897 /* count one execution during startup */
3898 locContext.total.startup += locContext.total.per_tuple;
3899 locContext.total.per_tuple = 0;
3900 }
3901 rinfo->eval_cost = locContext.total;
3902 }
3903 context->total.startup += rinfo->eval_cost.startup;
3904 context->total.per_tuple += rinfo->eval_cost.per_tuple;
3905 /* do NOT recurse into children */
3906 return false;
3907 }
3908
3909 /*
3910 * For each operator or function node in the given tree, we charge the
3911 * estimated execution cost given by pg_proc.procost (remember to multiply
3912 * this by cpu_operator_cost).
3913 *
3914 * Vars and Consts are charged zero, and so are boolean operators (AND,
3915 * OR, NOT). Simplistic, but a lot better than no model at all.
3916 *
3917 * Should we try to account for the possibility of short-circuit
3918 * evaluation of AND/OR? Probably *not*, because that would make the
3919 * results depend on the clause ordering, and we are not in any position
3920 * to expect that the current ordering of the clauses is the one that's
3921 * going to end up being used. The above per-RestrictInfo caching would
3922 * not mix well with trying to re-order clauses anyway.
3923 *
3924 * Another issue that is entirely ignored here is that if a set-returning
3925 * function is below top level in the tree, the functions/operators above
3926 * it will need to be evaluated multiple times. In practical use, such
3927 * cases arise so seldom as to not be worth the added complexity needed;
3928 * moreover, since our rowcount estimates for functions tend to be pretty
3929 * phony, the results would also be pretty phony.
3930 */
3931 if (IsA(node, FuncExpr))
3932 {
3933 add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
3934 &context->total);
3935 }
3936 else if (IsA(node, OpExpr) ||
3937 IsA(node, DistinctExpr) ||
3938 IsA(node, NullIfExpr))
3939 {
3940 /* rely on struct equivalence to treat these all alike */
3941 set_opfuncid((OpExpr *) node);
3942 add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
3943 &context->total);
3944 }
3945 else if (IsA(node, ScalarArrayOpExpr))
3946 {
3947 /*
3948 * Estimate that the operator will be applied to about half of the
3949 * array elements before the answer is determined.
3950 */
3951 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
3952 Node *arraynode = (Node *) lsecond(saop->args);
3953 QualCost sacosts;
3954
3955 set_sa_opfuncid(saop);
3956 sacosts.startup = sacosts.per_tuple = 0;
3957 add_function_cost(context->root, saop->opfuncid, NULL,
3958 &sacosts);
3959 context->total.startup += sacosts.startup;
3960 context->total.per_tuple += sacosts.per_tuple *
3961 estimate_array_length(arraynode) * 0.5;
3962 }
3963 else if (IsA(node, Aggref) ||
3964 IsA(node, WindowFunc))
3965 {
3966 /*
3967 * Aggref and WindowFunc nodes are (and should be) treated like Vars,
3968 * ie, zero execution cost in the current model, because they behave
3969 * essentially like Vars at execution. We disregard the costs of
3970 * their input expressions for the same reason. The actual execution
3971 * costs of the aggregate/window functions and their arguments have to
3972 * be factored into plan-node-specific costing of the Agg or WindowAgg
3973 * plan node.
3974 */
3975 return false; /* don't recurse into children */
3976 }
3977 else if (IsA(node, CoerceViaIO))
3978 {
3979 CoerceViaIO *iocoerce = (CoerceViaIO *) node;
3980 Oid iofunc;
3981 Oid typioparam;
3982 bool typisvarlena;
3983
3984 /* check the result type's input function */
3985 getTypeInputInfo(iocoerce->resulttype,
3986 &iofunc, &typioparam);
3987 add_function_cost(context->root, iofunc, NULL,
3988 &context->total);
3989 /* check the input type's output function */
3990 getTypeOutputInfo(exprType((Node *) iocoerce->arg),
3991 &iofunc, &typisvarlena);
3992 add_function_cost(context->root, iofunc, NULL,
3993 &context->total);
3994 }
3995 else if (IsA(node, ArrayCoerceExpr))
3996 {
3997 ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
3998 QualCost perelemcost;
3999
4000 cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
4001 context->root);
4002 context->total.startup += perelemcost.startup;
4003 if (perelemcost.per_tuple > 0)
4004 context->total.per_tuple += perelemcost.per_tuple *
4005 estimate_array_length((Node *) acoerce->arg);
4006 }
4007 else if (IsA(node, RowCompareExpr))
4008 {
4009 /* Conservatively assume we will check all the columns */
4010 RowCompareExpr *rcexpr = (RowCompareExpr *) node;
4011 ListCell *lc;
4012
4013 foreach(lc, rcexpr->opnos)
4014 {
4015 Oid opid = lfirst_oid(lc);
4016
4017 add_function_cost(context->root, get_opcode(opid), NULL,
4018 &context->total);
4019 }
4020 }
4021 else if (IsA(node, MinMaxExpr) ||
4022 IsA(node, SQLValueFunction) ||
4023 IsA(node, XmlExpr) ||
4024 IsA(node, CoerceToDomain) ||
4025 IsA(node, NextValueExpr))
4026 {
4027 /* Treat all these as having cost 1 */
4028 context->total.per_tuple += cpu_operator_cost;
4029 }
4030 else if (IsA(node, CurrentOfExpr))
4031 {
4032 /* Report high cost to prevent selection of anything but TID scan */
4033 context->total.startup += disable_cost;
4034 }
4035 else if (IsA(node, SubLink))
4036 {
4037 /* This routine should not be applied to un-planned expressions */
4038 elog(ERROR, "cannot handle unplanned sub-select");
4039 }
4040 else if (IsA(node, SubPlan))
4041 {
4042 /*
4043 * A subplan node in an expression typically indicates that the
4044 * subplan will be executed on each evaluation, so charge accordingly.
4045 * (Sub-selects that can be executed as InitPlans have already been
4046 * removed from the expression.)
4047 */
4048 SubPlan *subplan = (SubPlan *) node;
4049
4050 context->total.startup += subplan->startup_cost;
4051 context->total.per_tuple += subplan->per_call_cost;
4052
4053 /*
4054 * We don't want to recurse into the testexpr, because it was already
4055 * counted in the SubPlan node's costs. So we're done.
4056 */
4057 return false;
4058 }
4059 else if (IsA(node, AlternativeSubPlan))
4060 {
4061 /*
4062 * Arbitrarily use the first alternative plan for costing. (We should
4063 * certainly only include one alternative, and we don't yet have
4064 * enough information to know which one the executor is most likely to
4065 * use.)
4066 */
4067 AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
4068
4069 return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
4070 context);
4071 }
4072 else if (IsA(node, PlaceHolderVar))
4073 {
4074 /*
4075 * A PlaceHolderVar should be given cost zero when considering general
4076 * expression evaluation costs. The expense of doing the contained
4077 * expression is charged as part of the tlist eval costs of the scan
4078 * or join where the PHV is first computed (see set_rel_width and
4079 * add_placeholders_to_joinrel). If we charged it again here, we'd be
4080 * double-counting the cost for each level of plan that the PHV
4081 * bubbles up through. Hence, return without recursing into the
4082 * phexpr.
4083 */
4084 return false;
4085 }
4086
4087 /* recurse into children */
4088 return expression_tree_walker(node, cost_qual_eval_walker,
4089 (void *) context);
4090}
4091
4092/*
4093 * get_restriction_qual_cost
4094 * Compute evaluation costs of a baserel's restriction quals, plus any
4095 * movable join quals that have been pushed down to the scan.
4096 * Results are returned into *qpqual_cost.
4097 *
4098 * This is a convenience subroutine that works for seqscans and other cases
4099 * where all the given quals will be evaluated the hard way. It's not useful
4100 * for cost_index(), for example, where the index machinery takes care of
4101 * some of the quals. We assume baserestrictcost was previously set by
4102 * set_baserel_size_estimates().
4103 */
4104static void
4105get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
4106 ParamPathInfo *param_info,
4107 QualCost *qpqual_cost)
4108{
4109 if (param_info)
4110 {
4111 /* Include costs of pushed-down clauses */
4112 cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4113
4114 qpqual_cost->startup += baserel->baserestrictcost.startup;
4115 qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4116 }
4117 else
4118 *qpqual_cost = baserel->baserestrictcost;
4119}
4120
4121
4122/*
4123 * compute_semi_anti_join_factors
4124 * Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4125 * can be expected to scan.
4126 *
4127 * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4128 * inner rows as soon as it finds a match to the current outer row.
4129 * The same happens if we have detected the inner rel is unique.
4130 * We should therefore adjust some of the cost components for this effect.
4131 * This function computes some estimates needed for these adjustments.
4132 * These estimates will be the same regardless of the particular paths used
4133 * for the outer and inner relation, so we compute these once and then pass
4134 * them to all the join cost estimation functions.
4135 *
4136 * Input parameters:
4137 * joinrel: join relation under consideration
4138 * outerrel: outer relation under consideration
4139 * innerrel: inner relation under consideration
4140 * jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4141 * sjinfo: SpecialJoinInfo relevant to this join
4142 * restrictlist: join quals
4143 * Output parameters:
4144 * *semifactors is filled in (see pathnodes.h for field definitions)
4145 */
4146void
4147compute_semi_anti_join_factors(PlannerInfo *root,
4148 RelOptInfo *joinrel,
4149 RelOptInfo *outerrel,
4150 RelOptInfo *innerrel,
4151 JoinType jointype,
4152 SpecialJoinInfo *sjinfo,
4153 List *restrictlist,
4154 SemiAntiJoinFactors *semifactors)
4155{
4156 Selectivity jselec;
4157 Selectivity nselec;
4158 Selectivity avgmatch;
4159 SpecialJoinInfo norm_sjinfo;
4160 List *joinquals;
4161 ListCell *l;
4162
4163 /*
4164 * In an ANTI join, we must ignore clauses that are "pushed down", since
4165 * those won't affect the match logic. In a SEMI join, we do not
4166 * distinguish joinquals from "pushed down" quals, so just use the whole
4167 * restrictinfo list. For other outer join types, we should consider only
4168 * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
4169 */
4170 if (IS_OUTER_JOIN(jointype))
4171 {
4172 joinquals = NIL;
4173 foreach(l, restrictlist)
4174 {
4175 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4176
4177 if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4178 joinquals = lappend(joinquals, rinfo);
4179 }
4180 }
4181 else
4182 joinquals = restrictlist;
4183
4184 /*
4185 * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
4186 */
4187 jselec = clauselist_selectivity(root,
4188 joinquals,
4189 0,
4190 (jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
4191 sjinfo);
4192
4193 /*
4194 * Also get the normal inner-join selectivity of the join clauses.
4195 */
4196 norm_sjinfo.type = T_SpecialJoinInfo;
4197 norm_sjinfo.min_lefthand = outerrel->relids;
4198 norm_sjinfo.min_righthand = innerrel->relids;
4199 norm_sjinfo.syn_lefthand = outerrel->relids;
4200 norm_sjinfo.syn_righthand = innerrel->relids;
4201 norm_sjinfo.jointype = JOIN_INNER;
4202 /* we don't bother trying to make the remaining fields valid */
4203 norm_sjinfo.lhs_strict = false;
4204 norm_sjinfo.delay_upper_joins = false;
4205 norm_sjinfo.semi_can_btree = false;
4206 norm_sjinfo.semi_can_hash = false;
4207 norm_sjinfo.semi_operators = NIL;
4208 norm_sjinfo.semi_rhs_exprs = NIL;
4209
4210 nselec = clauselist_selectivity(root,
4211 joinquals,
4212 0,
4213 JOIN_INNER,
4214 &norm_sjinfo);
4215
4216 /* Avoid leaking a lot of ListCells */
4217 if (IS_OUTER_JOIN(jointype))
4218 list_free(joinquals);
4219
4220 /*
4221 * jselec can be interpreted as the fraction of outer-rel rows that have
4222 * any matches (this is true for both SEMI and ANTI cases). And nselec is
4223 * the fraction of the Cartesian product that matches. So, the average
4224 * number of matches for each outer-rel row that has at least one match is
4225 * nselec * inner_rows / jselec.
4226 *
4227 * Note: it is correct to use the inner rel's "rows" count here, even
4228 * though we might later be considering a parameterized inner path with
4229 * fewer rows. This is because we have included all the join clauses in
4230 * the selectivity estimate.
4231 */
4232 if (jselec > 0) /* protect against zero divide */
4233 {
4234 avgmatch = nselec * innerrel->rows / jselec;
4235 /* Clamp to sane range */
4236 avgmatch = Max(1.0, avgmatch);
4237 }
4238 else
4239 avgmatch = 1.0;
4240
4241 semifactors->outer_match_frac = jselec;
4242 semifactors->match_count = avgmatch;
4243}
4244
4245/*
4246 * has_indexed_join_quals
4247 * Check whether all the joinquals of a nestloop join are used as
4248 * inner index quals.
4249 *
4250 * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
4251 * indexscan) that uses all the joinquals as indexquals, we can assume that an
4252 * unmatched outer tuple is cheap to process, whereas otherwise it's probably
4253 * expensive.
4254 */
4255static bool
4256has_indexed_join_quals(NestPath *joinpath)
4257{
4258 Relids joinrelids = joinpath->path.parent->relids;
4259 Path *innerpath = joinpath->innerjoinpath;
4260 List *indexclauses;
4261 bool found_one;
4262 ListCell *lc;
4263
4264 /* If join still has quals to evaluate, it's not fast */
4265 if (joinpath->joinrestrictinfo != NIL)
4266 return false;
4267 /* Nor if the inner path isn't parameterized at all */
4268 if (innerpath->param_info == NULL)
4269 return false;
4270
4271 /* Find the indexclauses list for the inner scan */
4272 switch (innerpath->pathtype)
4273 {
4274 case T_IndexScan:
4275 case T_IndexOnlyScan:
4276 indexclauses = ((IndexPath *) innerpath)->indexclauses;
4277 break;
4278 case T_BitmapHeapScan:
4279 {
4280 /* Accept only a simple bitmap scan, not AND/OR cases */
4281 Path *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
4282
4283 if (IsA(bmqual, IndexPath))
4284 indexclauses = ((IndexPath *) bmqual)->indexclauses;
4285 else
4286 return false;
4287 break;
4288 }
4289 default:
4290
4291 /*
4292 * If it's not a simple indexscan, it probably doesn't run quickly
4293 * for zero rows out, even if it's a parameterized path using all
4294 * the joinquals.
4295 */
4296 return false;
4297 }
4298
4299 /*
4300 * Examine the inner path's param clauses. Any that are from the outer
4301 * path must be found in the indexclauses list, either exactly or in an
4302 * equivalent form generated by equivclass.c. Also, we must find at least
4303 * one such clause, else it's a clauseless join which isn't fast.
4304 */
4305 found_one = false;
4306 foreach(lc, innerpath->param_info->ppi_clauses)
4307 {
4308 RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
4309
4310 if (join_clause_is_movable_into(rinfo,
4311 innerpath->parent->relids,
4312 joinrelids))
4313 {
4314 if (!is_redundant_with_indexclauses(rinfo, indexclauses))
4315 return false;
4316 found_one = true;
4317 }
4318 }
4319 return found_one;
4320}
4321
4322
4323/*
4324 * approx_tuple_count
4325 * Quick-and-dirty estimation of the number of join rows passing
4326 * a set of qual conditions.
4327 *
4328 * The quals can be either an implicitly-ANDed list of boolean expressions,
4329 * or a list of RestrictInfo nodes (typically the latter).
4330 *
4331 * We intentionally compute the selectivity under JOIN_INNER rules, even
4332 * if it's some type of outer join. This is appropriate because we are
4333 * trying to figure out how many tuples pass the initial merge or hash
4334 * join step.
4335 *
4336 * This is quick-and-dirty because we bypass clauselist_selectivity, and
4337 * simply multiply the independent clause selectivities together. Now
4338 * clauselist_selectivity often can't do any better than that anyhow, but
4339 * for some situations (such as range constraints) it is smarter. However,
4340 * we can't effectively cache the results of clauselist_selectivity, whereas
4341 * the individual clause selectivities can be and are cached.
4342 *
4343 * Since we are only using the results to estimate how many potential
4344 * output tuples are generated and passed through qpqual checking, it
4345 * seems OK to live with the approximation.
4346 */
4347static double
4348approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
4349{
4350 double tuples;
4351 double outer_tuples = path->outerjoinpath->rows;
4352 double inner_tuples = path->innerjoinpath->rows;
4353 SpecialJoinInfo sjinfo;
4354 Selectivity selec = 1.0;
4355 ListCell *l;
4356
4357 /*
4358 * Make up a SpecialJoinInfo for JOIN_INNER semantics.
4359 */
4360 sjinfo.type = T_SpecialJoinInfo;
4361 sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
4362 sjinfo.min_righthand = path->innerjoinpath->parent->relids;
4363 sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
4364 sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
4365 sjinfo.jointype = JOIN_INNER;
4366 /* we don't bother trying to make the remaining fields valid */
4367 sjinfo.lhs_strict = false;
4368 sjinfo.delay_upper_joins = false;
4369 sjinfo.semi_can_btree = false;
4370 sjinfo.semi_can_hash = false;
4371 sjinfo.semi_operators = NIL;
4372 sjinfo.semi_rhs_exprs = NIL;
4373
4374 /* Get the approximate selectivity */
4375 foreach(l, quals)
4376 {
4377 Node *qual = (Node *) lfirst(l);
4378
4379 /* Note that clause_selectivity will be able to cache its result */
4380 selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
4381 }
4382
4383 /* Apply it to the input relation sizes */
4384 tuples = selec * outer_tuples * inner_tuples;
4385
4386 return clamp_row_est(tuples);
4387}
4388
4389
4390/*
4391 * set_baserel_size_estimates
4392 * Set the size estimates for the given base relation.
4393 *
4394 * The rel's targetlist and restrictinfo list must have been constructed
4395 * already, and rel->tuples must be set.
4396 *
4397 * We set the following fields of the rel node:
4398 * rows: the estimated number of output tuples (after applying
4399 * restriction clauses).
4400 * width: the estimated average output tuple width in bytes.
4401 * baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
4402 */
4403void
4404set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4405{
4406 double nrows;
4407
4408 /* Should only be applied to base relations */
4409 Assert(rel->relid > 0);
4410
4411 nrows = rel->tuples *
4412 clauselist_selectivity(root,
4413 rel->baserestrictinfo,
4414 0,
4415 JOIN_INNER,
4416 NULL);
4417
4418 rel->rows = clamp_row_est(nrows);
4419
4420 cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
4421
4422 set_rel_width(root, rel);
4423}
4424
4425/*
4426 * get_parameterized_baserel_size
4427 * Make a size estimate for a parameterized scan of a base relation.
4428 *
4429 * 'param_clauses' lists the additional join clauses to be used.
4430 *
4431 * set_baserel_size_estimates must have been applied already.
4432 */
4433double
4434get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
4435 List *param_clauses)
4436{
4437 List *allclauses;
4438 double nrows;
4439
4440 /*
4441 * Estimate the number of rows returned by the parameterized scan, knowing
4442 * that it will apply all the extra join clauses as well as the rel's own
4443 * restriction clauses. Note that we force the clauses to be treated as
4444 * non-join clauses during selectivity estimation.
4445 */
4446 allclauses = list_concat(list_copy(param_clauses),
4447 rel->baserestrictinfo);
4448 nrows = rel->tuples *
4449 clauselist_selectivity(root,
4450 allclauses,
4451 rel->relid, /* do not use 0! */
4452 JOIN_INNER,
4453 NULL);
4454 nrows = clamp_row_est(nrows);
4455 /* For safety, make sure result is not more than the base estimate */
4456 if (nrows > rel->rows)
4457 nrows = rel->rows;
4458 return nrows;
4459}
4460
4461/*
4462 * set_joinrel_size_estimates
4463 * Set the size estimates for the given join relation.
4464 *
4465 * The rel's targetlist must have been constructed already, and a
4466 * restriction clause list that matches the given component rels must
4467 * be provided.
4468 *
4469 * Since there is more than one way to make a joinrel for more than two
4470 * base relations, the results we get here could depend on which component
4471 * rel pair is provided. In theory we should get the same answers no matter
4472 * which pair is provided; in practice, since the selectivity estimation
4473 * routines don't handle all cases equally well, we might not. But there's
4474 * not much to be done about it. (Would it make sense to repeat the
4475 * calculations for each pair of input rels that's encountered, and somehow
4476 * average the results? Probably way more trouble than it's worth, and
4477 * anyway we must keep the rowcount estimate the same for all paths for the
4478 * joinrel.)
4479 *
4480 * We set only the rows field here. The reltarget field was already set by
4481 * build_joinrel_tlist, and baserestrictcost is not used for join rels.
4482 */
4483void
4484set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
4485 RelOptInfo *outer_rel,
4486 RelOptInfo *inner_rel,
4487 SpecialJoinInfo *sjinfo,
4488 List *restrictlist)
4489{
4490 rel->rows = calc_joinrel_size_estimate(root,
4491 rel,
4492 outer_rel,
4493 inner_rel,
4494 outer_rel->rows,
4495 inner_rel->rows,
4496 sjinfo,
4497 restrictlist);
4498}
4499
4500/*
4501 * get_parameterized_joinrel_size
4502 * Make a size estimate for a parameterized scan of a join relation.
4503 *
4504 * 'rel' is the joinrel under consideration.
4505 * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
4506 * produce the relations being joined.
4507 * 'sjinfo' is any SpecialJoinInfo relevant to this join.
4508 * 'restrict_clauses' lists the join clauses that need to be applied at the
4509 * join node (including any movable clauses that were moved down to this join,
4510 * and not including any movable clauses that were pushed down into the
4511 * child paths).
4512 *
4513 * set_joinrel_size_estimates must have been applied already.
4514 */
4515double
4516get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
4517 Path *outer_path,
4518 Path *inner_path,
4519 SpecialJoinInfo *sjinfo,
4520 List *restrict_clauses)
4521{
4522 double nrows;
4523
4524 /*
4525 * Estimate the number of rows returned by the parameterized join as the
4526 * sizes of the input paths times the selectivity of the clauses that have
4527 * ended up at this join node.
4528 *
4529 * As with set_joinrel_size_estimates, the rowcount estimate could depend
4530 * on the pair of input paths provided, though ideally we'd get the same
4531 * estimate for any pair with the same parameterization.
4532 */
4533 nrows = calc_joinrel_size_estimate(root,
4534 rel,
4535 outer_path->parent,
4536 inner_path->parent,
4537 outer_path->rows,
4538 inner_path->rows,
4539 sjinfo,
4540 restrict_clauses);
4541 /* For safety, make sure result is not more than the base estimate */
4542 if (nrows > rel->rows)
4543 nrows = rel->rows;
4544 return nrows;
4545}
4546
4547/*
4548 * calc_joinrel_size_estimate
4549 * Workhorse for set_joinrel_size_estimates and
4550 * get_parameterized_joinrel_size.
4551 *
4552 * outer_rel/inner_rel are the relations being joined, but they should be
4553 * assumed to have sizes outer_rows/inner_rows; those numbers might be less
4554 * than what rel->rows says, when we are considering parameterized paths.
4555 */
4556static double
4557calc_joinrel_size_estimate(PlannerInfo *root,
4558 RelOptInfo *joinrel,
4559 RelOptInfo *outer_rel,
4560 RelOptInfo *inner_rel,
4561 double outer_rows,
4562 double inner_rows,
4563 SpecialJoinInfo *sjinfo,
4564 List *restrictlist_in)
4565{
4566 /* This apparently-useless variable dodges a compiler bug in VS2013: */
4567 List *restrictlist = restrictlist_in;
4568 JoinType jointype = sjinfo->jointype;
4569 Selectivity fkselec;
4570 Selectivity jselec;
4571 Selectivity pselec;
4572 double nrows;
4573
4574 /*
4575 * Compute joinclause selectivity. Note that we are only considering
4576 * clauses that become restriction clauses at this join level; we are not
4577 * double-counting them because they were not considered in estimating the
4578 * sizes of the component rels.
4579 *
4580 * First, see whether any of the joinclauses can be matched to known FK
4581 * constraints. If so, drop those clauses from the restrictlist, and
4582 * instead estimate their selectivity using FK semantics. (We do this
4583 * without regard to whether said clauses are local or "pushed down".
4584 * Probably, an FK-matching clause could never be seen as pushed down at
4585 * an outer join, since it would be strict and hence would be grounds for
4586 * join strength reduction.) fkselec gets the net selectivity for
4587 * FK-matching clauses, or 1.0 if there are none.
4588 */
4589 fkselec = get_foreign_key_join_selectivity(root,
4590 outer_rel->relids,
4591 inner_rel->relids,
4592 sjinfo,
4593 &restrictlist);
4594
4595 /*
4596 * For an outer join, we have to distinguish the selectivity of the join's
4597 * own clauses (JOIN/ON conditions) from any clauses that were "pushed
4598 * down". For inner joins we just count them all as joinclauses.
4599 */
4600 if (IS_OUTER_JOIN(jointype))
4601 {
4602 List *joinquals = NIL;
4603 List *pushedquals = NIL;
4604 ListCell *l;
4605
4606 /* Grovel through the clauses to separate into two lists */
4607 foreach(l, restrictlist)
4608 {
4609 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4610
4611 if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4612 pushedquals = lappend(pushedquals, rinfo);
4613 else
4614 joinquals = lappend(joinquals, rinfo);
4615 }
4616
4617 /* Get the separate selectivities */
4618 jselec = clauselist_selectivity(root,
4619 joinquals,
4620 0,
4621 jointype,
4622 sjinfo);
4623 pselec = clauselist_selectivity(root,
4624 pushedquals,
4625 0,
4626 jointype,
4627 sjinfo);
4628
4629 /* Avoid leaking a lot of ListCells */
4630 list_free(joinquals);
4631 list_free(pushedquals);
4632 }
4633 else
4634 {
4635 jselec = clauselist_selectivity(root,
4636 restrictlist,
4637 0,
4638 jointype,
4639 sjinfo);
4640 pselec = 0.0; /* not used, keep compiler quiet */
4641 }
4642
4643 /*
4644 * Basically, we multiply size of Cartesian product by selectivity.
4645 *
4646 * If we are doing an outer join, take that into account: the joinqual
4647 * selectivity has to be clamped using the knowledge that the output must
4648 * be at least as large as the non-nullable input. However, any
4649 * pushed-down quals are applied after the outer join, so their
4650 * selectivity applies fully.
4651 *
4652 * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
4653 * of LHS rows that have matches, and we apply that straightforwardly.
4654 */
4655 switch (jointype)
4656 {
4657 case JOIN_INNER:
4658 nrows = outer_rows * inner_rows * fkselec * jselec;
4659 /* pselec not used */
4660 break;
4661 case JOIN_LEFT:
4662 nrows = outer_rows * inner_rows * fkselec * jselec;
4663 if (nrows < outer_rows)
4664 nrows = outer_rows;
4665 nrows *= pselec;
4666 break;
4667 case JOIN_FULL:
4668 nrows = outer_rows * inner_rows * fkselec * jselec;
4669 if (nrows < outer_rows)
4670 nrows = outer_rows;
4671 if (nrows < inner_rows)
4672 nrows = inner_rows;
4673 nrows *= pselec;
4674 break;
4675 case JOIN_SEMI:
4676 nrows = outer_rows * fkselec * jselec;
4677 /* pselec not used */
4678 break;
4679 case JOIN_ANTI:
4680 nrows = outer_rows * (1.0 - fkselec * jselec);
4681 nrows *= pselec;
4682 break;
4683 default:
4684 /* other values not expected here */
4685 elog(ERROR, "unrecognized join type: %d", (int) jointype);
4686 nrows = 0; /* keep compiler quiet */
4687 break;
4688 }
4689
4690 return clamp_row_est(nrows);
4691}
4692
4693/*
4694 * get_foreign_key_join_selectivity
4695 * Estimate join selectivity for foreign-key-related clauses.
4696 *
4697 * Remove any clauses that can be matched to FK constraints from *restrictlist,
4698 * and return a substitute estimate of their selectivity. 1.0 is returned
4699 * when there are no such clauses.
4700 *
4701 * The reason for treating such clauses specially is that we can get better
4702 * estimates this way than by relying on clauselist_selectivity(), especially
4703 * for multi-column FKs where that function's assumption that the clauses are
4704 * independent falls down badly. But even with single-column FKs, we may be
4705 * able to get a better answer when the pg_statistic stats are missing or out
4706 * of date.
4707 */
4708static Selectivity
4709get_foreign_key_join_selectivity(PlannerInfo *root,
4710 Relids outer_relids,
4711 Relids inner_relids,
4712 SpecialJoinInfo *sjinfo,
4713 List **restrictlist)
4714{
4715 Selectivity fkselec = 1.0;
4716 JoinType jointype = sjinfo->jointype;
4717 List *worklist = *restrictlist;
4718 ListCell *lc;
4719
4720 /* Consider each FK constraint that is known to match the query */
4721 foreach(lc, root->fkey_list)
4722 {
4723 ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
4724 bool ref_is_outer;
4725 List *removedlist;
4726 ListCell *cell;
4727 ListCell *prev;
4728 ListCell *next;
4729
4730 /*
4731 * This FK is not relevant unless it connects a baserel on one side of
4732 * this join to a baserel on the other side.
4733 */
4734 if (bms_is_member(fkinfo->con_relid, outer_relids) &&
4735 bms_is_member(fkinfo->ref_relid, inner_relids))
4736 ref_is_outer = false;
4737 else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
4738 bms_is_member(fkinfo->con_relid, inner_relids))
4739 ref_is_outer = true;
4740 else
4741 continue;
4742
4743 /*
4744 * If we're dealing with a semi/anti join, and the FK's referenced
4745 * relation is on the outside, then knowledge of the FK doesn't help
4746 * us figure out what we need to know (which is the fraction of outer
4747 * rows that have matches). On the other hand, if the referenced rel
4748 * is on the inside, then all outer rows must have matches in the
4749 * referenced table (ignoring nulls). But any restriction or join
4750 * clauses that filter that table will reduce the fraction of matches.
4751 * We can account for restriction clauses, but it's too hard to guess
4752 * how many table rows would get through a join that's inside the RHS.
4753 * Hence, if either case applies, punt and ignore the FK.
4754 */
4755 if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
4756 (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
4757 continue;
4758
4759 /*
4760 * Modify the restrictlist by removing clauses that match the FK (and
4761 * putting them into removedlist instead). It seems unsafe to modify
4762 * the originally-passed List structure, so we make a shallow copy the
4763 * first time through.
4764 */
4765 if (worklist == *restrictlist)
4766 worklist = list_copy(worklist);
4767
4768 removedlist = NIL;
4769 prev = NULL;
4770 for (cell = list_head(worklist); cell; cell = next)
4771 {
4772 RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
4773 bool remove_it = false;
4774 int i;
4775
4776 next = lnext(cell);
4777 /* Drop this clause if it matches any column of the FK */
4778 for (i = 0; i < fkinfo->nkeys; i++)
4779 {
4780 if (rinfo->parent_ec)
4781 {
4782 /*
4783 * EC-derived clauses can only match by EC. It is okay to
4784 * consider any clause derived from the same EC as
4785 * matching the FK: even if equivclass.c chose to generate
4786 * a clause equating some other pair of Vars, it could
4787 * have generated one equating the FK's Vars. So for
4788 * purposes of estimation, we can act as though it did so.
4789 *
4790 * Note: checking parent_ec is a bit of a cheat because
4791 * there are EC-derived clauses that don't have parent_ec
4792 * set; but such clauses must compare expressions that
4793 * aren't just Vars, so they cannot match the FK anyway.
4794 */
4795 if (fkinfo->eclass[i] == rinfo->parent_ec)
4796 {
4797 remove_it = true;
4798 break;
4799 }
4800 }
4801 else
4802 {
4803 /*
4804 * Otherwise, see if rinfo was previously matched to FK as
4805 * a "loose" clause.
4806 */
4807 if (list_member_ptr(fkinfo->rinfos[i], rinfo))
4808 {
4809 remove_it = true;
4810 break;
4811 }
4812 }
4813 }
4814 if (remove_it)
4815 {
4816 worklist = list_delete_cell(worklist, cell, prev);
4817 removedlist = lappend(removedlist, rinfo);
4818 }
4819 else
4820 prev = cell;
4821 }
4822
4823 /*
4824 * If we failed to remove all the matching clauses we expected to
4825 * find, chicken out and ignore this FK; applying its selectivity
4826 * might result in double-counting. Put any clauses we did manage to
4827 * remove back into the worklist.
4828 *
4829 * Since the matching clauses are known not outerjoin-delayed, they
4830 * should certainly have appeared in the initial joinclause list. If
4831 * we didn't find them, they must have been matched to, and removed
4832 * by, some other FK in a previous iteration of this loop. (A likely
4833 * case is that two FKs are matched to the same EC; there will be only
4834 * one EC-derived clause in the initial list, so the first FK will
4835 * consume it.) Applying both FKs' selectivity independently risks
4836 * underestimating the join size; in particular, this would undo one
4837 * of the main things that ECs were invented for, namely to avoid
4838 * double-counting the selectivity of redundant equality conditions.
4839 * Later we might think of a reasonable way to combine the estimates,
4840 * but for now, just punt, since this is a fairly uncommon situation.
4841 */
4842 if (list_length(removedlist) !=
4843 (fkinfo->nmatched_ec + fkinfo->nmatched_ri))
4844 {
4845 worklist = list_concat(worklist, removedlist);
4846 continue;
4847 }
4848
4849 /*
4850 * Finally we get to the payoff: estimate selectivity using the
4851 * knowledge that each referencing row will match exactly one row in
4852 * the referenced table.
4853 *
4854 * XXX that's not true in the presence of nulls in the referencing
4855 * column(s), so in principle we should derate the estimate for those.
4856 * However (1) if there are any strict restriction clauses for the
4857 * referencing column(s) elsewhere in the query, derating here would
4858 * be double-counting the null fraction, and (2) it's not very clear
4859 * how to combine null fractions for multiple referencing columns. So
4860 * we do nothing for now about correcting for nulls.
4861 *
4862 * XXX another point here is that if either side of an FK constraint
4863 * is an inheritance parent, we estimate as though the constraint
4864 * covers all its children as well. This is not an unreasonable
4865 * assumption for a referencing table, ie the user probably applied
4866 * identical constraints to all child tables (though perhaps we ought
4867 * to check that). But it's not possible to have done that for a
4868 * referenced table. Fortunately, precisely because that doesn't
4869 * work, it is uncommon in practice to have an FK referencing a parent
4870 * table. So, at least for now, disregard inheritance here.
4871 */
4872 if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
4873 {
4874 /*
4875 * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
4876 * referenced table is exactly the inside of the join. The join
4877 * selectivity is defined as the fraction of LHS rows that have
4878 * matches. The FK implies that every LHS row has a match *in the
4879 * referenced table*; but any restriction clauses on it will
4880 * reduce the number of matches. Hence we take the join
4881 * selectivity as equal to the selectivity of the table's
4882 * restriction clauses, which is rows / tuples; but we must guard
4883 * against tuples == 0.
4884 */
4885 RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4886 double ref_tuples = Max(ref_rel->tuples, 1.0);
4887
4888 fkselec *= ref_rel->rows / ref_tuples;
4889 }
4890 else
4891 {
4892 /*
4893 * Otherwise, selectivity is exactly 1/referenced-table-size; but
4894 * guard against tuples == 0. Note we should use the raw table
4895 * tuple count, not any estimate of its filtered or joined size.
4896 */
4897 RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4898 double ref_tuples = Max(ref_rel->tuples, 1.0);
4899
4900 fkselec *= 1.0 / ref_tuples;
4901 }
4902 }
4903
4904 *restrictlist = worklist;
4905 return fkselec;
4906}
4907
4908/*
4909 * set_subquery_size_estimates
4910 * Set the size estimates for a base relation that is a subquery.
4911 *
4912 * The rel's targetlist and restrictinfo list must have been constructed
4913 * already, and the Paths for the subquery must have been completed.
4914 * We look at the subquery's PlannerInfo to extract data.
4915 *
4916 * We set the same fields as set_baserel_size_estimates.
4917 */
4918void
4919set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4920{
4921 PlannerInfo *subroot = rel->subroot;
4922 RelOptInfo *sub_final_rel;
4923 ListCell *lc;
4924
4925 /* Should only be applied to base relations that are subqueries */
4926 Assert(rel->relid > 0);
4927 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
4928
4929 /*
4930 * Copy raw number of output rows from subquery. All of its paths should
4931 * have the same output rowcount, so just look at cheapest-total.
4932 */
4933 sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
4934 rel->tuples = sub_final_rel->cheapest_total_path->rows;
4935
4936 /*
4937 * Compute per-output-column width estimates by examining the subquery's
4938 * targetlist. For any output that is a plain Var, get the width estimate
4939 * that was made while planning the subquery. Otherwise, we leave it to
4940 * set_rel_width to fill in a datatype-based default estimate.
4941 */
4942 foreach(lc, subroot->parse->targetList)
4943 {
4944 TargetEntry *te = lfirst_node(TargetEntry, lc);
4945 Node *texpr = (Node *) te->expr;
4946 int32 item_width = 0;
4947
4948 /* junk columns aren't visible to upper query */
4949 if (te->resjunk)
4950 continue;
4951
4952 /*
4953 * The subquery could be an expansion of a view that's had columns
4954 * added to it since the current query was parsed, so that there are
4955 * non-junk tlist columns in it that don't correspond to any column
4956 * visible at our query level. Ignore such columns.
4957 */
4958 if (te->resno < rel->min_attr || te->resno > rel->max_attr)
4959 continue;
4960
4961 /*
4962 * XXX This currently doesn't work for subqueries containing set
4963 * operations, because the Vars in their tlists are bogus references
4964 * to the first leaf subquery, which wouldn't give the right answer
4965 * even if we could still get to its PlannerInfo.
4966 *
4967 * Also, the subquery could be an appendrel for which all branches are
4968 * known empty due to constraint exclusion, in which case
4969 * set_append_rel_pathlist will have left the attr_widths set to zero.
4970 *
4971 * In either case, we just leave the width estimate zero until
4972 * set_rel_width fixes it.
4973 */
4974 if (IsA(texpr, Var) &&
4975 subroot->parse->setOperations == NULL)
4976 {
4977 Var *var = (Var *) texpr;
4978 RelOptInfo *subrel = find_base_rel(subroot, var->varno);
4979
4980 item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
4981 }
4982 rel->attr_widths[te->resno - rel->min_attr] = item_width;
4983 }
4984
4985 /* Now estimate number of output rows, etc */
4986 set_baserel_size_estimates(root, rel);
4987}
4988
4989/*
4990 * set_function_size_estimates
4991 * Set the size estimates for a base relation that is a function call.
4992 *
4993 * The rel's targetlist and restrictinfo list must have been constructed
4994 * already.
4995 *
4996 * We set the same fields as set_baserel_size_estimates.
4997 */
4998void
4999set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5000{
5001 RangeTblEntry *rte;
5002 ListCell *lc;
5003
5004 /* Should only be applied to base relations that are functions */
5005 Assert(rel->relid > 0);
5006 rte = planner_rt_fetch(rel->relid, root);
5007 Assert(rte->rtekind == RTE_FUNCTION);
5008
5009 /*
5010 * Estimate number of rows the functions will return. The rowcount of the
5011 * node is that of the largest function result.
5012 */
5013 rel->tuples = 0;
5014 foreach(lc, rte->functions)
5015 {
5016 RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
5017 double ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
5018
5019 if (ntup > rel->tuples)
5020 rel->tuples = ntup;
5021 }
5022
5023 /* Now estimate number of output rows, etc */
5024 set_baserel_size_estimates(root, rel);
5025}
5026
5027/*
5028 * set_function_size_estimates
5029 * Set the size estimates for a base relation that is a function call.
5030 *
5031 * The rel's targetlist and restrictinfo list must have been constructed
5032 * already.
5033 *
5034 * We set the same fields as set_tablefunc_size_estimates.
5035 */
5036void
5037set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5038{
5039 /* Should only be applied to base relations that are functions */
5040 Assert(rel->relid > 0);
5041 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
5042
5043 rel->tuples = 100;
5044
5045 /* Now estimate number of output rows, etc */
5046 set_baserel_size_estimates(root, rel);
5047}
5048
5049/*
5050 * set_values_size_estimates
5051 * Set the size estimates for a base relation that is a values list.
5052 *
5053 * The rel's targetlist and restrictinfo list must have been constructed
5054 * already.
5055 *
5056 * We set the same fields as set_baserel_size_estimates.
5057 */
5058void
5059set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5060{
5061 RangeTblEntry *rte;
5062
5063 /* Should only be applied to base relations that are values lists */
5064 Assert(rel->relid > 0);
5065 rte = planner_rt_fetch(rel->relid, root);
5066 Assert(rte->rtekind == RTE_VALUES);
5067
5068 /*
5069 * Estimate number of rows the values list will return. We know this
5070 * precisely based on the list length (well, barring set-returning
5071 * functions in list items, but that's a refinement not catered for
5072 * anywhere else either).
5073 */
5074 rel->tuples = list_length(rte->values_lists);
5075
5076 /* Now estimate number of output rows, etc */
5077 set_baserel_size_estimates(root, rel);
5078}
5079
5080/*
5081 * set_cte_size_estimates
5082 * Set the size estimates for a base relation that is a CTE reference.
5083 *
5084 * The rel's targetlist and restrictinfo list must have been constructed
5085 * already, and we need an estimate of the number of rows returned by the CTE
5086 * (if a regular CTE) or the non-recursive term (if a self-reference).
5087 *
5088 * We set the same fields as set_baserel_size_estimates.
5089 */
5090void
5091set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
5092{
5093 RangeTblEntry *rte;
5094
5095 /* Should only be applied to base relations that are CTE references */
5096 Assert(rel->relid > 0);
5097 rte = planner_rt_fetch(rel->relid, root);
5098 Assert(rte->rtekind == RTE_CTE);
5099
5100 if (rte->self_reference)
5101 {
5102 /*
5103 * In a self-reference, arbitrarily assume the average worktable size
5104 * is about 10 times the nonrecursive term's size.
5105 */
5106 rel->tuples = 10 * cte_rows;
5107 }
5108 else
5109 {
5110 /* Otherwise just believe the CTE's rowcount estimate */
5111 rel->tuples = cte_rows;
5112 }
5113
5114 /* Now estimate number of output rows, etc */
5115 set_baserel_size_estimates(root, rel);
5116}
5117
5118/*
5119 * set_namedtuplestore_size_estimates
5120 * Set the size estimates for a base relation that is a tuplestore reference.
5121 *
5122 * The rel's targetlist and restrictinfo list must have been constructed
5123 * already.
5124 *
5125 * We set the same fields as set_baserel_size_estimates.
5126 */
5127void
5128set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5129{
5130 RangeTblEntry *rte;
5131
5132 /* Should only be applied to base relations that are tuplestore references */
5133 Assert(rel->relid > 0);
5134 rte = planner_rt_fetch(rel->relid, root);
5135 Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
5136
5137 /*
5138 * Use the estimate provided by the code which is generating the named
5139 * tuplestore. In some cases, the actual number might be available; in
5140 * others the same plan will be re-used, so a "typical" value might be
5141 * estimated and used.
5142 */
5143 rel->tuples = rte->enrtuples;
5144 if (rel->tuples < 0)
5145 rel->tuples = 1000;
5146
5147 /* Now estimate number of output rows, etc */
5148 set_baserel_size_estimates(root, rel);
5149}
5150
5151/*
5152 * set_result_size_estimates
5153 * Set the size estimates for an RTE_RESULT base relation
5154 *
5155 * The rel's targetlist and restrictinfo list must have been constructed
5156 * already.
5157 *
5158 * We set the same fields as set_baserel_size_estimates.
5159 */
5160void
5161set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5162{
5163 /* Should only be applied to RTE_RESULT base relations */
5164 Assert(rel->relid > 0);
5165 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_RESULT);
5166
5167 /* RTE_RESULT always generates a single row, natively */
5168 rel->tuples = 1;
5169
5170 /* Now estimate number of output rows, etc */
5171 set_baserel_size_estimates(root, rel);
5172}
5173
5174/*
5175 * set_foreign_size_estimates
5176 * Set the size estimates for a base relation that is a foreign table.
5177 *
5178 * There is not a whole lot that we can do here; the foreign-data wrapper
5179 * is responsible for producing useful estimates. We can do a decent job
5180 * of estimating baserestrictcost, so we set that, and we also set up width
5181 * using what will be purely datatype-driven estimates from the targetlist.
5182 * There is no way to do anything sane with the rows value, so we just put
5183 * a default estimate and hope that the wrapper can improve on it. The
5184 * wrapper's GetForeignRelSize function will be called momentarily.
5185 *
5186 * The rel's targetlist and restrictinfo list must have been constructed
5187 * already.
5188 */
5189void
5190set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5191{
5192 /* Should only be applied to base relations */
5193 Assert(rel->relid > 0);
5194
5195 rel->rows = 1000; /* entirely bogus default estimate */
5196
5197 cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5198
5199 set_rel_width(root, rel);
5200}
5201
5202
5203/*
5204 * set_rel_width
5205 * Set the estimated output width of a base relation.
5206 *
5207 * The estimated output width is the sum of the per-attribute width estimates
5208 * for the actually-referenced columns, plus any PHVs or other expressions
5209 * that have to be calculated at this relation. This is the amount of data
5210 * we'd need to pass upwards in case of a sort, hash, etc.
5211 *
5212 * This function also sets reltarget->cost, so it's a bit misnamed now.
5213 *
5214 * NB: this works best on plain relations because it prefers to look at
5215 * real Vars. For subqueries, set_subquery_size_estimates will already have
5216 * copied up whatever per-column estimates were made within the subquery,
5217 * and for other types of rels there isn't much we can do anyway. We fall
5218 * back on (fairly stupid) datatype-based width estimates if we can't get
5219 * any better number.
5220 *
5221 * The per-attribute width estimates are cached for possible re-use while
5222 * building join relations or post-scan/join pathtargets.
5223 */
5224static void
5225set_rel_width(PlannerInfo *root, RelOptInfo *rel)
5226{
5227 Oid reloid = planner_rt_fetch(rel->relid, root)->relid;
5228 int32 tuple_width = 0;
5229 bool have_wholerow_var = false;
5230 ListCell *lc;
5231
5232 /* Vars are assumed to have cost zero, but other exprs do not */
5233 rel->reltarget->cost.startup = 0;
5234 rel->reltarget->cost.per_tuple = 0;
5235
5236 foreach(lc, rel->reltarget->exprs)
5237 {
5238 Node *node = (Node *) lfirst(lc);
5239
5240 /*
5241 * Ordinarily, a Var in a rel's targetlist must belong to that rel;
5242 * but there are corner cases involving LATERAL references where that
5243 * isn't so. If the Var has the wrong varno, fall through to the
5244 * generic case (it doesn't seem worth the trouble to be any smarter).
5245 */
5246 if (IsA(node, Var) &&
5247 ((Var *) node)->varno == rel->relid)
5248 {
5249 Var *var = (Var *) node;
5250 int ndx;
5251 int32 item_width;
5252
5253 Assert(var->varattno >= rel->min_attr);
5254 Assert(var->varattno <= rel->max_attr);
5255
5256 ndx = var->varattno - rel->min_attr;
5257
5258 /*
5259 * If it's a whole-row Var, we'll deal with it below after we have
5260 * already cached as many attr widths as possible.
5261 */
5262 if (var->varattno == 0)
5263 {
5264 have_wholerow_var = true;
5265 continue;
5266 }
5267
5268 /*
5269 * The width may have been cached already (especially if it's a
5270 * subquery), so don't duplicate effort.
5271 */
5272 if (rel->attr_widths[ndx] > 0)
5273 {
5274 tuple_width += rel->attr_widths[ndx];
5275 continue;
5276 }
5277
5278 /* Try to get column width from statistics */
5279 if (reloid != InvalidOid && var->varattno > 0)
5280 {
5281 item_width = get_attavgwidth(reloid, var->varattno);
5282 if (item_width > 0)
5283 {
5284 rel->attr_widths[ndx] = item_width;
5285 tuple_width += item_width;
5286 continue;
5287 }
5288 }
5289
5290 /*
5291 * Not a plain relation, or can't find statistics for it. Estimate
5292 * using just the type info.
5293 */
5294 item_width = get_typavgwidth(var->vartype, var->vartypmod);
5295 Assert(item_width > 0);
5296 rel->attr_widths[ndx] = item_width;
5297 tuple_width += item_width;
5298 }
5299 else if (IsA(node, PlaceHolderVar))
5300 {
5301 /*
5302 * We will need to evaluate the PHV's contained expression while
5303 * scanning this rel, so be sure to include it in reltarget->cost.
5304 */
5305 PlaceHolderVar *phv = (PlaceHolderVar *) node;
5306 PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
5307 QualCost cost;
5308
5309 tuple_width += phinfo->ph_width;
5310 cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
5311 rel->reltarget->cost.startup += cost.startup;
5312 rel->reltarget->cost.per_tuple += cost.per_tuple;
5313 }
5314 else
5315 {
5316 /*
5317 * We could be looking at an expression pulled up from a subquery,
5318 * or a ROW() representing a whole-row child Var, etc. Do what we
5319 * can using the expression type information.
5320 */
5321 int32 item_width;
5322 QualCost cost;
5323
5324 item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5325 Assert(item_width > 0);
5326 tuple_width += item_width;
5327 /* Not entirely clear if we need to account for cost, but do so */
5328 cost_qual_eval_node(&cost, node, root);
5329 rel->reltarget->cost.startup += cost.startup;
5330 rel->reltarget->cost.per_tuple += cost.per_tuple;
5331 }
5332 }
5333
5334 /*
5335 * If we have a whole-row reference, estimate its width as the sum of
5336 * per-column widths plus heap tuple header overhead.
5337 */
5338 if (have_wholerow_var)
5339 {
5340 int32 wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
5341
5342 if (reloid != InvalidOid)
5343 {
5344 /* Real relation, so estimate true tuple width */
5345 wholerow_width += get_relation_data_width(reloid,
5346 rel->attr_widths - rel->min_attr);
5347 }
5348 else
5349 {
5350 /* Do what we can with info for a phony rel */
5351 AttrNumber i;
5352
5353 for (i = 1; i <= rel->max_attr; i++)
5354 wholerow_width += rel->attr_widths[i - rel->min_attr];
5355 }
5356
5357 rel->attr_widths[0 - rel->min_attr] = wholerow_width;
5358
5359 /*
5360 * Include the whole-row Var as part of the output tuple. Yes, that
5361 * really is what happens at runtime.
5362 */
5363 tuple_width += wholerow_width;
5364 }
5365
5366 Assert(tuple_width >= 0);
5367 rel->reltarget->width = tuple_width;
5368}
5369
5370/*
5371 * set_pathtarget_cost_width
5372 * Set the estimated eval cost and output width of a PathTarget tlist.
5373 *
5374 * As a notational convenience, returns the same PathTarget pointer passed in.
5375 *
5376 * Most, though not quite all, uses of this function occur after we've run
5377 * set_rel_width() for base relations; so we can usually obtain cached width
5378 * estimates for Vars. If we can't, fall back on datatype-based width
5379 * estimates. Present early-planning uses of PathTargets don't need accurate
5380 * widths badly enough to justify going to the catalogs for better data.
5381 */
5382PathTarget *
5383set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target)
5384{
5385 int32 tuple_width = 0;
5386 ListCell *lc;
5387
5388 /* Vars are assumed to have cost zero, but other exprs do not */
5389 target->cost.startup = 0;
5390 target->cost.per_tuple = 0;
5391
5392 foreach(lc, target->exprs)
5393 {
5394 Node *node = (Node *) lfirst(lc);
5395
5396 if (IsA(node, Var))
5397 {
5398 Var *var = (Var *) node;
5399 int32 item_width;
5400
5401 /* We should not see any upper-level Vars here */
5402 Assert(var->varlevelsup == 0);
5403
5404 /* Try to get data from RelOptInfo cache */
5405 if (var->varno < root->simple_rel_array_size)
5406 {
5407 RelOptInfo *rel = root->simple_rel_array[var->varno];
5408
5409 if (rel != NULL &&
5410 var->varattno >= rel->min_attr &&
5411 var->varattno <= rel->max_attr)
5412 {
5413 int ndx = var->varattno - rel->min_attr;
5414
5415 if (rel->attr_widths[ndx] > 0)
5416 {
5417 tuple_width += rel->attr_widths[ndx];
5418 continue;
5419 }
5420 }
5421 }
5422
5423 /*
5424 * No cached data available, so estimate using just the type info.
5425 */
5426 item_width = get_typavgwidth(var->vartype, var->vartypmod);
5427 Assert(item_width > 0);
5428 tuple_width += item_width;
5429 }
5430 else
5431 {
5432 /*
5433 * Handle general expressions using type info.
5434 */
5435 int32 item_width;
5436 QualCost cost;
5437
5438 item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5439 Assert(item_width > 0);
5440 tuple_width += item_width;
5441
5442 /* Account for cost, too */
5443 cost_qual_eval_node(&cost, node, root);
5444 target->cost.startup += cost.startup;
5445 target->cost.per_tuple += cost.per_tuple;
5446 }
5447 }
5448
5449 Assert(tuple_width >= 0);
5450 target->width = tuple_width;
5451
5452 return target;
5453}
5454
5455/*
5456 * relation_byte_size
5457 * Estimate the storage space in bytes for a given number of tuples
5458 * of a given width (size in bytes).
5459 */
5460static double
5461relation_byte_size(double tuples, int width)
5462{
5463 return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
5464}
5465
5466/*
5467 * page_size
5468 * Returns an estimate of the number of pages covered by a given
5469 * number of tuples of a given width (size in bytes).
5470 */
5471static double
5472page_size(double tuples, int width)
5473{
5474 return ceil(relation_byte_size(tuples, width) / BLCKSZ);
5475}
5476
5477/*
5478 * Estimate the fraction of the work that each worker will do given the
5479 * number of workers budgeted for the path.
5480 */
5481static double
5482get_parallel_divisor(Path *path)
5483{
5484 double parallel_divisor = path->parallel_workers;
5485
5486 /*
5487 * Early experience with parallel query suggests that when there is only
5488 * one worker, the leader often makes a very substantial contribution to
5489 * executing the parallel portion of the plan, but as more workers are
5490 * added, it does less and less, because it's busy reading tuples from the
5491 * workers and doing whatever non-parallel post-processing is needed. By
5492 * the time we reach 4 workers, the leader no longer makes a meaningful
5493 * contribution. Thus, for now, estimate that the leader spends 30% of
5494 * its time servicing each worker, and the remainder executing the
5495 * parallel plan.
5496 */
5497 if (parallel_leader_participation)
5498 {
5499 double leader_contribution;
5500
5501 leader_contribution = 1.0 - (0.3 * path->parallel_workers);
5502 if (leader_contribution > 0)
5503 parallel_divisor += leader_contribution;
5504 }
5505
5506 return parallel_divisor;
5507}
5508
5509/*
5510 * compute_bitmap_pages
5511 *
5512 * compute number of pages fetched from heap in bitmap heap scan.
5513 */
5514double
5515compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
5516 int loop_count, Cost *cost, double *tuple)
5517{
5518 Cost indexTotalCost;
5519 Selectivity indexSelectivity;
5520 double T;
5521 double pages_fetched;
5522 double tuples_fetched;
5523 double heap_pages;
5524 long maxentries;
5525
5526 /*
5527 * Fetch total cost of obtaining the bitmap, as well as its total
5528 * selectivity.
5529 */
5530 cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
5531
5532 /*
5533 * Estimate number of main-table pages fetched.
5534 */
5535 tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
5536
5537 T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
5538
5539 /*
5540 * For a single scan, the number of heap pages that need to be fetched is
5541 * the same as the Mackert and Lohman formula for the case T <= b (ie, no
5542 * re-reads needed).
5543 */
5544 pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
5545
5546 /*
5547 * Calculate the number of pages fetched from the heap. Then based on
5548 * current work_mem estimate get the estimated maxentries in the bitmap.
5549 * (Note that we always do this calculation based on the number of pages
5550 * that would be fetched in a single iteration, even if loop_count > 1.
5551 * That's correct, because only that number of entries will be stored in
5552 * the bitmap at one time.)
5553 */
5554 heap_pages = Min(pages_fetched, baserel->pages);
5555 maxentries = tbm_calculate_entries(work_mem * 1024L);
5556
5557 if (loop_count > 1)
5558 {
5559 /*
5560 * For repeated bitmap scans, scale up the number of tuples fetched in
5561 * the Mackert and Lohman formula by the number of scans, so that we
5562 * estimate the number of pages fetched by all the scans. Then
5563 * pro-rate for one scan.
5564 */
5565 pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
5566 baserel->pages,
5567 get_indexpath_pages(bitmapqual),
5568 root);
5569 pages_fetched /= loop_count;
5570 }
5571
5572 if (pages_fetched >= T)
5573 pages_fetched = T;
5574 else
5575 pages_fetched = ceil(pages_fetched);
5576
5577 if (maxentries < heap_pages)
5578 {
5579 double exact_pages;
5580 double lossy_pages;
5581
5582 /*
5583 * Crude approximation of the number of lossy pages. Because of the
5584 * way tbm_lossify() is coded, the number of lossy pages increases
5585 * very sharply as soon as we run short of memory; this formula has
5586 * that property and seems to perform adequately in testing, but it's
5587 * possible we could do better somehow.
5588 */
5589 lossy_pages = Max(0, heap_pages - maxentries / 2);
5590 exact_pages = heap_pages - lossy_pages;
5591
5592 /*
5593 * If there are lossy pages then recompute the number of tuples
5594 * processed by the bitmap heap node. We assume here that the chance
5595 * of a given tuple coming from an exact page is the same as the
5596 * chance that a given page is exact. This might not be true, but
5597 * it's not clear how we can do any better.
5598 */
5599 if (lossy_pages > 0)
5600 tuples_fetched =
5601 clamp_row_est(indexSelectivity *
5602 (exact_pages / heap_pages) * baserel->tuples +
5603 (lossy_pages / heap_pages) * baserel->tuples);
5604 }
5605
5606 if (cost)
5607 *cost = indexTotalCost;
5608 if (tuple)
5609 *tuple = tuples_fetched;
5610
5611 return pages_fetched;
5612}
5613