costsize.c source code [PostgreSQL/src/backend/optimizer/path/costsize.c]

1	/-------------------------------------------------------------------------*
2	*
3	* costsize.c
4	* Routines to compute (and set) relation sizes and path costs
5	*
6	* Path costs are measured in arbitrary units established by these basic
7	* parameters:
8	*
9	* seq_page_cost Cost of a sequential page fetch
10	* random_page_cost Cost of a non-sequential page fetch
11	* cpu_tuple_cost Cost of typical CPU time to process a tuple
12	* cpu_index_tuple_cost Cost of typical CPU time to process an index tuple
13	* cpu_operator_cost Cost of CPU time to execute an operator or function
14	* parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend
15	* parallel_setup_cost Cost of setting up shared memory for parallelism
16	*
17	* We expect that the kernel will typically do some amount of read-ahead
18	* optimization; this in conjunction with seek costs means that seq_page_cost
19	* is normally considerably less than random_page_cost. (However, if the
20	* database is fully cached in RAM, it is reasonable to set them equal.)
21	*
22	* We also use a rough estimate "effective_cache_size" of the number of
23	* disk pages in Postgres + OS-level disk cache. (We can't simply use
24	* NBuffers for this purpose because that would ignore the effects of
25	* the kernel's disk cache.)
26	*
27	* Obviously, taking constants for these values is an oversimplification,
28	* but it's tough enough to get any useful estimates even at this level of
29	* detail. Note that all of these parameters are user-settable, in case
30	* the default values are drastically off for a particular platform.
31	*
32	* seq_page_cost and random_page_cost can also be overridden for an individual
33	* tablespace, in case some data is on a fast disk and other data is on a slow
34	* disk. Per-tablespace overrides never apply to temporary work files such as
35	* an external sort or a materialize node that overflows work_mem.
36	*
37	* We compute two separate costs for each path:
38	* total_cost: total estimated cost to fetch all tuples
39	* startup_cost: cost that is expended before first tuple is fetched
40	* In some scenarios, such as when there is a LIMIT or we are implementing
41	* an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42	* path's result. A caller can estimate the cost of fetching a partial
43	* result by interpolating between startup_cost and total_cost. In detail:
44	* actual_cost = startup_cost +
45	* (total_cost - startup_cost) * tuples_to_fetch / path->rows;
46	* Note that a base relation's rows count (and, by extension, plan_rows for
47	* plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48	* that this equation works properly. (Note: while path->rows is never zero
49	* for ordinary relations, it is zero for paths for provably-empty relations,
50	* so beware of division-by-zero.) The LIMIT is applied as a top-level
51	* plan node.
52	*
53	* For largely historical reasons, most of the routines in this module use
54	* the passed result Path only to store their results (rows, startup_cost and
55	* total_cost) into. All the input data they need is passed as separate
56	* parameters, even though much of it could be extracted from the Path.
57	* An exception is made for the cost_XXXjoin() routines, which expect all
58	* the other fields of the passed XXXPath to be filled in, and similarly
59	* cost_index() assumes the passed IndexPath is valid except for its output
60	* values.
61	*
62	*
63	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
64	* Portions Copyright (c) 1994, Regents of the University of California
65	*
66	* IDENTIFICATION
67	* src/backend/optimizer/path/costsize.c
68	*
69	*-------------------------------------------------------------------------
70	*/
71
72	#include "postgres.h"
73
74	#include <math.h>
75
76	#include "access/amapi.h"
77	#include "access/htup_details.h"
78	#include "access/tsmapi.h"
79	#include "executor/executor.h"
80	#include "executor/nodeHash.h"
81	#include "miscadmin.h"
82	#include "nodes/makefuncs.h"
83	#include "nodes/nodeFuncs.h"
84	#include "optimizer/clauses.h"
85	#include "optimizer/cost.h"
86	#include "optimizer/optimizer.h"
87	#include "optimizer/pathnode.h"
88	#include "optimizer/paths.h"
89	#include "optimizer/placeholder.h"
90	#include "optimizer/plancat.h"
91	#include "optimizer/planmain.h"
92	#include "optimizer/restrictinfo.h"
93	#include "parser/parsetree.h"
94	#include "utils/lsyscache.h"
95	#include "utils/selfuncs.h"
96	#include "utils/spccache.h"
97	#include "utils/tuplesort.h"
98
99
100	#define LOG2(x) (log(x) / 0.693147180559945)
101
102	/*
103	* Append and MergeAppend nodes are less expensive than some other operations
104	* which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
105	* per-tuple cost as cpu_tuple_cost multiplied by this value.
106	*/
107	#define APPEND_CPU_COST_MULTIPLIER 0.5
108
109
110	double seq_page_cost = DEFAULT_SEQ_PAGE_COST;
111	double random_page_cost = DEFAULT_RANDOM_PAGE_COST;
112	double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
113	double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
114	double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
115	double parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
116	double parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
117
118	int effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
119
120	Cost disable_cost = `1.0e10`;
121
122	int max_parallel_workers_per_gather = `2`;
123
124	bool enable_seqscan = true;
125	bool enable_indexscan = true;
126	bool enable_indexonlyscan = true;
127	bool enable_bitmapscan = true;
128	bool enable_tidscan = true;
129	bool enable_sort = true;
130	bool enable_hashagg = true;
131	bool enable_nestloop = true;
132	bool enable_material = true;
133	bool enable_mergejoin = true;
134	bool enable_hashjoin = true;
135	bool enable_gathermerge = true;
136	bool enable_partitionwise_join = false;
137	bool enable_partitionwise_aggregate = false;
138	bool enable_parallel_append = true;
139	bool enable_parallel_hash = true;
140	bool enable_partition_pruning = true;
141
142	typedef struct
143	{
144	PlannerInfo *root;
145	QualCost total;
146	} cost_qual_eval_context;
147
148	static List extract_nonindex_conditions(List qual_clauses, List *indexclauses);
149	static MergeScanSelCache cached_scansel(PlannerInfo root,
150	RestrictInfo *rinfo,
151	PathKey *pathkey);
152	static void cost_rescan(PlannerInfo root, Path path,
153	Cost rescan_startup_cost, Cost rescan_total_cost);
154	static bool cost_qual_eval_walker(Node node, cost_qual_eval_context context);
155	static void get_restriction_qual_cost(PlannerInfo root, RelOptInfo baserel,
156	ParamPathInfo *param_info,
157	QualCost *qpqual_cost);
158	static bool has_indexed_join_quals(NestPath *joinpath);
159	static double approx_tuple_count(PlannerInfo root, JoinPath path,
160	List *quals);
161	static double calc_joinrel_size_estimate(PlannerInfo *root,
162	RelOptInfo *joinrel,
163	RelOptInfo *outer_rel,
164	RelOptInfo *inner_rel,
165	double outer_rows,
166	double inner_rows,
167	SpecialJoinInfo *sjinfo,
168	List *restrictlist);
169	static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
170	Relids outer_relids,
171	Relids inner_relids,
172	SpecialJoinInfo *sjinfo,
173	List **restrictlist);
174	static Cost append_nonpartial_cost(List subpaths, int* numpaths,
175	int parallel_workers);
176	static void set_rel_width(PlannerInfo root, RelOptInfo rel);
177	static double relation_byte_size(double tuples, int width);
178	static double page_size(double tuples, int width);
179	static double get_parallel_divisor(Path *path);
180
181
182	/*
183	* clamp_row_est
184	* Force a row-count estimate to a sane value.
185	*/
186	double
187	clamp_row_est(double nrows)
188	{
189	/*
190	* Force estimate to be at least one row, to make explain output look
191	* better and to avoid possible divide-by-zero when interpolating costs.
192	* Make it an integer, too.
193	*/
194	if (nrows <= `1.0`)
195	nrows = `1.0`;
196	else
197	nrows = rint(nrows);
198
199	return nrows;
200	}
201
202
203	/*
204	* cost_seqscan
205	* Determines and returns the cost of scanning a relation sequentially.
206	*
207	* 'baserel' is the relation to be scanned
208	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
209	*/
210	void
211	cost_seqscan(Path path, PlannerInfo root,
212	RelOptInfo baserel, ParamPathInfo param_info)
213	{
214	Cost startup_cost = `0`;
215	Cost cpu_run_cost;
216	Cost disk_run_cost;
217	double spc_seq_page_cost;
218	QualCost qpqual_cost;
219	Cost cpu_per_tuple;
220
221	/ Should only be applied to base relations /
222	Assert(baserel->relid > `0`);
223	Assert(baserel->rtekind == RTE_RELATION);
224
225	/ Mark the path with the correct row estimate /
226	if (param_info)
227	path->rows = param_info->ppi_rows;
228	else
229	path->rows = baserel->rows;
230
231	if (!enable_seqscan)
232	startup_cost += disable_cost;
233
234	/ fetch estimated page cost for tablespace containing table /
235	get_tablespace_page_costs(baserel->reltablespace,
236	NULL,
237	&spc_seq_page_cost);
238
239	/*
240	* disk costs
241	*/
242	disk_run_cost = spc_seq_page_cost * baserel->pages;
243
244	/ CPU costs /
245	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
246
247	startup_cost += qpqual_cost.startup;
248	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
249	cpu_run_cost = cpu_per_tuple * baserel->tuples;
250	/ tlist eval costs are paid per output row, not per tuple scanned /
251	startup_cost += path->pathtarget->cost.startup;
252	cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
253
254	/ Adjust costing for parallelism, if used. /
255	if (path->parallel_workers > `0`)
256	{
257	double parallel_divisor = get_parallel_divisor(path);
258
259	/ The CPU cost is divided among all the workers. /
260	cpu_run_cost /= parallel_divisor;
261
262	/*
263	* It may be possible to amortize some of the I/O cost, but probably
264	* not very much, because most operating systems already do aggressive
265	* prefetching. For now, we assume that the disk run cost can't be
266	* amortized at all.
267	*/
268
269	/*
270	* In the case of a parallel plan, the row count needs to represent
271	* the number of tuples processed per worker.
272	*/
273	path->rows = clamp_row_est(path->rows / parallel_divisor);
274	}
275
276	path->startup_cost = startup_cost;
277	path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
278	}
279
280	/*
281	* cost_samplescan
282	* Determines and returns the cost of scanning a relation using sampling.
283	*
284	* 'baserel' is the relation to be scanned
285	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
286	*/
287	void
288	cost_samplescan(Path path, PlannerInfo root,
289	RelOptInfo baserel, ParamPathInfo param_info)
290	{
291	Cost startup_cost = `0`;
292	Cost run_cost = `0`;
293	RangeTblEntry *rte;
294	TableSampleClause *tsc;
295	TsmRoutine *tsm;
296	double spc_seq_page_cost,
297	spc_random_page_cost,
298	spc_page_cost;
299	QualCost qpqual_cost;
300	Cost cpu_per_tuple;
301
302	/ Should only be applied to base relations with tablesample clauses /
303	Assert(baserel->relid > `0`);
304	rte = planner_rt_fetch(baserel->relid, root);
305	Assert(rte->rtekind == RTE_RELATION);
306	tsc = rte->tablesample;
307	Assert(tsc != NULL);
308	tsm = GetTsmRoutine(tsc->tsmhandler);
309
310	/ Mark the path with the correct row estimate /
311	if (param_info)
312	path->rows = param_info->ppi_rows;
313	else
314	path->rows = baserel->rows;
315
316	/ fetch estimated page cost for tablespace containing table /
317	get_tablespace_page_costs(baserel->reltablespace,
318	&spc_random_page_cost,
319	&spc_seq_page_cost);
320
321	/ if NextSampleBlock is used, assume random access, else sequential /
322	spc_page_cost = (tsm->NextSampleBlock != NULL) ?
323	spc_random_page_cost : spc_seq_page_cost;
324
325	/*
326	* disk costs (recall that baserel->pages has already been set to the
327	* number of pages the sampling method will visit)
328	*/
329	run_cost += spc_page_cost * baserel->pages;
330
331	/*
332	* CPU costs (recall that baserel->tuples has already been set to the
333	* number of tuples the sampling method will select). Note that we ignore
334	* execution cost of the TABLESAMPLE parameter expressions; they will be
335	* evaluated only once per scan, and in most usages they'll likely be
336	* simple constants anyway. We also don't charge anything for the
337	* calculations the sampling method might do internally.
338	*/
339	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
340
341	startup_cost += qpqual_cost.startup;
342	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
343	run_cost += cpu_per_tuple * baserel->tuples;
344	/ tlist eval costs are paid per output row, not per tuple scanned /
345	startup_cost += path->pathtarget->cost.startup;
346	run_cost += path->pathtarget->cost.per_tuple * path->rows;
347
348	path->startup_cost = startup_cost;
349	path->total_cost = startup_cost + run_cost;
350	}
351
352	/*
353	* cost_gather
354	* Determines and returns the cost of gather path.
355	*
356	* 'rel' is the relation to be operated upon
357	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
358	* 'rows' may be used to point to a row estimate; if non-NULL, it overrides
359	* both 'rel' and 'param_info'. This is useful when the path doesn't exactly
360	* correspond to any particular RelOptInfo.
361	*/
362	void
363	cost_gather(GatherPath path, PlannerInfo root,
364	RelOptInfo rel, ParamPathInfo param_info,
365	double *rows)
366	{
367	Cost startup_cost = `0`;
368	Cost run_cost = `0`;
369
370	/ Mark the path with the correct row estimate /
371	if (rows)
372	path->path.rows = *rows;
373	else if (param_info)
374	path->path.rows = param_info->ppi_rows;
375	else
376	path->path.rows = rel->rows;
377
378	startup_cost = path->subpath->startup_cost;
379
380	run_cost = path->subpath->total_cost - path->subpath->startup_cost;
381
382	/ Parallel setup and communication cost. /
383	startup_cost += parallel_setup_cost;
384	run_cost += parallel_tuple_cost * path->path.rows;
385
386	path->path.startup_cost = startup_cost;
387	path->path.total_cost = (startup_cost + run_cost);
388	}
389
390	/*
391	* cost_gather_merge
392	* Determines and returns the cost of gather merge path.
393	*
394	* GatherMerge merges several pre-sorted input streams, using a heap that at
395	* any given instant holds the next tuple from each stream. If there are N
396	* streams, we need about N*log2(N) tuple comparisons to construct the heap at
397	* startup, and then for each output tuple, about log2(N) comparisons to
398	* replace the top heap entry with the next tuple from the same stream.
399	*/
400	void
401	cost_gather_merge(GatherMergePath path, PlannerInfo root,
402	RelOptInfo rel, ParamPathInfo param_info,
403	Cost input_startup_cost, Cost input_total_cost,
404	double *rows)
405	{
406	Cost startup_cost = `0`;
407	Cost run_cost = `0`;
408	Cost comparison_cost;
409	double N;
410	double logN;
411
412	/ Mark the path with the correct row estimate /
413	if (rows)
414	path->path.rows = *rows;
415	else if (param_info)
416	path->path.rows = param_info->ppi_rows;
417	else
418	path->path.rows = rel->rows;
419
420	if (!enable_gathermerge)
421	startup_cost += disable_cost;
422
423	/*
424	* Add one to the number of workers to account for the leader. This might
425	* be overgenerous since the leader will do less work than other workers
426	* in typical cases, but we'll go with it for now.
427	*/
428	Assert(path->num_workers > `0`);
429	N = (double) path->num_workers + `1`;
430	logN = LOG2(N);
431
432	/ Assumed cost per tuple comparison /
433	comparison_cost = `2.0` * cpu_operator_cost;
434
435	/ Heap creation cost /
436	startup_cost += comparison_cost * N * logN;
437
438	/ Per-tuple heap maintenance cost /
439	run_cost += path->path.rows * comparison_cost * logN;
440
441	/ small cost for heap management, like cost_merge_append /
442	run_cost += cpu_operator_cost * path->path.rows;
443
444	/*
445	* Parallel setup and communication cost. Since Gather Merge, unlike
446	* Gather, requires us to block until a tuple is available from every
447	* worker, we bump the IPC cost up a little bit as compared with Gather.
448	* For lack of a better idea, charge an extra 5%.
449	*/
450	startup_cost += parallel_setup_cost;
451	run_cost += parallel_tuple_cost * path->path.rows * `1.05`;
452
453	path->path.startup_cost = startup_cost + input_startup_cost;
454	path->path.total_cost = (startup_cost + run_cost + input_total_cost);
455	}
456
457	/*
458	* cost_index
459	* Determines and returns the cost of scanning a relation using an index.
460	*
461	* 'path' describes the indexscan under consideration, and is complete
462	* except for the fields to be set by this routine
463	* 'loop_count' is the number of repetitions of the indexscan to factor into
464	* estimates of caching behavior
465	*
466	* In addition to rows, startup_cost and total_cost, cost_index() sets the
467	* path's indextotalcost and indexselectivity fields. These values will be
468	* needed if the IndexPath is used in a BitmapIndexScan.
469	*
470	* NOTE: path->indexquals must contain only clauses usable as index
471	* restrictions. Any additional quals evaluated as qpquals may reduce the
472	* number of returned tuples, but they won't reduce the number of tuples
473	* we have to fetch from the table, so they don't reduce the scan cost.
474	*/
475	void
476	cost_index(IndexPath path, PlannerInfo root, double loop_count,
477	bool partial_path)
478	{
479	IndexOptInfo *index = path->indexinfo;
480	RelOptInfo *baserel = index->rel;
481	bool indexonly = (path->path.pathtype == T_IndexOnlyScan);
482	amcostestimate_function amcostestimate;
483	List *qpquals;
484	Cost startup_cost = `0`;
485	Cost run_cost = `0`;
486	Cost cpu_run_cost = `0`;
487	Cost indexStartupCost;
488	Cost indexTotalCost;
489	Selectivity indexSelectivity;
490	double indexCorrelation,
491	csquared;
492	double spc_seq_page_cost,
493	spc_random_page_cost;
494	Cost min_IO_cost,
495	max_IO_cost;
496	QualCost qpqual_cost;
497	Cost cpu_per_tuple;
498	double tuples_fetched;
499	double pages_fetched;
500	double rand_heap_pages;
501	double index_pages;
502
503	/ Should only be applied to base relations /
504	Assert(IsA(baserel, RelOptInfo) &&
505	IsA(index, IndexOptInfo));
506	Assert(baserel->relid > `0`);
507	Assert(baserel->rtekind == RTE_RELATION);
508
509	/*
510	* Mark the path with the correct row estimate, and identify which quals
511	* will need to be enforced as qpquals. We need not check any quals that
512	* are implied by the index's predicate, so we can use indrestrictinfo not
513	* baserestrictinfo as the list of relevant restriction clauses for the
514	* rel.
515	*/
516	if (path->path.param_info)
517	{
518	path->path.rows = path->path.param_info->ppi_rows;
519	/ qpquals come from the rel's restriction clauses and ppi_clauses /
520	qpquals = list_concat(extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
521	path->indexclauses),
522	extract_nonindex_conditions(path->path.param_info->ppi_clauses,
523	path->indexclauses));
524	}
525	else
526	{
527	path->path.rows = baserel->rows;
528	/ qpquals come from just the rel's restriction clauses /
529	qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
530	path->indexclauses);
531	}
532
533	if (!enable_indexscan)
534	startup_cost += disable_cost;
535	/ we don't need to check enable_indexonlyscan; indxpath.c does that /
536
537	/*
538	* Call index-access-method-specific code to estimate the processing cost
539	* for scanning the index, as well as the selectivity of the index (ie,
540	* the fraction of main-table tuples we will have to retrieve) and its
541	* correlation to the main-table tuple order. We need a cast here because
542	* pathnodes.h uses a weak function type to avoid including amapi.h.
543	*/
544	amcostestimate = (amcostestimate_function) index->amcostestimate;
545	amcostestimate(root, path, loop_count,
546	&indexStartupCost, &indexTotalCost,
547	&indexSelectivity, &indexCorrelation,
548	&index_pages);
549
550	/*
551	* Save amcostestimate's results for possible use in bitmap scan planning.
552	* We don't bother to save indexStartupCost or indexCorrelation, because a
553	* bitmap scan doesn't care about either.
554	*/
555	path->indextotalcost = indexTotalCost;
556	path->indexselectivity = indexSelectivity;
557
558	/ all costs for touching index itself included here /
559	startup_cost += indexStartupCost;
560	run_cost += indexTotalCost - indexStartupCost;
561
562	/ estimate number of main-table tuples fetched /
563	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
564
565	/ fetch estimated page costs for tablespace containing table /
566	get_tablespace_page_costs(baserel->reltablespace,
567	&spc_random_page_cost,
568	&spc_seq_page_cost);
569
570	/----------*
571	* Estimate number of main-table pages fetched, and compute I/O cost.
572	*
573	* When the index ordering is uncorrelated with the table ordering,
574	* we use an approximation proposed by Mackert and Lohman (see
575	* index_pages_fetched() for details) to compute the number of pages
576	* fetched, and then charge spc_random_page_cost per page fetched.
577	*
578	* When the index ordering is exactly correlated with the table ordering
579	* (just after a CLUSTER, for example), the number of pages fetched should
580	* be exactly selectivity * table_size. What's more, all but the first
581	* will be sequential fetches, not the random fetches that occur in the
582	* uncorrelated case. So if the number of pages is more than 1, we
583	* ought to charge
584	* spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
585	* For partially-correlated indexes, we ought to charge somewhere between
586	* these two estimates. We currently interpolate linearly between the
587	* estimates based on the correlation squared (XXX is that appropriate?).
588	*
589	* If it's an index-only scan, then we will not need to fetch any heap
590	* pages for which the visibility map shows all tuples are visible.
591	* Hence, reduce the estimated number of heap fetches accordingly.
592	* We use the measured fraction of the entire heap that is all-visible,
593	* which might not be particularly relevant to the subset of the heap
594	* that this query will fetch; but it's not clear how to do better.
595	*----------
596	*/
597	if (loop_count > `1`)
598	{
599	/*
600	* For repeated indexscans, the appropriate estimate for the
601	* uncorrelated case is to scale up the number of tuples fetched in
602	* the Mackert and Lohman formula by the number of scans, so that we
603	* estimate the number of pages fetched by all the scans; then
604	* pro-rate the costs for one scan. In this case we assume all the
605	* fetches are random accesses.
606	*/
607	pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
608	baserel->pages,
609	(double) index->pages,
610	root);
611
612	if (indexonly)
613	pages_fetched = ceil(pages_fetched * (`1.0` - baserel->allvisfrac));
614
615	rand_heap_pages = pages_fetched;
616
617	max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
618
619	/*
620	* In the perfectly correlated case, the number of pages touched by
621	* each scan is selectivity * table_size, and we can use the Mackert
622	* and Lohman formula at the page level to estimate how much work is
623	* saved by caching across scans. We still assume all the fetches are
624	* random, though, which is an overestimate that's hard to correct for
625	* without double-counting the cache effects. (But in most cases
626	* where such a plan is actually interesting, only one page would get
627	* fetched per scan anyway, so it shouldn't matter much.)
628	*/
629	pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
630
631	pages_fetched = index_pages_fetched(pages_fetched * loop_count,
632	baserel->pages,
633	(double) index->pages,
634	root);
635
636	if (indexonly)
637	pages_fetched = ceil(pages_fetched * (`1.0` - baserel->allvisfrac));
638
639	min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
640	}
641	else
642	{
643	/*
644	* Normal case: apply the Mackert and Lohman formula, and then
645	* interpolate between that and the correlation-derived result.
646	*/
647	pages_fetched = index_pages_fetched(tuples_fetched,
648	baserel->pages,
649	(double) index->pages,
650	root);
651
652	if (indexonly)
653	pages_fetched = ceil(pages_fetched * (`1.0` - baserel->allvisfrac));
654
655	rand_heap_pages = pages_fetched;
656
657	/ max_IO_cost is for the perfectly uncorrelated case (csquared=0) /
658	max_IO_cost = pages_fetched * spc_random_page_cost;
659
660	/ min_IO_cost is for the perfectly correlated case (csquared=1) /
661	pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
662
663	if (indexonly)
664	pages_fetched = ceil(pages_fetched * (`1.0` - baserel->allvisfrac));
665
666	if (pages_fetched > `0`)
667	{
668	min_IO_cost = spc_random_page_cost;
669	if (pages_fetched > `1`)
670	min_IO_cost += (pages_fetched - `1`) * spc_seq_page_cost;
671	}
672	else
673	min_IO_cost = `0`;
674	}
675
676	if (partial_path)
677	{
678	/*
679	* For index only scans compute workers based on number of index pages
680	* fetched; the number of heap pages we fetch might be so small as to
681	* effectively rule out parallelism, which we don't want to do.
682	*/
683	if (indexonly)
684	rand_heap_pages = -`1`;
685
686	/*
687	* Estimate the number of parallel workers required to scan index. Use
688	* the number of heap pages computed considering heap fetches won't be
689	* sequential as for parallel scans the pages are accessed in random
690	* order.
691	*/
692	path->path.parallel_workers = compute_parallel_worker(baserel,
693	rand_heap_pages,
694	index_pages,
695	max_parallel_workers_per_gather);
696
697	/*
698	* Fall out if workers can't be assigned for parallel scan, because in
699	* such a case this path will be rejected. So there is no benefit in
700	* doing extra computation.
701	*/
702	if (path->path.parallel_workers <= `0`)
703	return;
704
705	path->path.parallel_aware = true;
706	}
707
708	/*
709	* Now interpolate based on estimated index order correlation to get total
710	* disk I/O cost for main table accesses.
711	*/
712	csquared = indexCorrelation * indexCorrelation;
713
714	run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
715
716	/*
717	* Estimate CPU costs per tuple.
718	*
719	* What we want here is cpu_tuple_cost plus the evaluation costs of any
720	* qual clauses that we have to evaluate as qpquals.
721	*/
722	cost_qual_eval(&qpqual_cost, qpquals, root);
723
724	startup_cost += qpqual_cost.startup;
725	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
726
727	cpu_run_cost += cpu_per_tuple * tuples_fetched;
728
729	/ tlist eval costs are paid per output row, not per tuple scanned /
730	startup_cost += path->path.pathtarget->cost.startup;
731	cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
732
733	/ Adjust costing for parallelism, if used. /
734	if (path->path.parallel_workers > `0`)
735	{
736	double parallel_divisor = get_parallel_divisor(&path->path);
737
738	path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
739
740	/ The CPU cost is divided among all the workers. /
741	cpu_run_cost /= parallel_divisor;
742	}
743
744	run_cost += cpu_run_cost;
745
746	path->path.startup_cost = startup_cost;
747	path->path.total_cost = startup_cost + run_cost;
748	}
749
750	/*
751	* extract_nonindex_conditions
752	*
753	* Given a list of quals to be enforced in an indexscan, extract the ones that
754	* will have to be applied as qpquals (ie, the index machinery won't handle
755	* them). Here we detect only whether a qual clause is directly redundant
756	* with some indexclause. If the index path is chosen for use, createplan.c
757	* will try a bit harder to get rid of redundant qual conditions; specifically
758	* it will see if quals can be proven to be implied by the indexquals. But
759	* it does not seem worth the cycles to try to factor that in at this stage,
760	* since we're only trying to estimate qual eval costs. Otherwise this must
761	* match the logic in create_indexscan_plan().
762	*
763	* qual_clauses, and the result, are lists of RestrictInfos.
764	* indexclauses is a list of IndexClauses.
765	*/
766	static List *
767	extract_nonindex_conditions(List qual_clauses, List indexclauses)
768	{
769	List *result = NIL;
770	ListCell *lc;
771
772	foreach(lc, qual_clauses)
773	{
774	RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
775
776	if (rinfo->pseudoconstant)
777	continue; / we may drop pseudoconstants here /
778	if (is_redundant_with_indexclauses(rinfo, indexclauses))
779	continue; / dup or derived from same EquivalenceClass /
780	/ ... skip the predicate proof attempt createplan.c will try ... /
781	result = lappend(result, rinfo);
782	}
783	return result;
784	}
785
786	/*
787	* index_pages_fetched
788	* Estimate the number of pages actually fetched after accounting for
789	* cache effects.
790	*
791	* We use an approximation proposed by Mackert and Lohman, "Index Scans
792	* Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
793	* on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
794	* The Mackert and Lohman approximation is that the number of pages
795	* fetched is
796	* PF =
797	* min(2TNs/(2T+Ns), T) when T <= b
798	* 2TNs/(2T+Ns) when T > b and Ns <= 2Tb/(2T-b)
799	* b + (Ns - 2Tb/(2T-b))*(T-b)/T when T > b and Ns > 2Tb/(2T-b)
800	* where
801	* T = # pages in table
802	* N = # tuples in table
803	* s = selectivity = fraction of table to be scanned
804	* b = # buffer pages available (we include kernel space here)
805	*
806	* We assume that effective_cache_size is the total number of buffer pages
807	* available for the whole query, and pro-rate that space across all the
808	* tables in the query and the index currently under consideration. (This
809	* ignores space needed for other indexes used by the query, but since we
810	* don't know which indexes will get used, we can't estimate that very well;
811	* and in any case counting all the tables may well be an overestimate, since
812	* depending on the join plan not all the tables may be scanned concurrently.)
813	*
814	* The product Ns is the number of tuples fetched; we pass in that
815	* product rather than calculating it here. "pages" is the number of pages
816	* in the object under consideration (either an index or a table).
817	* "index_pages" is the amount to add to the total table space, which was
818	* computed for us by make_one_rel.
819	*
820	* Caller is expected to have ensured that tuples_fetched is greater than zero
821	* and rounded to integer (see clamp_row_est). The result will likewise be
822	* greater than zero and integral.
823	*/
824	double
825	index_pages_fetched(double tuples_fetched, BlockNumber pages,
826	double index_pages, PlannerInfo *root)
827	{
828	double pages_fetched;
829	double total_pages;
830	double T,
831	b;
832
833	/ T is # pages in table, but don't allow it to be zero /
834	T = (pages > `1`) ? (double) pages : `1.0`;
835
836	/ Compute number of pages assumed to be competing for cache space /
837	total_pages = root->total_table_pages + index_pages;
838	total_pages = Max(total_pages, `1.0`);
839	Assert(T <= total_pages);
840
841	/ b is pro-rated share of effective_cache_size /
842	b = (double) effective_cache_size * T / total_pages;
843
844	/ force it positive and integral /
845	if (b <= `1.0`)
846	b = `1.0`;
847	else
848	b = ceil(b);
849
850	/ This part is the Mackert and Lohman formula /
851	if (T <= b)
852	{
853	pages_fetched =
854	(`2.0` * T * tuples_fetched) / (`2.0` * T + tuples_fetched);
855	if (pages_fetched >= T)
856	pages_fetched = T;
857	else
858	pages_fetched = ceil(pages_fetched);
859	}
860	else
861	{
862	double lim;
863
864	lim = (`2.0` * T * b) / (`2.0` * T - b);
865	if (tuples_fetched <= lim)
866	{
867	pages_fetched =
868	(`2.0` * T * tuples_fetched) / (`2.0` * T + tuples_fetched);
869	}
870	else
871	{
872	pages_fetched =
873	b + (tuples_fetched - lim) * (T - b) / T;
874	}
875	pages_fetched = ceil(pages_fetched);
876	}
877	return pages_fetched;
878	}
879
880	/*
881	* get_indexpath_pages
882	* Determine the total size of the indexes used in a bitmap index path.
883	*
884	* Note: if the same index is used more than once in a bitmap tree, we will
885	* count it multiple times, which perhaps is the wrong thing ... but it's
886	* not completely clear, and detecting duplicates is difficult, so ignore it
887	* for now.
888	*/
889	static double
890	get_indexpath_pages(Path *bitmapqual)
891	{
892	double result = `0`;
893	ListCell *l;
894
895	if (IsA(bitmapqual, BitmapAndPath))
896	{
897	BitmapAndPath apath = (BitmapAndPath ) bitmapqual;
898
899	foreach(l, apath->bitmapquals)
900	{
901	result += get_indexpath_pages((Path *) lfirst(l));
902	}
903	}
904	else if (IsA(bitmapqual, BitmapOrPath))
905	{
906	BitmapOrPath opath = (BitmapOrPath ) bitmapqual;
907
908	foreach(l, opath->bitmapquals)
909	{
910	result += get_indexpath_pages((Path *) lfirst(l));
911	}
912	}
913	else if (IsA(bitmapqual, IndexPath))
914	{
915	IndexPath ipath = (IndexPath ) bitmapqual;
916
917	result = (double) ipath->indexinfo->pages;
918	}
919	else
920	elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
921
922	return result;
923	}
924
925	/*
926	* cost_bitmap_heap_scan
927	* Determines and returns the cost of scanning a relation using a bitmap
928	* index-then-heap plan.
929	*
930	* 'baserel' is the relation to be scanned
931	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
932	* 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
933	* 'loop_count' is the number of repetitions of the indexscan to factor into
934	* estimates of caching behavior
935	*
936	* Note: the component IndexPaths in bitmapqual should have been costed
937	* using the same loop_count.
938	*/
939	void
940	cost_bitmap_heap_scan(Path path, PlannerInfo root, RelOptInfo *baserel,
941	ParamPathInfo *param_info,
942	Path bitmapqual, double* loop_count)
943	{
944	Cost startup_cost = `0`;
945	Cost run_cost = `0`;
946	Cost indexTotalCost;
947	QualCost qpqual_cost;
948	Cost cpu_per_tuple;
949	Cost cost_per_page;
950	Cost cpu_run_cost;
951	double tuples_fetched;
952	double pages_fetched;
953	double spc_seq_page_cost,
954	spc_random_page_cost;
955	double T;
956
957	/ Should only be applied to base relations /
958	Assert(IsA(baserel, RelOptInfo));
959	Assert(baserel->relid > `0`);
960	Assert(baserel->rtekind == RTE_RELATION);
961
962	/ Mark the path with the correct row estimate /
963	if (param_info)
964	path->rows = param_info->ppi_rows;
965	else
966	path->rows = baserel->rows;
967
968	if (!enable_bitmapscan)
969	startup_cost += disable_cost;
970
971	pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
972	loop_count, &indexTotalCost,
973	&tuples_fetched);
974
975	startup_cost += indexTotalCost;
976	T = (baserel->pages > `1`) ? (double) baserel->pages : `1.0`;
977
978	/ Fetch estimated page costs for tablespace containing table. /
979	get_tablespace_page_costs(baserel->reltablespace,
980	&spc_random_page_cost,
981	&spc_seq_page_cost);
982
983	/*
984	* For small numbers of pages we should charge spc_random_page_cost
985	* apiece, while if nearly all the table's pages are being read, it's more
986	* appropriate to charge spc_seq_page_cost apiece. The effect is
987	* nonlinear, too. For lack of a better idea, interpolate like this to
988	* determine the cost per page.
989	*/
990	if (pages_fetched >= `2.0`)
991	cost_per_page = spc_random_page_cost -
992	(spc_random_page_cost - spc_seq_page_cost)
993	* sqrt(pages_fetched / T);
994	else
995	cost_per_page = spc_random_page_cost;
996
997	run_cost += pages_fetched * cost_per_page;
998
999	/*
1000	* Estimate CPU costs per tuple.
1001	*
1002	* Often the indexquals don't need to be rechecked at each tuple ... but
1003	* not always, especially not if there are enough tuples involved that the
1004	* bitmaps become lossy. For the moment, just assume they will be
1005	* rechecked always. This means we charge the full freight for all the
1006	* scan clauses.
1007	*/
1008	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1009
1010	startup_cost += qpqual_cost.startup;
1011	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1012	cpu_run_cost = cpu_per_tuple * tuples_fetched;
1013
1014	/ Adjust costing for parallelism, if used. /
1015	if (path->parallel_workers > `0`)
1016	{
1017	double parallel_divisor = get_parallel_divisor(path);
1018
1019	/ The CPU cost is divided among all the workers. /
1020	cpu_run_cost /= parallel_divisor;
1021
1022	path->rows = clamp_row_est(path->rows / parallel_divisor);
1023	}
1024
1025
1026	run_cost += cpu_run_cost;
1027
1028	/ tlist eval costs are paid per output row, not per tuple scanned /
1029	startup_cost += path->pathtarget->cost.startup;
1030	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1031
1032	path->startup_cost = startup_cost;
1033	path->total_cost = startup_cost + run_cost;
1034	}
1035
1036	/*
1037	* cost_bitmap_tree_node
1038	* Extract cost and selectivity from a bitmap tree node (index/and/or)
1039	*/
1040	void
1041	cost_bitmap_tree_node(Path path, Cost cost, Selectivity *selec)
1042	{
1043	if (IsA(path, IndexPath))
1044	{
1045	cost = ((IndexPath ) path)->indextotalcost;
1046	selec = ((IndexPath ) path)->indexselectivity;
1047
1048	/*
1049	* Charge a small amount per retrieved tuple to reflect the costs of
1050	* manipulating the bitmap. This is mostly to make sure that a bitmap
1051	* scan doesn't look to be the same cost as an indexscan to retrieve a
1052	* single tuple.
1053	*/
1054	cost += `0.1` cpu_operator_cost * path->rows;
1055	}
1056	else if (IsA(path, BitmapAndPath))
1057	{
1058	*cost = path->total_cost;
1059	selec = ((BitmapAndPath ) path)->bitmapselectivity;
1060	}
1061	else if (IsA(path, BitmapOrPath))
1062	{
1063	*cost = path->total_cost;
1064	selec = ((BitmapOrPath ) path)->bitmapselectivity;
1065	}
1066	else
1067	{
1068	elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1069	cost = selec = `0`; / keep compiler quiet /
1070	}
1071	}
1072
1073	/*
1074	* cost_bitmap_and_node
1075	* Estimate the cost of a BitmapAnd node
1076	*
1077	* Note that this considers only the costs of index scanning and bitmap
1078	* creation, not the eventual heap access. In that sense the object isn't
1079	* truly a Path, but it has enough path-like properties (costs in particular)
1080	* to warrant treating it as one. We don't bother to set the path rows field,
1081	* however.
1082	*/
1083	void
1084	cost_bitmap_and_node(BitmapAndPath path, PlannerInfo root)
1085	{
1086	Cost totalCost;
1087	Selectivity selec;
1088	ListCell *l;
1089
1090	/*
1091	* We estimate AND selectivity on the assumption that the inputs are
1092	* independent. This is probably often wrong, but we don't have the info
1093	* to do better.
1094	*
1095	* The runtime cost of the BitmapAnd itself is estimated at 100x
1096	* cpu_operator_cost for each tbm_intersect needed. Probably too small,
1097	* definitely too simplistic?
1098	*/
1099	totalCost = `0.0`;
1100	selec = `1.0`;
1101	foreach(l, path->bitmapquals)
1102	{
1103	Path subpath = (Path ) lfirst(l);
1104	Cost subCost;
1105	Selectivity subselec;
1106
1107	cost_bitmap_tree_node(subpath, &subCost, &subselec);
1108
1109	selec *= subselec;
1110
1111	totalCost += subCost;
1112	if (l != list_head(path->bitmapquals))
1113	totalCost += `100.0` * cpu_operator_cost;
1114	}
1115	path->bitmapselectivity = selec;
1116	path->path.rows = `0`; / per above, not used /
1117	path->path.startup_cost = totalCost;
1118	path->path.total_cost = totalCost;
1119	}
1120
1121	/*
1122	* cost_bitmap_or_node
1123	* Estimate the cost of a BitmapOr node
1124	*
1125	* See comments for cost_bitmap_and_node.
1126	*/
1127	void
1128	cost_bitmap_or_node(BitmapOrPath path, PlannerInfo root)
1129	{
1130	Cost totalCost;
1131	Selectivity selec;
1132	ListCell *l;
1133
1134	/*
1135	* We estimate OR selectivity on the assumption that the inputs are
1136	* non-overlapping, since that's often the case in "x IN (list)" type
1137	* situations. Of course, we clamp to 1.0 at the end.
1138	*
1139	* The runtime cost of the BitmapOr itself is estimated at 100x
1140	* cpu_operator_cost for each tbm_union needed. Probably too small,
1141	* definitely too simplistic? We are aware that the tbm_unions are
1142	* optimized out when the inputs are BitmapIndexScans.
1143	*/
1144	totalCost = `0.0`;
1145	selec = `0.0`;
1146	foreach(l, path->bitmapquals)
1147	{
1148	Path subpath = (Path ) lfirst(l);
1149	Cost subCost;
1150	Selectivity subselec;
1151
1152	cost_bitmap_tree_node(subpath, &subCost, &subselec);
1153
1154	selec += subselec;
1155
1156	totalCost += subCost;
1157	if (l != list_head(path->bitmapquals) &&
1158	!IsA(subpath, IndexPath))
1159	totalCost += `100.0` * cpu_operator_cost;
1160	}
1161	path->bitmapselectivity = Min(selec, `1.0`);
1162	path->path.rows = `0`; / per above, not used /
1163	path->path.startup_cost = totalCost;
1164	path->path.total_cost = totalCost;
1165	}
1166
1167	/*
1168	* cost_tidscan
1169	* Determines and returns the cost of scanning a relation using TIDs.
1170	*
1171	* 'baserel' is the relation to be scanned
1172	* 'tidquals' is the list of TID-checkable quals
1173	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1174	*/
1175	void
1176	cost_tidscan(Path path, PlannerInfo root,
1177	RelOptInfo baserel, List tidquals, ParamPathInfo *param_info)
1178	{
1179	Cost startup_cost = `0`;
1180	Cost run_cost = `0`;
1181	bool isCurrentOf = false;
1182	QualCost qpqual_cost;
1183	Cost cpu_per_tuple;
1184	QualCost tid_qual_cost;
1185	int ntuples;
1186	ListCell *l;
1187	double spc_random_page_cost;
1188
1189	/ Should only be applied to base relations /
1190	Assert(baserel->relid > `0`);
1191	Assert(baserel->rtekind == RTE_RELATION);
1192
1193	/ Mark the path with the correct row estimate /
1194	if (param_info)
1195	path->rows = param_info->ppi_rows;
1196	else
1197	path->rows = baserel->rows;
1198
1199	/ Count how many tuples we expect to retrieve /
1200	ntuples = `0`;
1201	foreach(l, tidquals)
1202	{
1203	RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1204	Expr *qual = rinfo->clause;
1205
1206	if (IsA(qual, ScalarArrayOpExpr))
1207	{
1208	/ Each element of the array yields 1 tuple /
1209	ScalarArrayOpExpr saop = (ScalarArrayOpExpr ) qual;
1210	Node arraynode = (Node ) lsecond(saop->args);
1211
1212	ntuples += estimate_array_length(arraynode);
1213	}
1214	else if (IsA(qual, CurrentOfExpr))
1215	{
1216	/ CURRENT OF yields 1 tuple /
1217	isCurrentOf = true;
1218	ntuples++;
1219	}
1220	else
1221	{
1222	/ It's just CTID = something, count 1 tuple /
1223	ntuples++;
1224	}
1225	}
1226
1227	/*
1228	* We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1229	* understands how to do it correctly. Therefore, honor enable_tidscan
1230	* only when CURRENT OF isn't present. Also note that cost_qual_eval
1231	* counts a CurrentOfExpr as having startup cost disable_cost, which we
1232	* subtract off here; that's to prevent other plan types such as seqscan
1233	* from winning.
1234	*/
1235	if (isCurrentOf)
1236	{
1237	Assert(baserel->baserestrictcost.startup >= disable_cost);
1238	startup_cost -= disable_cost;
1239	}
1240	else if (!enable_tidscan)
1241	startup_cost += disable_cost;
1242
1243	/*
1244	* The TID qual expressions will be computed once, any other baserestrict
1245	* quals once per retrieved tuple.
1246	*/
1247	cost_qual_eval(&tid_qual_cost, tidquals, root);
1248
1249	/ fetch estimated page cost for tablespace containing table /
1250	get_tablespace_page_costs(baserel->reltablespace,
1251	&spc_random_page_cost,
1252	NULL);
1253
1254	/ disk costs --- assume each tuple on a different page /
1255	run_cost += spc_random_page_cost * ntuples;
1256
1257	/ Add scanning CPU costs /
1258	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1259
1260	/ XXX currently we assume TID quals are a subset of qpquals /
1261	startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1262	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1263	tid_qual_cost.per_tuple;
1264	run_cost += cpu_per_tuple * ntuples;
1265
1266	/ tlist eval costs are paid per output row, not per tuple scanned /
1267	startup_cost += path->pathtarget->cost.startup;
1268	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1269
1270	path->startup_cost = startup_cost;
1271	path->total_cost = startup_cost + run_cost;
1272	}
1273
1274	/*
1275	* cost_subqueryscan
1276	* Determines and returns the cost of scanning a subquery RTE.
1277	*
1278	* 'baserel' is the relation to be scanned
1279	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1280	*/
1281	void
1282	cost_subqueryscan(SubqueryScanPath path, PlannerInfo root,
1283	RelOptInfo baserel, ParamPathInfo param_info)
1284	{
1285	Cost startup_cost;
1286	Cost run_cost;
1287	QualCost qpqual_cost;
1288	Cost cpu_per_tuple;
1289
1290	/ Should only be applied to base relations that are subqueries /
1291	Assert(baserel->relid > `0`);
1292	Assert(baserel->rtekind == RTE_SUBQUERY);
1293
1294	/ Mark the path with the correct row estimate /
1295	if (param_info)
1296	path->path.rows = param_info->ppi_rows;
1297	else
1298	path->path.rows = baserel->rows;
1299
1300	/*
1301	* Cost of path is cost of evaluating the subplan, plus cost of evaluating
1302	* any restriction clauses and tlist that will be attached to the
1303	* SubqueryScan node, plus cpu_tuple_cost to account for selection and
1304	* projection overhead.
1305	*/
1306	path->path.startup_cost = path->subpath->startup_cost;
1307	path->path.total_cost = path->subpath->total_cost;
1308
1309	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1310
1311	startup_cost = qpqual_cost.startup;
1312	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1313	run_cost = cpu_per_tuple * baserel->tuples;
1314
1315	/ tlist eval costs are paid per output row, not per tuple scanned /
1316	startup_cost += path->path.pathtarget->cost.startup;
1317	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1318
1319	path->path.startup_cost += startup_cost;
1320	path->path.total_cost += startup_cost + run_cost;
1321	}
1322
1323	/*
1324	* cost_functionscan
1325	* Determines and returns the cost of scanning a function RTE.
1326	*
1327	* 'baserel' is the relation to be scanned
1328	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1329	*/
1330	void
1331	cost_functionscan(Path path, PlannerInfo root,
1332	RelOptInfo baserel, ParamPathInfo param_info)
1333	{
1334	Cost startup_cost = `0`;
1335	Cost run_cost = `0`;
1336	QualCost qpqual_cost;
1337	Cost cpu_per_tuple;
1338	RangeTblEntry *rte;
1339	QualCost exprcost;
1340
1341	/ Should only be applied to base relations that are functions /
1342	Assert(baserel->relid > `0`);
1343	rte = planner_rt_fetch(baserel->relid, root);
1344	Assert(rte->rtekind == RTE_FUNCTION);
1345
1346	/ Mark the path with the correct row estimate /
1347	if (param_info)
1348	path->rows = param_info->ppi_rows;
1349	else
1350	path->rows = baserel->rows;
1351
1352	/*
1353	* Estimate costs of executing the function expression(s).
1354	*
1355	* Currently, nodeFunctionscan.c always executes the functions to
1356	* completion before returning any rows, and caches the results in a
1357	* tuplestore. So the function eval cost is all startup cost, and per-row
1358	* costs are minimal.
1359	*
1360	* XXX in principle we ought to charge tuplestore spill costs if the
1361	* number of rows is large. However, given how phony our rowcount
1362	* estimates for functions tend to be, there's not a lot of point in that
1363	* refinement right now.
1364	*/
1365	cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1366
1367	startup_cost += exprcost.startup + exprcost.per_tuple;
1368
1369	/ Add scanning CPU costs /
1370	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1371
1372	startup_cost += qpqual_cost.startup;
1373	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1374	run_cost += cpu_per_tuple * baserel->tuples;
1375
1376	/ tlist eval costs are paid per output row, not per tuple scanned /
1377	startup_cost += path->pathtarget->cost.startup;
1378	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1379
1380	path->startup_cost = startup_cost;
1381	path->total_cost = startup_cost + run_cost;
1382	}
1383
1384	/*
1385	* cost_tablefuncscan
1386	* Determines and returns the cost of scanning a table function.
1387	*
1388	* 'baserel' is the relation to be scanned
1389	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1390	*/
1391	void
1392	cost_tablefuncscan(Path path, PlannerInfo root,
1393	RelOptInfo baserel, ParamPathInfo param_info)
1394	{
1395	Cost startup_cost = `0`;
1396	Cost run_cost = `0`;
1397	QualCost qpqual_cost;
1398	Cost cpu_per_tuple;
1399	RangeTblEntry *rte;
1400	QualCost exprcost;
1401
1402	/ Should only be applied to base relations that are functions /
1403	Assert(baserel->relid > `0`);
1404	rte = planner_rt_fetch(baserel->relid, root);
1405	Assert(rte->rtekind == RTE_TABLEFUNC);
1406
1407	/ Mark the path with the correct row estimate /
1408	if (param_info)
1409	path->rows = param_info->ppi_rows;
1410	else
1411	path->rows = baserel->rows;
1412
1413	/*
1414	* Estimate costs of executing the table func expression(s).
1415	*
1416	* XXX in principle we ought to charge tuplestore spill costs if the
1417	* number of rows is large. However, given how phony our rowcount
1418	* estimates for tablefuncs tend to be, there's not a lot of point in that
1419	* refinement right now.
1420	*/
1421	cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1422
1423	startup_cost += exprcost.startup + exprcost.per_tuple;
1424
1425	/ Add scanning CPU costs /
1426	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1427
1428	startup_cost += qpqual_cost.startup;
1429	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1430	run_cost += cpu_per_tuple * baserel->tuples;
1431
1432	/ tlist eval costs are paid per output row, not per tuple scanned /
1433	startup_cost += path->pathtarget->cost.startup;
1434	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1435
1436	path->startup_cost = startup_cost;
1437	path->total_cost = startup_cost + run_cost;
1438	}
1439
1440	/*
1441	* cost_valuesscan
1442	* Determines and returns the cost of scanning a VALUES RTE.
1443	*
1444	* 'baserel' is the relation to be scanned
1445	* 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1446	*/
1447	void
1448	cost_valuesscan(Path path, PlannerInfo root,
1449	RelOptInfo baserel, ParamPathInfo param_info)
1450	{
1451	Cost startup_cost = `0`;
1452	Cost run_cost = `0`;
1453	QualCost qpqual_cost;
1454	Cost cpu_per_tuple;
1455
1456	/ Should only be applied to base relations that are values lists /
1457	Assert(baserel->relid > `0`);
1458	Assert(baserel->rtekind == RTE_VALUES);
1459
1460	/ Mark the path with the correct row estimate /
1461	if (param_info)
1462	path->rows = param_info->ppi_rows;
1463	else
1464	path->rows = baserel->rows;
1465
1466	/*
1467	* For now, estimate list evaluation cost at one operator eval per list
1468	* (probably pretty bogus, but is it worth being smarter?)
1469	*/
1470	cpu_per_tuple = cpu_operator_cost;
1471
1472	/ Add scanning CPU costs /
1473	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1474
1475	startup_cost += qpqual_cost.startup;
1476	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1477	run_cost += cpu_per_tuple * baserel->tuples;
1478
1479	/ tlist eval costs are paid per output row, not per tuple scanned /
1480	startup_cost += path->pathtarget->cost.startup;
1481	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1482
1483	path->startup_cost = startup_cost;
1484	path->total_cost = startup_cost + run_cost;
1485	}
1486
1487	/*
1488	* cost_ctescan
1489	* Determines and returns the cost of scanning a CTE RTE.
1490	*
1491	* Note: this is used for both self-reference and regular CTEs; the
1492	* possible cost differences are below the threshold of what we could
1493	* estimate accurately anyway. Note that the costs of evaluating the
1494	* referenced CTE query are added into the final plan as initplan costs,
1495	* and should NOT be counted here.
1496	*/
1497	void
1498	cost_ctescan(Path path, PlannerInfo root,
1499	RelOptInfo baserel, ParamPathInfo param_info)
1500	{
1501	Cost startup_cost = `0`;
1502	Cost run_cost = `0`;
1503	QualCost qpqual_cost;
1504	Cost cpu_per_tuple;
1505
1506	/ Should only be applied to base relations that are CTEs /
1507	Assert(baserel->relid > `0`);
1508	Assert(baserel->rtekind == RTE_CTE);
1509
1510	/ Mark the path with the correct row estimate /
1511	if (param_info)
1512	path->rows = param_info->ppi_rows;
1513	else
1514	path->rows = baserel->rows;
1515
1516	/ Charge one CPU tuple cost per row for tuplestore manipulation /
1517	cpu_per_tuple = cpu_tuple_cost;
1518
1519	/ Add scanning CPU costs /
1520	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1521
1522	startup_cost += qpqual_cost.startup;
1523	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1524	run_cost += cpu_per_tuple * baserel->tuples;
1525
1526	/ tlist eval costs are paid per output row, not per tuple scanned /
1527	startup_cost += path->pathtarget->cost.startup;
1528	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1529
1530	path->startup_cost = startup_cost;
1531	path->total_cost = startup_cost + run_cost;
1532	}
1533
1534	/*
1535	* cost_namedtuplestorescan
1536	* Determines and returns the cost of scanning a named tuplestore.
1537	*/
1538	void
1539	cost_namedtuplestorescan(Path path, PlannerInfo root,
1540	RelOptInfo baserel, ParamPathInfo param_info)
1541	{
1542	Cost startup_cost = `0`;
1543	Cost run_cost = `0`;
1544	QualCost qpqual_cost;
1545	Cost cpu_per_tuple;
1546
1547	/ Should only be applied to base relations that are Tuplestores /
1548	Assert(baserel->relid > `0`);
1549	Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1550
1551	/ Mark the path with the correct row estimate /
1552	if (param_info)
1553	path->rows = param_info->ppi_rows;
1554	else
1555	path->rows = baserel->rows;
1556
1557	/ Charge one CPU tuple cost per row for tuplestore manipulation /
1558	cpu_per_tuple = cpu_tuple_cost;
1559
1560	/ Add scanning CPU costs /
1561	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1562
1563	startup_cost += qpqual_cost.startup;
1564	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1565	run_cost += cpu_per_tuple * baserel->tuples;
1566
1567	path->startup_cost = startup_cost;
1568	path->total_cost = startup_cost + run_cost;
1569	}
1570
1571	/*
1572	* cost_resultscan
1573	* Determines and returns the cost of scanning an RTE_RESULT relation.
1574	*/
1575	void
1576	cost_resultscan(Path path, PlannerInfo root,
1577	RelOptInfo baserel, ParamPathInfo param_info)
1578	{
1579	Cost startup_cost = `0`;
1580	Cost run_cost = `0`;
1581	QualCost qpqual_cost;
1582	Cost cpu_per_tuple;
1583
1584	/ Should only be applied to RTE_RESULT base relations /
1585	Assert(baserel->relid > `0`);
1586	Assert(baserel->rtekind == RTE_RESULT);
1587
1588	/ Mark the path with the correct row estimate /
1589	if (param_info)
1590	path->rows = param_info->ppi_rows;
1591	else
1592	path->rows = baserel->rows;
1593
1594	/ We charge qual cost plus cpu_tuple_cost /
1595	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1596
1597	startup_cost += qpqual_cost.startup;
1598	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1599	run_cost += cpu_per_tuple * baserel->tuples;
1600
1601	path->startup_cost = startup_cost;
1602	path->total_cost = startup_cost + run_cost;
1603	}
1604
1605	/*
1606	* cost_recursive_union
1607	* Determines and returns the cost of performing a recursive union,
1608	* and also the estimated output size.
1609	*
1610	* We are given Paths for the nonrecursive and recursive terms.
1611	*/
1612	void
1613	cost_recursive_union(Path runion, Path nrterm, Path *rterm)
1614	{
1615	Cost startup_cost;
1616	Cost total_cost;
1617	double total_rows;
1618
1619	/ We probably have decent estimates for the non-recursive term /
1620	startup_cost = nrterm->startup_cost;
1621	total_cost = nrterm->total_cost;
1622	total_rows = nrterm->rows;
1623
1624	/*
1625	* We arbitrarily assume that about 10 recursive iterations will be
1626	* needed, and that we've managed to get a good fix on the cost and output
1627	* size of each one of them. These are mighty shaky assumptions but it's
1628	* hard to see how to do better.
1629	*/
1630	total_cost += `10` * rterm->total_cost;
1631	total_rows += `10` * rterm->rows;
1632
1633	/*
1634	* Also charge cpu_tuple_cost per row to account for the costs of
1635	* manipulating the tuplestores. (We don't worry about possible
1636	* spill-to-disk costs.)
1637	*/
1638	total_cost += cpu_tuple_cost * total_rows;
1639
1640	runion->startup_cost = startup_cost;
1641	runion->total_cost = total_cost;
1642	runion->rows = total_rows;
1643	runion->pathtarget->width = Max(nrterm->pathtarget->width,
1644	rterm->pathtarget->width);
1645	}
1646
1647	/*
1648	* cost_sort
1649	* Determines and returns the cost of sorting a relation, including
1650	* the cost of reading the input data.
1651	*
1652	* If the total volume of data to sort is less than sort_mem, we will do
1653	* an in-memory sort, which requires no I/O and about t*log2(t) tuple
1654	* comparisons for t tuples.
1655	*
1656	* If the total volume exceeds sort_mem, we switch to a tape-style merge
1657	* algorithm. There will still be about t*log2(t) tuple comparisons in
1658	* total, but we will also need to write and read each tuple once per
1659	* merge pass. We expect about ceil(logM(r)) merge passes where r is the
1660	* number of initial runs formed and M is the merge order used by tuplesort.c.
1661	* Since the average initial run should be about sort_mem, we have
1662	* disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1663	* cpu = comparison_cost * t * log2(t)
1664	*
1665	* If the sort is bounded (i.e., only the first k result tuples are needed)
1666	* and k tuples can fit into sort_mem, we use a heap method that keeps only
1667	* k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1668	*
1669	* The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1670	* accesses (XXX can't we refine that guess?)
1671	*
1672	* By default, we charge two operator evals per tuple comparison, which should
1673	* be in the right ballpark in most cases. The caller can tweak this by
1674	* specifying nonzero comparison_cost; typically that's used for any extra
1675	* work that has to be done to prepare the inputs to the comparison operators.
1676	*
1677	* 'pathkeys' is a list of sort keys
1678	* 'input_cost' is the total cost for reading the input data
1679	* 'tuples' is the number of tuples in the relation
1680	* 'width' is the average tuple width in bytes
1681	* 'comparison_cost' is the extra cost per comparison, if any
1682	* 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1683	* 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1684	*
1685	* NOTE: some callers currently pass NIL for pathkeys because they
1686	* can't conveniently supply the sort keys. Since this routine doesn't
1687	* currently do anything with pathkeys anyway, that doesn't matter...
1688	* but if it ever does, it should react gracefully to lack of key data.
1689	* (Actually, the thing we'd most likely be interested in is just the number
1690	* of sort keys, which all callers could supply.)
1691	*/
1692	void
1693	cost_sort(Path path, PlannerInfo root,
1694	List pathkeys, Cost input_cost, double* tuples, int width,
1695	Cost comparison_cost, int sort_mem,
1696	double limit_tuples)
1697	{
1698	Cost startup_cost = input_cost;
1699	Cost run_cost = `0`;
1700	double input_bytes = relation_byte_size(tuples, width);
1701	double output_bytes;
1702	double output_tuples;
1703	long sort_mem_bytes = sort_mem * `1024L`;
1704
1705	if (!enable_sort)
1706	startup_cost += disable_cost;
1707
1708	path->rows = tuples;
1709
1710	/*
1711	* We want to be sure the cost of a sort is never estimated as zero, even
1712	* if passed-in tuple count is zero. Besides, mustn't do log(0)...
1713	*/
1714	if (tuples < `2.0`)
1715	tuples = `2.0`;
1716
1717	/ Include the default cost-per-comparison /
1718	comparison_cost += `2.0` * cpu_operator_cost;
1719
1720	/ Do we have a useful LIMIT? /
1721	if (limit_tuples > `0` && limit_tuples < tuples)
1722	{
1723	output_tuples = limit_tuples;
1724	output_bytes = relation_byte_size(output_tuples, width);
1725	}
1726	else
1727	{
1728	output_tuples = tuples;
1729	output_bytes = input_bytes;
1730	}
1731
1732	if (output_bytes > sort_mem_bytes)
1733	{
1734	/*
1735	* We'll have to use a disk-based sort of all the tuples
1736	*/
1737	double npages = ceil(input_bytes / BLCKSZ);
1738	double nruns = input_bytes / sort_mem_bytes;
1739	double mergeorder = tuplesort_merge_order(sort_mem_bytes);
1740	double log_runs;
1741	double npageaccesses;
1742
1743	/*
1744	* CPU costs
1745	*
1746	* Assume about N log2 N comparisons
1747	*/
1748	startup_cost += comparison_cost * tuples * LOG2(tuples);
1749
1750	/ Disk costs /
1751
1752	/ Compute logM(r) as log(r) / log(M) /
1753	if (nruns > mergeorder)
1754	log_runs = ceil(log(nruns) / log(mergeorder));
1755	else
1756	log_runs = `1.0`;
1757	npageaccesses = `2.0` * npages * log_runs;
1758	/ Assume 3/4ths of accesses are sequential, 1/4th are not /
1759	startup_cost += npageaccesses *
1760	(seq_page_cost * `0.75` + random_page_cost * `0.25`);
1761	}
1762	else if (tuples > `2` * output_tuples \|\| input_bytes > sort_mem_bytes)
1763	{
1764	/*
1765	* We'll use a bounded heap-sort keeping just K tuples in memory, for
1766	* a total number of tuple comparisons of N log2 K; but the constant
1767	* factor is a bit higher than for quicksort. Tweak it so that the
1768	* cost curve is continuous at the crossover point.
1769	*/
1770	startup_cost += comparison_cost * tuples * LOG2(`2.0` * output_tuples);
1771	}
1772	else
1773	{
1774	/ We'll use plain quicksort on all the input tuples /
1775	startup_cost += comparison_cost * tuples * LOG2(tuples);
1776	}
1777
1778	/*
1779	* Also charge a small amount (arbitrarily set equal to operator cost) per
1780	* extracted tuple. We don't charge cpu_tuple_cost because a Sort node
1781	* doesn't do qual-checking or projection, so it has less overhead than
1782	* most plan nodes. Note it's correct to use tuples not output_tuples
1783	* here --- the upper LIMIT will pro-rate the run cost so we'd be double
1784	* counting the LIMIT otherwise.
1785	*/
1786	run_cost += cpu_operator_cost * tuples;
1787
1788	path->startup_cost = startup_cost;
1789	path->total_cost = startup_cost + run_cost;
1790	}
1791
1792	/*
1793	* append_nonpartial_cost
1794	* Estimate the cost of the non-partial paths in a Parallel Append.
1795	* The non-partial paths are assumed to be the first "numpaths" paths
1796	* from the subpaths list, and to be in order of decreasing cost.
1797	*/
1798	static Cost
1799	append_nonpartial_cost(List subpaths, int* numpaths, int parallel_workers)
1800	{
1801	Cost *costarr;
1802	int arrlen;
1803	ListCell *l;
1804	ListCell *cell;
1805	int i;
1806	int path_index;
1807	int min_index;
1808	int max_index;
1809
1810	if (numpaths == `0`)
1811	return `0`;
1812
1813	/*
1814	* Array length is number of workers or number of relevants paths,
1815	* whichever is less.
1816	*/
1817	arrlen = Min(parallel_workers, numpaths);
1818	costarr = (Cost ) palloc(sizeof(Cost) arrlen);
1819
1820	/ The first few paths will each be claimed by a different worker. /
1821	path_index = `0`;
1822	foreach(cell, subpaths)
1823	{
1824	Path subpath = (Path ) lfirst(cell);
1825
1826	if (path_index == arrlen)
1827	break;
1828	costarr[path_index++] = subpath->total_cost;
1829	}
1830
1831	/*
1832	* Since subpaths are sorted by decreasing cost, the last one will have
1833	* the minimum cost.
1834	*/
1835	min_index = arrlen - `1`;
1836
1837	/*
1838	* For each of the remaining subpaths, add its cost to the array element
1839	* with minimum cost.
1840	*/
1841	for_each_cell(l, cell)
1842	{
1843	Path subpath = (Path ) lfirst(l);
1844	int i;
1845
1846	/ Consider only the non-partial paths /
1847	if (path_index++ == numpaths)
1848	break;
1849
1850	costarr[min_index] += subpath->total_cost;
1851
1852	/ Update the new min cost array index /
1853	for (min_index = i = `0`; i < arrlen; i++)
1854	{
1855	if (costarr[i] < costarr[min_index])
1856	min_index = i;
1857	}
1858	}
1859
1860	/ Return the highest cost from the array /
1861	for (max_index = i = `0`; i < arrlen; i++)
1862	{
1863	if (costarr[i] > costarr[max_index])
1864	max_index = i;
1865	}
1866
1867	return costarr[max_index];
1868	}
1869
1870	/*
1871	* cost_append
1872	* Determines and returns the cost of an Append node.
1873	*/
1874	void
1875	cost_append(AppendPath *apath)
1876	{
1877	ListCell *l;
1878
1879	apath->path.startup_cost = `0`;
1880	apath->path.total_cost = `0`;
1881	apath->path.rows = `0`;
1882
1883	if (apath->subpaths == NIL)
1884	return;
1885
1886	if (!apath->path.parallel_aware)
1887	{
1888	List *pathkeys = apath->path.pathkeys;
1889
1890	if (pathkeys == NIL)
1891	{
1892	Path subpath = (Path ) linitial(apath->subpaths);
1893
1894	/*
1895	* For an unordered, non-parallel-aware Append we take the startup
1896	* cost as the startup cost of the first subpath.
1897	*/
1898	apath->path.startup_cost = subpath->startup_cost;
1899
1900	/ Compute rows and costs as sums of subplan rows and costs. /
1901	foreach(l, apath->subpaths)
1902	{
1903	Path subpath = (Path ) lfirst(l);
1904
1905	apath->path.rows += subpath->rows;
1906	apath->path.total_cost += subpath->total_cost;
1907	}
1908	}
1909	else
1910	{
1911	/*
1912	* For an ordered, non-parallel-aware Append we take the startup
1913	* cost as the sum of the subpath startup costs. This ensures
1914	* that we don't underestimate the startup cost when a query's
1915	* LIMIT is such that several of the children have to be run to
1916	* satisfy it. This might be overkill --- another plausible hack
1917	* would be to take the Append's startup cost as the maximum of
1918	* the child startup costs. But we don't want to risk believing
1919	* that an ORDER BY LIMIT query can be satisfied at small cost
1920	* when the first child has small startup cost but later ones
1921	* don't. (If we had the ability to deal with nonlinear cost
1922	* interpolation for partial retrievals, we would not need to be
1923	* so conservative about this.)
1924	*
1925	* This case is also different from the above in that we have to
1926	* account for possibly injecting sorts into subpaths that aren't
1927	* natively ordered.
1928	*/
1929	foreach(l, apath->subpaths)
1930	{
1931	Path subpath = (Path ) lfirst(l);
1932	Path sort_path; / dummy for result of cost_sort /
1933
1934	if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
1935	{
1936	/*
1937	* We'll need to insert a Sort node, so include costs for
1938	* that. We can use the parent's LIMIT if any, since we
1939	* certainly won't pull more than that many tuples from
1940	* any child.
1941	*/
1942	cost_sort(&sort_path,
1943	NULL, / doesn't currently need root /
1944	pathkeys,
1945	subpath->total_cost,
1946	subpath->rows,
1947	subpath->pathtarget->width,
1948	`0.0`,
1949	work_mem,
1950	apath->limit_tuples);
1951	subpath = &sort_path;
1952	}
1953
1954	apath->path.rows += subpath->rows;
1955	apath->path.startup_cost += subpath->startup_cost;
1956	apath->path.total_cost += subpath->total_cost;
1957	}
1958	}
1959	}
1960	else / parallel-aware /
1961	{
1962	int i = `0`;
1963	double parallel_divisor = get_parallel_divisor(&apath->path);
1964
1965	/ Parallel-aware Append never produces ordered output. /
1966	Assert(apath->path.pathkeys == NIL);
1967
1968	/ Calculate startup cost. /
1969	foreach(l, apath->subpaths)
1970	{
1971	Path subpath = (Path ) lfirst(l);
1972
1973	/*
1974	* Append will start returning tuples when the child node having
1975	* lowest startup cost is done setting up. We consider only the
1976	* first few subplans that immediately get a worker assigned.
1977	*/
1978	if (i == `0`)
1979	apath->path.startup_cost = subpath->startup_cost;
1980	else if (i < apath->path.parallel_workers)
1981	apath->path.startup_cost = Min(apath->path.startup_cost,
1982	subpath->startup_cost);
1983
1984	/*
1985	* Apply parallel divisor to subpaths. Scale the number of rows
1986	* for each partial subpath based on the ratio of the parallel
1987	* divisor originally used for the subpath to the one we adopted.
1988	* Also add the cost of partial paths to the total cost, but
1989	* ignore non-partial paths for now.
1990	*/
1991	if (i < apath->first_partial_path)
1992	apath->path.rows += subpath->rows / parallel_divisor;
1993	else
1994	{
1995	double subpath_parallel_divisor;
1996
1997	subpath_parallel_divisor = get_parallel_divisor(subpath);
1998	apath->path.rows += subpath->rows * (subpath_parallel_divisor /
1999	parallel_divisor);
2000	apath->path.total_cost += subpath->total_cost;
2001	}
2002
2003	apath->path.rows = clamp_row_est(apath->path.rows);
2004
2005	i++;
2006	}
2007
2008	/ Add cost for non-partial subpaths. /
2009	apath->path.total_cost +=
2010	append_nonpartial_cost(apath->subpaths,
2011	apath->first_partial_path,
2012	apath->path.parallel_workers);
2013	}
2014
2015	/*
2016	* Although Append does not do any selection or projection, it's not free;
2017	* add a small per-tuple overhead.
2018	*/
2019	apath->path.total_cost +=
2020	cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
2021	}
2022
2023	/*
2024	* cost_merge_append
2025	* Determines and returns the cost of a MergeAppend node.
2026	*
2027	* MergeAppend merges several pre-sorted input streams, using a heap that
2028	* at any given instant holds the next tuple from each stream. If there
2029	* are N streams, we need about N*log2(N) tuple comparisons to construct
2030	* the heap at startup, and then for each output tuple, about log2(N)
2031	* comparisons to replace the top entry.
2032	*
2033	* (The effective value of N will drop once some of the input streams are
2034	* exhausted, but it seems unlikely to be worth trying to account for that.)
2035	*
2036	* The heap is never spilled to disk, since we assume N is not very large.
2037	* So this is much simpler than cost_sort.
2038	*
2039	* As in cost_sort, we charge two operator evals per tuple comparison.
2040	*
2041	* 'pathkeys' is a list of sort keys
2042	* 'n_streams' is the number of input streams
2043	* 'input_startup_cost' is the sum of the input streams' startup costs
2044	* 'input_total_cost' is the sum of the input streams' total costs
2045	* 'tuples' is the number of tuples in all the streams
2046	*/
2047	void
2048	cost_merge_append(Path path, PlannerInfo root,
2049	List pathkeys, int* n_streams,
2050	Cost input_startup_cost, Cost input_total_cost,
2051	double tuples)
2052	{
2053	Cost startup_cost = `0`;
2054	Cost run_cost = `0`;
2055	Cost comparison_cost;
2056	double N;
2057	double logN;
2058
2059	/*
2060	* Avoid log(0)...
2061	*/
2062	N = (n_streams < `2`) ? `2.0` : (double) n_streams;
2063	logN = LOG2(N);
2064
2065	/ Assumed cost per tuple comparison /
2066	comparison_cost = `2.0` * cpu_operator_cost;
2067
2068	/ Heap creation cost /
2069	startup_cost += comparison_cost * N * logN;
2070
2071	/ Per-tuple heap maintenance cost /
2072	run_cost += tuples * comparison_cost * logN;
2073
2074	/*
2075	* Although MergeAppend does not do any selection or projection, it's not
2076	* free; add a small per-tuple overhead.
2077	*/
2078	run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2079
2080	path->startup_cost = startup_cost + input_startup_cost;
2081	path->total_cost = startup_cost + run_cost + input_total_cost;
2082	}
2083
2084	/*
2085	* cost_material
2086	* Determines and returns the cost of materializing a relation, including
2087	* the cost of reading the input data.
2088	*
2089	* If the total volume of data to materialize exceeds work_mem, we will need
2090	* to write it to disk, so the cost is much higher in that case.
2091	*
2092	* Note that here we are estimating the costs for the first scan of the
2093	* relation, so the materialization is all overhead --- any savings will
2094	* occur only on rescan, which is estimated in cost_rescan.
2095	*/
2096	void
2097	cost_material(Path *path,
2098	Cost input_startup_cost, Cost input_total_cost,
2099	double tuples, int width)
2100	{
2101	Cost startup_cost = input_startup_cost;
2102	Cost run_cost = input_total_cost - input_startup_cost;
2103	double nbytes = relation_byte_size(tuples, width);
2104	long work_mem_bytes = work_mem * `1024L`;
2105
2106	path->rows = tuples;
2107
2108	/*
2109	* Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2110	* reflect bookkeeping overhead. (This rate must be more than what
2111	* cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2112	* if it is exactly the same then there will be a cost tie between
2113	* nestloop with A outer, materialized B inner and nestloop with B outer,
2114	* materialized A inner. The extra cost ensures we'll prefer
2115	* materializing the smaller rel.) Note that this is normally a good deal
2116	* less than cpu_tuple_cost; which is OK because a Material plan node
2117	* doesn't do qual-checking or projection, so it's got less overhead than
2118	* most plan nodes.
2119	*/
2120	run_cost += `2` * cpu_operator_cost * tuples;
2121
2122	/*
2123	* If we will spill to disk, charge at the rate of seq_page_cost per page.
2124	* This cost is assumed to be evenly spread through the plan run phase,
2125	* which isn't exactly accurate but our cost model doesn't allow for
2126	* nonuniform costs within the run phase.
2127	*/
2128	if (nbytes > work_mem_bytes)
2129	{
2130	double npages = ceil(nbytes / BLCKSZ);
2131
2132	run_cost += seq_page_cost * npages;
2133	}
2134
2135	path->startup_cost = startup_cost;
2136	path->total_cost = startup_cost + run_cost;
2137	}
2138
2139	/*
2140	* cost_agg
2141	* Determines and returns the cost of performing an Agg plan node,
2142	* including the cost of its input.
2143	*
2144	* aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2145	* we are using a hashed Agg node just to do grouping).
2146	*
2147	* Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2148	* are for appropriately-sorted input.
2149	*/
2150	void
2151	cost_agg(Path path, PlannerInfo root,
2152	AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2153	int numGroupCols, double numGroups,
2154	List *quals,
2155	Cost input_startup_cost, Cost input_total_cost,
2156	double input_tuples)
2157	{
2158	double output_tuples;
2159	Cost startup_cost;
2160	Cost total_cost;
2161	AggClauseCosts dummy_aggcosts;
2162
2163	/ Use all-zero per-aggregate costs if NULL is passed /
2164	if (aggcosts == NULL)
2165	{
2166	Assert(aggstrategy == AGG_HASHED);
2167	MemSet(&dummy_aggcosts, `0`, sizeof(AggClauseCosts));
2168	aggcosts = &dummy_aggcosts;
2169	}
2170
2171	/*
2172	* The transCost.per_tuple component of aggcosts should be charged once
2173	* per input tuple, corresponding to the costs of evaluating the aggregate
2174	* transfns and their input expressions. The finalCost.per_tuple component
2175	* is charged once per output tuple, corresponding to the costs of
2176	* evaluating the finalfns. Startup costs are of course charged but once.
2177	*
2178	* If we are grouping, we charge an additional cpu_operator_cost per
2179	* grouping column per input tuple for grouping comparisons.
2180	*
2181	* We will produce a single output tuple if not grouping, and a tuple per
2182	* group otherwise. We charge cpu_tuple_cost for each output tuple.
2183	*
2184	* Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2185	* same total CPU cost, but AGG_SORTED has lower startup cost. If the
2186	* input path is already sorted appropriately, AGG_SORTED should be
2187	* preferred (since it has no risk of memory overflow). This will happen
2188	* as long as the computed total costs are indeed exactly equal --- but if
2189	* there's roundoff error we might do the wrong thing. So be sure that
2190	* the computations below form the same intermediate values in the same
2191	* order.
2192	*/
2193	if (aggstrategy == AGG_PLAIN)
2194	{
2195	startup_cost = input_total_cost;
2196	startup_cost += aggcosts->transCost.startup;
2197	startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2198	startup_cost += aggcosts->finalCost.startup;
2199	startup_cost += aggcosts->finalCost.per_tuple;
2200	/ we aren't grouping /
2201	total_cost = startup_cost + cpu_tuple_cost;
2202	output_tuples = `1`;
2203	}
2204	else if (aggstrategy == AGG_SORTED \|\| aggstrategy == AGG_MIXED)
2205	{
2206	/ Here we are able to deliver output on-the-fly /
2207	startup_cost = input_startup_cost;
2208	total_cost = input_total_cost;
2209	if (aggstrategy == AGG_MIXED && !enable_hashagg)
2210	{
2211	startup_cost += disable_cost;
2212	total_cost += disable_cost;
2213	}
2214	/ calcs phrased this way to match HASHED case, see note above /
2215	total_cost += aggcosts->transCost.startup;
2216	total_cost += aggcosts->transCost.per_tuple * input_tuples;
2217	total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2218	total_cost += aggcosts->finalCost.startup;
2219	total_cost += aggcosts->finalCost.per_tuple * numGroups;
2220	total_cost += cpu_tuple_cost * numGroups;
2221	output_tuples = numGroups;
2222	}
2223	else
2224	{
2225	/ must be AGG_HASHED /
2226	startup_cost = input_total_cost;
2227	if (!enable_hashagg)
2228	startup_cost += disable_cost;
2229	startup_cost += aggcosts->transCost.startup;
2230	startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2231	startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2232	startup_cost += aggcosts->finalCost.startup;
2233	total_cost = startup_cost;
2234	total_cost += aggcosts->finalCost.per_tuple * numGroups;
2235	total_cost += cpu_tuple_cost * numGroups;
2236	output_tuples = numGroups;
2237	}
2238
2239	/*
2240	* If there are quals (HAVING quals), account for their cost and
2241	* selectivity.
2242	*/
2243	if (quals)
2244	{
2245	QualCost qual_cost;
2246
2247	cost_qual_eval(&qual_cost, quals, root);
2248	startup_cost += qual_cost.startup;
2249	total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2250
2251	output_tuples = clamp_row_est(output_tuples *
2252	clauselist_selectivity(root,
2253	quals,
2254	`0`,
2255	JOIN_INNER,
2256	NULL));
2257	}
2258
2259	path->rows = output_tuples;
2260	path->startup_cost = startup_cost;
2261	path->total_cost = total_cost;
2262	}
2263
2264	/*
2265	* cost_windowagg
2266	* Determines and returns the cost of performing a WindowAgg plan node,
2267	* including the cost of its input.
2268	*
2269	* Input is assumed already properly sorted.
2270	*/
2271	void
2272	cost_windowagg(Path path, PlannerInfo root,
2273	List windowFuncs, int* numPartCols, int numOrderCols,
2274	Cost input_startup_cost, Cost input_total_cost,
2275	double input_tuples)
2276	{
2277	Cost startup_cost;
2278	Cost total_cost;
2279	ListCell *lc;
2280
2281	startup_cost = input_startup_cost;
2282	total_cost = input_total_cost;
2283
2284	/*
2285	* Window functions are assumed to cost their stated execution cost, plus
2286	* the cost of evaluating their input expressions, per tuple. Since they
2287	* may in fact evaluate their inputs at multiple rows during each cycle,
2288	* this could be a drastic underestimate; but without a way to know how
2289	* many rows the window function will fetch, it's hard to do better. In
2290	* any case, it's a good estimate for all the built-in window functions,
2291	* so we'll just do this for now.
2292	*/
2293	foreach(lc, windowFuncs)
2294	{
2295	WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
2296	Cost wfunccost;
2297	QualCost argcosts;
2298
2299	argcosts.startup = argcosts.per_tuple = `0`;
2300	add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
2301	&argcosts);
2302	startup_cost += argcosts.startup;
2303	wfunccost = argcosts.per_tuple;
2304
2305	/ also add the input expressions' cost to per-input-row costs /
2306	cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
2307	startup_cost += argcosts.startup;
2308	wfunccost += argcosts.per_tuple;
2309
2310	/*
2311	* Add the filter's cost to per-input-row costs. XXX We should reduce
2312	* input expression costs according to filter selectivity.
2313	*/
2314	cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
2315	startup_cost += argcosts.startup;
2316	wfunccost += argcosts.per_tuple;
2317
2318	total_cost += wfunccost * input_tuples;
2319	}
2320
2321	/*
2322	* We also charge cpu_operator_cost per grouping column per tuple for
2323	* grouping comparisons, plus cpu_tuple_cost per tuple for general
2324	* overhead.
2325	*
2326	* XXX this neglects costs of spooling the data to disk when it overflows
2327	* work_mem. Sooner or later that should get accounted for.
2328	*/
2329	total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
2330	total_cost += cpu_tuple_cost * input_tuples;
2331
2332	path->rows = input_tuples;
2333	path->startup_cost = startup_cost;
2334	path->total_cost = total_cost;
2335	}
2336
2337	/*
2338	* cost_group
2339	* Determines and returns the cost of performing a Group plan node,
2340	* including the cost of its input.
2341	*
2342	* Note: caller must ensure that input costs are for appropriately-sorted
2343	* input.
2344	*/
2345	void
2346	cost_group(Path path, PlannerInfo root,
2347	int numGroupCols, double numGroups,
2348	List *quals,
2349	Cost input_startup_cost, Cost input_total_cost,
2350	double input_tuples)
2351	{
2352	double output_tuples;
2353	Cost startup_cost;
2354	Cost total_cost;
2355
2356	output_tuples = numGroups;
2357	startup_cost = input_startup_cost;
2358	total_cost = input_total_cost;
2359
2360	/*
2361	* Charge one cpu_operator_cost per comparison per input tuple. We assume
2362	* all columns get compared at most of the tuples.
2363	*/
2364	total_cost += cpu_operator_cost * input_tuples * numGroupCols;
2365
2366	/*
2367	* If there are quals (HAVING quals), account for their cost and
2368	* selectivity.
2369	*/
2370	if (quals)
2371	{
2372	QualCost qual_cost;
2373
2374	cost_qual_eval(&qual_cost, quals, root);
2375	startup_cost += qual_cost.startup;
2376	total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2377
2378	output_tuples = clamp_row_est(output_tuples *
2379	clauselist_selectivity(root,
2380	quals,
2381	`0`,
2382	JOIN_INNER,
2383	NULL));
2384	}
2385
2386	path->rows = output_tuples;
2387	path->startup_cost = startup_cost;
2388	path->total_cost = total_cost;
2389	}
2390
2391	/*
2392	* initial_cost_nestloop
2393	* Preliminary estimate of the cost of a nestloop join path.
2394	*
2395	* This must quickly produce lower-bound estimates of the path's startup and
2396	* total costs. If we are unable to eliminate the proposed path from
2397	* consideration using the lower bounds, final_cost_nestloop will be called
2398	* to obtain the final estimates.
2399	*
2400	* The exact division of labor between this function and final_cost_nestloop
2401	* is private to them, and represents a tradeoff between speed of the initial
2402	* estimate and getting a tight lower bound. We choose to not examine the
2403	* join quals here, since that's by far the most expensive part of the
2404	* calculations. The end result is that CPU-cost considerations must be
2405	* left for the second phase; and for SEMI/ANTI joins, we must also postpone
2406	* incorporation of the inner path's run cost.
2407	*
2408	* 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2409	* other data to be used by final_cost_nestloop
2410	* 'jointype' is the type of join to be performed
2411	* 'outer_path' is the outer input to the join
2412	* 'inner_path' is the inner input to the join
2413	* 'extra' contains miscellaneous information about the join
2414	*/
2415	void
2416	initial_cost_nestloop(PlannerInfo root, JoinCostWorkspace workspace,
2417	JoinType jointype,
2418	Path outer_path, Path inner_path,
2419	JoinPathExtraData *extra)
2420	{
2421	Cost startup_cost = `0`;
2422	Cost run_cost = `0`;
2423	double outer_path_rows = outer_path->rows;
2424	Cost inner_rescan_start_cost;
2425	Cost inner_rescan_total_cost;
2426	Cost inner_run_cost;
2427	Cost inner_rescan_run_cost;
2428
2429	/ estimate costs to rescan the inner relation /
2430	cost_rescan(root, inner_path,
2431	&inner_rescan_start_cost,
2432	&inner_rescan_total_cost);
2433
2434	/ cost of source data /
2435
2436	/*
2437	* NOTE: clearly, we must pay both outer and inner paths' startup_cost
2438	* before we can start returning tuples, so the join's startup cost is
2439	* their sum. We'll also pay the inner path's rescan startup cost
2440	* multiple times.
2441	*/
2442	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
2443	run_cost += outer_path->total_cost - outer_path->startup_cost;
2444	if (outer_path_rows > `1`)
2445	run_cost += (outer_path_rows - `1`) * inner_rescan_start_cost;
2446
2447	inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
2448	inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
2449
2450	if (jointype == JOIN_SEMI \|\| jointype == JOIN_ANTI \|\|
2451	extra->inner_unique)
2452	{
2453	/*
2454	* With a SEMI or ANTI join, or if the innerrel is known unique, the
2455	* executor will stop after the first match.
2456	*
2457	* Getting decent estimates requires inspection of the join quals,
2458	* which we choose to postpone to final_cost_nestloop.
2459	*/
2460
2461	/ Save private data for final_cost_nestloop /
2462	workspace->inner_run_cost = inner_run_cost;
2463	workspace->inner_rescan_run_cost = inner_rescan_run_cost;
2464	}
2465	else
2466	{
2467	/ Normal case; we'll scan whole input rel for each outer row /
2468	run_cost += inner_run_cost;
2469	if (outer_path_rows > `1`)
2470	run_cost += (outer_path_rows - `1`) * inner_rescan_run_cost;
2471	}
2472
2473	/ CPU costs left for later /
2474
2475	/ Public result fields /
2476	workspace->startup_cost = startup_cost;
2477	workspace->total_cost = startup_cost + run_cost;
2478	/ Save private data for final_cost_nestloop /
2479	workspace->run_cost = run_cost;
2480	}
2481
2482	/*
2483	* final_cost_nestloop
2484	* Final estimate of the cost and result size of a nestloop join path.
2485	*
2486	* 'path' is already filled in except for the rows and cost fields
2487	* 'workspace' is the result from initial_cost_nestloop
2488	* 'extra' contains miscellaneous information about the join
2489	*/
2490	void
2491	final_cost_nestloop(PlannerInfo root, NestPath path,
2492	JoinCostWorkspace *workspace,
2493	JoinPathExtraData *extra)
2494	{
2495	Path *outer_path = path->outerjoinpath;
2496	Path *inner_path = path->innerjoinpath;
2497	double outer_path_rows = outer_path->rows;
2498	double inner_path_rows = inner_path->rows;
2499	Cost startup_cost = workspace->startup_cost;
2500	Cost run_cost = workspace->run_cost;
2501	Cost cpu_per_tuple;
2502	QualCost restrict_qual_cost;
2503	double ntuples;
2504
2505	/ Protect some assumptions below that rowcounts aren't zero or NaN /
2506	if (outer_path_rows <= `0` \|\| isnan(outer_path_rows))
2507	outer_path_rows = `1`;
2508	if (inner_path_rows <= `0` \|\| isnan(inner_path_rows))
2509	inner_path_rows = `1`;
2510
2511	/ Mark the path with the correct row estimate /
2512	if (path->path.param_info)
2513	path->path.rows = path->path.param_info->ppi_rows;
2514	else
2515	path->path.rows = path->path.parent->rows;
2516
2517	/ For partial paths, scale row estimate. /
2518	if (path->path.parallel_workers > `0`)
2519	{
2520	double parallel_divisor = get_parallel_divisor(&path->path);
2521
2522	path->path.rows =
2523	clamp_row_est(path->path.rows / parallel_divisor);
2524	}
2525
2526	/*
2527	* We could include disable_cost in the preliminary estimate, but that
2528	* would amount to optimizing for the case where the join method is
2529	* disabled, which doesn't seem like the way to bet.
2530	*/
2531	if (!enable_nestloop)
2532	startup_cost += disable_cost;
2533
2534	/ cost of inner-relation source data (we already dealt with outer rel) /
2535
2536	if (path->jointype == JOIN_SEMI \|\| path->jointype == JOIN_ANTI \|\|
2537	extra->inner_unique)
2538	{
2539	/*
2540	* With a SEMI or ANTI join, or if the innerrel is known unique, the
2541	* executor will stop after the first match.
2542	*/
2543	Cost inner_run_cost = workspace->inner_run_cost;
2544	Cost inner_rescan_run_cost = workspace->inner_rescan_run_cost;
2545	double outer_matched_rows;
2546	double outer_unmatched_rows;
2547	Selectivity inner_scan_frac;
2548
2549	/*
2550	* For an outer-rel row that has at least one match, we can expect the
2551	* inner scan to stop after a fraction 1/(match_count+1) of the inner
2552	* rows, if the matches are evenly distributed. Since they probably
2553	* aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
2554	* that fraction. (If we used a larger fuzz factor, we'd have to
2555	* clamp inner_scan_frac to at most 1.0; but since match_count is at
2556	* least 1, no such clamp is needed now.)
2557	*/
2558	outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
2559	outer_unmatched_rows = outer_path_rows - outer_matched_rows;
2560	inner_scan_frac = `2.0` / (extra->semifactors.match_count + `1.0`);
2561
2562	/*
2563	* Compute number of tuples processed (not number emitted!). First,
2564	* account for successfully-matched outer rows.
2565	*/
2566	ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
2567
2568	/*
2569	* Now we need to estimate the actual costs of scanning the inner
2570	* relation, which may be quite a bit less than N times inner_run_cost
2571	* due to early scan stops. We consider two cases. If the inner path
2572	* is an indexscan using all the joinquals as indexquals, then an
2573	* unmatched outer row results in an indexscan returning no rows,
2574	* which is probably quite cheap. Otherwise, the executor will have
2575	* to scan the whole inner rel for an unmatched row; not so cheap.
2576	*/
2577	if (has_indexed_join_quals(path))
2578	{
2579	/*
2580	* Successfully-matched outer rows will only require scanning
2581	* inner_scan_frac of the inner relation. In this case, we don't
2582	* need to charge the full inner_run_cost even when that's more
2583	* than inner_rescan_run_cost, because we can assume that none of
2584	* the inner scans ever scan the whole inner relation. So it's
2585	* okay to assume that all the inner scan executions can be
2586	* fractions of the full cost, even if materialization is reducing
2587	* the rescan cost. At this writing, it's impossible to get here
2588	* for a materialized inner scan, so inner_run_cost and
2589	* inner_rescan_run_cost will be the same anyway; but just in
2590	* case, use inner_run_cost for the first matched tuple and
2591	* inner_rescan_run_cost for additional ones.
2592	*/
2593	run_cost += inner_run_cost * inner_scan_frac;
2594	if (outer_matched_rows > `1`)
2595	run_cost += (outer_matched_rows - `1`) * inner_rescan_run_cost * inner_scan_frac;
2596
2597	/*
2598	* Add the cost of inner-scan executions for unmatched outer rows.
2599	* We estimate this as the same cost as returning the first tuple
2600	* of a nonempty scan. We consider that these are all rescans,
2601	* since we used inner_run_cost once already.
2602	*/
2603	run_cost += outer_unmatched_rows *
2604	inner_rescan_run_cost / inner_path_rows;
2605
2606	/*
2607	* We won't be evaluating any quals at all for unmatched rows, so
2608	* don't add them to ntuples.
2609	*/
2610	}
2611	else
2612	{
2613	/*
2614	* Here, a complicating factor is that rescans may be cheaper than
2615	* first scans. If we never scan all the way to the end of the
2616	* inner rel, it might be (depending on the plan type) that we'd
2617	* never pay the whole inner first-scan run cost. However it is
2618	* difficult to estimate whether that will happen (and it could
2619	* not happen if there are any unmatched outer rows!), so be
2620	* conservative and always charge the whole first-scan cost once.
2621	* We consider this charge to correspond to the first unmatched
2622	* outer row, unless there isn't one in our estimate, in which
2623	* case blame it on the first matched row.
2624	*/
2625
2626	/ First, count all unmatched join tuples as being processed /
2627	ntuples += outer_unmatched_rows * inner_path_rows;
2628
2629	/ Now add the forced full scan, and decrement appropriate count /
2630	run_cost += inner_run_cost;
2631	if (outer_unmatched_rows >= `1`)
2632	outer_unmatched_rows -= `1`;
2633	else
2634	outer_matched_rows -= `1`;
2635
2636	/ Add inner run cost for additional outer tuples having matches /
2637	if (outer_matched_rows > `0`)
2638	run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
2639
2640	/ Add inner run cost for additional unmatched outer tuples /
2641	if (outer_unmatched_rows > `0`)
2642	run_cost += outer_unmatched_rows * inner_rescan_run_cost;
2643	}
2644	}
2645	else
2646	{
2647	/ Normal-case source costs were included in preliminary estimate /
2648
2649	/ Compute number of tuples processed (not number emitted!) /
2650	ntuples = outer_path_rows * inner_path_rows;
2651	}
2652
2653	/ CPU costs /
2654	cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
2655	startup_cost += restrict_qual_cost.startup;
2656	cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
2657	run_cost += cpu_per_tuple * ntuples;
2658
2659	/ tlist eval costs are paid per output row, not per tuple scanned /
2660	startup_cost += path->path.pathtarget->cost.startup;
2661	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
2662
2663	path->path.startup_cost = startup_cost;
2664	path->path.total_cost = startup_cost + run_cost;
2665	}
2666
2667	/*
2668	* initial_cost_mergejoin
2669	* Preliminary estimate of the cost of a mergejoin path.
2670	*
2671	* This must quickly produce lower-bound estimates of the path's startup and
2672	* total costs. If we are unable to eliminate the proposed path from
2673	* consideration using the lower bounds, final_cost_mergejoin will be called
2674	* to obtain the final estimates.
2675	*
2676	* The exact division of labor between this function and final_cost_mergejoin
2677	* is private to them, and represents a tradeoff between speed of the initial
2678	* estimate and getting a tight lower bound. We choose to not examine the
2679	* join quals here, except for obtaining the scan selectivity estimate which
2680	* is really essential (but fortunately, use of caching keeps the cost of
2681	* getting that down to something reasonable).
2682	* We also assume that cost_sort is cheap enough to use here.
2683	*
2684	* 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2685	* other data to be used by final_cost_mergejoin
2686	* 'jointype' is the type of join to be performed
2687	* 'mergeclauses' is the list of joinclauses to be used as merge clauses
2688	* 'outer_path' is the outer input to the join
2689	* 'inner_path' is the inner input to the join
2690	* 'outersortkeys' is the list of sort keys for the outer path
2691	* 'innersortkeys' is the list of sort keys for the inner path
2692	* 'extra' contains miscellaneous information about the join
2693	*
2694	* Note: outersortkeys and innersortkeys should be NIL if no explicit
2695	* sort is needed because the respective source path is already ordered.
2696	*/
2697	void
2698	initial_cost_mergejoin(PlannerInfo root, JoinCostWorkspace workspace,
2699	JoinType jointype,
2700	List *mergeclauses,
2701	Path outer_path, Path inner_path,
2702	List outersortkeys, List innersortkeys,
2703	JoinPathExtraData *extra)
2704	{
2705	Cost startup_cost = `0`;
2706	Cost run_cost = `0`;
2707	double outer_path_rows = outer_path->rows;
2708	double inner_path_rows = inner_path->rows;
2709	Cost inner_run_cost;
2710	double outer_rows,
2711	inner_rows,
2712	outer_skip_rows,
2713	inner_skip_rows;
2714	Selectivity outerstartsel,
2715	outerendsel,
2716	innerstartsel,
2717	innerendsel;
2718	Path sort_path; / dummy for result of cost_sort /
2719
2720	/ Protect some assumptions below that rowcounts aren't zero or NaN /
2721	if (outer_path_rows <= `0` \|\| isnan(outer_path_rows))
2722	outer_path_rows = `1`;
2723	if (inner_path_rows <= `0` \|\| isnan(inner_path_rows))
2724	inner_path_rows = `1`;
2725
2726	/*
2727	* A merge join will stop as soon as it exhausts either input stream
2728	* (unless it's an outer join, in which case the outer side has to be
2729	* scanned all the way anyway). Estimate fraction of the left and right
2730	* inputs that will actually need to be scanned. Likewise, we can
2731	* estimate the number of rows that will be skipped before the first join
2732	* pair is found, which should be factored into startup cost. We use only
2733	* the first (most significant) merge clause for this purpose. Since
2734	* mergejoinscansel() is a fairly expensive computation, we cache the
2735	* results in the merge clause RestrictInfo.
2736	*/
2737	if (mergeclauses && jointype != JOIN_FULL)
2738	{
2739	RestrictInfo firstclause = (RestrictInfo ) linitial(mergeclauses);
2740	List *opathkeys;
2741	List *ipathkeys;
2742	PathKey *opathkey;
2743	PathKey *ipathkey;
2744	MergeScanSelCache *cache;
2745
2746	/ Get the input pathkeys to determine the sort-order details /
2747	opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
2748	ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
2749	Assert(opathkeys);
2750	Assert(ipathkeys);
2751	opathkey = (PathKey *) linitial(opathkeys);
2752	ipathkey = (PathKey *) linitial(ipathkeys);
2753	/ debugging check /
2754	if (opathkey->pk_opfamily != ipathkey->pk_opfamily \|\|
2755	opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation \|\|
2756	opathkey->pk_strategy != ipathkey->pk_strategy \|\|
2757	opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
2758	elog(ERROR, "left and right pathkeys do not match in mergejoin");
2759
2760	/ Get the selectivity with caching /
2761	cache = cached_scansel(root, firstclause, opathkey);
2762
2763	if (bms_is_subset(firstclause->left_relids,
2764	outer_path->parent->relids))
2765	{
2766	/ left side of clause is outer /
2767	outerstartsel = cache->leftstartsel;
2768	outerendsel = cache->leftendsel;
2769	innerstartsel = cache->rightstartsel;
2770	innerendsel = cache->rightendsel;
2771	}
2772	else
2773	{
2774	/ left side of clause is inner /
2775	outerstartsel = cache->rightstartsel;
2776	outerendsel = cache->rightendsel;
2777	innerstartsel = cache->leftstartsel;
2778	innerendsel = cache->leftendsel;
2779	}
2780	if (jointype == JOIN_LEFT \|\|
2781	jointype == JOIN_ANTI)
2782	{
2783	outerstartsel = `0.0`;
2784	outerendsel = `1.0`;
2785	}
2786	else if (jointype == JOIN_RIGHT)
2787	{
2788	innerstartsel = `0.0`;
2789	innerendsel = `1.0`;
2790	}
2791	}
2792	else
2793	{
2794	/ cope with clauseless or full mergejoin /
2795	outerstartsel = innerstartsel = `0.0`;
2796	outerendsel = innerendsel = `1.0`;
2797	}
2798
2799	/*
2800	* Convert selectivities to row counts. We force outer_rows and
2801	* inner_rows to be at least 1, but the skip_rows estimates can be zero.
2802	*/
2803	outer_skip_rows = rint(outer_path_rows * outerstartsel);
2804	inner_skip_rows = rint(inner_path_rows * innerstartsel);
2805	outer_rows = clamp_row_est(outer_path_rows * outerendsel);
2806	inner_rows = clamp_row_est(inner_path_rows * innerendsel);
2807
2808	Assert(outer_skip_rows <= outer_rows);
2809	Assert(inner_skip_rows <= inner_rows);
2810
2811	/*
2812	* Readjust scan selectivities to account for above rounding. This is
2813	* normally an insignificant effect, but when there are only a few rows in
2814	* the inputs, failing to do this makes for a large percentage error.
2815	*/
2816	outerstartsel = outer_skip_rows / outer_path_rows;
2817	innerstartsel = inner_skip_rows / inner_path_rows;
2818	outerendsel = outer_rows / outer_path_rows;
2819	innerendsel = inner_rows / inner_path_rows;
2820
2821	Assert(outerstartsel <= outerendsel);
2822	Assert(innerstartsel <= innerendsel);
2823
2824	/ cost of source data /
2825
2826	if (outersortkeys) / do we need to sort outer? /
2827	{
2828	cost_sort(&sort_path,
2829	root,
2830	outersortkeys,
2831	outer_path->total_cost,
2832	outer_path_rows,
2833	outer_path->pathtarget->width,
2834	`0.0`,
2835	work_mem,
2836	-`1.0`);
2837	startup_cost += sort_path.startup_cost;
2838	startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2839	* outerstartsel;
2840	run_cost += (sort_path.total_cost - sort_path.startup_cost)
2841	* (outerendsel - outerstartsel);
2842	}
2843	else
2844	{
2845	startup_cost += outer_path->startup_cost;
2846	startup_cost += (outer_path->total_cost - outer_path->startup_cost)
2847	* outerstartsel;
2848	run_cost += (outer_path->total_cost - outer_path->startup_cost)
2849	* (outerendsel - outerstartsel);
2850	}
2851
2852	if (innersortkeys) / do we need to sort inner? /
2853	{
2854	cost_sort(&sort_path,
2855	root,
2856	innersortkeys,
2857	inner_path->total_cost,
2858	inner_path_rows,
2859	inner_path->pathtarget->width,
2860	`0.0`,
2861	work_mem,
2862	-`1.0`);
2863	startup_cost += sort_path.startup_cost;
2864	startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2865	* innerstartsel;
2866	inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
2867	* (innerendsel - innerstartsel);
2868	}
2869	else
2870	{
2871	startup_cost += inner_path->startup_cost;
2872	startup_cost += (inner_path->total_cost - inner_path->startup_cost)
2873	* innerstartsel;
2874	inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
2875	* (innerendsel - innerstartsel);
2876	}
2877
2878	/*
2879	* We can't yet determine whether rescanning occurs, or whether
2880	* materialization of the inner input should be done. The minimum
2881	* possible inner input cost, regardless of rescan and materialization
2882	* considerations, is inner_run_cost. We include that in
2883	* workspace->total_cost, but not yet in run_cost.
2884	*/
2885
2886	/ CPU costs left for later /
2887
2888	/ Public result fields /
2889	workspace->startup_cost = startup_cost;
2890	workspace->total_cost = startup_cost + run_cost + inner_run_cost;
2891	/ Save private data for final_cost_mergejoin /
2892	workspace->run_cost = run_cost;
2893	workspace->inner_run_cost = inner_run_cost;
2894	workspace->outer_rows = outer_rows;
2895	workspace->inner_rows = inner_rows;
2896	workspace->outer_skip_rows = outer_skip_rows;
2897	workspace->inner_skip_rows = inner_skip_rows;
2898	}
2899
2900	/*
2901	* final_cost_mergejoin
2902	* Final estimate of the cost and result size of a mergejoin path.
2903	*
2904	* Unlike other costsize functions, this routine makes two actual decisions:
2905	* whether the executor will need to do mark/restore, and whether we should
2906	* materialize the inner path. It would be logically cleaner to build
2907	* separate paths testing these alternatives, but that would require repeating
2908	* most of the cost calculations, which are not all that cheap. Since the
2909	* choice will not affect output pathkeys or startup cost, only total cost,
2910	* there is no possibility of wanting to keep more than one path. So it seems
2911	* best to make the decisions here and record them in the path's
2912	* skip_mark_restore and materialize_inner fields.
2913	*
2914	* Mark/restore overhead is usually required, but can be skipped if we know
2915	* that the executor need find only one match per outer tuple, and that the
2916	* mergeclauses are sufficient to identify a match.
2917	*
2918	* We materialize the inner path if we need mark/restore and either the inner
2919	* path can't support mark/restore, or it's cheaper to use an interposed
2920	* Material node to handle mark/restore.
2921	*
2922	* 'path' is already filled in except for the rows and cost fields and
2923	* skip_mark_restore and materialize_inner
2924	* 'workspace' is the result from initial_cost_mergejoin
2925	* 'extra' contains miscellaneous information about the join
2926	*/
2927	void
2928	final_cost_mergejoin(PlannerInfo root, MergePath path,
2929	JoinCostWorkspace *workspace,
2930	JoinPathExtraData *extra)
2931	{
2932	Path *outer_path = path->jpath.outerjoinpath;
2933	Path *inner_path = path->jpath.innerjoinpath;
2934	double inner_path_rows = inner_path->rows;
2935	List *mergeclauses = path->path_mergeclauses;
2936	List *innersortkeys = path->innersortkeys;
2937	Cost startup_cost = workspace->startup_cost;
2938	Cost run_cost = workspace->run_cost;
2939	Cost inner_run_cost = workspace->inner_run_cost;
2940	double outer_rows = workspace->outer_rows;
2941	double inner_rows = workspace->inner_rows;
2942	double outer_skip_rows = workspace->outer_skip_rows;
2943	double inner_skip_rows = workspace->inner_skip_rows;
2944	Cost cpu_per_tuple,
2945	bare_inner_cost,
2946	mat_inner_cost;
2947	QualCost merge_qual_cost;
2948	QualCost qp_qual_cost;
2949	double mergejointuples,
2950	rescannedtuples;
2951	double rescanratio;
2952
2953	/ Protect some assumptions below that rowcounts aren't zero or NaN /
2954	if (inner_path_rows <= `0` \|\| isnan(inner_path_rows))
2955	inner_path_rows = `1`;
2956
2957	/ Mark the path with the correct row estimate /
2958	if (path->jpath.path.param_info)
2959	path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
2960	else
2961	path->jpath.path.rows = path->jpath.path.parent->rows;
2962
2963	/ For partial paths, scale row estimate. /
2964	if (path->jpath.path.parallel_workers > `0`)
2965	{
2966	double parallel_divisor = get_parallel_divisor(&path->jpath.path);
2967
2968	path->jpath.path.rows =
2969	clamp_row_est(path->jpath.path.rows / parallel_divisor);
2970	}
2971
2972	/*
2973	* We could include disable_cost in the preliminary estimate, but that
2974	* would amount to optimizing for the case where the join method is
2975	* disabled, which doesn't seem like the way to bet.
2976	*/
2977	if (!enable_mergejoin)
2978	startup_cost += disable_cost;
2979
2980	/*
2981	* Compute cost of the mergequals and qpquals (other restriction clauses)
2982	* separately.
2983	*/
2984	cost_qual_eval(&merge_qual_cost, mergeclauses, root);
2985	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
2986	qp_qual_cost.startup -= merge_qual_cost.startup;
2987	qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
2988
2989	/*
2990	* With a SEMI or ANTI join, or if the innerrel is known unique, the
2991	* executor will stop scanning for matches after the first match. When
2992	* all the joinclauses are merge clauses, this means we don't ever need to
2993	* back up the merge, and so we can skip mark/restore overhead.
2994	*/
2995	if ((path->jpath.jointype == JOIN_SEMI \|\|
2996	path->jpath.jointype == JOIN_ANTI \|\|
2997	extra->inner_unique) &&
2998	(list_length(path->jpath.joinrestrictinfo) ==
2999	list_length(path->path_mergeclauses)))
3000	path->skip_mark_restore = true;
3001	else
3002	path->skip_mark_restore = false;
3003
3004	/*
3005	* Get approx # tuples passing the mergequals. We use approx_tuple_count
3006	* here because we need an estimate done with JOIN_INNER semantics.
3007	*/
3008	mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
3009
3010	/*
3011	* When there are equal merge keys in the outer relation, the mergejoin
3012	* must rescan any matching tuples in the inner relation. This means
3013	* re-fetching inner tuples; we have to estimate how often that happens.
3014	*
3015	* For regular inner and outer joins, the number of re-fetches can be
3016	* estimated approximately as size of merge join output minus size of
3017	* inner relation. Assume that the distinct key values are 1, 2, ..., and
3018	* denote the number of values of each key in the outer relation as m1,
3019	* m2, ...; in the inner relation, n1, n2, ... Then we have
3020	*
3021	* size of join = m1 * n1 + m2 * n2 + ...
3022	*
3023	* number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
3024	* n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
3025	* relation
3026	*
3027	* This equation works correctly for outer tuples having no inner match
3028	* (nk = 0), but not for inner tuples having no outer match (mk = 0); we
3029	* are effectively subtracting those from the number of rescanned tuples,
3030	* when we should not. Can we do better without expensive selectivity
3031	* computations?
3032	*
3033	* The whole issue is moot if we are working from a unique-ified outer
3034	* input, or if we know we don't need to mark/restore at all.
3035	*/
3036	if (IsA(outer_path, UniquePath) \|\|path->skip_mark_restore)
3037	rescannedtuples = `0`;
3038	else
3039	{
3040	rescannedtuples = mergejointuples - inner_path_rows;
3041	/ Must clamp because of possible underestimate /
3042	if (rescannedtuples < `0`)
3043	rescannedtuples = `0`;
3044	}
3045
3046	/*
3047	* We'll inflate various costs this much to account for rescanning. Note
3048	* that this is to be multiplied by something involving inner_rows, or
3049	* another number related to the portion of the inner rel we'll scan.
3050	*/
3051	rescanratio = `1.0` + (rescannedtuples / inner_rows);
3052
3053	/*
3054	* Decide whether we want to materialize the inner input to shield it from
3055	* mark/restore and performing re-fetches. Our cost model for regular
3056	* re-fetches is that a re-fetch costs the same as an original fetch,
3057	* which is probably an overestimate; but on the other hand we ignore the
3058	* bookkeeping costs of mark/restore. Not clear if it's worth developing
3059	* a more refined model. So we just need to inflate the inner run cost by
3060	* rescanratio.
3061	*/
3062	bare_inner_cost = inner_run_cost * rescanratio;
3063
3064	/*
3065	* When we interpose a Material node the re-fetch cost is assumed to be
3066	* just cpu_operator_cost per tuple, independently of the underlying
3067	* plan's cost; and we charge an extra cpu_operator_cost per original
3068	* fetch as well. Note that we're assuming the materialize node will
3069	* never spill to disk, since it only has to remember tuples back to the
3070	* last mark. (If there are a huge number of duplicates, our other cost
3071	* factors will make the path so expensive that it probably won't get
3072	* chosen anyway.) So we don't use cost_rescan here.
3073	*
3074	* Note: keep this estimate in sync with create_mergejoin_plan's labeling
3075	* of the generated Material node.
3076	*/
3077	mat_inner_cost = inner_run_cost +
3078	cpu_operator_cost * inner_rows * rescanratio;
3079
3080	/*
3081	* If we don't need mark/restore at all, we don't need materialization.
3082	*/
3083	if (path->skip_mark_restore)
3084	path->materialize_inner = false;
3085
3086	/*
3087	* Prefer materializing if it looks cheaper, unless the user has asked to
3088	* suppress materialization.
3089	*/
3090	else if (enable_material && mat_inner_cost < bare_inner_cost)
3091	path->materialize_inner = true;
3092
3093	/*
3094	* Even if materializing doesn't look cheaper, we must do it if the
3095	* inner path is to be used directly (without sorting) and it doesn't
3096	* support mark/restore.
3097	*
3098	* Since the inner side must be ordered, and only Sorts and IndexScans can
3099	* create order to begin with, and they both support mark/restore, you
3100	* might think there's no problem --- but you'd be wrong. Nestloop and
3101	* merge joins can preserve the order of their inputs, so they can be
3102	* selected as the input of a mergejoin, and they don't support
3103	* mark/restore at present.
3104	*
3105	* We don't test the value of enable_material here, because
3106	* materialization is required for correctness in this case, and turning
3107	* it off does not entitle us to deliver an invalid plan.
3108	*/
3109	else if (innersortkeys == NIL &&
3110	!ExecSupportsMarkRestore(inner_path))
3111	path->materialize_inner = true;
3112
3113	/*
3114	* Also, force materializing if the inner path is to be sorted and the
3115	* sort is expected to spill to disk. This is because the final merge
3116	* pass can be done on-the-fly if it doesn't have to support mark/restore.
3117	* We don't try to adjust the cost estimates for this consideration,
3118	* though.
3119	*
3120	* Since materialization is a performance optimization in this case,
3121	* rather than necessary for correctness, we skip it if enable_material is
3122	* off.
3123	*/
3124	else if (enable_material && innersortkeys != NIL &&
3125	relation_byte_size(inner_path_rows,
3126	inner_path->pathtarget->width) >
3127	(work_mem * `1024L`))
3128	path->materialize_inner = true;
3129	else
3130	path->materialize_inner = false;
3131
3132	/ Charge the right incremental cost for the chosen case /
3133	if (path->materialize_inner)
3134	run_cost += mat_inner_cost;
3135	else
3136	run_cost += bare_inner_cost;
3137
3138	/ CPU costs /
3139
3140	/*
3141	* The number of tuple comparisons needed is approximately number of outer
3142	* rows plus number of inner rows plus number of rescanned tuples (can we
3143	* refine this?). At each one, we need to evaluate the mergejoin quals.
3144	*/
3145	startup_cost += merge_qual_cost.startup;
3146	startup_cost += merge_qual_cost.per_tuple *
3147	(outer_skip_rows + inner_skip_rows * rescanratio);
3148	run_cost += merge_qual_cost.per_tuple *
3149	((outer_rows - outer_skip_rows) +
3150	(inner_rows - inner_skip_rows) * rescanratio);
3151
3152	/*
3153	* For each tuple that gets through the mergejoin proper, we charge
3154	* cpu_tuple_cost plus the cost of evaluating additional restriction
3155	* clauses that are to be applied at the join. (This is pessimistic since
3156	* not all of the quals may get evaluated at each tuple.)
3157	*
3158	* Note: we could adjust for SEMI/ANTI joins skipping some qual
3159	* evaluations here, but it's probably not worth the trouble.
3160	*/
3161	startup_cost += qp_qual_cost.startup;
3162	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3163	run_cost += cpu_per_tuple * mergejointuples;
3164
3165	/ tlist eval costs are paid per output row, not per tuple scanned /
3166	startup_cost += path->jpath.path.pathtarget->cost.startup;
3167	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3168
3169	path->jpath.path.startup_cost = startup_cost;
3170	path->jpath.path.total_cost = startup_cost + run_cost;
3171	}
3172
3173	/*
3174	* run mergejoinscansel() with caching
3175	*/
3176	static MergeScanSelCache *
3177	cached_scansel(PlannerInfo root, RestrictInfo rinfo, PathKey *pathkey)
3178	{
3179	MergeScanSelCache *cache;
3180	ListCell *lc;
3181	Selectivity leftstartsel,
3182	leftendsel,
3183	rightstartsel,
3184	rightendsel;
3185	MemoryContext oldcontext;
3186
3187	/ Do we have this result already? /
3188	foreach(lc, rinfo->scansel_cache)
3189	{
3190	cache = (MergeScanSelCache *) lfirst(lc);
3191	if (cache->opfamily == pathkey->pk_opfamily &&
3192	cache->collation == pathkey->pk_eclass->ec_collation &&
3193	cache->strategy == pathkey->pk_strategy &&
3194	cache->nulls_first == pathkey->pk_nulls_first)
3195	return cache;
3196	}
3197
3198	/ Nope, do the computation /
3199	mergejoinscansel(root,
3200	(Node *) rinfo->clause,
3201	pathkey->pk_opfamily,
3202	pathkey->pk_strategy,
3203	pathkey->pk_nulls_first,
3204	&leftstartsel,
3205	&leftendsel,
3206	&rightstartsel,
3207	&rightendsel);
3208
3209	/ Cache the result in suitably long-lived workspace /
3210	oldcontext = MemoryContextSwitchTo(root->planner_cxt);
3211
3212	cache = (MergeScanSelCache ) palloc(sizeof*(MergeScanSelCache));
3213	cache->opfamily = pathkey->pk_opfamily;
3214	cache->collation = pathkey->pk_eclass->ec_collation;
3215	cache->strategy = pathkey->pk_strategy;
3216	cache->nulls_first = pathkey->pk_nulls_first;
3217	cache->leftstartsel = leftstartsel;
3218	cache->leftendsel = leftendsel;
3219	cache->rightstartsel = rightstartsel;
3220	cache->rightendsel = rightendsel;
3221
3222	rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
3223
3224	MemoryContextSwitchTo(oldcontext);
3225
3226	return cache;
3227	}
3228
3229	/*
3230	* initial_cost_hashjoin
3231	* Preliminary estimate of the cost of a hashjoin path.
3232	*
3233	* This must quickly produce lower-bound estimates of the path's startup and
3234	* total costs. If we are unable to eliminate the proposed path from
3235	* consideration using the lower bounds, final_cost_hashjoin will be called
3236	* to obtain the final estimates.
3237	*
3238	* The exact division of labor between this function and final_cost_hashjoin
3239	* is private to them, and represents a tradeoff between speed of the initial
3240	* estimate and getting a tight lower bound. We choose to not examine the
3241	* join quals here (other than by counting the number of hash clauses),
3242	* so we can't do much with CPU costs. We do assume that
3243	* ExecChooseHashTableSize is cheap enough to use here.
3244	*
3245	* 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3246	* other data to be used by final_cost_hashjoin
3247	* 'jointype' is the type of join to be performed
3248	* 'hashclauses' is the list of joinclauses to be used as hash clauses
3249	* 'outer_path' is the outer input to the join
3250	* 'inner_path' is the inner input to the join
3251	* 'extra' contains miscellaneous information about the join
3252	* 'parallel_hash' indicates that inner_path is partial and that a shared
3253	* hash table will be built in parallel
3254	*/
3255	void
3256	initial_cost_hashjoin(PlannerInfo root, JoinCostWorkspace workspace,
3257	JoinType jointype,
3258	List *hashclauses,
3259	Path outer_path, Path inner_path,
3260	JoinPathExtraData *extra,
3261	bool parallel_hash)
3262	{
3263	Cost startup_cost = `0`;
3264	Cost run_cost = `0`;
3265	double outer_path_rows = outer_path->rows;
3266	double inner_path_rows = inner_path->rows;
3267	double inner_path_rows_total = inner_path_rows;
3268	int num_hashclauses = list_length(hashclauses);
3269	int numbuckets;
3270	int numbatches;
3271	int num_skew_mcvs;
3272	size_t space_allowed; / unused /
3273
3274	/ cost of source data /
3275	startup_cost += outer_path->startup_cost;
3276	run_cost += outer_path->total_cost - outer_path->startup_cost;
3277	startup_cost += inner_path->total_cost;
3278
3279	/*
3280	* Cost of computing hash function: must do it once per input tuple. We
3281	* charge one cpu_operator_cost for each column's hash function. Also,
3282	* tack on one cpu_tuple_cost per inner row, to model the costs of
3283	* inserting the row into the hashtable.
3284	*
3285	* XXX when a hashclause is more complex than a single operator, we really
3286	* should charge the extra eval costs of the left or right side, as
3287	* appropriate, here. This seems more work than it's worth at the moment.
3288	*/
3289	startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
3290	* inner_path_rows;
3291	run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
3292
3293	/*
3294	* If this is a parallel hash build, then the value we have for
3295	* inner_rows_total currently refers only to the rows returned by each
3296	* participant. For shared hash table size estimation, we need the total
3297	* number, so we need to undo the division.
3298	*/
3299	if (parallel_hash)
3300	inner_path_rows_total *= get_parallel_divisor(inner_path);
3301
3302	/*
3303	* Get hash table size that executor would use for inner relation.
3304	*
3305	* XXX for the moment, always assume that skew optimization will be
3306	* performed. As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
3307	* trying to determine that for sure.
3308	*
3309	* XXX at some point it might be interesting to try to account for skew
3310	* optimization in the cost estimate, but for now, we don't.
3311	*/
3312	ExecChooseHashTableSize(inner_path_rows_total,
3313	inner_path->pathtarget->width,
3314	true, / useskew /
3315	parallel_hash, / try_combined_work_mem /
3316	outer_path->parallel_workers,
3317	&space_allowed,
3318	&numbuckets,
3319	&numbatches,
3320	&num_skew_mcvs);
3321
3322	/*
3323	* If inner relation is too big then we will need to "batch" the join,
3324	* which implies writing and reading most of the tuples to disk an extra
3325	* time. Charge seq_page_cost per page, since the I/O should be nice and
3326	* sequential. Writing the inner rel counts as startup cost, all the rest
3327	* as run cost.
3328	*/
3329	if (numbatches > `1`)
3330	{
3331	double outerpages = page_size(outer_path_rows,
3332	outer_path->pathtarget->width);
3333	double innerpages = page_size(inner_path_rows,
3334	inner_path->pathtarget->width);
3335
3336	startup_cost += seq_page_cost * innerpages;
3337	run_cost += seq_page_cost * (innerpages + `2` * outerpages);
3338	}
3339
3340	/ CPU costs left for later /
3341
3342	/ Public result fields /
3343	workspace->startup_cost = startup_cost;
3344	workspace->total_cost = startup_cost + run_cost;
3345	/ Save private data for final_cost_hashjoin /
3346	workspace->run_cost = run_cost;
3347	workspace->numbuckets = numbuckets;
3348	workspace->numbatches = numbatches;
3349	workspace->inner_rows_total = inner_path_rows_total;
3350	}
3351
3352	/*
3353	* final_cost_hashjoin
3354	* Final estimate of the cost and result size of a hashjoin path.
3355	*
3356	* Note: the numbatches estimate is also saved into 'path' for use later
3357	*
3358	* 'path' is already filled in except for the rows and cost fields and
3359	* num_batches
3360	* 'workspace' is the result from initial_cost_hashjoin
3361	* 'extra' contains miscellaneous information about the join
3362	*/
3363	void
3364	final_cost_hashjoin(PlannerInfo root, HashPath path,
3365	JoinCostWorkspace *workspace,
3366	JoinPathExtraData *extra)
3367	{
3368	Path *outer_path = path->jpath.outerjoinpath;
3369	Path *inner_path = path->jpath.innerjoinpath;
3370	double outer_path_rows = outer_path->rows;
3371	double inner_path_rows = inner_path->rows;
3372	double inner_path_rows_total = workspace->inner_rows_total;
3373	List *hashclauses = path->path_hashclauses;
3374	Cost startup_cost = workspace->startup_cost;
3375	Cost run_cost = workspace->run_cost;
3376	int numbuckets = workspace->numbuckets;
3377	int numbatches = workspace->numbatches;
3378	Cost cpu_per_tuple;
3379	QualCost hash_qual_cost;
3380	QualCost qp_qual_cost;
3381	double hashjointuples;
3382	double virtualbuckets;
3383	Selectivity innerbucketsize;
3384	Selectivity innermcvfreq;
3385	ListCell *hcl;
3386
3387	/ Mark the path with the correct row estimate /
3388	if (path->jpath.path.param_info)
3389	path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3390	else
3391	path->jpath.path.rows = path->jpath.path.parent->rows;
3392
3393	/ For partial paths, scale row estimate. /
3394	if (path->jpath.path.parallel_workers > `0`)
3395	{
3396	double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3397
3398	path->jpath.path.rows =
3399	clamp_row_est(path->jpath.path.rows / parallel_divisor);
3400	}
3401
3402	/*
3403	* We could include disable_cost in the preliminary estimate, but that
3404	* would amount to optimizing for the case where the join method is
3405	* disabled, which doesn't seem like the way to bet.
3406	*/
3407	if (!enable_hashjoin)
3408	startup_cost += disable_cost;
3409
3410	/ mark the path with estimated # of batches /
3411	path->num_batches = numbatches;
3412
3413	/ store the total number of tuples (sum of partial row estimates) /
3414	path->inner_rows_total = inner_path_rows_total;
3415
3416	/ and compute the number of "virtual" buckets in the whole join /
3417	virtualbuckets = (double) numbuckets * (double) numbatches;
3418
3419	/*
3420	* Determine bucketsize fraction and MCV frequency for the inner relation.
3421	* We use the smallest bucketsize or MCV frequency estimated for any
3422	* individual hashclause; this is undoubtedly conservative.
3423	*
3424	* BUT: if inner relation has been unique-ified, we can assume it's good
3425	* for hashing. This is important both because it's the right answer, and
3426	* because we avoid contaminating the cache with a value that's wrong for
3427	* non-unique-ified paths.
3428	*/
3429	if (IsA(inner_path, UniquePath))
3430	{
3431	innerbucketsize = `1.0` / virtualbuckets;
3432	innermcvfreq = `0.0`;
3433	}
3434	else
3435	{
3436	innerbucketsize = `1.0`;
3437	innermcvfreq = `1.0`;
3438	foreach(hcl, hashclauses)
3439	{
3440	RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
3441	Selectivity thisbucketsize;
3442	Selectivity thismcvfreq;
3443
3444	/*
3445	* First we have to figure out which side of the hashjoin clause
3446	* is the inner side.
3447	*
3448	* Since we tend to visit the same clauses over and over when
3449	* planning a large query, we cache the bucket stats estimates in
3450	* the RestrictInfo node to avoid repeated lookups of statistics.
3451	*/
3452	if (bms_is_subset(restrictinfo->right_relids,
3453	inner_path->parent->relids))
3454	{
3455	/ righthand side is inner /
3456	thisbucketsize = restrictinfo->right_bucketsize;
3457	if (thisbucketsize < `0`)
3458	{
3459	/ not cached yet /
3460	estimate_hash_bucket_stats(root,
3461	get_rightop(restrictinfo->clause),
3462	virtualbuckets,
3463	&restrictinfo->right_mcvfreq,
3464	&restrictinfo->right_bucketsize);
3465	thisbucketsize = restrictinfo->right_bucketsize;
3466	}
3467	thismcvfreq = restrictinfo->right_mcvfreq;
3468	}
3469	else
3470	{
3471	Assert(bms_is_subset(restrictinfo->left_relids,
3472	inner_path->parent->relids));
3473	/ lefthand side is inner /
3474	thisbucketsize = restrictinfo->left_bucketsize;
3475	if (thisbucketsize < `0`)
3476	{
3477	/ not cached yet /
3478	estimate_hash_bucket_stats(root,
3479	get_leftop(restrictinfo->clause),
3480	virtualbuckets,
3481	&restrictinfo->left_mcvfreq,
3482	&restrictinfo->left_bucketsize);
3483	thisbucketsize = restrictinfo->left_bucketsize;
3484	}
3485	thismcvfreq = restrictinfo->left_mcvfreq;
3486	}
3487
3488	if (innerbucketsize > thisbucketsize)
3489	innerbucketsize = thisbucketsize;
3490	if (innermcvfreq > thismcvfreq)
3491	innermcvfreq = thismcvfreq;
3492	}
3493	}
3494
3495	/*
3496	* If the bucket holding the inner MCV would exceed work_mem, we don't
3497	* want to hash unless there is really no other alternative, so apply
3498	* disable_cost. (The executor normally copes with excessive memory usage
3499	* by splitting batches, but obviously it cannot separate equal values
3500	* that way, so it will be unable to drive the batch size below work_mem
3501	* when this is true.)
3502	*/
3503	if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
3504	inner_path->pathtarget->width) >
3505	(work_mem * `1024L`))
3506	startup_cost += disable_cost;
3507
3508	/*
3509	* Compute cost of the hashquals and qpquals (other restriction clauses)
3510	* separately.
3511	*/
3512	cost_qual_eval(&hash_qual_cost, hashclauses, root);
3513	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3514	qp_qual_cost.startup -= hash_qual_cost.startup;
3515	qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
3516
3517	/ CPU costs /
3518
3519	if (path->jpath.jointype == JOIN_SEMI \|\|
3520	path->jpath.jointype == JOIN_ANTI \|\|
3521	extra->inner_unique)
3522	{
3523	double outer_matched_rows;
3524	Selectivity inner_scan_frac;
3525
3526	/*
3527	* With a SEMI or ANTI join, or if the innerrel is known unique, the
3528	* executor will stop after the first match.
3529	*
3530	* For an outer-rel row that has at least one match, we can expect the
3531	* bucket scan to stop after a fraction 1/(match_count+1) of the
3532	* bucket's rows, if the matches are evenly distributed. Since they
3533	* probably aren't quite evenly distributed, we apply a fuzz factor of
3534	* 2.0 to that fraction. (If we used a larger fuzz factor, we'd have
3535	* to clamp inner_scan_frac to at most 1.0; but since match_count is
3536	* at least 1, no such clamp is needed now.)
3537	*/
3538	outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3539	inner_scan_frac = `2.0` / (extra->semifactors.match_count + `1.0`);
3540
3541	startup_cost += hash_qual_cost.startup;
3542	run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
3543	clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * `0.5`;
3544
3545	/*
3546	* For unmatched outer-rel rows, the picture is quite a lot different.
3547	* In the first place, there is no reason to assume that these rows
3548	* preferentially hit heavily-populated buckets; instead assume they
3549	* are uncorrelated with the inner distribution and so they see an
3550	* average bucket size of inner_path_rows / virtualbuckets. In the
3551	* second place, it seems likely that they will have few if any exact
3552	* hash-code matches and so very few of the tuples in the bucket will
3553	* actually require eval of the hash quals. We don't have any good
3554	* way to estimate how many will, but for the moment assume that the
3555	* effective cost per bucket entry is one-tenth what it is for
3556	* matchable tuples.
3557	*/
3558	run_cost += hash_qual_cost.per_tuple *
3559	(outer_path_rows - outer_matched_rows) *
3560	clamp_row_est(inner_path_rows / virtualbuckets) * `0.05`;
3561
3562	/ Get # of tuples that will pass the basic join /
3563	if (path->jpath.jointype == JOIN_ANTI)
3564	hashjointuples = outer_path_rows - outer_matched_rows;
3565	else
3566	hashjointuples = outer_matched_rows;
3567	}
3568	else
3569	{
3570	/*
3571	* The number of tuple comparisons needed is the number of outer
3572	* tuples times the typical number of tuples in a hash bucket, which
3573	* is the inner relation size times its bucketsize fraction. At each
3574	* one, we need to evaluate the hashjoin quals. But actually,
3575	* charging the full qual eval cost at each tuple is pessimistic,
3576	* since we don't evaluate the quals unless the hash values match
3577	* exactly. For lack of a better idea, halve the cost estimate to
3578	* allow for that.
3579	*/
3580	startup_cost += hash_qual_cost.startup;
3581	run_cost += hash_qual_cost.per_tuple * outer_path_rows *
3582	clamp_row_est(inner_path_rows * innerbucketsize) * `0.5`;
3583
3584	/*
3585	* Get approx # tuples passing the hashquals. We use
3586	* approx_tuple_count here because we need an estimate done with
3587	* JOIN_INNER semantics.
3588	*/
3589	hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
3590	}
3591
3592	/*
3593	* For each tuple that gets through the hashjoin proper, we charge
3594	* cpu_tuple_cost plus the cost of evaluating additional restriction
3595	* clauses that are to be applied at the join. (This is pessimistic since
3596	* not all of the quals may get evaluated at each tuple.)
3597	*/
3598	startup_cost += qp_qual_cost.startup;
3599	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3600	run_cost += cpu_per_tuple * hashjointuples;
3601
3602	/ tlist eval costs are paid per output row, not per tuple scanned /
3603	startup_cost += path->jpath.path.pathtarget->cost.startup;
3604	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3605
3606	path->jpath.path.startup_cost = startup_cost;
3607	path->jpath.path.total_cost = startup_cost + run_cost;
3608	}
3609
3610
3611	/*
3612	* cost_subplan
3613	* Figure the costs for a SubPlan (or initplan).
3614	*
3615	* Note: we could dig the subplan's Plan out of the root list, but in practice
3616	* all callers have it handy already, so we make them pass it.
3617	*/
3618	void
3619	cost_subplan(PlannerInfo root, SubPlan subplan, Plan *plan)
3620	{
3621	QualCost sp_cost;
3622
3623	/ Figure any cost for evaluating the testexpr /
3624	cost_qual_eval(&sp_cost,
3625	make_ands_implicit((Expr *) subplan->testexpr),
3626	root);
3627
3628	if (subplan->useHashTable)
3629	{
3630	/*
3631	* If we are using a hash table for the subquery outputs, then the
3632	* cost of evaluating the query is a one-time cost. We charge one
3633	* cpu_operator_cost per tuple for the work of loading the hashtable,
3634	* too.
3635	*/
3636	sp_cost.startup += plan->total_cost +
3637	cpu_operator_cost * plan->plan_rows;
3638
3639	/*
3640	* The per-tuple costs include the cost of evaluating the lefthand
3641	* expressions, plus the cost of probing the hashtable. We already
3642	* accounted for the lefthand expressions as part of the testexpr, and
3643	* will also have counted one cpu_operator_cost for each comparison
3644	* operator. That is probably too low for the probing cost, but it's
3645	* hard to make a better estimate, so live with it for now.
3646	*/
3647	}
3648	else
3649	{
3650	/*
3651	* Otherwise we will be rescanning the subplan output on each
3652	* evaluation. We need to estimate how much of the output we will
3653	* actually need to scan. NOTE: this logic should agree with the
3654	* tuple_fraction estimates used by make_subplan() in
3655	* plan/subselect.c.
3656	*/
3657	Cost plan_run_cost = plan->total_cost - plan->startup_cost;
3658
3659	if (subplan->subLinkType == EXISTS_SUBLINK)
3660	{
3661	/ we only need to fetch 1 tuple; clamp to avoid zero divide /
3662	sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
3663	}
3664	else if (subplan->subLinkType == ALL_SUBLINK \|\|
3665	subplan->subLinkType == ANY_SUBLINK)
3666	{
3667	/ assume we need 50% of the tuples /
3668	sp_cost.per_tuple += `0.50` * plan_run_cost;
3669	/ also charge a cpu_operator_cost per row examined /
3670	sp_cost.per_tuple += `0.50` * plan->plan_rows * cpu_operator_cost;
3671	}
3672	else
3673	{
3674	/ assume we need all tuples /
3675	sp_cost.per_tuple += plan_run_cost;
3676	}
3677
3678	/*
3679	* Also account for subplan's startup cost. If the subplan is
3680	* uncorrelated or undirect correlated, AND its topmost node is one
3681	* that materializes its output, assume that we'll only need to pay
3682	* its startup cost once; otherwise assume we pay the startup cost
3683	* every time.
3684	*/
3685	if (subplan->parParam == NIL &&
3686	ExecMaterializesOutput(nodeTag(plan)))
3687	sp_cost.startup += plan->startup_cost;
3688	else
3689	sp_cost.per_tuple += plan->startup_cost;
3690	}
3691
3692	subplan->startup_cost = sp_cost.startup;
3693	subplan->per_call_cost = sp_cost.per_tuple;
3694	}
3695
3696
3697	/*
3698	* cost_rescan
3699	* Given a finished Path, estimate the costs of rescanning it after
3700	* having done so the first time. For some Path types a rescan is
3701	* cheaper than an original scan (if no parameters change), and this
3702	* function embodies knowledge about that. The default is to return
3703	* the same costs stored in the Path. (Note that the cost estimates
3704	* actually stored in Paths are always for first scans.)
3705	*
3706	* This function is not currently intended to model effects such as rescans
3707	* being cheaper due to disk block caching; what we are concerned with is
3708	* plan types wherein the executor caches results explicitly, or doesn't
3709	* redo startup calculations, etc.
3710	*/
3711	static void
3712	cost_rescan(PlannerInfo root, Path path,
3713	Cost rescan_startup_cost, /* output parameters /
3714	Cost *rescan_total_cost)
3715	{
3716	switch (path->pathtype)
3717	{
3718	case T_FunctionScan:
3719
3720	/*
3721	* Currently, nodeFunctionscan.c always executes the function to
3722	* completion before returning any rows, and caches the results in
3723	* a tuplestore. So the function eval cost is all startup cost
3724	* and isn't paid over again on rescans. However, all run costs
3725	* will be paid over again.
3726	*/
3727	*rescan_startup_cost = `0`;
3728	*rescan_total_cost = path->total_cost - path->startup_cost;
3729	break;
3730	case T_HashJoin:
3731
3732	/*
3733	* If it's a single-batch join, we don't need to rebuild the hash
3734	* table during a rescan.
3735	*/
3736	if (((HashPath *) path)->num_batches == `1`)
3737	{
3738	/ Startup cost is exactly the cost of hash table building /
3739	*rescan_startup_cost = `0`;
3740	*rescan_total_cost = path->total_cost - path->startup_cost;
3741	}
3742	else
3743	{
3744	/ Otherwise, no special treatment /
3745	*rescan_startup_cost = path->startup_cost;
3746	*rescan_total_cost = path->total_cost;
3747	}
3748	break;
3749	case T_CteScan:
3750	case T_WorkTableScan:
3751	{
3752	/*
3753	* These plan types materialize their final result in a
3754	* tuplestore or tuplesort object. So the rescan cost is only
3755	* cpu_tuple_cost per tuple, unless the result is large enough
3756	* to spill to disk.
3757	*/
3758	Cost run_cost = cpu_tuple_cost * path->rows;
3759	double nbytes = relation_byte_size(path->rows,
3760	path->pathtarget->width);
3761	long work_mem_bytes = work_mem * `1024L`;
3762
3763	if (nbytes > work_mem_bytes)
3764	{
3765	/ It will spill, so account for re-read cost /
3766	double npages = ceil(nbytes / BLCKSZ);
3767
3768	run_cost += seq_page_cost * npages;
3769	}
3770	*rescan_startup_cost = `0`;
3771	*rescan_total_cost = run_cost;
3772	}
3773	break;
3774	case T_Material:
3775	case T_Sort:
3776	{
3777	/*
3778	* These plan types not only materialize their results, but do
3779	* not implement qual filtering or projection. So they are
3780	* even cheaper to rescan than the ones above. We charge only
3781	* cpu_operator_cost per tuple. (Note: keep that in sync with
3782	* the run_cost charge in cost_sort, and also see comments in
3783	* cost_material before you change it.)
3784	*/
3785	Cost run_cost = cpu_operator_cost * path->rows;
3786	double nbytes = relation_byte_size(path->rows,
3787	path->pathtarget->width);
3788	long work_mem_bytes = work_mem * `1024L`;
3789
3790	if (nbytes > work_mem_bytes)
3791	{
3792	/ It will spill, so account for re-read cost /
3793	double npages = ceil(nbytes / BLCKSZ);
3794
3795	run_cost += seq_page_cost * npages;
3796	}
3797	*rescan_startup_cost = `0`;
3798	*rescan_total_cost = run_cost;
3799	}
3800	break;
3801	default:
3802	*rescan_startup_cost = path->startup_cost;
3803	*rescan_total_cost = path->total_cost;
3804	break;
3805	}
3806	}
3807
3808
3809	/*
3810	* cost_qual_eval
3811	* Estimate the CPU costs of evaluating a WHERE clause.
3812	* The input can be either an implicitly-ANDed list of boolean
3813	* expressions, or a list of RestrictInfo nodes. (The latter is
3814	* preferred since it allows caching of the results.)
3815	* The result includes both a one-time (startup) component,
3816	* and a per-evaluation component.
3817	*/
3818	void
3819	cost_qual_eval(QualCost cost, List quals, PlannerInfo *root)
3820	{
3821	cost_qual_eval_context context;
3822	ListCell *l;
3823
3824	context.root = root;
3825	context.total.startup = `0`;
3826	context.total.per_tuple = `0`;
3827
3828	/ We don't charge any cost for the implicit ANDing at top level ... /
3829
3830	foreach(l, quals)
3831	{
3832	Node qual = (Node ) lfirst(l);
3833
3834	cost_qual_eval_walker(qual, &context);
3835	}
3836
3837	*cost = context.total;
3838	}
3839
3840	/*
3841	* cost_qual_eval_node
3842	* As above, for a single RestrictInfo or expression.
3843	*/
3844	void
3845	cost_qual_eval_node(QualCost cost, Node qual, PlannerInfo *root)
3846	{
3847	cost_qual_eval_context context;
3848
3849	context.root = root;
3850	context.total.startup = `0`;
3851	context.total.per_tuple = `0`;
3852
3853	cost_qual_eval_walker(qual, &context);
3854
3855	*cost = context.total;
3856	}
3857
3858	static bool
3859	cost_qual_eval_walker(Node node, cost_qual_eval_context context)
3860	{
3861	if (node == NULL)
3862	return false;
3863
3864	/*
3865	* RestrictInfo nodes contain an eval_cost field reserved for this
3866	* routine's use, so that it's not necessary to evaluate the qual clause's
3867	* cost more than once. If the clause's cost hasn't been computed yet,
3868	* the field's startup value will contain -1.
3869	*/
3870	if (IsA(node, RestrictInfo))
3871	{
3872	RestrictInfo rinfo = (RestrictInfo ) node;
3873
3874	if (rinfo->eval_cost.startup < `0`)
3875	{
3876	cost_qual_eval_context locContext;
3877
3878	locContext.root = context->root;
3879	locContext.total.startup = `0`;
3880	locContext.total.per_tuple = `0`;
3881
3882	/*
3883	* For an OR clause, recurse into the marked-up tree so that we
3884	* set the eval_cost for contained RestrictInfos too.
3885	*/
3886	if (rinfo->orclause)
3887	cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
3888	else
3889	cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
3890
3891	/*
3892	* If the RestrictInfo is marked pseudoconstant, it will be tested
3893	* only once, so treat its cost as all startup cost.
3894	*/
3895	if (rinfo->pseudoconstant)
3896	{
3897	/ count one execution during startup /
3898	locContext.total.startup += locContext.total.per_tuple;
3899	locContext.total.per_tuple = `0`;
3900	}
3901	rinfo->eval_cost = locContext.total;
3902	}
3903	context->total.startup += rinfo->eval_cost.startup;
3904	context->total.per_tuple += rinfo->eval_cost.per_tuple;
3905	/ do NOT recurse into children /
3906	return false;
3907	}
3908
3909	/*
3910	* For each operator or function node in the given tree, we charge the
3911	* estimated execution cost given by pg_proc.procost (remember to multiply
3912	* this by cpu_operator_cost).
3913	*
3914	* Vars and Consts are charged zero, and so are boolean operators (AND,
3915	* OR, NOT). Simplistic, but a lot better than no model at all.
3916	*
3917	* Should we try to account for the possibility of short-circuit
3918	* evaluation of AND/OR? Probably not, because that would make the
3919	* results depend on the clause ordering, and we are not in any position
3920	* to expect that the current ordering of the clauses is the one that's
3921	* going to end up being used. The above per-RestrictInfo caching would
3922	* not mix well with trying to re-order clauses anyway.
3923	*
3924	* Another issue that is entirely ignored here is that if a set-returning
3925	* function is below top level in the tree, the functions/operators above
3926	* it will need to be evaluated multiple times. In practical use, such
3927	* cases arise so seldom as to not be worth the added complexity needed;
3928	* moreover, since our rowcount estimates for functions tend to be pretty
3929	* phony, the results would also be pretty phony.
3930	*/
3931	if (IsA(node, FuncExpr))
3932	{
3933	add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
3934	&context->total);
3935	}
3936	else if (IsA(node, OpExpr) \|\|
3937	IsA(node, DistinctExpr) \|\|
3938	IsA(node, NullIfExpr))
3939	{
3940	/ rely on struct equivalence to treat these all alike /
3941	set_opfuncid((OpExpr *) node);
3942	add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
3943	&context->total);
3944	}
3945	else if (IsA(node, ScalarArrayOpExpr))
3946	{
3947	/*
3948	* Estimate that the operator will be applied to about half of the
3949	* array elements before the answer is determined.
3950	*/
3951	ScalarArrayOpExpr saop = (ScalarArrayOpExpr ) node;
3952	Node arraynode = (Node ) lsecond(saop->args);
3953	QualCost sacosts;
3954
3955	set_sa_opfuncid(saop);
3956	sacosts.startup = sacosts.per_tuple = `0`;
3957	add_function_cost(context->root, saop->opfuncid, NULL,
3958	&sacosts);
3959	context->total.startup += sacosts.startup;
3960	context->total.per_tuple += sacosts.per_tuple *
3961	estimate_array_length(arraynode) * `0.5`;
3962	}
3963	else if (IsA(node, Aggref) \|\|
3964	IsA(node, WindowFunc))
3965	{
3966	/*
3967	* Aggref and WindowFunc nodes are (and should be) treated like Vars,
3968	* ie, zero execution cost in the current model, because they behave
3969	* essentially like Vars at execution. We disregard the costs of
3970	* their input expressions for the same reason. The actual execution
3971	* costs of the aggregate/window functions and their arguments have to
3972	* be factored into plan-node-specific costing of the Agg or WindowAgg
3973	* plan node.
3974	*/
3975	return false; / don't recurse into children /
3976	}
3977	else if (IsA(node, CoerceViaIO))
3978	{
3979	CoerceViaIO iocoerce = (CoerceViaIO ) node;
3980	Oid iofunc;
3981	Oid typioparam;
3982	bool typisvarlena;
3983
3984	/ check the result type's input function /
3985	getTypeInputInfo(iocoerce->resulttype,
3986	&iofunc, &typioparam);
3987	add_function_cost(context->root, iofunc, NULL,
3988	&context->total);
3989	/ check the input type's output function /
3990	getTypeOutputInfo(exprType((Node *) iocoerce->arg),
3991	&iofunc, &typisvarlena);
3992	add_function_cost(context->root, iofunc, NULL,
3993	&context->total);
3994	}
3995	else if (IsA(node, ArrayCoerceExpr))
3996	{
3997	ArrayCoerceExpr acoerce = (ArrayCoerceExpr ) node;
3998	QualCost perelemcost;
3999
4000	cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
4001	context->root);
4002	context->total.startup += perelemcost.startup;
4003	if (perelemcost.per_tuple > `0`)
4004	context->total.per_tuple += perelemcost.per_tuple *
4005	estimate_array_length((Node *) acoerce->arg);
4006	}
4007	else if (IsA(node, RowCompareExpr))
4008	{
4009	/ Conservatively assume we will check all the columns /
4010	RowCompareExpr rcexpr = (RowCompareExpr ) node;
4011	ListCell *lc;
4012
4013	foreach(lc, rcexpr->opnos)
4014	{
4015	Oid opid = lfirst_oid(lc);
4016
4017	add_function_cost(context->root, get_opcode(opid), NULL,
4018	&context->total);
4019	}
4020	}
4021	else if (IsA(node, MinMaxExpr) \|\|
4022	IsA(node, SQLValueFunction) \|\|
4023	IsA(node, XmlExpr) \|\|
4024	IsA(node, CoerceToDomain) \|\|
4025	IsA(node, NextValueExpr))
4026	{
4027	/ Treat all these as having cost 1 /
4028	context->total.per_tuple += cpu_operator_cost;
4029	}
4030	else if (IsA(node, CurrentOfExpr))
4031	{
4032	/ Report high cost to prevent selection of anything but TID scan /
4033	context->total.startup += disable_cost;
4034	}
4035	else if (IsA(node, SubLink))
4036	{
4037	/ This routine should not be applied to un-planned expressions /
4038	elog(ERROR, "cannot handle unplanned sub-select");
4039	}
4040	else if (IsA(node, SubPlan))
4041	{
4042	/*
4043	* A subplan node in an expression typically indicates that the
4044	* subplan will be executed on each evaluation, so charge accordingly.
4045	* (Sub-selects that can be executed as InitPlans have already been
4046	* removed from the expression.)
4047	*/
4048	SubPlan subplan = (SubPlan ) node;
4049
4050	context->total.startup += subplan->startup_cost;
4051	context->total.per_tuple += subplan->per_call_cost;
4052
4053	/*
4054	* We don't want to recurse into the testexpr, because it was already
4055	* counted in the SubPlan node's costs. So we're done.
4056	*/
4057	return false;
4058	}
4059	else if (IsA(node, AlternativeSubPlan))
4060	{
4061	/*
4062	* Arbitrarily use the first alternative plan for costing. (We should
4063	* certainly only include one alternative, and we don't yet have
4064	* enough information to know which one the executor is most likely to
4065	* use.)
4066	*/
4067	AlternativeSubPlan asplan = (AlternativeSubPlan ) node;
4068
4069	return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
4070	context);
4071	}
4072	else if (IsA(node, PlaceHolderVar))
4073	{
4074	/*
4075	* A PlaceHolderVar should be given cost zero when considering general
4076	* expression evaluation costs. The expense of doing the contained
4077	* expression is charged as part of the tlist eval costs of the scan
4078	* or join where the PHV is first computed (see set_rel_width and
4079	* add_placeholders_to_joinrel). If we charged it again here, we'd be
4080	* double-counting the cost for each level of plan that the PHV
4081	* bubbles up through. Hence, return without recursing into the
4082	* phexpr.
4083	*/
4084	return false;
4085	}
4086
4087	/ recurse into children /
4088	return expression_tree_walker(node, cost_qual_eval_walker,
4089	(void *) context);
4090	}
4091
4092	/*
4093	* get_restriction_qual_cost
4094	* Compute evaluation costs of a baserel's restriction quals, plus any
4095	* movable join quals that have been pushed down to the scan.
4096	* Results are returned into *qpqual_cost.
4097	*
4098	* This is a convenience subroutine that works for seqscans and other cases
4099	* where all the given quals will be evaluated the hard way. It's not useful
4100	* for cost_index(), for example, where the index machinery takes care of
4101	* some of the quals. We assume baserestrictcost was previously set by
4102	* set_baserel_size_estimates().
4103	*/
4104	static void
4105	get_restriction_qual_cost(PlannerInfo root, RelOptInfo baserel,
4106	ParamPathInfo *param_info,
4107	QualCost *qpqual_cost)
4108	{
4109	if (param_info)
4110	{
4111	/ Include costs of pushed-down clauses /
4112	cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4113
4114	qpqual_cost->startup += baserel->baserestrictcost.startup;
4115	qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4116	}
4117	else
4118	*qpqual_cost = baserel->baserestrictcost;
4119	}
4120
4121
4122	/*
4123	* compute_semi_anti_join_factors
4124	* Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4125	* can be expected to scan.
4126	*
4127	* In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4128	* inner rows as soon as it finds a match to the current outer row.
4129	* The same happens if we have detected the inner rel is unique.
4130	* We should therefore adjust some of the cost components for this effect.
4131	* This function computes some estimates needed for these adjustments.
4132	* These estimates will be the same regardless of the particular paths used
4133	* for the outer and inner relation, so we compute these once and then pass
4134	* them to all the join cost estimation functions.
4135	*
4136	* Input parameters:
4137	* joinrel: join relation under consideration
4138	* outerrel: outer relation under consideration
4139	* innerrel: inner relation under consideration
4140	* jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4141	* sjinfo: SpecialJoinInfo relevant to this join
4142	* restrictlist: join quals
4143	* Output parameters:
4144	* *semifactors is filled in (see pathnodes.h for field definitions)
4145	*/
4146	void
4147	compute_semi_anti_join_factors(PlannerInfo *root,
4148	RelOptInfo *joinrel,
4149	RelOptInfo *outerrel,
4150	RelOptInfo *innerrel,
4151	JoinType jointype,
4152	SpecialJoinInfo *sjinfo,
4153	List *restrictlist,
4154	SemiAntiJoinFactors *semifactors)
4155	{
4156	Selectivity jselec;
4157	Selectivity nselec;
4158	Selectivity avgmatch;
4159	SpecialJoinInfo norm_sjinfo;
4160	List *joinquals;
4161	ListCell *l;
4162
4163	/*
4164	* In an ANTI join, we must ignore clauses that are "pushed down", since
4165	* those won't affect the match logic. In a SEMI join, we do not
4166	* distinguish joinquals from "pushed down" quals, so just use the whole
4167	* restrictinfo list. For other outer join types, we should consider only
4168	* non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
4169	*/
4170	if (IS_OUTER_JOIN(jointype))
4171	{
4172	joinquals = NIL;
4173	foreach(l, restrictlist)
4174	{
4175	RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4176
4177	if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4178	joinquals = lappend(joinquals, rinfo);
4179	}
4180	}
4181	else
4182	joinquals = restrictlist;
4183
4184	/*
4185	* Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
4186	*/
4187	jselec = clauselist_selectivity(root,
4188	joinquals,
4189	`0`,
4190	(jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
4191	sjinfo);
4192
4193	/*
4194	* Also get the normal inner-join selectivity of the join clauses.
4195	*/
4196	norm_sjinfo.type = T_SpecialJoinInfo;
4197	norm_sjinfo.min_lefthand = outerrel->relids;
4198	norm_sjinfo.min_righthand = innerrel->relids;
4199	norm_sjinfo.syn_lefthand = outerrel->relids;
4200	norm_sjinfo.syn_righthand = innerrel->relids;
4201	norm_sjinfo.jointype = JOIN_INNER;
4202	/ we don't bother trying to make the remaining fields valid /
4203	norm_sjinfo.lhs_strict = false;
4204	norm_sjinfo.delay_upper_joins = false;
4205	norm_sjinfo.semi_can_btree = false;
4206	norm_sjinfo.semi_can_hash = false;
4207	norm_sjinfo.semi_operators = NIL;
4208	norm_sjinfo.semi_rhs_exprs = NIL;
4209
4210	nselec = clauselist_selectivity(root,
4211	joinquals,
4212	`0`,
4213	JOIN_INNER,
4214	&norm_sjinfo);
4215
4216	/ Avoid leaking a lot of ListCells /
4217	if (IS_OUTER_JOIN(jointype))
4218	list_free(joinquals);
4219
4220	/*
4221	* jselec can be interpreted as the fraction of outer-rel rows that have
4222	* any matches (this is true for both SEMI and ANTI cases). And nselec is
4223	* the fraction of the Cartesian product that matches. So, the average
4224	* number of matches for each outer-rel row that has at least one match is
4225	* nselec * inner_rows / jselec.
4226	*
4227	* Note: it is correct to use the inner rel's "rows" count here, even
4228	* though we might later be considering a parameterized inner path with
4229	* fewer rows. This is because we have included all the join clauses in
4230	* the selectivity estimate.
4231	*/
4232	if (jselec > `0`) / protect against zero divide /
4233	{
4234	avgmatch = nselec * innerrel->rows / jselec;
4235	/ Clamp to sane range /
4236	avgmatch = Max(`1.0`, avgmatch);
4237	}
4238	else
4239	avgmatch = `1.0`;
4240
4241	semifactors->outer_match_frac = jselec;
4242	semifactors->match_count = avgmatch;
4243	}
4244
4245	/*
4246	* has_indexed_join_quals
4247	* Check whether all the joinquals of a nestloop join are used as
4248	* inner index quals.
4249	*
4250	* If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
4251	* indexscan) that uses all the joinquals as indexquals, we can assume that an
4252	* unmatched outer tuple is cheap to process, whereas otherwise it's probably
4253	* expensive.
4254	*/
4255	static bool
4256	has_indexed_join_quals(NestPath *joinpath)
4257	{
4258	Relids joinrelids = joinpath->path.parent->relids;
4259	Path *innerpath = joinpath->innerjoinpath;
4260	List *indexclauses;
4261	bool found_one;
4262	ListCell *lc;
4263
4264	/ If join still has quals to evaluate, it's not fast /
4265	if (joinpath->joinrestrictinfo != NIL)
4266	return false;
4267	/ Nor if the inner path isn't parameterized at all /
4268	if (innerpath->param_info == NULL)
4269	return false;
4270
4271	/ Find the indexclauses list for the inner scan /
4272	switch (innerpath->pathtype)
4273	{
4274	case T_IndexScan:
4275	case T_IndexOnlyScan:
4276	indexclauses = ((IndexPath *) innerpath)->indexclauses;
4277	break;
4278	case T_BitmapHeapScan:
4279	{
4280	/ Accept only a simple bitmap scan, not AND/OR cases /
4281	Path bmqual = ((BitmapHeapPath ) innerpath)->bitmapqual;
4282
4283	if (IsA(bmqual, IndexPath))
4284	indexclauses = ((IndexPath *) bmqual)->indexclauses;
4285	else
4286	return false;
4287	break;
4288	}
4289	default:
4290
4291	/*
4292	* If it's not a simple indexscan, it probably doesn't run quickly
4293	* for zero rows out, even if it's a parameterized path using all
4294	* the joinquals.
4295	*/
4296	return false;
4297	}
4298
4299	/*
4300	* Examine the inner path's param clauses. Any that are from the outer
4301	* path must be found in the indexclauses list, either exactly or in an
4302	* equivalent form generated by equivclass.c. Also, we must find at least
4303	* one such clause, else it's a clauseless join which isn't fast.
4304	*/
4305	found_one = false;
4306	foreach(lc, innerpath->param_info->ppi_clauses)
4307	{
4308	RestrictInfo rinfo = (RestrictInfo ) lfirst(lc);
4309
4310	if (join_clause_is_movable_into(rinfo,
4311	innerpath->parent->relids,
4312	joinrelids))
4313	{
4314	if (!is_redundant_with_indexclauses(rinfo, indexclauses))
4315	return false;
4316	found_one = true;
4317	}
4318	}
4319	return found_one;
4320	}
4321
4322
4323	/*
4324	* approx_tuple_count
4325	* Quick-and-dirty estimation of the number of join rows passing
4326	* a set of qual conditions.
4327	*
4328	* The quals can be either an implicitly-ANDed list of boolean expressions,
4329	* or a list of RestrictInfo nodes (typically the latter).
4330	*
4331	* We intentionally compute the selectivity under JOIN_INNER rules, even
4332	* if it's some type of outer join. This is appropriate because we are
4333	* trying to figure out how many tuples pass the initial merge or hash
4334	* join step.
4335	*
4336	* This is quick-and-dirty because we bypass clauselist_selectivity, and
4337	* simply multiply the independent clause selectivities together. Now
4338	* clauselist_selectivity often can't do any better than that anyhow, but
4339	* for some situations (such as range constraints) it is smarter. However,
4340	* we can't effectively cache the results of clauselist_selectivity, whereas
4341	* the individual clause selectivities can be and are cached.
4342	*
4343	* Since we are only using the results to estimate how many potential
4344	* output tuples are generated and passed through qpqual checking, it
4345	* seems OK to live with the approximation.
4346	*/
4347	static double
4348	approx_tuple_count(PlannerInfo root, JoinPath path, List *quals)
4349	{
4350	double tuples;
4351	double outer_tuples = path->outerjoinpath->rows;
4352	double inner_tuples = path->innerjoinpath->rows;
4353	SpecialJoinInfo sjinfo;
4354	Selectivity selec = `1.0`;
4355	ListCell *l;
4356
4357	/*
4358	* Make up a SpecialJoinInfo for JOIN_INNER semantics.
4359	*/
4360	sjinfo.type = T_SpecialJoinInfo;
4361	sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
4362	sjinfo.min_righthand = path->innerjoinpath->parent->relids;
4363	sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
4364	sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
4365	sjinfo.jointype = JOIN_INNER;
4366	/ we don't bother trying to make the remaining fields valid /
4367	sjinfo.lhs_strict = false;
4368	sjinfo.delay_upper_joins = false;
4369	sjinfo.semi_can_btree = false;
4370	sjinfo.semi_can_hash = false;
4371	sjinfo.semi_operators = NIL;
4372	sjinfo.semi_rhs_exprs = NIL;
4373
4374	/ Get the approximate selectivity /
4375	foreach(l, quals)
4376	{
4377	Node qual = (Node ) lfirst(l);
4378
4379	/ Note that clause_selectivity will be able to cache its result /
4380	selec *= clause_selectivity(root, qual, `0`, JOIN_INNER, &sjinfo);
4381	}
4382
4383	/ Apply it to the input relation sizes /
4384	tuples = selec * outer_tuples * inner_tuples;
4385
4386	return clamp_row_est(tuples);
4387	}
4388
4389
4390	/*
4391	* set_baserel_size_estimates
4392	* Set the size estimates for the given base relation.
4393	*
4394	* The rel's targetlist and restrictinfo list must have been constructed
4395	* already, and rel->tuples must be set.
4396	*
4397	* We set the following fields of the rel node:
4398	* rows: the estimated number of output tuples (after applying
4399	* restriction clauses).
4400	* width: the estimated average output tuple width in bytes.
4401	* baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
4402	*/
4403	void
4404	set_baserel_size_estimates(PlannerInfo root, RelOptInfo rel)
4405	{
4406	double nrows;
4407
4408	/ Should only be applied to base relations /
4409	Assert(rel->relid > `0`);
4410
4411	nrows = rel->tuples *
4412	clauselist_selectivity(root,
4413	rel->baserestrictinfo,
4414	`0`,
4415	JOIN_INNER,
4416	NULL);
4417
4418	rel->rows = clamp_row_est(nrows);
4419
4420	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
4421
4422	set_rel_width(root, rel);
4423	}
4424
4425	/*
4426	* get_parameterized_baserel_size
4427	* Make a size estimate for a parameterized scan of a base relation.
4428	*
4429	* 'param_clauses' lists the additional join clauses to be used.
4430	*
4431	* set_baserel_size_estimates must have been applied already.
4432	*/
4433	double
4434	get_parameterized_baserel_size(PlannerInfo root, RelOptInfo rel,
4435	List *param_clauses)
4436	{
4437	List *allclauses;
4438	double nrows;
4439
4440	/*
4441	* Estimate the number of rows returned by the parameterized scan, knowing
4442	* that it will apply all the extra join clauses as well as the rel's own
4443	* restriction clauses. Note that we force the clauses to be treated as
4444	* non-join clauses during selectivity estimation.
4445	*/
4446	allclauses = list_concat(list_copy(param_clauses),
4447	rel->baserestrictinfo);
4448	nrows = rel->tuples *
4449	clauselist_selectivity(root,
4450	allclauses,
4451	rel->relid, / do not use 0! /
4452	JOIN_INNER,
4453	NULL);
4454	nrows = clamp_row_est(nrows);
4455	/ For safety, make sure result is not more than the base estimate /
4456	if (nrows > rel->rows)
4457	nrows = rel->rows;
4458	return nrows;
4459	}
4460
4461	/*
4462	* set_joinrel_size_estimates
4463	* Set the size estimates for the given join relation.
4464	*
4465	* The rel's targetlist must have been constructed already, and a
4466	* restriction clause list that matches the given component rels must
4467	* be provided.
4468	*
4469	* Since there is more than one way to make a joinrel for more than two
4470	* base relations, the results we get here could depend on which component
4471	* rel pair is provided. In theory we should get the same answers no matter
4472	* which pair is provided; in practice, since the selectivity estimation
4473	* routines don't handle all cases equally well, we might not. But there's
4474	* not much to be done about it. (Would it make sense to repeat the
4475	* calculations for each pair of input rels that's encountered, and somehow
4476	* average the results? Probably way more trouble than it's worth, and
4477	* anyway we must keep the rowcount estimate the same for all paths for the
4478	* joinrel.)
4479	*
4480	* We set only the rows field here. The reltarget field was already set by
4481	* build_joinrel_tlist, and baserestrictcost is not used for join rels.
4482	*/
4483	void
4484	set_joinrel_size_estimates(PlannerInfo root, RelOptInfo rel,
4485	RelOptInfo *outer_rel,
4486	RelOptInfo *inner_rel,
4487	SpecialJoinInfo *sjinfo,
4488	List *restrictlist)
4489	{
4490	rel->rows = calc_joinrel_size_estimate(root,
4491	rel,
4492	outer_rel,
4493	inner_rel,
4494	outer_rel->rows,
4495	inner_rel->rows,
4496	sjinfo,
4497	restrictlist);
4498	}
4499
4500	/*
4501	* get_parameterized_joinrel_size
4502	* Make a size estimate for a parameterized scan of a join relation.
4503	*
4504	* 'rel' is the joinrel under consideration.
4505	* 'outer_path', 'inner_path' are (probably also parameterized) Paths that
4506	* produce the relations being joined.
4507	* 'sjinfo' is any SpecialJoinInfo relevant to this join.
4508	* 'restrict_clauses' lists the join clauses that need to be applied at the
4509	* join node (including any movable clauses that were moved down to this join,
4510	* and not including any movable clauses that were pushed down into the
4511	* child paths).
4512	*
4513	* set_joinrel_size_estimates must have been applied already.
4514	*/
4515	double
4516	get_parameterized_joinrel_size(PlannerInfo root, RelOptInfo rel,
4517	Path *outer_path,
4518	Path *inner_path,
4519	SpecialJoinInfo *sjinfo,
4520	List *restrict_clauses)
4521	{
4522	double nrows;
4523
4524	/*
4525	* Estimate the number of rows returned by the parameterized join as the
4526	* sizes of the input paths times the selectivity of the clauses that have
4527	* ended up at this join node.
4528	*
4529	* As with set_joinrel_size_estimates, the rowcount estimate could depend
4530	* on the pair of input paths provided, though ideally we'd get the same
4531	* estimate for any pair with the same parameterization.
4532	*/
4533	nrows = calc_joinrel_size_estimate(root,
4534	rel,
4535	outer_path->parent,
4536	inner_path->parent,
4537	outer_path->rows,
4538	inner_path->rows,
4539	sjinfo,
4540	restrict_clauses);
4541	/ For safety, make sure result is not more than the base estimate /
4542	if (nrows > rel->rows)
4543	nrows = rel->rows;
4544	return nrows;
4545	}
4546
4547	/*
4548	* calc_joinrel_size_estimate
4549	* Workhorse for set_joinrel_size_estimates and
4550	* get_parameterized_joinrel_size.
4551	*
4552	* outer_rel/inner_rel are the relations being joined, but they should be
4553	* assumed to have sizes outer_rows/inner_rows; those numbers might be less
4554	* than what rel->rows says, when we are considering parameterized paths.
4555	*/
4556	static double
4557	calc_joinrel_size_estimate(PlannerInfo *root,
4558	RelOptInfo *joinrel,
4559	RelOptInfo *outer_rel,
4560	RelOptInfo *inner_rel,
4561	double outer_rows,
4562	double inner_rows,
4563	SpecialJoinInfo *sjinfo,
4564	List *restrictlist_in)
4565	{
4566	/ This apparently-useless variable dodges a compiler bug in VS2013: /
4567	List *restrictlist = restrictlist_in;
4568	JoinType jointype = sjinfo->jointype;
4569	Selectivity fkselec;
4570	Selectivity jselec;
4571	Selectivity pselec;
4572	double nrows;
4573
4574	/*
4575	* Compute joinclause selectivity. Note that we are only considering
4576	* clauses that become restriction clauses at this join level; we are not
4577	* double-counting them because they were not considered in estimating the
4578	* sizes of the component rels.
4579	*
4580	* First, see whether any of the joinclauses can be matched to known FK
4581	* constraints. If so, drop those clauses from the restrictlist, and
4582	* instead estimate their selectivity using FK semantics. (We do this
4583	* without regard to whether said clauses are local or "pushed down".
4584	* Probably, an FK-matching clause could never be seen as pushed down at
4585	* an outer join, since it would be strict and hence would be grounds for
4586	* join strength reduction.) fkselec gets the net selectivity for
4587	* FK-matching clauses, or 1.0 if there are none.
4588	*/
4589	fkselec = get_foreign_key_join_selectivity(root,
4590	outer_rel->relids,
4591	inner_rel->relids,
4592	sjinfo,
4593	&restrictlist);
4594
4595	/*
4596	* For an outer join, we have to distinguish the selectivity of the join's
4597	* own clauses (JOIN/ON conditions) from any clauses that were "pushed
4598	* down". For inner joins we just count them all as joinclauses.
4599	*/
4600	if (IS_OUTER_JOIN(jointype))
4601	{
4602	List *joinquals = NIL;
4603	List *pushedquals = NIL;
4604	ListCell *l;
4605
4606	/ Grovel through the clauses to separate into two lists /
4607	foreach(l, restrictlist)
4608	{
4609	RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4610
4611	if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4612	pushedquals = lappend(pushedquals, rinfo);
4613	else
4614	joinquals = lappend(joinquals, rinfo);
4615	}
4616
4617	/ Get the separate selectivities /
4618	jselec = clauselist_selectivity(root,
4619	joinquals,
4620	`0`,
4621	jointype,
4622	sjinfo);
4623	pselec = clauselist_selectivity(root,
4624	pushedquals,
4625	`0`,
4626	jointype,
4627	sjinfo);
4628
4629	/ Avoid leaking a lot of ListCells /
4630	list_free(joinquals);
4631	list_free(pushedquals);
4632	}
4633	else
4634	{
4635	jselec = clauselist_selectivity(root,
4636	restrictlist,
4637	`0`,
4638	jointype,
4639	sjinfo);
4640	pselec = `0.0`; / not used, keep compiler quiet /
4641	}
4642
4643	/*
4644	* Basically, we multiply size of Cartesian product by selectivity.
4645	*
4646	* If we are doing an outer join, take that into account: the joinqual
4647	* selectivity has to be clamped using the knowledge that the output must
4648	* be at least as large as the non-nullable input. However, any
4649	* pushed-down quals are applied after the outer join, so their
4650	* selectivity applies fully.
4651	*
4652	* For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
4653	* of LHS rows that have matches, and we apply that straightforwardly.
4654	*/
4655	switch (jointype)
4656	{
4657	case JOIN_INNER:
4658	nrows = outer_rows * inner_rows * fkselec * jselec;
4659	/ pselec not used /
4660	break;
4661	case JOIN_LEFT:
4662	nrows = outer_rows * inner_rows * fkselec * jselec;
4663	if (nrows < outer_rows)
4664	nrows = outer_rows;
4665	nrows *= pselec;
4666	break;
4667	case JOIN_FULL:
4668	nrows = outer_rows * inner_rows * fkselec * jselec;
4669	if (nrows < outer_rows)
4670	nrows = outer_rows;
4671	if (nrows < inner_rows)
4672	nrows = inner_rows;
4673	nrows *= pselec;
4674	break;
4675	case JOIN_SEMI:
4676	nrows = outer_rows * fkselec * jselec;
4677	/ pselec not used /
4678	break;
4679	case JOIN_ANTI:
4680	nrows = outer_rows * (`1.0` - fkselec * jselec);
4681	nrows *= pselec;
4682	break;
4683	default:
4684	/ other values not expected here /
4685	elog(ERROR, "unrecognized join type: %d", (int) jointype);
4686	nrows = `0`; / keep compiler quiet /
4687	break;
4688	}
4689
4690	return clamp_row_est(nrows);
4691	}
4692
4693	/*
4694	* get_foreign_key_join_selectivity
4695	* Estimate join selectivity for foreign-key-related clauses.
4696	*
4697	* Remove any clauses that can be matched to FK constraints from *restrictlist,
4698	* and return a substitute estimate of their selectivity. 1.0 is returned
4699	* when there are no such clauses.
4700	*
4701	* The reason for treating such clauses specially is that we can get better
4702	* estimates this way than by relying on clauselist_selectivity(), especially
4703	* for multi-column FKs where that function's assumption that the clauses are
4704	* independent falls down badly. But even with single-column FKs, we may be
4705	* able to get a better answer when the pg_statistic stats are missing or out
4706	* of date.
4707	*/
4708	static Selectivity
4709	get_foreign_key_join_selectivity(PlannerInfo *root,
4710	Relids outer_relids,
4711	Relids inner_relids,
4712	SpecialJoinInfo *sjinfo,
4713	List **restrictlist)
4714	{
4715	Selectivity fkselec = `1.0`;
4716	JoinType jointype = sjinfo->jointype;
4717	List worklist = restrictlist;
4718	ListCell *lc;
4719
4720	/ Consider each FK constraint that is known to match the query /
4721	foreach(lc, root->fkey_list)
4722	{
4723	ForeignKeyOptInfo fkinfo = (ForeignKeyOptInfo ) lfirst(lc);
4724	bool ref_is_outer;
4725	List *removedlist;
4726	ListCell *cell;
4727	ListCell *prev;
4728	ListCell *next;
4729
4730	/*
4731	* This FK is not relevant unless it connects a baserel on one side of
4732	* this join to a baserel on the other side.
4733	*/
4734	if (bms_is_member(fkinfo->con_relid, outer_relids) &&
4735	bms_is_member(fkinfo->ref_relid, inner_relids))
4736	ref_is_outer = false;
4737	else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
4738	bms_is_member(fkinfo->con_relid, inner_relids))
4739	ref_is_outer = true;
4740	else
4741	continue;
4742
4743	/*
4744	* If we're dealing with a semi/anti join, and the FK's referenced
4745	* relation is on the outside, then knowledge of the FK doesn't help
4746	* us figure out what we need to know (which is the fraction of outer
4747	* rows that have matches). On the other hand, if the referenced rel
4748	* is on the inside, then all outer rows must have matches in the
4749	* referenced table (ignoring nulls). But any restriction or join
4750	* clauses that filter that table will reduce the fraction of matches.
4751	* We can account for restriction clauses, but it's too hard to guess
4752	* how many table rows would get through a join that's inside the RHS.
4753	* Hence, if either case applies, punt and ignore the FK.
4754	*/
4755	if ((jointype == JOIN_SEMI \|\| jointype == JOIN_ANTI) &&
4756	(ref_is_outer \|\| bms_membership(inner_relids) != BMS_SINGLETON))
4757	continue;
4758
4759	/*
4760	* Modify the restrictlist by removing clauses that match the FK (and
4761	* putting them into removedlist instead). It seems unsafe to modify
4762	* the originally-passed List structure, so we make a shallow copy the
4763	* first time through.
4764	*/
4765	if (worklist == *restrictlist)
4766	worklist = list_copy(worklist);
4767
4768	removedlist = NIL;
4769	prev = NULL;
4770	for (cell = list_head(worklist); cell; cell = next)
4771	{
4772	RestrictInfo rinfo = (RestrictInfo ) lfirst(cell);
4773	bool remove_it = false;
4774	int i;
4775
4776	next = lnext(cell);
4777	/ Drop this clause if it matches any column of the FK /
4778	for (i = `0`; i < fkinfo->nkeys; i++)
4779	{
4780	if (rinfo->parent_ec)
4781	{
4782	/*
4783	* EC-derived clauses can only match by EC. It is okay to
4784	* consider any clause derived from the same EC as
4785	* matching the FK: even if equivclass.c chose to generate
4786	* a clause equating some other pair of Vars, it could
4787	* have generated one equating the FK's Vars. So for
4788	* purposes of estimation, we can act as though it did so.
4789	*
4790	* Note: checking parent_ec is a bit of a cheat because
4791	* there are EC-derived clauses that don't have parent_ec
4792	* set; but such clauses must compare expressions that
4793	* aren't just Vars, so they cannot match the FK anyway.
4794	*/
4795	if (fkinfo->eclass[i] == rinfo->parent_ec)
4796	{
4797	remove_it = true;
4798	break;
4799	}
4800	}
4801	else
4802	{
4803	/*
4804	* Otherwise, see if rinfo was previously matched to FK as
4805	* a "loose" clause.
4806	*/
4807	if (list_member_ptr(fkinfo->rinfos[i], rinfo))
4808	{
4809	remove_it = true;
4810	break;
4811	}
4812	}
4813	}
4814	if (remove_it)
4815	{
4816	worklist = list_delete_cell(worklist, cell, prev);
4817	removedlist = lappend(removedlist, rinfo);
4818	}
4819	else
4820	prev = cell;
4821	}
4822
4823	/*
4824	* If we failed to remove all the matching clauses we expected to
4825	* find, chicken out and ignore this FK; applying its selectivity
4826	* might result in double-counting. Put any clauses we did manage to
4827	* remove back into the worklist.
4828	*
4829	* Since the matching clauses are known not outerjoin-delayed, they
4830	* should certainly have appeared in the initial joinclause list. If
4831	* we didn't find them, they must have been matched to, and removed
4832	* by, some other FK in a previous iteration of this loop. (A likely
4833	* case is that two FKs are matched to the same EC; there will be only
4834	* one EC-derived clause in the initial list, so the first FK will
4835	* consume it.) Applying both FKs' selectivity independently risks
4836	* underestimating the join size; in particular, this would undo one
4837	* of the main things that ECs were invented for, namely to avoid
4838	* double-counting the selectivity of redundant equality conditions.
4839	* Later we might think of a reasonable way to combine the estimates,
4840	* but for now, just punt, since this is a fairly uncommon situation.
4841	*/
4842	if (list_length(removedlist) !=
4843	(fkinfo->nmatched_ec + fkinfo->nmatched_ri))
4844	{
4845	worklist = list_concat(worklist, removedlist);
4846	continue;
4847	}
4848
4849	/*
4850	* Finally we get to the payoff: estimate selectivity using the
4851	* knowledge that each referencing row will match exactly one row in
4852	* the referenced table.
4853	*
4854	* XXX that's not true in the presence of nulls in the referencing
4855	* column(s), so in principle we should derate the estimate for those.
4856	* However (1) if there are any strict restriction clauses for the
4857	* referencing column(s) elsewhere in the query, derating here would
4858	* be double-counting the null fraction, and (2) it's not very clear
4859	* how to combine null fractions for multiple referencing columns. So
4860	* we do nothing for now about correcting for nulls.
4861	*
4862	* XXX another point here is that if either side of an FK constraint
4863	* is an inheritance parent, we estimate as though the constraint
4864	* covers all its children as well. This is not an unreasonable
4865	* assumption for a referencing table, ie the user probably applied
4866	* identical constraints to all child tables (though perhaps we ought
4867	* to check that). But it's not possible to have done that for a
4868	* referenced table. Fortunately, precisely because that doesn't
4869	* work, it is uncommon in practice to have an FK referencing a parent
4870	* table. So, at least for now, disregard inheritance here.
4871	*/
4872	if (jointype == JOIN_SEMI \|\| jointype == JOIN_ANTI)
4873	{
4874	/*
4875	* For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
4876	* referenced table is exactly the inside of the join. The join
4877	* selectivity is defined as the fraction of LHS rows that have
4878	* matches. The FK implies that every LHS row has a match *in the
4879	* referenced table*; but any restriction clauses on it will
4880	* reduce the number of matches. Hence we take the join
4881	* selectivity as equal to the selectivity of the table's
4882	* restriction clauses, which is rows / tuples; but we must guard
4883	* against tuples == 0.
4884	*/
4885	RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4886	double ref_tuples = Max(ref_rel->tuples, `1.0`);
4887
4888	fkselec *= ref_rel->rows / ref_tuples;
4889	}
4890	else
4891	{
4892	/*
4893	* Otherwise, selectivity is exactly 1/referenced-table-size; but
4894	* guard against tuples == 0. Note we should use the raw table
4895	* tuple count, not any estimate of its filtered or joined size.
4896	*/
4897	RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4898	double ref_tuples = Max(ref_rel->tuples, `1.0`);
4899
4900	fkselec *= `1.0` / ref_tuples;
4901	}
4902	}
4903
4904	*restrictlist = worklist;
4905	return fkselec;
4906	}
4907
4908	/*
4909	* set_subquery_size_estimates
4910	* Set the size estimates for a base relation that is a subquery.
4911	*
4912	* The rel's targetlist and restrictinfo list must have been constructed
4913	* already, and the Paths for the subquery must have been completed.
4914	* We look at the subquery's PlannerInfo to extract data.
4915	*
4916	* We set the same fields as set_baserel_size_estimates.
4917	*/
4918	void
4919	set_subquery_size_estimates(PlannerInfo root, RelOptInfo rel)
4920	{
4921	PlannerInfo *subroot = rel->subroot;
4922	RelOptInfo *sub_final_rel;
4923	ListCell *lc;
4924
4925	/ Should only be applied to base relations that are subqueries /
4926	Assert(rel->relid > `0`);
4927	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
4928
4929	/*
4930	* Copy raw number of output rows from subquery. All of its paths should
4931	* have the same output rowcount, so just look at cheapest-total.
4932	*/
4933	sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
4934	rel->tuples = sub_final_rel->cheapest_total_path->rows;
4935
4936	/*
4937	* Compute per-output-column width estimates by examining the subquery's
4938	* targetlist. For any output that is a plain Var, get the width estimate
4939	* that was made while planning the subquery. Otherwise, we leave it to
4940	* set_rel_width to fill in a datatype-based default estimate.
4941	*/
4942	foreach(lc, subroot->parse->targetList)
4943	{
4944	TargetEntry *te = lfirst_node(TargetEntry, lc);
4945	Node texpr = (Node ) te->expr;
4946	int32 item_width = `0`;
4947
4948	/ junk columns aren't visible to upper query /
4949	if (te->resjunk)
4950	continue;
4951
4952	/*
4953	* The subquery could be an expansion of a view that's had columns
4954	* added to it since the current query was parsed, so that there are
4955	* non-junk tlist columns in it that don't correspond to any column
4956	* visible at our query level. Ignore such columns.
4957	*/
4958	if (te->resno < rel->min_attr \|\| te->resno > rel->max_attr)
4959	continue;
4960
4961	/*
4962	* XXX This currently doesn't work for subqueries containing set
4963	* operations, because the Vars in their tlists are bogus references
4964	* to the first leaf subquery, which wouldn't give the right answer
4965	* even if we could still get to its PlannerInfo.
4966	*
4967	* Also, the subquery could be an appendrel for which all branches are
4968	* known empty due to constraint exclusion, in which case
4969	* set_append_rel_pathlist will have left the attr_widths set to zero.
4970	*
4971	* In either case, we just leave the width estimate zero until
4972	* set_rel_width fixes it.
4973	*/
4974	if (IsA(texpr, Var) &&
4975	subroot->parse->setOperations == NULL)
4976	{
4977	Var var = (Var ) texpr;
4978	RelOptInfo *subrel = find_base_rel(subroot, var->varno);
4979
4980	item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
4981	}
4982	rel->attr_widths[te->resno - rel->min_attr] = item_width;
4983	}
4984
4985	/ Now estimate number of output rows, etc /
4986	set_baserel_size_estimates(root, rel);
4987	}
4988
4989	/*
4990	* set_function_size_estimates
4991	* Set the size estimates for a base relation that is a function call.
4992	*
4993	* The rel's targetlist and restrictinfo list must have been constructed
4994	* already.
4995	*
4996	* We set the same fields as set_baserel_size_estimates.
4997	*/
4998	void
4999	set_function_size_estimates(PlannerInfo root, RelOptInfo rel)
5000	{
5001	RangeTblEntry *rte;
5002	ListCell *lc;
5003
5004	/ Should only be applied to base relations that are functions /
5005	Assert(rel->relid > `0`);
5006	rte = planner_rt_fetch(rel->relid, root);
5007	Assert(rte->rtekind == RTE_FUNCTION);
5008
5009	/*
5010	* Estimate number of rows the functions will return. The rowcount of the
5011	* node is that of the largest function result.
5012	*/
5013	rel->tuples = `0`;
5014	foreach(lc, rte->functions)
5015	{
5016	RangeTblFunction rtfunc = (RangeTblFunction ) lfirst(lc);
5017	double ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
5018
5019	if (ntup > rel->tuples)
5020	rel->tuples = ntup;
5021	}
5022
5023	/ Now estimate number of output rows, etc /
5024	set_baserel_size_estimates(root, rel);
5025	}
5026
5027	/*
5028	* set_function_size_estimates
5029	* Set the size estimates for a base relation that is a function call.
5030	*
5031	* The rel's targetlist and restrictinfo list must have been constructed
5032	* already.
5033	*
5034	* We set the same fields as set_tablefunc_size_estimates.
5035	*/
5036	void
5037	set_tablefunc_size_estimates(PlannerInfo root, RelOptInfo rel)
5038	{
5039	/ Should only be applied to base relations that are functions /
5040	Assert(rel->relid > `0`);
5041	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
5042
5043	rel->tuples = `100`;
5044
5045	/ Now estimate number of output rows, etc /
5046	set_baserel_size_estimates(root, rel);
5047	}
5048
5049	/*
5050	* set_values_size_estimates
5051	* Set the size estimates for a base relation that is a values list.
5052	*
5053	* The rel's targetlist and restrictinfo list must have been constructed
5054	* already.
5055	*
5056	* We set the same fields as set_baserel_size_estimates.
5057	*/
5058	void
5059	set_values_size_estimates(PlannerInfo root, RelOptInfo rel)
5060	{
5061	RangeTblEntry *rte;
5062
5063	/ Should only be applied to base relations that are values lists /
5064	Assert(rel->relid > `0`);
5065	rte = planner_rt_fetch(rel->relid, root);
5066	Assert(rte->rtekind == RTE_VALUES);
5067
5068	/*
5069	* Estimate number of rows the values list will return. We know this
5070	* precisely based on the list length (well, barring set-returning
5071	* functions in list items, but that's a refinement not catered for
5072	* anywhere else either).
5073	*/
5074	rel->tuples = list_length(rte->values_lists);
5075
5076	/ Now estimate number of output rows, etc /
5077	set_baserel_size_estimates(root, rel);
5078	}
5079
5080	/*
5081	* set_cte_size_estimates
5082	* Set the size estimates for a base relation that is a CTE reference.
5083	*
5084	* The rel's targetlist and restrictinfo list must have been constructed
5085	* already, and we need an estimate of the number of rows returned by the CTE
5086	* (if a regular CTE) or the non-recursive term (if a self-reference).
5087	*
5088	* We set the same fields as set_baserel_size_estimates.
5089	*/
5090	void
5091	set_cte_size_estimates(PlannerInfo root, RelOptInfo rel, double cte_rows)
5092	{
5093	RangeTblEntry *rte;
5094
5095	/ Should only be applied to base relations that are CTE references /
5096	Assert(rel->relid > `0`);
5097	rte = planner_rt_fetch(rel->relid, root);
5098	Assert(rte->rtekind == RTE_CTE);
5099
5100	if (rte->self_reference)
5101	{
5102	/*
5103	* In a self-reference, arbitrarily assume the average worktable size
5104	* is about 10 times the nonrecursive term's size.
5105	*/
5106	rel->tuples = `10` * cte_rows;
5107	}
5108	else
5109	{
5110	/ Otherwise just believe the CTE's rowcount estimate /
5111	rel->tuples = cte_rows;
5112	}
5113
5114	/ Now estimate number of output rows, etc /
5115	set_baserel_size_estimates(root, rel);
5116	}
5117
5118	/*
5119	* set_namedtuplestore_size_estimates
5120	* Set the size estimates for a base relation that is a tuplestore reference.
5121	*
5122	* The rel's targetlist and restrictinfo list must have been constructed
5123	* already.
5124	*
5125	* We set the same fields as set_baserel_size_estimates.
5126	*/
5127	void
5128	set_namedtuplestore_size_estimates(PlannerInfo root, RelOptInfo rel)
5129	{
5130	RangeTblEntry *rte;
5131
5132	/ Should only be applied to base relations that are tuplestore references /
5133	Assert(rel->relid > `0`);
5134	rte = planner_rt_fetch(rel->relid, root);
5135	Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
5136
5137	/*
5138	* Use the estimate provided by the code which is generating the named
5139	* tuplestore. In some cases, the actual number might be available; in
5140	* others the same plan will be re-used, so a "typical" value might be
5141	* estimated and used.
5142	*/
5143	rel->tuples = rte->enrtuples;
5144	if (rel->tuples < `0`)
5145	rel->tuples = `1000`;
5146
5147	/ Now estimate number of output rows, etc /
5148	set_baserel_size_estimates(root, rel);
5149	}
5150
5151	/*
5152	* set_result_size_estimates
5153	* Set the size estimates for an RTE_RESULT base relation
5154	*
5155	* The rel's targetlist and restrictinfo list must have been constructed
5156	* already.
5157	*
5158	* We set the same fields as set_baserel_size_estimates.
5159	*/
5160	void
5161	set_result_size_estimates(PlannerInfo root, RelOptInfo rel)
5162	{
5163	/ Should only be applied to RTE_RESULT base relations /
5164	Assert(rel->relid > `0`);
5165	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_RESULT);
5166
5167	/ RTE_RESULT always generates a single row, natively /
5168	rel->tuples = `1`;
5169
5170	/ Now estimate number of output rows, etc /
5171	set_baserel_size_estimates(root, rel);
5172	}
5173
5174	/*
5175	* set_foreign_size_estimates
5176	* Set the size estimates for a base relation that is a foreign table.
5177	*
5178	* There is not a whole lot that we can do here; the foreign-data wrapper
5179	* is responsible for producing useful estimates. We can do a decent job
5180	* of estimating baserestrictcost, so we set that, and we also set up width
5181	* using what will be purely datatype-driven estimates from the targetlist.
5182	* There is no way to do anything sane with the rows value, so we just put
5183	* a default estimate and hope that the wrapper can improve on it. The
5184	* wrapper's GetForeignRelSize function will be called momentarily.
5185	*
5186	* The rel's targetlist and restrictinfo list must have been constructed
5187	* already.
5188	*/
5189	void
5190	set_foreign_size_estimates(PlannerInfo root, RelOptInfo rel)
5191	{
5192	/ Should only be applied to base relations /
5193	Assert(rel->relid > `0`);
5194
5195	rel->rows = `1000`; / entirely bogus default estimate /
5196
5197	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5198
5199	set_rel_width(root, rel);
5200	}
5201
5202
5203	/*
5204	* set_rel_width
5205	* Set the estimated output width of a base relation.
5206	*
5207	* The estimated output width is the sum of the per-attribute width estimates
5208	* for the actually-referenced columns, plus any PHVs or other expressions
5209	* that have to be calculated at this relation. This is the amount of data
5210	* we'd need to pass upwards in case of a sort, hash, etc.
5211	*
5212	* This function also sets reltarget->cost, so it's a bit misnamed now.
5213	*
5214	* NB: this works best on plain relations because it prefers to look at
5215	* real Vars. For subqueries, set_subquery_size_estimates will already have
5216	* copied up whatever per-column estimates were made within the subquery,
5217	* and for other types of rels there isn't much we can do anyway. We fall
5218	* back on (fairly stupid) datatype-based width estimates if we can't get
5219	* any better number.
5220	*
5221	* The per-attribute width estimates are cached for possible re-use while
5222	* building join relations or post-scan/join pathtargets.
5223	*/
5224	static void
5225	set_rel_width(PlannerInfo root, RelOptInfo rel)
5226	{
5227	Oid reloid = planner_rt_fetch(rel->relid, root)->relid;
5228	int32 tuple_width = `0`;
5229	bool have_wholerow_var = false;
5230	ListCell *lc;
5231
5232	/ Vars are assumed to have cost zero, but other exprs do not /
5233	rel->reltarget->cost.startup = `0`;
5234	rel->reltarget->cost.per_tuple = `0`;
5235
5236	foreach(lc, rel->reltarget->exprs)
5237	{
5238	Node node = (Node ) lfirst(lc);
5239
5240	/*
5241	* Ordinarily, a Var in a rel's targetlist must belong to that rel;
5242	* but there are corner cases involving LATERAL references where that
5243	* isn't so. If the Var has the wrong varno, fall through to the
5244	* generic case (it doesn't seem worth the trouble to be any smarter).
5245	*/
5246	if (IsA(node, Var) &&
5247	((Var *) node)->varno == rel->relid)
5248	{
5249	Var var = (Var ) node;
5250	int ndx;
5251	int32 item_width;
5252
5253	Assert(var->varattno >= rel->min_attr);
5254	Assert(var->varattno <= rel->max_attr);
5255
5256	ndx = var->varattno - rel->min_attr;
5257
5258	/*
5259	* If it's a whole-row Var, we'll deal with it below after we have
5260	* already cached as many attr widths as possible.
5261	*/
5262	if (var->varattno == `0`)
5263	{
5264	have_wholerow_var = true;
5265	continue;
5266	}
5267
5268	/*
5269	* The width may have been cached already (especially if it's a
5270	* subquery), so don't duplicate effort.
5271	*/
5272	if (rel->attr_widths[ndx] > `0`)
5273	{
5274	tuple_width += rel->attr_widths[ndx];
5275	continue;
5276	}
5277
5278	/ Try to get column width from statistics /
5279	if (reloid != InvalidOid && var->varattno > `0`)
5280	{
5281	item_width = get_attavgwidth(reloid, var->varattno);
5282	if (item_width > `0`)
5283	{
5284	rel->attr_widths[ndx] = item_width;
5285	tuple_width += item_width;
5286	continue;
5287	}
5288	}
5289
5290	/*
5291	* Not a plain relation, or can't find statistics for it. Estimate
5292	* using just the type info.
5293	*/
5294	item_width = get_typavgwidth(var->vartype, var->vartypmod);
5295	Assert(item_width > `0`);
5296	rel->attr_widths[ndx] = item_width;
5297	tuple_width += item_width;
5298	}
5299	else if (IsA(node, PlaceHolderVar))
5300	{
5301	/*
5302	* We will need to evaluate the PHV's contained expression while
5303	* scanning this rel, so be sure to include it in reltarget->cost.
5304	*/
5305	PlaceHolderVar phv = (PlaceHolderVar ) node;
5306	PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
5307	QualCost cost;
5308
5309	tuple_width += phinfo->ph_width;
5310	cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
5311	rel->reltarget->cost.startup += cost.startup;
5312	rel->reltarget->cost.per_tuple += cost.per_tuple;
5313	}
5314	else
5315	{
5316	/*
5317	* We could be looking at an expression pulled up from a subquery,
5318	* or a ROW() representing a whole-row child Var, etc. Do what we
5319	* can using the expression type information.
5320	*/
5321	int32 item_width;
5322	QualCost cost;
5323
5324	item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5325	Assert(item_width > `0`);
5326	tuple_width += item_width;
5327	/ Not entirely clear if we need to account for cost, but do so /
5328	cost_qual_eval_node(&cost, node, root);
5329	rel->reltarget->cost.startup += cost.startup;
5330	rel->reltarget->cost.per_tuple += cost.per_tuple;
5331	}
5332	}
5333
5334	/*
5335	* If we have a whole-row reference, estimate its width as the sum of
5336	* per-column widths plus heap tuple header overhead.
5337	*/
5338	if (have_wholerow_var)
5339	{
5340	int32 wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
5341
5342	if (reloid != InvalidOid)
5343	{
5344	/ Real relation, so estimate true tuple width /
5345	wholerow_width += get_relation_data_width(reloid,
5346	rel->attr_widths - rel->min_attr);
5347	}
5348	else
5349	{
5350	/ Do what we can with info for a phony rel /
5351	AttrNumber i;
5352
5353	for (i = `1`; i <= rel->max_attr; i++)
5354	wholerow_width += rel->attr_widths[i - rel->min_attr];
5355	}
5356
5357	rel->attr_widths[`0` - rel->min_attr] = wholerow_width;
5358
5359	/*
5360	* Include the whole-row Var as part of the output tuple. Yes, that
5361	* really is what happens at runtime.
5362	*/
5363	tuple_width += wholerow_width;
5364	}
5365
5366	Assert(tuple_width >= `0`);
5367	rel->reltarget->width = tuple_width;
5368	}
5369
5370	/*
5371	* set_pathtarget_cost_width
5372	* Set the estimated eval cost and output width of a PathTarget tlist.
5373	*
5374	* As a notational convenience, returns the same PathTarget pointer passed in.
5375	*
5376	* Most, though not quite all, uses of this function occur after we've run
5377	* set_rel_width() for base relations; so we can usually obtain cached width
5378	* estimates for Vars. If we can't, fall back on datatype-based width
5379	* estimates. Present early-planning uses of PathTargets don't need accurate
5380	* widths badly enough to justify going to the catalogs for better data.
5381	*/
5382	PathTarget *
5383	set_pathtarget_cost_width(PlannerInfo root, PathTarget target)
5384	{
5385	int32 tuple_width = `0`;
5386	ListCell *lc;
5387
5388	/ Vars are assumed to have cost zero, but other exprs do not /
5389	target->cost.startup = `0`;
5390	target->cost.per_tuple = `0`;
5391
5392	foreach(lc, target->exprs)
5393	{
5394	Node node = (Node ) lfirst(lc);
5395
5396	if (IsA(node, Var))
5397	{
5398	Var var = (Var ) node;
5399	int32 item_width;
5400
5401	/ We should not see any upper-level Vars here /
5402	Assert(var->varlevelsup == `0`);
5403
5404	/ Try to get data from RelOptInfo cache /
5405	if (var->varno < root->simple_rel_array_size)
5406	{
5407	RelOptInfo *rel = root->simple_rel_array[var->varno];
5408
5409	if (rel != NULL &&
5410	var->varattno >= rel->min_attr &&
5411	var->varattno <= rel->max_attr)
5412	{
5413	int ndx = var->varattno - rel->min_attr;
5414
5415	if (rel->attr_widths[ndx] > `0`)
5416	{
5417	tuple_width += rel->attr_widths[ndx];
5418	continue;
5419	}
5420	}
5421	}
5422
5423	/*
5424	* No cached data available, so estimate using just the type info.
5425	*/
5426	item_width = get_typavgwidth(var->vartype, var->vartypmod);
5427	Assert(item_width > `0`);
5428	tuple_width += item_width;
5429	}
5430	else
5431	{
5432	/*
5433	* Handle general expressions using type info.
5434	*/
5435	int32 item_width;
5436	QualCost cost;
5437
5438	item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5439	Assert(item_width > `0`);
5440	tuple_width += item_width;
5441
5442	/ Account for cost, too /
5443	cost_qual_eval_node(&cost, node, root);
5444	target->cost.startup += cost.startup;
5445	target->cost.per_tuple += cost.per_tuple;
5446	}
5447	}
5448
5449	Assert(tuple_width >= `0`);
5450	target->width = tuple_width;
5451
5452	return target;
5453	}
5454
5455	/*
5456	* relation_byte_size
5457	* Estimate the storage space in bytes for a given number of tuples
5458	* of a given width (size in bytes).
5459	*/
5460	static double
5461	relation_byte_size(double tuples, int width)
5462	{
5463	return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
5464	}
5465
5466	/*
5467	* page_size
5468	* Returns an estimate of the number of pages covered by a given
5469	* number of tuples of a given width (size in bytes).
5470	*/
5471	static double
5472	page_size(double tuples, int width)
5473	{
5474	return ceil(relation_byte_size(tuples, width) / BLCKSZ);
5475	}
5476
5477	/*
5478	* Estimate the fraction of the work that each worker will do given the
5479	* number of workers budgeted for the path.
5480	*/
5481	static double
5482	get_parallel_divisor(Path *path)
5483	{
5484	double parallel_divisor = path->parallel_workers;
5485
5486	/*
5487	* Early experience with parallel query suggests that when there is only
5488	* one worker, the leader often makes a very substantial contribution to
5489	* executing the parallel portion of the plan, but as more workers are
5490	* added, it does less and less, because it's busy reading tuples from the
5491	* workers and doing whatever non-parallel post-processing is needed. By
5492	* the time we reach 4 workers, the leader no longer makes a meaningful
5493	* contribution. Thus, for now, estimate that the leader spends 30% of
5494	* its time servicing each worker, and the remainder executing the
5495	* parallel plan.
5496	*/
5497	if (parallel_leader_participation)
5498	{
5499	double leader_contribution;
5500
5501	leader_contribution = `1.0` - (`0.3` * path->parallel_workers);
5502	if (leader_contribution > `0`)
5503	parallel_divisor += leader_contribution;
5504	}
5505
5506	return parallel_divisor;
5507	}
5508
5509	/*
5510	* compute_bitmap_pages
5511	*
5512	* compute number of pages fetched from heap in bitmap heap scan.
5513	*/
5514	double
5515	compute_bitmap_pages(PlannerInfo root, RelOptInfo baserel, Path *bitmapqual,
5516	int loop_count, Cost cost, double* *tuple)
5517	{
5518	Cost indexTotalCost;
5519	Selectivity indexSelectivity;
5520	double T;
5521	double pages_fetched;
5522	double tuples_fetched;
5523	double heap_pages;
5524	long maxentries;
5525
5526	/*
5527	* Fetch total cost of obtaining the bitmap, as well as its total
5528	* selectivity.
5529	*/
5530	cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
5531
5532	/*
5533	* Estimate number of main-table pages fetched.
5534	*/
5535	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
5536
5537	T = (baserel->pages > `1`) ? (double) baserel->pages : `1.0`;
5538
5539	/*
5540	* For a single scan, the number of heap pages that need to be fetched is
5541	* the same as the Mackert and Lohman formula for the case T <= b (ie, no
5542	* re-reads needed).
5543	*/
5544	pages_fetched = (`2.0` * T * tuples_fetched) / (`2.0` * T + tuples_fetched);
5545
5546	/*
5547	* Calculate the number of pages fetched from the heap. Then based on
5548	* current work_mem estimate get the estimated maxentries in the bitmap.
5549	* (Note that we always do this calculation based on the number of pages
5550	* that would be fetched in a single iteration, even if loop_count > 1.
5551	* That's correct, because only that number of entries will be stored in
5552	* the bitmap at one time.)
5553	*/
5554	heap_pages = Min(pages_fetched, baserel->pages);
5555	maxentries = tbm_calculate_entries(work_mem * `1024L`);
5556
5557	if (loop_count > `1`)
5558	{
5559	/*
5560	* For repeated bitmap scans, scale up the number of tuples fetched in
5561	* the Mackert and Lohman formula by the number of scans, so that we
5562	* estimate the number of pages fetched by all the scans. Then
5563	* pro-rate for one scan.
5564	*/
5565	pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
5566	baserel->pages,
5567	get_indexpath_pages(bitmapqual),
5568	root);
5569	pages_fetched /= loop_count;
5570	}
5571
5572	if (pages_fetched >= T)
5573	pages_fetched = T;
5574	else
5575	pages_fetched = ceil(pages_fetched);
5576
5577	if (maxentries < heap_pages)
5578	{
5579	double exact_pages;
5580	double lossy_pages;
5581
5582	/*
5583	* Crude approximation of the number of lossy pages. Because of the
5584	* way tbm_lossify() is coded, the number of lossy pages increases
5585	* very sharply as soon as we run short of memory; this formula has
5586	* that property and seems to perform adequately in testing, but it's
5587	* possible we could do better somehow.
5588	*/
5589	lossy_pages = Max(`0`, heap_pages - maxentries / `2`);
5590	exact_pages = heap_pages - lossy_pages;
5591
5592	/*
5593	* If there are lossy pages then recompute the number of tuples
5594	* processed by the bitmap heap node. We assume here that the chance
5595	* of a given tuple coming from an exact page is the same as the
5596	* chance that a given page is exact. This might not be true, but
5597	* it's not clear how we can do any better.
5598	*/
5599	if (lossy_pages > `0`)
5600	tuples_fetched =
5601	clamp_row_est(indexSelectivity *
5602	(exact_pages / heap_pages) * baserel->tuples +
5603	(lossy_pages / heap_pages) * baserel->tuples);
5604	}
5605
5606	if (cost)
5607	*cost = indexTotalCost;
5608	if (tuple)
5609	*tuple = tuples_fetched;
5610
5611	return pages_fetched;
5612	}
5613

Browse the source code of PostgreSQL/src/backend/optimizer/path/costsize.c