network_selfuncs.c source code [PostgreSQL/src/backend/utils/adt/network_selfuncs.c]

1	/-------------------------------------------------------------------------*
2	*
3	* network_selfuncs.c
4	* Functions for selectivity estimation of inet/cidr operators
5	*
6	* This module provides estimators for the subnet inclusion and overlap
7	* operators. Estimates are based on null fraction, most common values,
8	* and histogram of inet/cidr columns.
9	*
10	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
11	* Portions Copyright (c) 1994, Regents of the University of California
12	*
13	*
14	* IDENTIFICATION
15	* src/backend/utils/adt/network_selfuncs.c
16	*
17	*-------------------------------------------------------------------------
18	*/
19	#include "postgres.h"
20
21	#include <math.h>
22
23	#include "access/htup_details.h"
24	#include "catalog/pg_operator.h"
25	#include "catalog/pg_statistic.h"
26	#include "utils/builtins.h"
27	#include "utils/inet.h"
28	#include "utils/lsyscache.h"
29	#include "utils/selfuncs.h"
30
31
32	/ Default selectivity for the inet overlap operator /
33	#define DEFAULT_OVERLAP_SEL 0.01
34
35	/ Default selectivity for the various inclusion operators /
36	#define DEFAULT_INCLUSION_SEL 0.005
37
38	/ Default selectivity for specified operator /
39	#define DEFAULT_SEL(operator) \
40	((operator) == OID_INET_OVERLAP_OP ? \
41	DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL)
42
43	/ Maximum number of items to consider in join selectivity calculations /
44	#define MAX_CONSIDERED_ELEMS 1024
45
46	static Selectivity networkjoinsel_inner(Oid operator,
47	VariableStatData vardata1, VariableStatData vardata2);
48	static Selectivity networkjoinsel_semi(Oid operator,
49	VariableStatData vardata1, VariableStatData vardata2);
50	static Selectivity mcv_population(float4 mcv_numbers, int* mcv_nvalues);
51	static Selectivity inet_hist_value_sel(Datum values, int* nvalues,
52	Datum constvalue, int opr_codenum);
53	static Selectivity inet_mcv_join_sel(Datum *mcv1_values,
54	float4 mcv1_numbers, int* mcv1_nvalues, Datum *mcv2_values,
55	float4 mcv2_numbers, int* mcv2_nvalues, Oid operator);
56	static Selectivity inet_mcv_hist_sel(Datum mcv_values, float4 mcv_numbers,
57	int mcv_nvalues, Datum hist_values, int* hist_nvalues,
58	int opr_codenum);
59	static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values,
60	int hist1_nvalues,
61	Datum hist2_values, int* hist2_nvalues,
62	int opr_codenum);
63	static Selectivity inet_semi_join_sel(Datum lhs_value,
64	bool mcv_exists, Datum mcv_values, int* mcv_nvalues,
65	bool hist_exists, Datum hist_values, int* hist_nvalues,
66	double hist_weight,
67	FmgrInfo proc, int* opr_codenum);
68	static int inet_opr_codenum(Oid operator);
69	static int inet_inclusion_cmp(inet left, inet right, int opr_codenum);
70	static int inet_masklen_inclusion_cmp(inet left, inet right,
71	int opr_codenum);
72	static int inet_hist_match_divider(inet boundary, inet query,
73	int opr_codenum);
74
75	/*
76	* Selectivity estimation for the subnet inclusion/overlap operators
77	*/
78	Datum
79	networksel(PG_FUNCTION_ARGS)
80	{
81	PlannerInfo root = (PlannerInfo ) PG_GETARG_POINTER(`0`);
82	Oid operator = PG_GETARG_OID(`1`);
83	List args = (List ) PG_GETARG_POINTER(`2`);
84	int varRelid = PG_GETARG_INT32(`3`);
85	VariableStatData vardata;
86	Node *other;
87	bool varonleft;
88	Selectivity selec,
89	mcv_selec,
90	non_mcv_selec;
91	Datum constvalue;
92	Form_pg_statistic stats;
93	AttStatsSlot hslot;
94	double sumcommon,
95	nullfrac;
96	FmgrInfo proc;
97
98	/*
99	* If expression is not (variable op something) or (something op
100	* variable), then punt and return a default estimate.
101	*/
102	if (!get_restriction_variable(root, args, varRelid,
103	&vardata, &other, &varonleft))
104	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
105
106	/*
107	* Can't do anything useful if the something is not a constant, either.
108	*/
109	if (!IsA(other, Const))
110	{
111	ReleaseVariableStats(vardata);
112	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
113	}
114
115	/ All of the operators handled here are strict. /
116	if (((Const *) other)->constisnull)
117	{
118	ReleaseVariableStats(vardata);
119	PG_RETURN_FLOAT8(`0.0`);
120	}
121	constvalue = ((Const *) other)->constvalue;
122
123	/ Otherwise, we need stats in order to produce a non-default estimate. /
124	if (!HeapTupleIsValid(vardata.statsTuple))
125	{
126	ReleaseVariableStats(vardata);
127	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
128	}
129
130	stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
131	nullfrac = stats->stanullfrac;
132
133	/*
134	* If we have most-common-values info, add up the fractions of the MCV
135	* entries that satisfy MCV OP CONST. These fractions contribute directly
136	* to the result selectivity. Also add up the total fraction represented
137	* by MCV entries.
138	*/
139	fmgr_info(get_opcode(operator), &proc);
140	mcv_selec = mcv_selectivity(&vardata, &proc, constvalue, varonleft,
141	&sumcommon);
142
143	/*
144	* If we have a histogram, use it to estimate the proportion of the
145	* non-MCV population that satisfies the clause. If we don't, apply the
146	* default selectivity to that population.
147	*/
148	if (get_attstatsslot(&hslot, vardata.statsTuple,
149	STATISTIC_KIND_HISTOGRAM, InvalidOid,
150	ATTSTATSSLOT_VALUES))
151	{
152	int opr_codenum = inet_opr_codenum(operator);
153
154	/ Commute if needed, so we can consider histogram to be on the left /
155	if (!varonleft)
156	opr_codenum = -opr_codenum;
157	non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues,
158	constvalue, opr_codenum);
159
160	free_attstatsslot(&hslot);
161	}
162	else
163	non_mcv_selec = DEFAULT_SEL(operator);
164
165	/ Combine selectivities for MCV and non-MCV populations /
166	selec = mcv_selec + (`1.0` - nullfrac - sumcommon) * non_mcv_selec;
167
168	/ Result should be in range, but make sure... /
169	CLAMP_PROBABILITY(selec);
170
171	ReleaseVariableStats(vardata);
172
173	PG_RETURN_FLOAT8(selec);
174	}
175
176	/*
177	* Join selectivity estimation for the subnet inclusion/overlap operators
178	*
179	* This function has the same structure as eqjoinsel() in selfuncs.c.
180	*
181	* Throughout networkjoinsel and its subroutines, we have a performance issue
182	* in that the amount of work to be done is O(N^2) in the length of the MCV
183	* and histogram arrays. To keep the runtime from getting out of hand when
184	* large statistics targets have been set, we arbitrarily limit the number of
185	* values considered to 1024 (MAX_CONSIDERED_ELEMS). For the MCV arrays, this
186	* is easy: just consider at most the first N elements. (Since the MCVs are
187	* sorted by decreasing frequency, this correctly gets us the first N MCVs.)
188	* For the histogram arrays, we decimate; that is consider only every k'th
189	* element, where k is chosen so that no more than MAX_CONSIDERED_ELEMS
190	* elements are considered. This should still give us a good random sample of
191	* the non-MCV population. Decimation is done on-the-fly in the loops that
192	* iterate over the histogram arrays.
193	*/
194	Datum
195	networkjoinsel(PG_FUNCTION_ARGS)
196	{
197	PlannerInfo root = (PlannerInfo ) PG_GETARG_POINTER(`0`);
198	Oid operator = PG_GETARG_OID(`1`);
199	List args = (List ) PG_GETARG_POINTER(`2`);
200	#ifdef NOT_USED
201	JoinType jointype = (JoinType) PG_GETARG_INT16(`3`);
202	#endif
203	SpecialJoinInfo sjinfo = (SpecialJoinInfo ) PG_GETARG_POINTER(`4`);
204	double selec;
205	VariableStatData vardata1;
206	VariableStatData vardata2;
207	bool join_is_reversed;
208
209	get_join_variables(root, args, sjinfo,
210	&vardata1, &vardata2, &join_is_reversed);
211
212	switch (sjinfo->jointype)
213	{
214	case JOIN_INNER:
215	case JOIN_LEFT:
216	case JOIN_FULL:
217
218	/*
219	* Selectivity for left/full join is not exactly the same as inner
220	* join, but we neglect the difference, as eqjoinsel does.
221	*/
222	selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
223	break;
224	case JOIN_SEMI:
225	case JOIN_ANTI:
226	/ Here, it's important that we pass the outer var on the left. /
227	if (!join_is_reversed)
228	selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
229	else
230	selec = networkjoinsel_semi(get_commutator(operator),
231	&vardata2, &vardata1);
232	break;
233	default:
234	/ other values not expected here /
235	elog(ERROR, "unrecognized join type: %d",
236	(int) sjinfo->jointype);
237	selec = `0`; / keep compiler quiet /
238	break;
239	}
240
241	ReleaseVariableStats(vardata1);
242	ReleaseVariableStats(vardata2);
243
244	CLAMP_PROBABILITY(selec);
245
246	PG_RETURN_FLOAT8((float8) selec);
247	}
248
249	/*
250	* Inner join selectivity estimation for subnet inclusion/overlap operators
251	*
252	* Calculates MCV vs MCV, MCV vs histogram and histogram vs histogram
253	* selectivity for join using the subnet inclusion operators. Unlike the
254	* join selectivity function for the equality operator, eqjoinsel_inner(),
255	* one to one matching of the values is not enough. Network inclusion
256	* operators are likely to match many to many, so we must check all pairs.
257	* (Note: it might be possible to exploit understanding of the histogram's
258	* btree ordering to reduce the work needed, but we don't currently try.)
259	* Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner().
260	*/
261	static Selectivity
262	networkjoinsel_inner(Oid operator,
263	VariableStatData vardata1, VariableStatData vardata2)
264	{
265	Form_pg_statistic stats;
266	double nullfrac1 = `0.0`,
267	nullfrac2 = `0.0`;
268	Selectivity selec = `0.0`,
269	sumcommon1 = `0.0`,
270	sumcommon2 = `0.0`;
271	bool mcv1_exists = false,
272	mcv2_exists = false,
273	hist1_exists = false,
274	hist2_exists = false;
275	int opr_codenum;
276	int mcv1_length = `0`,
277	mcv2_length = `0`;
278	AttStatsSlot mcv1_slot;
279	AttStatsSlot mcv2_slot;
280	AttStatsSlot hist1_slot;
281	AttStatsSlot hist2_slot;
282
283	if (HeapTupleIsValid(vardata1->statsTuple))
284	{
285	stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
286	nullfrac1 = stats->stanullfrac;
287
288	mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
289	STATISTIC_KIND_MCV, InvalidOid,
290	ATTSTATSSLOT_VALUES \| ATTSTATSSLOT_NUMBERS);
291	hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
292	STATISTIC_KIND_HISTOGRAM, InvalidOid,
293	ATTSTATSSLOT_VALUES);
294	/ Arbitrarily limit number of MCVs considered /
295	mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
296	if (mcv1_exists)
297	sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
298	}
299	else
300	{
301	memset(&mcv1_slot, `0`, sizeof(mcv1_slot));
302	memset(&hist1_slot, `0`, sizeof(hist1_slot));
303	}
304
305	if (HeapTupleIsValid(vardata2->statsTuple))
306	{
307	stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
308	nullfrac2 = stats->stanullfrac;
309
310	mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
311	STATISTIC_KIND_MCV, InvalidOid,
312	ATTSTATSSLOT_VALUES \| ATTSTATSSLOT_NUMBERS);
313	hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
314	STATISTIC_KIND_HISTOGRAM, InvalidOid,
315	ATTSTATSSLOT_VALUES);
316	/ Arbitrarily limit number of MCVs considered /
317	mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
318	if (mcv2_exists)
319	sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
320	}
321	else
322	{
323	memset(&mcv2_slot, `0`, sizeof(mcv2_slot));
324	memset(&hist2_slot, `0`, sizeof(hist2_slot));
325	}
326
327	opr_codenum = inet_opr_codenum(operator);
328
329	/*
330	* Calculate selectivity for MCV vs MCV matches.
331	*/
332	if (mcv1_exists && mcv2_exists)
333	selec += inet_mcv_join_sel(mcv1_slot.values, mcv1_slot.numbers,
334	mcv1_length,
335	mcv2_slot.values, mcv2_slot.numbers,
336	mcv2_length,
337	operator);
338
339	/*
340	* Add in selectivities for MCV vs histogram matches, scaling according to
341	* the fractions of the populations represented by the histograms. Note
342	* that the second case needs to commute the operator.
343	*/
344	if (mcv1_exists && hist2_exists)
345	selec += (`1.0` - nullfrac2 - sumcommon2) *
346	inet_mcv_hist_sel(mcv1_slot.values, mcv1_slot.numbers, mcv1_length,
347	hist2_slot.values, hist2_slot.nvalues,
348	opr_codenum);
349	if (mcv2_exists && hist1_exists)
350	selec += (`1.0` - nullfrac1 - sumcommon1) *
351	inet_mcv_hist_sel(mcv2_slot.values, mcv2_slot.numbers, mcv2_length,
352	hist1_slot.values, hist1_slot.nvalues,
353	-opr_codenum);
354
355	/*
356	* Add in selectivity for histogram vs histogram matches, again scaling
357	* appropriately.
358	*/
359	if (hist1_exists && hist2_exists)
360	selec += (`1.0` - nullfrac1 - sumcommon1) *
361	(`1.0` - nullfrac2 - sumcommon2) *
362	inet_hist_inclusion_join_sel(hist1_slot.values, hist1_slot.nvalues,
363	hist2_slot.values, hist2_slot.nvalues,
364	opr_codenum);
365
366	/*
367	* If useful statistics are not available then use the default estimate.
368	* We can apply null fractions if known, though.
369	*/
370	if ((!mcv1_exists && !hist1_exists) \|\| (!mcv2_exists && !hist2_exists))
371	selec = (`1.0` - nullfrac1) * (`1.0` - nullfrac2) * DEFAULT_SEL(operator);
372
373	/ Release stats. /
374	free_attstatsslot(&mcv1_slot);
375	free_attstatsslot(&mcv2_slot);
376	free_attstatsslot(&hist1_slot);
377	free_attstatsslot(&hist2_slot);
378
379	return selec;
380	}
381
382	/*
383	* Semi join selectivity estimation for subnet inclusion/overlap operators
384	*
385	* Calculates MCV vs MCV, MCV vs histogram, histogram vs MCV, and histogram vs
386	* histogram selectivity for semi/anti join cases.
387	*/
388	static Selectivity
389	networkjoinsel_semi(Oid operator,
390	VariableStatData vardata1, VariableStatData vardata2)
391	{
392	Form_pg_statistic stats;
393	Selectivity selec = `0.0`,
394	sumcommon1 = `0.0`,
395	sumcommon2 = `0.0`;
396	double nullfrac1 = `0.0`,
397	nullfrac2 = `0.0`,
398	hist2_weight = `0.0`;
399	bool mcv1_exists = false,
400	mcv2_exists = false,
401	hist1_exists = false,
402	hist2_exists = false;
403	int opr_codenum;
404	FmgrInfo proc;
405	int i,
406	mcv1_length = `0`,
407	mcv2_length = `0`;
408	AttStatsSlot mcv1_slot;
409	AttStatsSlot mcv2_slot;
410	AttStatsSlot hist1_slot;
411	AttStatsSlot hist2_slot;
412
413	if (HeapTupleIsValid(vardata1->statsTuple))
414	{
415	stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
416	nullfrac1 = stats->stanullfrac;
417
418	mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
419	STATISTIC_KIND_MCV, InvalidOid,
420	ATTSTATSSLOT_VALUES \| ATTSTATSSLOT_NUMBERS);
421	hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
422	STATISTIC_KIND_HISTOGRAM, InvalidOid,
423	ATTSTATSSLOT_VALUES);
424	/ Arbitrarily limit number of MCVs considered /
425	mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
426	if (mcv1_exists)
427	sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
428	}
429	else
430	{
431	memset(&mcv1_slot, `0`, sizeof(mcv1_slot));
432	memset(&hist1_slot, `0`, sizeof(hist1_slot));
433	}
434
435	if (HeapTupleIsValid(vardata2->statsTuple))
436	{
437	stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
438	nullfrac2 = stats->stanullfrac;
439
440	mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
441	STATISTIC_KIND_MCV, InvalidOid,
442	ATTSTATSSLOT_VALUES \| ATTSTATSSLOT_NUMBERS);
443	hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
444	STATISTIC_KIND_HISTOGRAM, InvalidOid,
445	ATTSTATSSLOT_VALUES);
446	/ Arbitrarily limit number of MCVs considered /
447	mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
448	if (mcv2_exists)
449	sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
450	}
451	else
452	{
453	memset(&mcv2_slot, `0`, sizeof(mcv2_slot));
454	memset(&hist2_slot, `0`, sizeof(hist2_slot));
455	}
456
457	opr_codenum = inet_opr_codenum(operator);
458	fmgr_info(get_opcode(operator), &proc);
459
460	/ Estimate number of input rows represented by RHS histogram. /
461	if (hist2_exists && vardata2->rel)
462	hist2_weight = (`1.0` - nullfrac2 - sumcommon2) * vardata2->rel->rows;
463
464	/*
465	* Consider each element of the LHS MCV list, matching it to whatever RHS
466	* stats we have. Scale according to the known frequency of the MCV.
467	*/
468	if (mcv1_exists && (mcv2_exists \|\| hist2_exists))
469	{
470	for (i = `0`; i < mcv1_length; i++)
471	{
472	selec += mcv1_slot.numbers[i] *
473	inet_semi_join_sel(mcv1_slot.values[i],
474	mcv2_exists, mcv2_slot.values, mcv2_length,
475	hist2_exists,
476	hist2_slot.values, hist2_slot.nvalues,
477	hist2_weight,
478	&proc, opr_codenum);
479	}
480	}
481
482	/*
483	* Consider each element of the LHS histogram, except for the first and
484	* last elements, which we exclude on the grounds that they're outliers
485	* and thus not very representative. Scale on the assumption that each
486	* such histogram element represents an equal share of the LHS histogram
487	* population (which is a bit bogus, because the members of its bucket may
488	* not all act the same with respect to the join clause, but it's hard to
489	* do better).
490	*
491	* If there are too many histogram elements, decimate to limit runtime.
492	*/
493	if (hist1_exists && hist1_slot.nvalues > `2` && (mcv2_exists \|\| hist2_exists))
494	{
495	double hist_selec_sum = `0.0`;
496	int k,
497	n;
498
499	k = (hist1_slot.nvalues - `3`) / MAX_CONSIDERED_ELEMS + `1`;
500
501	n = `0`;
502	for (i = `1`; i < hist1_slot.nvalues - `1`; i += k)
503	{
504	hist_selec_sum +=
505	inet_semi_join_sel(hist1_slot.values[i],
506	mcv2_exists, mcv2_slot.values, mcv2_length,
507	hist2_exists,
508	hist2_slot.values, hist2_slot.nvalues,
509	hist2_weight,
510	&proc, opr_codenum);
511	n++;
512	}
513
514	selec += (`1.0` - nullfrac1 - sumcommon1) * hist_selec_sum / n;
515	}
516
517	/*
518	* If useful statistics are not available then use the default estimate.
519	* We can apply null fractions if known, though.
520	*/
521	if ((!mcv1_exists && !hist1_exists) \|\| (!mcv2_exists && !hist2_exists))
522	selec = (`1.0` - nullfrac1) * (`1.0` - nullfrac2) * DEFAULT_SEL(operator);
523
524	/ Release stats. /
525	free_attstatsslot(&mcv1_slot);
526	free_attstatsslot(&mcv2_slot);
527	free_attstatsslot(&hist1_slot);
528	free_attstatsslot(&hist2_slot);
529
530	return selec;
531	}
532
533	/*
534	* Compute the fraction of a relation's population that is represented
535	* by the MCV list.
536	*/
537	static Selectivity
538	mcv_population(float4 mcv_numbers, int* mcv_nvalues)
539	{
540	Selectivity sumcommon = `0.0`;
541	int i;
542
543	for (i = `0`; i < mcv_nvalues; i++)
544	{
545	sumcommon += mcv_numbers[i];
546	}
547
548	return sumcommon;
549	}
550
551	/*
552	* Inet histogram vs single value selectivity estimation
553	*
554	* Estimate the fraction of the histogram population that satisfies
555	* "value OPR CONST". (The result needs to be scaled to reflect the
556	* proportion of the total population represented by the histogram.)
557	*
558	* The histogram is originally for the inet btree comparison operators.
559	* Only the common bits of the network part and the length of the network part
560	* (masklen) are interesting for the subnet inclusion operators. Fortunately,
561	* btree comparison treats the network part as the major sort key. Even so,
562	* the length of the network part would not really be significant in the
563	* histogram. This would lead to big mistakes for data sets with uneven
564	* masklen distribution. To reduce this problem, comparisons with the left
565	* and the right sides of the buckets are used together.
566	*
567	* Histogram bucket matches are calculated in two forms. If the constant
568	* matches both bucket endpoints the bucket is considered as fully matched.
569	* The second form is to match the bucket partially; we recognize this when
570	* the constant matches just one endpoint, or the two endpoints fall on
571	* opposite sides of the constant. (Note that when the constant matches an
572	* interior histogram element, it gets credit for partial matches to the
573	* buckets on both sides, while a match to a histogram endpoint gets credit
574	* for only one partial match. This is desirable.)
575	*
576	* The divider in the partial bucket match is imagined as the distance
577	* between the decisive bits and the common bits of the addresses. It will
578	* be used as a power of two as it is the natural scale for the IP network
579	* inclusion. This partial bucket match divider calculation is an empirical
580	* formula and subject to change with more experiment.
581	*
582	* For a partial match, we try to calculate dividers for both of the
583	* boundaries. If the address family of a boundary value does not match the
584	* constant or comparison of the length of the network parts is not correct
585	* for the operator, the divider for that boundary will not be taken into
586	* account. If both of the dividers are valid, the greater one will be used
587	* to minimize the mistake in buckets that have disparate masklens. This
588	* calculation is unfair when dividers can be calculated for both of the
589	* boundaries but they are far from each other; but it is not a common
590	* situation as the boundaries are expected to share most of their significant
591	* bits of their masklens. The mistake would be greater, if we would use the
592	* minimum instead of the maximum, and we don't know a sensible way to combine
593	* them.
594	*
595	* For partial match in buckets that have different address families on the
596	* left and right sides, only the boundary with the same address family is
597	* taken into consideration. This can cause more mistakes for these buckets
598	* if the masklens of their boundaries are also disparate. But this can only
599	* happen in one bucket, since only two address families exist. It seems a
600	* better option than not considering these buckets at all.
601	*/
602	static Selectivity
603	inet_hist_value_sel(Datum values, int* nvalues, Datum constvalue,
604	int opr_codenum)
605	{
606	Selectivity match = `0.0`;
607	inet *query,
608	*left,
609	*right;
610	int i,
611	k,
612	n;
613	int left_order,
614	right_order,
615	left_divider,
616	right_divider;
617
618	/ guard against zero-divide below /
619	if (nvalues <= `1`)
620	return `0.0`;
621
622	/ if there are too many histogram elements, decimate to limit runtime /
623	k = (nvalues - `2`) / MAX_CONSIDERED_ELEMS + `1`;
624
625	query = DatumGetInetPP(constvalue);
626
627	/ "left" is the left boundary value of the current bucket ... /
628	left = DatumGetInetPP(values[`0`]);
629	left_order = inet_inclusion_cmp(left, query, opr_codenum);
630
631	n = `0`;
632	for (i = k; i < nvalues; i += k)
633	{
634	/ ... and "right" is the right boundary value /
635	right = DatumGetInetPP(values[i]);
636	right_order = inet_inclusion_cmp(right, query, opr_codenum);
637
638	if (left_order == `0` && right_order == `0`)
639	{
640	/ The whole bucket matches, since both endpoints do. /
641	match += `1.0`;
642	}
643	else if ((left_order <= `0` && right_order >= `0`) \|\|
644	(left_order >= `0` && right_order <= `0`))
645	{
646	/ Partial bucket match. /
647	left_divider = inet_hist_match_divider(left, query, opr_codenum);
648	right_divider = inet_hist_match_divider(right, query, opr_codenum);
649
650	if (left_divider >= `0` \|\| right_divider >= `0`)
651	match += `1.0` / pow(`2.0`, Max(left_divider, right_divider));
652	}
653
654	/ Shift the variables. /
655	left = right;
656	left_order = right_order;
657
658	/ Count the number of buckets considered. /
659	n++;
660	}
661
662	return match / n;
663	}
664
665	/*
666	* Inet MCV vs MCV join selectivity estimation
667	*
668	* We simply add up the fractions of the populations that satisfy the clause.
669	* The result is exact and does not need to be scaled further.
670	*/
671	static Selectivity
672	inet_mcv_join_sel(Datum mcv1_values, float4 mcv1_numbers, int mcv1_nvalues,
673	Datum mcv2_values, float4 mcv2_numbers, int mcv2_nvalues,
674	Oid operator)
675	{
676	Selectivity selec = `0.0`;
677	FmgrInfo proc;
678	int i,
679	j;
680
681	fmgr_info(get_opcode(operator), &proc);
682
683	for (i = `0`; i < mcv1_nvalues; i++)
684	{
685	for (j = `0`; j < mcv2_nvalues; j++)
686	if (DatumGetBool(FunctionCall2(&proc,
687	mcv1_values[i],
688	mcv2_values[j])))
689	selec += mcv1_numbers[i] * mcv2_numbers[j];
690	}
691	return selec;
692	}
693
694	/*
695	* Inet MCV vs histogram join selectivity estimation
696	*
697	* For each MCV on the lefthand side, estimate the fraction of the righthand's
698	* histogram population that satisfies the join clause, and add those up,
699	* scaling by the MCV's frequency. The result still needs to be scaled
700	* according to the fraction of the righthand's population represented by
701	* the histogram.
702	*/
703	static Selectivity
704	inet_mcv_hist_sel(Datum mcv_values, float4 mcv_numbers, int mcv_nvalues,
705	Datum hist_values, int* hist_nvalues,
706	int opr_codenum)
707	{
708	Selectivity selec = `0.0`;
709	int i;
710
711	/*
712	* We'll call inet_hist_value_selec with the histogram on the left, so we
713	* must commute the operator.
714	*/
715	opr_codenum = -opr_codenum;
716
717	for (i = `0`; i < mcv_nvalues; i++)
718	{
719	selec += mcv_numbers[i] *
720	inet_hist_value_sel(hist_values, hist_nvalues, mcv_values[i],
721	opr_codenum);
722	}
723	return selec;
724	}
725
726	/*
727	* Inet histogram vs histogram join selectivity estimation
728	*
729	* Here, we take all values listed in the second histogram (except for the
730	* first and last elements, which are excluded on the grounds of possibly
731	* not being very representative) and treat them as a uniform sample of
732	* the non-MCV population for that relation. For each one, we apply
733	* inet_hist_value_selec to see what fraction of the first histogram
734	* it matches.
735	*
736	* We could alternatively do this the other way around using the operator's
737	* commutator. XXX would it be worthwhile to do it both ways and take the
738	* average? That would at least avoid non-commutative estimation results.
739	*/
740	static Selectivity
741	inet_hist_inclusion_join_sel(Datum hist1_values, int* hist1_nvalues,
742	Datum hist2_values, int* hist2_nvalues,
743	int opr_codenum)
744	{
745	double match = `0.0`;
746	int i,
747	k,
748	n;
749
750	if (hist2_nvalues <= `2`)
751	return `0.0`; / no interior histogram elements /
752
753	/ if there are too many histogram elements, decimate to limit runtime /
754	k = (hist2_nvalues - `3`) / MAX_CONSIDERED_ELEMS + `1`;
755
756	n = `0`;
757	for (i = `1`; i < hist2_nvalues - `1`; i += k)
758	{
759	match += inet_hist_value_sel(hist1_values, hist1_nvalues,
760	hist2_values[i], opr_codenum);
761	n++;
762	}
763
764	return match / n;
765	}
766
767	/*
768	* Inet semi join selectivity estimation for one value
769	*
770	* The function calculates the probability that there is at least one row
771	* in the RHS table that satisfies the "lhs_value op column" condition.
772	* It is used in semi join estimation to check a sample from the left hand
773	* side table.
774	*
775	* The MCV and histogram from the right hand side table should be provided as
776	* arguments with the lhs_value from the left hand side table for the join.
777	* hist_weight is the total number of rows represented by the histogram.
778	* For example, if the table has 1000 rows, and 10% of the rows are in the MCV
779	* list, and another 10% are NULLs, hist_weight would be 800.
780	*
781	* First, the lhs_value will be matched to the most common values. If it
782	* matches any of them, 1.0 will be returned, because then there is surely
783	* a match.
784	*
785	* Otherwise, the histogram will be used to estimate the number of rows in
786	* the second table that match the condition. If the estimate is greater
787	* than 1.0, 1.0 will be returned, because it means there is a greater chance
788	* that the lhs_value will match more than one row in the table. If it is
789	* between 0.0 and 1.0, it will be returned as the probability.
790	*/
791	static Selectivity
792	inet_semi_join_sel(Datum lhs_value,
793	bool mcv_exists, Datum mcv_values, int* mcv_nvalues,
794	bool hist_exists, Datum hist_values, int* hist_nvalues,
795	double hist_weight,
796	FmgrInfo proc, int* opr_codenum)
797	{
798	if (mcv_exists)
799	{
800	int i;
801
802	for (i = `0`; i < mcv_nvalues; i++)
803	{
804	if (DatumGetBool(FunctionCall2(proc,
805	lhs_value,
806	mcv_values[i])))
807	return `1.0`;
808	}
809	}
810
811	if (hist_exists && hist_weight > `0`)
812	{
813	Selectivity hist_selec;
814
815	/ Commute operator, since we're passing lhs_value on the right /
816	hist_selec = inet_hist_value_sel(hist_values, hist_nvalues,
817	lhs_value, -opr_codenum);
818
819	if (hist_selec > `0`)
820	return Min(`1.0`, hist_weight * hist_selec);
821	}
822
823	return `0.0`;
824	}
825
826	/*
827	* Assign useful code numbers for the subnet inclusion/overlap operators
828	*
829	* Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend
830	* on the exact codes assigned here; but many other places in this file
831	* know that they can negate a code to obtain the code for the commutator
832	* operator.
833	*/
834	static int
835	inet_opr_codenum(Oid operator)
836	{
837	switch (operator)
838	{
839	case OID_INET_SUP_OP:
840	return -`2`;
841	case OID_INET_SUPEQ_OP:
842	return -`1`;
843	case OID_INET_OVERLAP_OP:
844	return `0`;
845	case OID_INET_SUBEQ_OP:
846	return `1`;
847	case OID_INET_SUB_OP:
848	return `2`;
849	default:
850	elog(ERROR, "unrecognized operator %u for inet selectivity",
851	operator);
852	}
853	return `0`; / unreached, but keep compiler quiet /
854	}
855
856	/*
857	* Comparison function for the subnet inclusion/overlap operators
858	*
859	* If the comparison is okay for the specified inclusion operator, the return
860	* value will be 0. Otherwise the return value will be less than or greater
861	* than 0 as appropriate for the operator.
862	*
863	* Comparison is compatible with the basic comparison function for the inet
864	* type. See network_cmp_internal() in network.c for the original. Basic
865	* comparison operators are implemented with the network_cmp_internal()
866	* function. It is possible to implement the subnet inclusion operators with
867	* this function.
868	*
869	* Comparison is first on the common bits of the network part, then on the
870	* length of the network part (masklen) as in the network_cmp_internal()
871	* function. Only the first part is in this function. The second part is
872	* separated to another function for reusability. The difference between the
873	* second part and the original network_cmp_internal() is that the inclusion
874	* operator is considered while comparing the lengths of the network parts.
875	* See the inet_masklen_inclusion_cmp() function below.
876	*/
877	static int
878	inet_inclusion_cmp(inet left, inet right, int opr_codenum)
879	{
880	if (ip_family(left) == ip_family(right))
881	{
882	int order;
883
884	order = bitncmp(ip_addr(left), ip_addr(right),
885	Min(ip_bits(left), ip_bits(right)));
886	if (order != `0`)
887	return order;
888
889	return inet_masklen_inclusion_cmp(left, right, opr_codenum);
890	}
891
892	return ip_family(left) - ip_family(right);
893	}
894
895	/*
896	* Masklen comparison function for the subnet inclusion/overlap operators
897	*
898	* Compares the lengths of the network parts of the inputs. If the comparison
899	* is okay for the specified inclusion operator, the return value will be 0.
900	* Otherwise the return value will be less than or greater than 0 as
901	* appropriate for the operator.
902	*/
903	static int
904	inet_masklen_inclusion_cmp(inet left, inet right, int opr_codenum)
905	{
906	int order;
907
908	order = (int) ip_bits(left) - (int) ip_bits(right);
909
910	/*
911	* Return 0 if the operator would accept this combination of masklens.
912	* Note that opr_codenum zero (overlaps) will accept all cases.
913	*/
914	if ((order > `0` && opr_codenum >= `0`) \|\|
915	(order == `0` && opr_codenum >= -`1` && opr_codenum <= `1`) \|\|
916	(order < `0` && opr_codenum <= `0`))
917	return `0`;
918
919	/*
920	* Otherwise, return a negative value for sup/supeq (notionally, the RHS
921	* needs to have a larger masklen than it has, which would make it sort
922	* later), or a positive value for sub/subeq (vice versa).
923	*/
924	return opr_codenum;
925	}
926
927	/*
928	* Inet histogram partial match divider calculation
929	*
930	* First the families and the lengths of the network parts are compared using
931	* the subnet inclusion operator. If those are acceptable for the operator,
932	* the divider will be calculated using the masklens and the common bits of
933	* the addresses. -1 will be returned if it cannot be calculated.
934	*
935	* See commentary for inet_hist_value_sel() for some rationale for this.
936	*/
937	static int
938	inet_hist_match_divider(inet boundary, inet query, int opr_codenum)
939	{
940	if (ip_family(boundary) == ip_family(query) &&
941	inet_masklen_inclusion_cmp(boundary, query, opr_codenum) == `0`)
942	{
943	int min_bits,
944	decisive_bits;
945
946	min_bits = Min(ip_bits(boundary), ip_bits(query));
947
948	/*
949	* Set decisive_bits to the masklen of the one that should contain the
950	* other according to the operator.
951	*/
952	if (opr_codenum < `0`)
953	decisive_bits = ip_bits(boundary);
954	else if (opr_codenum > `0`)
955	decisive_bits = ip_bits(query);
956	else
957	decisive_bits = min_bits;
958
959	/*
960	* Now return the number of non-common decisive bits. (This will be
961	* zero if the boundary and query in fact match, else positive.)
962	*/
963	if (min_bits > `0`)
964	return decisive_bits - bitncommon(ip_addr(boundary),
965	ip_addr(query),
966	min_bits);
967	return decisive_bits;
968	}
969
970	return -`1`;
971	}
972

Browse the source code of PostgreSQL/src/backend/utils/adt/network_selfuncs.c