ts_selfuncs.c source code [PostgreSQL/src/backend/tsearch/ts_selfuncs.c]

1	/-------------------------------------------------------------------------*
2	*
3	* ts_selfuncs.c
4	* Selectivity estimation functions for text search operators.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	*
8	*
9	* IDENTIFICATION
10	* src/backend/tsearch/ts_selfuncs.c
11	*
12	*-------------------------------------------------------------------------
13	*/
14	#include "postgres.h"
15
16	#include "access/htup_details.h"
17	#include "catalog/pg_statistic.h"
18	#include "catalog/pg_type.h"
19	#include "miscadmin.h"
20	#include "nodes/nodes.h"
21	#include "tsearch/ts_type.h"
22	#include "utils/builtins.h"
23	#include "utils/lsyscache.h"
24	#include "utils/selfuncs.h"
25	#include "utils/syscache.h"
26
27
28	/*
29	* The default text search selectivity is chosen to be small enough to
30	* encourage indexscans for typical table densities. See selfuncs.h and
31	* DEFAULT_EQ_SEL for details.
32	*/
33	#define DEFAULT_TS_MATCH_SEL 0.005
34
35	/ lookup table type for binary searching through MCELEMs /
36	typedef struct
37	{
38	text *element;
39	float4 frequency;
40	} TextFreq;
41
42	/ type of keys for bsearch'ing through an array of TextFreqs /
43	typedef struct
44	{
45	char *lexeme;
46	int length;
47	} LexemeKey;
48
49	static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
50	static Selectivity mcelem_tsquery_selec(TSQuery query,
51	Datum mcelem, int* nmcelem,
52	float4 numbers, int* nnumbers);
53	static Selectivity tsquery_opr_selec(QueryItem item, char* *operand,
54	TextFreq lookup, int* length, float4 minfreq);
55	static int compare_lexeme_textfreq(const void e1, const* void *e2);
56
57	#define tsquery_opr_selec_no_stats(query) \
58	tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
59
60
61	/*
62	* tsmatchsel -- Selectivity of "@@"
63	*
64	* restriction selectivity function for tsvector @@ tsquery and
65	* tsquery @@ tsvector
66	*/
67	Datum
68	tsmatchsel(PG_FUNCTION_ARGS)
69	{
70	PlannerInfo root = (PlannerInfo ) PG_GETARG_POINTER(`0`);
71
72	#ifdef NOT_USED
73	Oid operator = PG_GETARG_OID(`1`);
74	#endif
75	List args = (List ) PG_GETARG_POINTER(`2`);
76	int varRelid = PG_GETARG_INT32(`3`);
77	VariableStatData vardata;
78	Node *other;
79	bool varonleft;
80	Selectivity selec;
81
82	/*
83	* If expression is not variable = something or something = variable, then
84	* punt and return a default estimate.
85	*/
86	if (!get_restriction_variable(root, args, varRelid,
87	&vardata, &other, &varonleft))
88	PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
89
90	/*
91	* Can't do anything useful if the something is not a constant, either.
92	*/
93	if (!IsA(other, Const))
94	{
95	ReleaseVariableStats(vardata);
96	PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
97	}
98
99	/*
100	* The "@@" operator is strict, so we can cope with NULL right away
101	*/
102	if (((Const *) other)->constisnull)
103	{
104	ReleaseVariableStats(vardata);
105	PG_RETURN_FLOAT8(`0.0`);
106	}
107
108	/*
109	* OK, there's a Var and a Const we're dealing with here. We need the
110	* Const to be a TSQuery, else we can't do anything useful. We have to
111	* check this because the Var might be the TSQuery not the TSVector.
112	*/
113	if (((Const *) other)->consttype == TSQUERYOID)
114	{
115	/ tsvector @@ tsquery or the other way around /
116	Assert(vardata.vartype == TSVECTOROID);
117
118	selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
119	}
120	else
121	{
122	/ If we can't see the query structure, must punt /
123	selec = DEFAULT_TS_MATCH_SEL;
124	}
125
126	ReleaseVariableStats(vardata);
127
128	CLAMP_PROBABILITY(selec);
129
130	PG_RETURN_FLOAT8((float8) selec);
131	}
132
133
134	/*
135	* tsmatchjoinsel -- join selectivity of "@@"
136	*
137	* join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
138	*/
139	Datum
140	tsmatchjoinsel(PG_FUNCTION_ARGS)
141	{
142	/ for the moment we just punt /
143	PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
144	}
145
146
147	/*
148	* @@ selectivity for tsvector var vs tsquery constant
149	*/
150	static Selectivity
151	tsquerysel(VariableStatData *vardata, Datum constval)
152	{
153	Selectivity selec;
154	TSQuery query;
155
156	/ The caller made sure the const is a TSQuery, so get it now /
157	query = DatumGetTSQuery(constval);
158
159	/ Empty query matches nothing /
160	if (query->size == `0`)
161	return (Selectivity) `0.0`;
162
163	if (HeapTupleIsValid(vardata->statsTuple))
164	{
165	Form_pg_statistic stats;
166	AttStatsSlot sslot;
167
168	stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
169
170	/ MCELEM will be an array of TEXT elements for a tsvector column /
171	if (get_attstatsslot(&sslot, vardata->statsTuple,
172	STATISTIC_KIND_MCELEM, InvalidOid,
173	ATTSTATSSLOT_VALUES \| ATTSTATSSLOT_NUMBERS))
174	{
175	/*
176	* There is a most-common-elements slot for the tsvector Var, so
177	* use that.
178	*/
179	selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
180	sslot.numbers, sslot.nnumbers);
181	free_attstatsslot(&sslot);
182	}
183	else
184	{
185	/ No most-common-elements info, so do without /
186	selec = tsquery_opr_selec_no_stats(query);
187	}
188
189	/*
190	* MCE stats count only non-null rows, so adjust for null rows.
191	*/
192	selec *= (`1.0` - stats->stanullfrac);
193	}
194	else
195	{
196	/ No stats at all, so do without /
197	selec = tsquery_opr_selec_no_stats(query);
198	/ we assume no nulls here, so no stanullfrac correction /
199	}
200
201	return selec;
202	}
203
204	/*
205	* Extract data from the pg_statistic arrays into useful format.
206	*/
207	static Selectivity
208	mcelem_tsquery_selec(TSQuery query, Datum mcelem, int* nmcelem,
209	float4 numbers, int* nnumbers)
210	{
211	float4 minfreq;
212	TextFreq *lookup;
213	Selectivity selec;
214	int i;
215
216	/*
217	* There should be two more Numbers than Values, because the last two
218	* cells are taken for minimal and maximal frequency. Punt if not.
219	*
220	* (Note: the MCELEM statistics slot definition allows for a third extra
221	* number containing the frequency of nulls, but we're not expecting that
222	* to appear for a tsvector column.)
223	*/
224	if (nnumbers != nmcelem + `2`)
225	return tsquery_opr_selec_no_stats(query);
226
227	/*
228	* Transpose the data into a single array so we can use bsearch().
229	*/
230	lookup = (TextFreq ) palloc(sizeof(TextFreq) nmcelem);
231	for (i = `0`; i < nmcelem; i++)
232	{
233	/*
234	* The text Datums came from an array, so it cannot be compressed or
235	* stored out-of-line -- it's safe to use VARSIZE_ANY*.
236	*/
237	Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
238	lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
239	lookup[i].frequency = numbers[i];
240	}
241
242	/*
243	* Grab the lowest frequency. compute_tsvector_stats() stored it for us in
244	* the one before the last cell of the Numbers array. See ts_typanalyze.c
245	*/
246	minfreq = numbers[nnumbers - `2`];
247
248	selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
249	nmcelem, minfreq);
250
251	pfree(lookup);
252
253	return selec;
254	}
255
256	/*
257	* Traverse the tsquery in preorder, calculating selectivity as:
258	*
259	* selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
260	*
261	* selec(left_oper) + selec(right_oper) -
262	* selec(left_oper) * selec(right_oper) in OR nodes,
263	*
264	* 1 - select(oper) in NOT nodes
265	*
266	* histogram-based estimation in prefix VAL nodes
267	*
268	* freq[val] in exact VAL nodes, if the value is in MCELEM
269	* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
270	*
271	* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
272	* binary search for determining freq[MCELEM].
273	*
274	* If we don't have stats for the tsvector, we still use this logic,
275	* except we use default estimates for VAL nodes. This case is signaled
276	* by lookup == NULL.
277	*/
278	static Selectivity
279	tsquery_opr_selec(QueryItem item, char* *operand,
280	TextFreq lookup, int* length, float4 minfreq)
281	{
282	Selectivity selec;
283
284	/ since this function recurses, it could be driven to stack overflow /
285	check_stack_depth();
286
287	if (item->type == QI_VAL)
288	{
289	QueryOperand oper = (QueryOperand ) item;
290	LexemeKey key;
291
292	/*
293	* Prepare the key for bsearch().
294	*/
295	key.lexeme = operand + oper->distance;
296	key.length = oper->length;
297
298	if (oper->prefix)
299	{
300	/ Prefix match, ie the query item is lexeme:* /
301	Selectivity matched,
302	allmces;
303	int i,
304	n_matched;
305
306	/*
307	* Our strategy is to scan through the MCELEM list and combine the
308	* frequencies of the ones that match the prefix. We then
309	* extrapolate the fraction of matching MCELEMs to the remaining
310	* rows, assuming that the MCELEMs are representative of the whole
311	* lexeme population in this respect. (Compare
312	* histogram_selectivity().) Note that these are most common
313	* elements not most common values, so they're not mutually
314	* exclusive. We treat occurrences as independent events.
315	*
316	* This is only a good plan if we have a pretty fair number of
317	* MCELEMs available; we set the threshold at 100. If no stats or
318	* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
319	*/
320	if (lookup == NULL \|\| length < `100`)
321	return (Selectivity) (DEFAULT_TS_MATCH_SEL * `4`);
322
323	matched = allmces = `0`;
324	n_matched = `0`;
325	for (i = `0`; i < length; i++)
326	{
327	TextFreq *t = lookup + i;
328	int tlen = VARSIZE_ANY_EXHDR(t->element);
329
330	if (tlen >= key.length &&
331	strncmp(key.lexeme, VARDATA_ANY(t->element),
332	key.length) == `0`)
333	{
334	matched += t->frequency - matched * t->frequency;
335	n_matched++;
336	}
337	allmces += t->frequency - allmces * t->frequency;
338	}
339
340	/ Clamp to ensure sanity in the face of roundoff error /
341	CLAMP_PROBABILITY(matched);
342	CLAMP_PROBABILITY(allmces);
343
344	selec = matched + (`1.0` - allmces) * ((double) n_matched / length);
345
346	/*
347	* In any case, never believe that a prefix match has selectivity
348	* less than we would assign for a non-MCELEM lexeme. This
349	* preserves the property that "word:*" should be estimated to
350	* match at least as many rows as "word" would be.
351	*/
352	selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / `2`), selec);
353	}
354	else
355	{
356	/ Regular exact lexeme match /
357	TextFreq *searchres;
358
359	/ If no stats for the variable, use DEFAULT_TS_MATCH_SEL /
360	if (lookup == NULL)
361	return (Selectivity) DEFAULT_TS_MATCH_SEL;
362
363	searchres = (TextFreq *) bsearch(&key, lookup, length,
364	sizeof(TextFreq),
365	compare_lexeme_textfreq);
366
367	if (searchres)
368	{
369	/*
370	* The element is in MCELEM. Return precise selectivity (or
371	* at least as precise as ANALYZE could find out).
372	*/
373	selec = searchres->frequency;
374	}
375	else
376	{
377	/*
378	* The element is not in MCELEM. Punt, but assume that the
379	* selectivity cannot be more than minfreq / 2.
380	*/
381	selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / `2`);
382	}
383	}
384	}
385	else
386	{
387	/ Current TSQuery node is an operator /
388	Selectivity s1,
389	s2;
390
391	switch (item->qoperator.oper)
392	{
393	case OP_NOT:
394	selec = `1.0` - tsquery_opr_selec(item + `1`, operand,
395	lookup, length, minfreq);
396	break;
397
398	case OP_PHRASE:
399	case OP_AND:
400	s1 = tsquery_opr_selec(item + `1`, operand,
401	lookup, length, minfreq);
402	s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
403	lookup, length, minfreq);
404	selec = s1 * s2;
405	break;
406
407	case OP_OR:
408	s1 = tsquery_opr_selec(item + `1`, operand,
409	lookup, length, minfreq);
410	s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
411	lookup, length, minfreq);
412	selec = s1 + s2 - s1 * s2;
413	break;
414
415	default:
416	elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
417	selec = `0`; / keep compiler quiet /
418	break;
419	}
420	}
421
422	/ Clamp intermediate results to stay sane despite roundoff error /
423	CLAMP_PROBABILITY(selec);
424
425	return selec;
426	}
427
428	/*
429	* bsearch() comparator for a lexeme (non-NULL terminated string with length)
430	* and a TextFreq. Use length, then byte-for-byte comparison, because that's
431	* how ANALYZE code sorted data before storing it in a statistic tuple.
432	* See ts_typanalyze.c for details.
433	*/
434	static int
435	compare_lexeme_textfreq(const void e1, const* void *e2)
436	{
437	const LexemeKey key = (const* LexemeKey *) e1;
438	const TextFreq t = (const* TextFreq *) e2;
439	int len1,
440	len2;
441
442	len1 = key->length;
443	len2 = VARSIZE_ANY_EXHDR(t->element);
444
445	/ Compare lengths first, possibly avoiding a strncmp call /
446	if (len1 > len2)
447	return `1`;
448	else if (len1 < len2)
449	return -`1`;
450
451	/ Fall back on byte-for-byte comparison /
452	return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
453	}
454

Browse the source code of PostgreSQL/src/backend/tsearch/ts_selfuncs.c