1/*-------------------------------------------------------------------------
2 *
3 * tsquery_cleanup.c
4 * Cleanup query from NOT values and/or stopword
5 * Utility functions to correct work.
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/adt/tsquery_cleanup.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "postgres.h"
17
18#include "tsearch/ts_utils.h"
19#include "miscadmin.h"
20
21typedef struct NODE
22{
23 struct NODE *left;
24 struct NODE *right;
25 QueryItem *valnode;
26} NODE;
27
28/*
29 * make query tree from plain view of query
30 */
31static NODE *
32maketree(QueryItem *in)
33{
34 NODE *node = (NODE *) palloc(sizeof(NODE));
35
36 /* since this function recurses, it could be driven to stack overflow. */
37 check_stack_depth();
38
39 node->valnode = in;
40 node->right = node->left = NULL;
41 if (in->type == QI_OPR)
42 {
43 node->right = maketree(in + 1);
44 if (in->qoperator.oper != OP_NOT)
45 node->left = maketree(in + in->qoperator.left);
46 }
47 return node;
48}
49
50/*
51 * Internal state for plaintree and plainnode
52 */
53typedef struct
54{
55 QueryItem *ptr;
56 int len; /* allocated size of ptr */
57 int cur; /* number of elements in ptr */
58} PLAINTREE;
59
60static void
61plainnode(PLAINTREE *state, NODE *node)
62{
63 /* since this function recurses, it could be driven to stack overflow. */
64 check_stack_depth();
65
66 if (state->cur == state->len)
67 {
68 state->len *= 2;
69 state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem));
70 }
71 memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem));
72 if (node->valnode->type == QI_VAL)
73 state->cur++;
74 else if (node->valnode->qoperator.oper == OP_NOT)
75 {
76 state->ptr[state->cur].qoperator.left = 1;
77 state->cur++;
78 plainnode(state, node->right);
79 }
80 else
81 {
82 int cur = state->cur;
83
84 state->cur++;
85 plainnode(state, node->right);
86 state->ptr[cur].qoperator.left = state->cur - cur;
87 plainnode(state, node->left);
88 }
89 pfree(node);
90}
91
92/*
93 * make plain view of tree from a NODE-tree representation
94 */
95static QueryItem *
96plaintree(NODE *root, int *len)
97{
98 PLAINTREE pl;
99
100 pl.cur = 0;
101 pl.len = 16;
102 if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
103 {
104 pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
105 plainnode(&pl, root);
106 }
107 else
108 pl.ptr = NULL;
109 *len = pl.cur;
110 return pl.ptr;
111}
112
113static void
114freetree(NODE *node)
115{
116 /* since this function recurses, it could be driven to stack overflow. */
117 check_stack_depth();
118
119 if (!node)
120 return;
121 if (node->left)
122 freetree(node->left);
123 if (node->right)
124 freetree(node->right);
125 pfree(node);
126}
127
128/*
129 * clean tree for ! operator.
130 * It's useful for debug, but in
131 * other case, such view is used with search in index.
132 * Operator ! always return TRUE
133 */
134static NODE *
135clean_NOT_intree(NODE *node)
136{
137 /* since this function recurses, it could be driven to stack overflow. */
138 check_stack_depth();
139
140 if (node->valnode->type == QI_VAL)
141 return node;
142
143 if (node->valnode->qoperator.oper == OP_NOT)
144 {
145 freetree(node);
146 return NULL;
147 }
148
149 /* operator & or | */
150 if (node->valnode->qoperator.oper == OP_OR)
151 {
152 if ((node->left = clean_NOT_intree(node->left)) == NULL ||
153 (node->right = clean_NOT_intree(node->right)) == NULL)
154 {
155 freetree(node);
156 return NULL;
157 }
158 }
159 else
160 {
161 NODE *res = node;
162
163 Assert(node->valnode->qoperator.oper == OP_AND ||
164 node->valnode->qoperator.oper == OP_PHRASE);
165
166 node->left = clean_NOT_intree(node->left);
167 node->right = clean_NOT_intree(node->right);
168 if (node->left == NULL && node->right == NULL)
169 {
170 pfree(node);
171 res = NULL;
172 }
173 else if (node->left == NULL)
174 {
175 res = node->right;
176 pfree(node);
177 }
178 else if (node->right == NULL)
179 {
180 res = node->left;
181 pfree(node);
182 }
183 return res;
184 }
185 return node;
186}
187
188QueryItem *
189clean_NOT(QueryItem *ptr, int *len)
190{
191 NODE *root = maketree(ptr);
192
193 return plaintree(clean_NOT_intree(root), len);
194}
195
196
197/*
198 * Remove QI_VALSTOP (stopword) nodes from query tree.
199 *
200 * Returns NULL if the query degenerates to nothing. Input must not be NULL.
201 *
202 * When we remove a phrase operator due to removing one or both of its
203 * arguments, we might need to adjust the distance of a parent phrase
204 * operator. For example, 'a' is a stopword, so:
205 * (b <-> a) <-> c should become b <2> c
206 * b <-> (a <-> c) should become b <2> c
207 * (b <-> (a <-> a)) <-> c should become b <3> c
208 * b <-> ((a <-> a) <-> c) should become b <3> c
209 * To handle that, we define two output parameters:
210 * ladd: amount to add to a phrase distance to the left of this node
211 * radd: amount to add to a phrase distance to the right of this node
212 * We need two outputs because we could need to bubble up adjustments to two
213 * different parent phrase operators. Consider
214 * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
215 * After we've removed the two a's and are considering the <2> node (which is
216 * now just x <2> y), we have an ladd distance of 1 that needs to propagate
217 * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
218 * propagate to the rightmost <->, so that we'll end up with
219 * w <2> ((x <2> y) <4> z)
220 * Near the bottom of the tree, we may have subtrees consisting only of
221 * stopwords. The distances of any phrase operators within such a subtree are
222 * summed and propagated to both ladd and radd, since we don't know which side
223 * of the lowest surviving phrase operator we are in. The rule is that any
224 * subtree that degenerates to NULL must return equal values of ladd and radd,
225 * and the parent node dealing with it should incorporate only one of those.
226 *
227 * Currently, we only implement this adjustment for adjacent phrase operators.
228 * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
229 * isn't ideal, but there is no way to represent the really desired semantics
230 * without some redesign of the tsquery structure. Certainly it would not be
231 * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
232 * corner case, let it go for now. But we can fix it in cases where the
233 * intervening non-phrase operator also gets removed, for example
234 * '((x <-> a) | a) <-> y' will become 'x <2> y'.
235 */
236static NODE *
237clean_stopword_intree(NODE *node, int *ladd, int *radd)
238{
239 /* since this function recurses, it could be driven to stack overflow. */
240 check_stack_depth();
241
242 /* default output parameters indicate no change in parent distance */
243 *ladd = *radd = 0;
244
245 if (node->valnode->type == QI_VAL)
246 return node;
247 else if (node->valnode->type == QI_VALSTOP)
248 {
249 pfree(node);
250 return NULL;
251 }
252
253 Assert(node->valnode->type == QI_OPR);
254
255 if (node->valnode->qoperator.oper == OP_NOT)
256 {
257 /* NOT doesn't change pattern width, so just report child distances */
258 node->right = clean_stopword_intree(node->right, ladd, radd);
259 if (!node->right)
260 {
261 freetree(node);
262 return NULL;
263 }
264 }
265 else
266 {
267 NODE *res = node;
268 bool isphrase;
269 int ndistance,
270 lladd,
271 lradd,
272 rladd,
273 rradd;
274
275 /* First, recurse */
276 node->left = clean_stopword_intree(node->left, &lladd, &lradd);
277 node->right = clean_stopword_intree(node->right, &rladd, &rradd);
278
279 /* Check if current node is OP_PHRASE, get its distance */
280 isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
281 ndistance = isphrase ? node->valnode->qoperator.distance : 0;
282
283 if (node->left == NULL && node->right == NULL)
284 {
285 /*
286 * When we collapse out a phrase node entirely, propagate its own
287 * distance into both *ladd and *radd; it is the responsibility of
288 * the parent node to count it only once. Also, for a phrase
289 * node, distances coming from children are summed and propagated
290 * up to parent (we assume lladd == lradd and rladd == rradd, else
291 * rule was broken at a lower level). But if this isn't a phrase
292 * node, take the larger of the two child distances; that
293 * corresponds to what TS_execute will do in non-stopword cases.
294 */
295 if (isphrase)
296 *ladd = *radd = lladd + ndistance + rladd;
297 else
298 *ladd = *radd = Max(lladd, rladd);
299 freetree(node);
300 return NULL;
301 }
302 else if (node->left == NULL)
303 {
304 /* Removing this operator and left subnode */
305 /* lladd and lradd are equal/redundant, don't count both */
306 if (isphrase)
307 {
308 /* operator's own distance must propagate to left */
309 *ladd = lladd + ndistance + rladd;
310 *radd = rradd;
311 }
312 else
313 {
314 /* at non-phrase op, just forget the left subnode entirely */
315 *ladd = rladd;
316 *radd = rradd;
317 }
318 res = node->right;
319 pfree(node);
320 }
321 else if (node->right == NULL)
322 {
323 /* Removing this operator and right subnode */
324 /* rladd and rradd are equal/redundant, don't count both */
325 if (isphrase)
326 {
327 /* operator's own distance must propagate to right */
328 *ladd = lladd;
329 *radd = lradd + ndistance + rradd;
330 }
331 else
332 {
333 /* at non-phrase op, just forget the right subnode entirely */
334 *ladd = lladd;
335 *radd = lradd;
336 }
337 res = node->left;
338 pfree(node);
339 }
340 else if (isphrase)
341 {
342 /* Absorb appropriate corrections at this level */
343 node->valnode->qoperator.distance += lradd + rladd;
344 /* Propagate up any unaccounted-for corrections */
345 *ladd = lladd;
346 *radd = rradd;
347 }
348 else
349 {
350 /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
351 }
352
353 return res;
354 }
355 return node;
356}
357
358/*
359 * Number of elements in query tree
360 */
361static int32
362calcstrlen(NODE *node)
363{
364 int32 size = 0;
365
366 if (node->valnode->type == QI_VAL)
367 {
368 size = node->valnode->qoperand.length + 1;
369 }
370 else
371 {
372 Assert(node->valnode->type == QI_OPR);
373
374 size = calcstrlen(node->right);
375 if (node->valnode->qoperator.oper != OP_NOT)
376 size += calcstrlen(node->left);
377 }
378
379 return size;
380}
381
382/*
383 * Remove QI_VALSTOP (stopword) nodes from TSQuery.
384 */
385TSQuery
386cleanup_tsquery_stopwords(TSQuery in)
387{
388 int32 len,
389 lenstr,
390 commonlen,
391 i;
392 NODE *root;
393 int ladd,
394 radd;
395 TSQuery out;
396 QueryItem *items;
397 char *operands;
398
399 if (in->size == 0)
400 return in;
401
402 /* eliminate stop words */
403 root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
404 if (root == NULL)
405 {
406 ereport(NOTICE,
407 (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
408 out = palloc(HDRSIZETQ);
409 out->size = 0;
410 SET_VARSIZE(out, HDRSIZETQ);
411 return out;
412 }
413
414 /*
415 * Build TSQuery from plain view
416 */
417
418 lenstr = calcstrlen(root);
419 items = plaintree(root, &len);
420 commonlen = COMPUTESIZE(len, lenstr);
421
422 out = palloc(commonlen);
423 SET_VARSIZE(out, commonlen);
424 out->size = len;
425
426 memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
427
428 items = GETQUERY(out);
429 operands = GETOPERAND(out);
430 for (i = 0; i < out->size; i++)
431 {
432 QueryOperand *op = (QueryOperand *) &items[i];
433
434 if (op->type != QI_VAL)
435 continue;
436
437 memcpy(operands, GETOPERAND(in) + op->distance, op->length);
438 operands[op->length] = '\0';
439 op->distance = operands - GETOPERAND(out);
440 operands += op->length + 1;
441 }
442
443 return out;
444}
445