1/*****************************************************************************
2
3Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
4
5This program is free software; you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free Software
7Foundation; version 2 of the License.
8
9This program is distributed in the hope that it will be useful, but WITHOUT
10ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
13You should have received a copy of the GNU General Public License along with
14this program; if not, write to the Free Software Foundation, Inc.,
1551 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
16
17*****************************************************************************/
18
19/**************************************************//**
20@file fts/fts0ast.cc
21Full Text Search parser helper file.
22
23Created 2007/3/16 Sunny Bains.
24***********************************************************************/
25
26#include "ha_prototypes.h"
27
28#include "fts0ast.h"
29#include "fts0pars.h"
30#include "fts0fts.h"
31
32/* The FTS ast visit pass. */
33enum fts_ast_visit_pass_t {
34 FTS_PASS_FIRST, /*!< First visit pass,
35 process operators excluding
36 FTS_EXIST and FTS_IGNORE */
37 FTS_PASS_EXIST, /*!< Exist visit pass,
38 process operator FTS_EXIST */
39 FTS_PASS_IGNORE /*!< Ignore visit pass,
40 process operator FTS_IGNORE */
41};
42
43/******************************************************************//**
44Create an empty fts_ast_node_t.
45@return Create a new node */
46static
47fts_ast_node_t*
48fts_ast_node_create(void)
49/*=====================*/
50{
51 fts_ast_node_t* node;
52
53 node = (fts_ast_node_t*) ut_zalloc_nokey(sizeof(*node));
54
55 return(node);
56}
57
58/** Track node allocations, in case there is an error during parsing. */
59static
60void
61fts_ast_state_add_node(
62 fts_ast_state_t*state, /*!< in: ast instance */
63 fts_ast_node_t* node) /*!< in: node to add to ast */
64{
65 if (!state->list.head) {
66 ut_a(!state->list.tail);
67
68 state->list.head = state->list.tail = node;
69 } else {
70 state->list.tail->next_alloc = node;
71 state->list.tail = node;
72 }
73}
74
75/******************************************************************//**
76Create a operator fts_ast_node_t.
77@return new node */
78fts_ast_node_t*
79fts_ast_create_node_oper(
80/*=====================*/
81 void* arg, /*!< in: ast state instance */
82 fts_ast_oper_t oper) /*!< in: ast operator */
83{
84 fts_ast_node_t* node = fts_ast_node_create();
85
86 node->type = FTS_AST_OPER;
87 node->oper = oper;
88
89 fts_ast_state_add_node((fts_ast_state_t*) arg, node);
90
91 return(node);
92}
93
94/******************************************************************//**
95This function takes ownership of the ptr and is responsible
96for free'ing it
97@return new node or a node list with tokenized words */
98fts_ast_node_t*
99fts_ast_create_node_term(
100/*=====================*/
101 void* arg, /*!< in: ast state instance */
102 const fts_ast_string_t* ptr) /*!< in: ast term string */
103{
104 fts_ast_state_t* state = static_cast<fts_ast_state_t*>(arg);
105 ulint len = ptr->len;
106 ulint cur_pos = 0;
107 fts_ast_node_t* node = NULL;
108 fts_ast_node_t* node_list = NULL;
109 fts_ast_node_t* first_node = NULL;
110
111 /* Scan the incoming string and filter out any "non-word" characters */
112 while (cur_pos < len) {
113 fts_string_t str;
114 ulint cur_len;
115
116 cur_len = innobase_mysql_fts_get_token(
117 state->charset,
118 reinterpret_cast<const byte*>(ptr->str) + cur_pos,
119 reinterpret_cast<const byte*>(ptr->str) + len, &str);
120
121 if (cur_len == 0) {
122 break;
123 }
124
125 cur_pos += cur_len;
126
127 if (str.f_n_char > 0) {
128 /* If the subsequent term (after the first one)'s size
129 is less than fts_min_token_size or the term is greater
130 than fts_max_token_size, we shall ignore that. This is
131 to make consistent with MyISAM behavior */
132 if ((first_node && (str.f_n_char < fts_min_token_size))
133 || str.f_n_char > fts_max_token_size) {
134 continue;
135 }
136
137 node = fts_ast_node_create();
138
139 node->type = FTS_AST_TERM;
140
141 node->term.ptr = fts_ast_string_create(
142 str.f_str, str.f_len);
143
144 fts_ast_state_add_node(
145 static_cast<fts_ast_state_t*>(arg), node);
146
147 if (first_node) {
148 /* There is more than one word, create
149 a list to organize them */
150 if (!node_list) {
151 node_list = fts_ast_create_node_list(
152 static_cast<fts_ast_state_t*>(
153 arg),
154 first_node);
155 }
156
157 fts_ast_add_node(node_list, node);
158 } else {
159 first_node = node;
160 }
161 }
162 }
163
164 return((node_list != NULL) ? node_list : first_node);
165}
166
167/******************************************************************//**
168Create an AST term node, makes a copy of ptr for plugin parser
169@return node */
170fts_ast_node_t*
171fts_ast_create_node_term_for_parser(
172/*================================*/
173 void* arg, /*!< in: ast state */
174 const char* ptr, /*!< in: term string */
175 const ulint len) /*!< in: term string length */
176{
177 fts_ast_node_t* node = NULL;
178
179 /* '%' as first char is forbidden for LIKE in internal SQL parser;
180 '%' as last char is reserved for wildcard search;*/
181 if (len == 0 || len > FTS_MAX_WORD_LEN
182 || ptr[0] == '%' || ptr[len - 1] == '%') {
183 return(NULL);
184 }
185
186 node = fts_ast_node_create();
187
188 node->type = FTS_AST_TERM;
189
190 node->term.ptr = fts_ast_string_create(
191 reinterpret_cast<const byte*>(ptr), len);
192
193 fts_ast_state_add_node(static_cast<fts_ast_state_t*>(arg), node);
194
195 return(node);
196}
197
198/******************************************************************//**
199This function takes ownership of the ptr and is responsible
200for free'ing it.
201@return new node */
202fts_ast_node_t*
203fts_ast_create_node_text(
204/*=====================*/
205 void* arg, /*!< in: ast state instance */
206 const fts_ast_string_t* ptr) /*!< in: ast text string */
207{
208 ulint len = ptr->len;
209 fts_ast_node_t* node = NULL;
210
211 /* Once we come here, the string must have at least 2 quotes ""
212 around the query string, which could be empty. Also the query
213 string may contain 0x00 in it, we don't treat it as null-terminated. */
214 ut_ad(len >= 2);
215 ut_ad(ptr->str[0] == '\"' && ptr->str[len - 1] == '\"');
216
217 if (len == 2) {
218 /* If the query string contains nothing except quotes,
219 it's obviously an invalid query. */
220 return(NULL);
221 }
222
223 node = fts_ast_node_create();
224
225 /*!< We ignore the actual quotes "" */
226 len -= 2;
227
228 node->type = FTS_AST_TEXT;
229 /*!< Skip copying the first quote */
230 node->text.ptr = fts_ast_string_create(
231 reinterpret_cast<const byte*>(ptr->str + 1), len);
232 node->text.distance = ULINT_UNDEFINED;
233
234 fts_ast_state_add_node((fts_ast_state_t*) arg, node);
235
236 return(node);
237}
238
239/******************************************************************//**
240Create an AST phrase list node for plugin parser
241@return node */
242fts_ast_node_t*
243fts_ast_create_node_phrase_list(
244/*============================*/
245 void* arg) /*!< in: ast state */
246{
247 fts_ast_node_t* node = fts_ast_node_create();
248
249 node->type = FTS_AST_PARSER_PHRASE_LIST;
250
251 node->text.distance = ULINT_UNDEFINED;
252 node->list.head = node->list.tail = NULL;
253
254 fts_ast_state_add_node(static_cast<fts_ast_state_t*>(arg), node);
255
256 return(node);
257}
258
259/******************************************************************//**
260This function takes ownership of the expr and is responsible
261for free'ing it.
262@return new node */
263fts_ast_node_t*
264fts_ast_create_node_list(
265/*=====================*/
266 void* arg, /*!< in: ast state instance */
267 fts_ast_node_t* expr) /*!< in: ast expr instance */
268{
269 fts_ast_node_t* node = fts_ast_node_create();
270
271 node->type = FTS_AST_LIST;
272 node->list.head = node->list.tail = expr;
273
274 fts_ast_state_add_node((fts_ast_state_t*) arg, node);
275
276 return(node);
277}
278
279/******************************************************************//**
280Create a sub-expression list node. This function takes ownership of
281expr and is responsible for deleting it.
282@return new node */
283fts_ast_node_t*
284fts_ast_create_node_subexp_list(
285/*============================*/
286 void* arg, /*!< in: ast state instance */
287 fts_ast_node_t* expr) /*!< in: ast expr instance */
288{
289 fts_ast_node_t* node = fts_ast_node_create();
290
291 node->type = FTS_AST_SUBEXP_LIST;
292 node->list.head = node->list.tail = expr;
293
294 fts_ast_state_add_node((fts_ast_state_t*) arg, node);
295
296 return(node);
297}
298
299/******************************************************************//**
300Free an expr list node elements. */
301static
302void
303fts_ast_free_list(
304/*==============*/
305 fts_ast_node_t* node) /*!< in: ast node to free */
306{
307 ut_a(node->type == FTS_AST_LIST
308 || node->type == FTS_AST_SUBEXP_LIST
309 || node->type == FTS_AST_PARSER_PHRASE_LIST);
310
311 for (node = node->list.head;
312 node != NULL;
313 node = fts_ast_free_node(node)) {
314
315 /*!< No op */
316 }
317}
318
319/********************************************************************//**
320Free a fts_ast_node_t instance.
321@return next node to free */
322fts_ast_node_t*
323fts_ast_free_node(
324/*==============*/
325 fts_ast_node_t* node) /*!< in: the node to free */
326{
327 fts_ast_node_t* next_node;
328
329 switch (node->type) {
330 case FTS_AST_TEXT:
331 if (node->text.ptr) {
332 fts_ast_string_free(node->text.ptr);
333 node->text.ptr = NULL;
334 }
335 break;
336
337 case FTS_AST_TERM:
338 if (node->term.ptr) {
339 fts_ast_string_free(node->term.ptr);
340 node->term.ptr = NULL;
341 }
342 break;
343
344 case FTS_AST_LIST:
345 case FTS_AST_SUBEXP_LIST:
346 case FTS_AST_PARSER_PHRASE_LIST:
347 fts_ast_free_list(node);
348 node->list.head = node->list.tail = NULL;
349 break;
350
351 case FTS_AST_OPER:
352 break;
353
354 default:
355 ut_error;
356 }
357
358 /*!< Get next node before freeing the node itself */
359 next_node = node->next;
360
361 ut_free(node);
362
363 return(next_node);
364}
365
366/******************************************************************//**
367This AST takes ownership of the expr and is responsible
368for free'ing it.
369@return in param "list" */
370fts_ast_node_t*
371fts_ast_add_node(
372/*=============*/
373 fts_ast_node_t* node, /*!< in: list instance */
374 fts_ast_node_t* elem) /*!< in: node to add to list */
375{
376 if (!elem) {
377 return(NULL);
378 }
379
380 ut_a(!elem->next);
381 ut_a(node->type == FTS_AST_LIST
382 || node->type == FTS_AST_SUBEXP_LIST
383 || node->type == FTS_AST_PARSER_PHRASE_LIST);
384
385 if (!node->list.head) {
386 ut_a(!node->list.tail);
387
388 node->list.head = node->list.tail = elem;
389 } else {
390 ut_a(node->list.tail);
391
392 node->list.tail->next = elem;
393 node->list.tail = elem;
394 }
395
396 return(node);
397}
398
399/******************************************************************//**
400Set the wildcard attribute of a term. */
401void
402fts_ast_term_set_wildcard(
403/*======================*/
404 fts_ast_node_t* node) /*!< in/out: set attribute of
405 a term node */
406{
407 if (!node) {
408 return;
409 }
410
411 /* If it's a node list, the wildcard should be set to the tail node*/
412 if (node->type == FTS_AST_LIST) {
413 ut_ad(node->list.tail != NULL);
414 node = node->list.tail;
415 }
416
417 ut_a(node->type == FTS_AST_TERM);
418 ut_a(!node->term.wildcard);
419
420 node->term.wildcard = TRUE;
421}
422
423/******************************************************************//**
424Set the proximity attribute of a text node. */
425void
426fts_ast_text_set_distance(
427/*======================*/
428 fts_ast_node_t* node, /*!< in/out: text node */
429 ulint distance) /*!< in: the text proximity
430 distance */
431{
432 if (node == NULL) {
433 return;
434 }
435
436 ut_a(node->type == FTS_AST_TEXT);
437 ut_a(node->text.distance == ULINT_UNDEFINED);
438
439 node->text.distance = distance;
440}
441
442/******************************************************************//**
443Free node and expr allocations. */
444void
445fts_ast_state_free(
446/*===============*/
447 fts_ast_state_t*state) /*!< in: ast state to free */
448{
449 fts_ast_node_t* node = state->list.head;
450
451 /* Free the nodes that were allocated during parsing. */
452 while (node) {
453 fts_ast_node_t* next = node->next_alloc;
454
455 if (node->type == FTS_AST_TEXT && node->text.ptr) {
456 fts_ast_string_free(node->text.ptr);
457 node->text.ptr = NULL;
458 } else if (node->type == FTS_AST_TERM && node->term.ptr) {
459 fts_ast_string_free(node->term.ptr);
460 node->term.ptr = NULL;
461 }
462
463 ut_free(node);
464 node = next;
465 }
466
467 state->root = state->list.head = state->list.tail = NULL;
468}
469
470/** Print the ast string
471@param[in] str string to print */
472static
473void
474fts_ast_string_print(
475 const fts_ast_string_t* ast_str)
476{
477 for (ulint i = 0; i < ast_str->len; ++i) {
478 printf("%c", ast_str->str[i]);
479 }
480
481 printf("\n");
482}
483
484/******************************************************************//**
485Print an ast node recursively. */
486static
487void
488fts_ast_node_print_recursive(
489/*=========================*/
490 fts_ast_node_t* node, /*!< in: ast node to print */
491 ulint level) /*!< in: recursive level */
492{
493 /* Print alignment blank */
494 for (ulint i = 0; i < level; i++) {
495 printf(" ");
496 }
497
498 switch (node->type) {
499 case FTS_AST_TEXT:
500 printf("TEXT: ");
501 fts_ast_string_print(node->text.ptr);
502 break;
503
504 case FTS_AST_TERM:
505 printf("TERM: ");
506 fts_ast_string_print(node->term.ptr);
507 break;
508
509 case FTS_AST_LIST:
510 printf("LIST: \n");
511
512 for (node = node->list.head; node; node = node->next) {
513 fts_ast_node_print_recursive(node, level + 1);
514 }
515 break;
516
517 case FTS_AST_SUBEXP_LIST:
518 printf("SUBEXP_LIST: \n");
519
520 for (node = node->list.head; node; node = node->next) {
521 fts_ast_node_print_recursive(node, level + 1);
522 }
523 break;
524
525 case FTS_AST_OPER:
526 printf("OPER: %d\n", node->oper);
527 break;
528
529 case FTS_AST_PARSER_PHRASE_LIST:
530 printf("PARSER_PHRASE_LIST: \n");
531
532 for (node = node->list.head; node; node = node->next) {
533 fts_ast_node_print_recursive(node, level + 1);
534 }
535 break;
536
537 default:
538 ut_error;
539 }
540}
541
542/******************************************************************//**
543Print an ast node */
544void
545fts_ast_node_print(
546/*===============*/
547 fts_ast_node_t* node) /*!< in: ast node to print */
548{
549 fts_ast_node_print_recursive(node, 0);
550}
551
552/** Check only union operation involved in the node
553@param[in] node ast node to check
554@return true if the node contains only union else false. */
555bool
556fts_ast_node_check_union(
557 fts_ast_node_t* node)
558{
559 if (node->type == FTS_AST_LIST
560 || node->type == FTS_AST_SUBEXP_LIST
561 || node->type == FTS_AST_PARSER_PHRASE_LIST) {
562
563 for (node = node->list.head; node; node = node->next) {
564 if (!fts_ast_node_check_union(node)) {
565 return(false);
566 }
567 }
568
569 } else if (node->type == FTS_AST_OPER
570 && (node->oper == FTS_IGNORE
571 || node->oper == FTS_EXIST)) {
572
573 return(false);
574 } else if (node->type == FTS_AST_TEXT) {
575 /* Distance or phrase search query. */
576 return(false);
577 }
578
579 return(true);
580}
581
582/******************************************************************//**
583Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE
584nodes, which will be ignored in the first pass of each level, and visited in a
585second and third pass after all other nodes in the same level are visited.
586@return DB_SUCCESS if all went well */
587dberr_t
588fts_ast_visit(
589/*==========*/
590 fts_ast_oper_t oper, /*!< in: current operator */
591 fts_ast_node_t* node, /*!< in: current root node */
592 fts_ast_callback visitor, /*!< in: callback function */
593 void* arg, /*!< in: arg for callback */
594 bool* has_ignore) /*!< out: true, if the operator
595 was ignored during processing,
596 currently we ignore FTS_EXIST
597 and FTS_IGNORE operators */
598{
599 dberr_t error = DB_SUCCESS;
600 fts_ast_node_t* oper_node = NULL;
601 fts_ast_node_t* start_node;
602 bool revisit = false;
603 bool will_be_ignored = false;
604 fts_ast_visit_pass_t visit_pass = FTS_PASS_FIRST;
605
606 start_node = node->list.head;
607
608 ut_a(node->type == FTS_AST_LIST
609 || node->type == FTS_AST_SUBEXP_LIST);
610
611 if (oper == FTS_EXIST_SKIP) {
612 visit_pass = FTS_PASS_EXIST;
613 } else if (oper == FTS_IGNORE_SKIP) {
614 visit_pass = FTS_PASS_IGNORE;
615 }
616
617 /* In the first pass of the tree, at the leaf level of the
618 tree, FTS_EXIST and FTS_IGNORE operation will be ignored.
619 It will be repeated at the level above the leaf level.
620
621 The basic idea here is that when we encounter FTS_EXIST or
622 FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP
623 or FTS_IGNORE_SKIP, and term node & text node with the operators
624 is ignored in the first pass. We have two passes during the revisit:
625 We process nodes with FTS_EXIST_SKIP in the exist pass, and then
626 process nodes with FTS_IGNORE_SKIP in the ignore pass.
627
628 The order should be restrictly followed, or we will get wrong results.
629 For example, we have a query 'a +b -c d +e -f'.
630 first pass: process 'a' and 'd' by union;
631 exist pass: process '+b' and '+e' by intersection;
632 ignore pass: process '-c' and '-f' by difference. */
633
634 for (node = node->list.head;
635 node && (error == DB_SUCCESS);
636 node = node->next) {
637
638 switch (node->type) {
639 case FTS_AST_LIST:
640 if (visit_pass != FTS_PASS_FIRST) {
641 break;
642 }
643
644 error = fts_ast_visit(oper, node, visitor,
645 arg, &will_be_ignored);
646
647 /* If will_be_ignored is set to true, then
648 we encountered and ignored a FTS_EXIST or FTS_IGNORE
649 operator. */
650 if (will_be_ignored) {
651 revisit = true;
652 /* Remember oper for list in case '-abc&def',
653 ignored oper is from previous node of list.*/
654 node->oper = oper;
655 }
656
657 break;
658
659 case FTS_AST_OPER:
660 oper = node->oper;
661 oper_node = node;
662
663 /* Change the operator for revisit */
664 if (oper == FTS_EXIST) {
665 oper_node->oper = FTS_EXIST_SKIP;
666 } else if (oper == FTS_IGNORE) {
667 oper_node->oper = FTS_IGNORE_SKIP;
668 }
669
670 break;
671
672 default:
673 if (node->visited) {
674 continue;
675 }
676
677 ut_a(oper == FTS_NONE || !oper_node
678 || oper_node->oper == oper
679 || oper_node->oper == FTS_EXIST_SKIP
680 || oper_node->oper == FTS_IGNORE_SKIP);
681
682 if (oper== FTS_EXIST || oper == FTS_IGNORE) {
683 *has_ignore = true;
684 continue;
685 }
686
687 /* Process leaf node accroding to its pass.*/
688 if (oper == FTS_EXIST_SKIP
689 && visit_pass == FTS_PASS_EXIST) {
690 error = visitor(FTS_EXIST, node, arg);
691 node->visited = true;
692 } else if (oper == FTS_IGNORE_SKIP
693 && visit_pass == FTS_PASS_IGNORE) {
694 error = visitor(FTS_IGNORE, node, arg);
695 node->visited = true;
696 } else if (visit_pass == FTS_PASS_FIRST) {
697 error = visitor(oper, node, arg);
698 node->visited = true;
699 }
700 }
701 }
702
703 if (revisit) {
704 /* Exist pass processes the skipped FTS_EXIST operation. */
705 for (node = start_node;
706 node && error == DB_SUCCESS;
707 node = node->next) {
708
709 if (node->type == FTS_AST_LIST
710 && node->oper != FTS_IGNORE) {
711 error = fts_ast_visit(FTS_EXIST_SKIP, node,
712 visitor, arg, &will_be_ignored);
713 }
714 }
715
716 /* Ignore pass processes the skipped FTS_IGNORE operation. */
717 for (node = start_node;
718 node && error == DB_SUCCESS;
719 node = node->next) {
720
721 if (node->type == FTS_AST_LIST) {
722 error = fts_ast_visit(FTS_IGNORE_SKIP, node,
723 visitor, arg, &will_be_ignored);
724 }
725 }
726 }
727
728 return(error);
729}
730
731/**
732Create an ast string object, with NUL-terminator, so the string
733has one more byte than len
734@param[in] str pointer to string
735@param[in] len length of the string
736@return ast string with NUL-terminator */
737fts_ast_string_t*
738fts_ast_string_create(
739 const byte* str,
740 ulint len)
741{
742 fts_ast_string_t* ast_str;
743
744 ut_ad(len > 0);
745
746 ast_str = static_cast<fts_ast_string_t*>(
747 ut_malloc_nokey(sizeof(fts_ast_string_t)));
748
749 ast_str->str = static_cast<byte*>(ut_malloc_nokey(len + 1));
750
751 ast_str->len = len;
752 memcpy(ast_str->str, str, len);
753 ast_str->str[len] = '\0';
754
755 return(ast_str);
756}
757
758/**
759Free an ast string instance
760@param[in,out] ast_str string to free */
761void
762fts_ast_string_free(
763 fts_ast_string_t* ast_str)
764{
765 if (ast_str != NULL) {
766 ut_free(ast_str->str);
767 ut_free(ast_str);
768 }
769}
770
771/**
772Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
773@param[in] str string to translate
774@param[in] base the base
775@return translated number */
776ulint
777fts_ast_string_to_ul(
778 const fts_ast_string_t* ast_str,
779 int base)
780{
781 return(strtoul(reinterpret_cast<const char*>(ast_str->str),
782 NULL, base));
783}
784
785#ifdef UNIV_DEBUG
786const char*
787fts_ast_node_type_get(fts_ast_type_t type)
788{
789 switch (type) {
790 case FTS_AST_OPER:
791 return("FTS_AST_OPER");
792 case FTS_AST_NUMB:
793 return("FTS_AST_NUMB");
794 case FTS_AST_TERM:
795 return("FTS_AST_TERM");
796 case FTS_AST_TEXT:
797 return("FTS_AST_TEXT");
798 case FTS_AST_LIST:
799 return("FTS_AST_LIST");
800 case FTS_AST_SUBEXP_LIST:
801 return("FTS_AST_SUBEXP_LIST");
802 case FTS_AST_PARSER_PHRASE_LIST:
803 return("FTS_AST_PARSER_PHRASE_LIST");
804 }
805 ut_ad(0);
806 return("FTS_UNKNOWN");
807}
808#endif /* UNIV_DEBUG */
809