1/*****************************************************************************
2
3Copyright (c) 2007, 2017, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2017, 2018, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/**************************************************//**
21@file fts/fts0que.cc
22Full Text Search functionality.
23
24Created 2007/03/27 Sunny Bains
25Completed 2011/7/10 Sunny and Jimmy Yang
26*******************************************************/
27
28#include "ha_prototypes.h"
29
30#include "dict0dict.h"
31#include "ut0rbt.h"
32#include "row0sel.h"
33#include "fts0fts.h"
34#include "fts0priv.h"
35#include "fts0ast.h"
36#include "fts0pars.h"
37#include "fts0types.h"
38#include "fts0plugin.h"
39#include "ut0new.h"
40
41#include <iomanip>
42#include <vector>
43
44#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
45
46#define RANK_DOWNGRADE (-1.0F)
47#define RANK_UPGRADE (1.0F)
48
49/* Maximum number of words supported in a phrase or proximity search. */
50#define MAX_PROXIMITY_ITEM 128
51
52/* Memory used by rbt itself for create and node add */
53#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
54#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
55
56/*Initial byte length for 'words' in fts_ranking_t */
57#define RANKING_WORDS_INIT_LEN 4
58
59// FIXME: Need to have a generic iterator that traverses the ilist.
60
61typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t;
62
63struct fts_word_freq_t;
64
65/** State of an FTS query. */
66struct fts_query_t {
67 mem_heap_t* heap; /*!< Heap to use for allocations */
68
69 trx_t* trx; /*!< The query transaction */
70
71 dict_index_t* index; /*!< The FTS index to search */
72 /*!< FTS auxiliary common table def */
73
74 fts_table_t fts_common_table;
75
76 fts_table_t fts_index_table;/*!< FTS auxiliary index table def */
77
78 ulint total_size; /*!< total memory size used by query */
79
80 fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be
81 filtered from the output */
82
83 fts_ast_node_t* root; /*!< Abstract syntax tree */
84
85 fts_ast_node_t* cur_node; /*!< Current tree node */
86
87 ib_rbt_t* word_map; /*!< Matched word map for
88 searching by word*/
89
90 word_vector_t* word_vector; /*!< Matched word vector for
91 searching by index */
92
93 ib_rbt_t* doc_ids; /*!< The current set of matching
94 doc ids, elements are of
95 type fts_ranking_t */
96
97 ib_rbt_t* intersection; /*!< The doc ids that were found in
98 doc_ids, this tree will become
99 the new doc_ids, elements are of type
100 fts_ranking_t */
101
102 /*!< Prepared statement to read the
103 nodes from the FTS INDEX */
104 que_t* read_nodes_graph;
105
106 fts_ast_oper_t oper; /*!< Current boolean mode operator */
107
108 /*!< TRUE if we want to collect the
109 word positions within the document */
110 ibool collect_positions;
111
112 ulint flags; /*!< Specify the full text search type,
113 such as boolean search, phrase
114 search, proximity search etc. */
115
116 ulint distance; /*!< The proximity distance of a
117 phrase search. */
118
119 /*!< These doc ids are used as a
120 boundary condition when searching the
121 FTS index rows */
122
123 doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */
124
125 doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */
126
127 bool boolean_mode; /*!< TRUE if boolean mode query */
128
129 ib_vector_t* matched; /*!< Array of matching documents
130 (fts_match_t) to search for a phrase */
131
132 ib_vector_t** match_array; /*!< Used for proximity search, contains
133 position info for each matched word
134 in the word list */
135
136 ib_uint64_t total_docs; /*!< The total number of documents */
137
138 ulint total_words; /*!< The total number of words */
139
140 dberr_t error; /*!< Error code if any, that is
141 encountered during query processing */
142
143 ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per
144 document, its elements are of type
145 fts_word_freq_t */
146
147 ib_rbt_t* wildcard_words; /*!< words with wildcard */
148
149 bool multi_exist; /*!< multiple FTS_EXIST oper */
150
151 st_mysql_ftparser* parser; /*!< fts plugin parser */
152};
153
154/** For phrase matching, first we collect the documents and the positions
155then we match. */
156struct fts_match_t {
157 doc_id_t doc_id; /*!< Document id */
158
159 ulint start; /*!< Start the phrase match from
160 this offset within the positions
161 vector. */
162
163 ib_vector_t* positions; /*!< Offsets of a word in a
164 document */
165};
166
167/** For matching tokens in a phrase search. We use this data structure in
168the callback that determines whether a document should be accepted or
169rejected for a phrase search. */
170struct fts_select_t {
171 doc_id_t doc_id; /*!< The document id to match */
172
173 ulint min_pos; /*!< For found to be TRUE at least
174 one position must be greater than
175 min_pos. */
176
177 ibool found; /*!< TRUE if found */
178
179 fts_word_freq_t*
180 word_freq; /*!< Word frequency instance of the
181 current word being looked up in
182 the FTS index */
183};
184
185typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t;
186
187/** structure defines a set of ranges for original documents, each of which
188has a minimum position and maximum position. Text in such range should
189contain all words in the proximity search. We will need to count the
190words in such range to make sure it is less than the specified distance
191of the proximity search */
192struct fts_proximity_t {
193 ulint n_pos; /*!< number of position set, defines
194 a range (min to max) containing all
195 matching words */
196 pos_vector_t min_pos; /*!< the minimum position (in bytes)
197 of the range */
198 pos_vector_t max_pos; /*!< the maximum position (in bytes)
199 of the range */
200};
201
202/** The match positions and tokesn to match */
203struct fts_phrase_t {
204 fts_phrase_t(const dict_table_t* table)
205 :
206 found(false),
207 match(NULL),
208 tokens(NULL),
209 distance(0),
210 charset(NULL),
211 heap(NULL),
212 page_size(dict_table_page_size(table)),
213 proximity_pos(NULL),
214 parser(NULL)
215 {
216 }
217
218 /** Match result */
219 ibool found;
220
221 /** Positions within text */
222 const fts_match_t* match;
223
224 /** Tokens to match */
225 const ib_vector_t* tokens;
226
227 /** For matching on proximity distance. Can be 0 for exact match */
228 ulint distance;
229
230 /** Phrase match charset */
231 CHARSET_INFO* charset;
232
233 /** Heap for word processing */
234 mem_heap_t* heap;
235
236 /** Row page size */
237 const page_size_t page_size;
238
239 /** Position info for proximity search verification. Records the
240 min and max position of words matched */
241 fts_proximity_t* proximity_pos;
242
243 /** FTS plugin parser */
244 st_mysql_ftparser* parser;
245};
246
247/** Paramter passed to fts phrase match by parser */
248struct fts_phrase_param_t {
249 fts_phrase_t* phrase; /*!< Match phrase instance */
250 ulint token_index; /*!< Index of token to match next */
251 mem_heap_t* heap; /*!< Heap for word processing */
252};
253
254/** For storing the frequncy of a word/term in a document */
255struct fts_doc_freq_t {
256 doc_id_t doc_id; /*!< Document id */
257 ulint freq; /*!< Frequency of a word in a document */
258};
259
260/** To determine the word frequency per document. */
261struct fts_word_freq_t {
262 fts_string_t word; /*!< Word for which we need the freq,
263 it's allocated on the query heap */
264
265 ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document
266 word frequencies. The elements are
267 of type fts_doc_freq_t */
268 ib_uint64_t doc_count; /*!< Total number of documents that
269 contain this word */
270 double idf; /*!< Inverse document frequency */
271};
272
273/********************************************************************
274Callback function to fetch the rows in an FTS INDEX record.
275@return always TRUE */
276static
277ibool
278fts_query_index_fetch_nodes(
279/*========================*/
280 void* row, /*!< in: sel_node_t* */
281 void* user_arg); /*!< in: pointer to ib_vector_t */
282
283/********************************************************************
284Read and filter nodes.
285@return fts_node_t instance */
286static
287dberr_t
288fts_query_filter_doc_ids(
289/*=====================*/
290 fts_query_t* query, /*!< in: query instance */
291 const fts_string_t* word, /*!< in: the current word */
292 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
293 const fts_node_t* node, /*!< in: current FTS node */
294 void* data, /*!< in: doc id ilist */
295 ulint len, /*!< in: doc id ilist size */
296 ibool calc_doc_count);/*!< in: whether to remember doc
297 count */
298
299/** Process (nested) sub-expression, create a new result set to store the
300sub-expression result by processing nodes under current sub-expression
301list. Merge the sub-expression result with that of parent expression list.
302@param[in,out] node current root node
303@param[in,out] visitor callback function
304@param[in,out] arg argument for callback
305@return DB_SUCCESS if all go well */
306static
307dberr_t
308fts_ast_visit_sub_exp(
309 fts_ast_node_t* node,
310 fts_ast_callback visitor,
311 void* arg);
312
313#if 0
314/*****************************************************************//***
315Find a doc_id in a word's ilist.
316@return TRUE if found. */
317static
318ibool
319fts_query_find_doc_id(
320/*==================*/
321 fts_select_t* select, /*!< in/out: search the doc id selected,
322 update the frequency if found. */
323 void* data, /*!< in: doc id ilist */
324 ulint len); /*!< in: doc id ilist size */
325#endif
326
327/*************************************************************//**
328This function implements a simple "blind" query expansion search:
329words in documents found in the first search pass will be used as
330search arguments to search the document again, thus "expand"
331the search result set.
332@return DB_SUCCESS if success, otherwise the error code */
333static
334dberr_t
335fts_expand_query(
336/*=============*/
337 dict_index_t* index, /*!< in: FTS index to search */
338 fts_query_t* query) /*!< in: query result, to be freed
339 by the client */
340 MY_ATTRIBUTE((nonnull, warn_unused_result));
341/*************************************************************//**
342This function finds documents that contain all words in a
343phrase or proximity search. And if proximity search, verify
344the words are close enough to each other, as in specified distance.
345This function is called for phrase and proximity search.
346@return TRUE if documents are found, FALSE if otherwise */
347static
348ibool
349fts_phrase_or_proximity_search(
350/*===========================*/
351 fts_query_t* query, /*!< in/out: query instance
352 query->doc_ids might be instantiated
353 with qualified doc IDs */
354 ib_vector_t* tokens); /*!< in: Tokens contain words */
355/*************************************************************//**
356This function checks whether words in result documents are close to
357each other (within proximity range as specified by "distance").
358If "distance" is MAX_ULINT, then it will find all combinations of
359positions of matching words and store min and max positions
360in the "qualified_pos" for later verification.
361@return true if words are close to each other, false if otherwise */
362static
363bool
364fts_proximity_get_positions(
365/*========================*/
366 fts_match_t** match, /*!< in: query instance */
367 ulint num_match, /*!< in: number of matching
368 items */
369 ulint distance, /*!< in: distance value
370 for proximity search */
371 fts_proximity_t* qualified_pos); /*!< out: the position info
372 records ranges containing
373 all matching words. */
374#if 0
375/********************************************************************
376Get the total number of words in a documents. */
377static
378ulint
379fts_query_terms_in_document(
380/*========================*/
381 /*!< out: DB_SUCCESS if all go well
382 else error code */
383 fts_query_t* query, /*!< in: FTS query state */
384 doc_id_t doc_id, /*!< in: the word to check */
385 ulint* total); /*!< out: total words in document */
386#endif
387
388/********************************************************************
389Compare two fts_doc_freq_t doc_ids.
390@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
391UNIV_INLINE
392int
393fts_freq_doc_id_cmp(
394/*================*/
395 const void* p1, /*!< in: id1 */
396 const void* p2) /*!< in: id2 */
397{
398 const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
399 const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
400
401 return((int) (fq1->doc_id - fq2->doc_id));
402}
403
404#if 0
405/*******************************************************************//**
406Print the table used for calculating LCS. */
407static
408void
409fts_print_lcs_table(
410/*================*/
411 const ulint* table, /*!< in: array to print */
412 ulint n_rows, /*!< in: total no. of rows */
413 ulint n_cols) /*!< in: total no. of cols */
414{
415 ulint i;
416
417 for (i = 0; i < n_rows; ++i) {
418 ulint j;
419
420 printf("\n");
421
422 for (j = 0; j < n_cols; ++j) {
423
424 printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
425 }
426 }
427}
428
429/********************************************************************
430Find the longest common subsequence between the query string and
431the document. */
432static
433ulint
434fts_query_lcs(
435/*==========*/
436 /*!< out: LCS (length) between
437 two ilists */
438 const ulint* p1, /*!< in: word positions of query */
439 ulint len_p1, /*!< in: no. of elements in p1 */
440 const ulint* p2, /*!< in: word positions within document */
441 ulint len_p2) /*!< in: no. of elements in p2 */
442{
443 int i;
444 ulint len = 0;
445 ulint r = len_p1;
446 ulint c = len_p2;
447 ulint size = (r + 1) * (c + 1) * sizeof(ulint);
448 ulint* table = (ulint*) ut_malloc_nokey(size);
449
450 /* Traverse the table backwards, from the last row to the first and
451 also from the last column to the first. We compute the smaller
452 common subsequeces first, then use the caluclated values to determine
453 the longest common subsequence. The result will be in TABLE[0][0]. */
454 for (i = r; i >= 0; --i) {
455 int j;
456
457 for (j = c; j >= 0; --j) {
458
459 if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
460
461 FTS_ELEM(table, c, i, j) = 0;
462
463 } else if (p1[i] == p2[j]) {
464
465 FTS_ELEM(table, c, i, j) = FTS_ELEM(
466 table, c, i + 1, j + 1) + 1;
467
468 } else {
469
470 ulint value;
471
472 value = ut_max(
473 FTS_ELEM(table, c, i + 1, j),
474 FTS_ELEM(table, c, i, j + 1));
475
476 FTS_ELEM(table, c, i, j) = value;
477 }
478 }
479 }
480
481 len = FTS_ELEM(table, c, 0, 0);
482
483 fts_print_lcs_table(table, r, c);
484 printf("\nLen=" ULINTPF "\n", len);
485
486 ut_free(table);
487
488 return(len);
489}
490#endif
491
492/*******************************************************************//**
493Compare two fts_ranking_t instance on their rank value and doc ids in
494descending order on the rank and ascending order on doc id.
495@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
496static
497int
498fts_query_compare_rank(
499/*===================*/
500 const void* p1, /*!< in: pointer to elem */
501 const void* p2) /*!< in: pointer to elem */
502{
503 const fts_ranking_t* r1 = (const fts_ranking_t*) p1;
504 const fts_ranking_t* r2 = (const fts_ranking_t*) p2;
505
506 if (r2->rank < r1->rank) {
507 return(-1);
508 } else if (r2->rank == r1->rank) {
509
510 if (r1->doc_id < r2->doc_id) {
511 return(1);
512 } else if (r1->doc_id > r2->doc_id) {
513 return(1);
514 }
515
516 return(0);
517 }
518
519 return(1);
520}
521
522/*******************************************************************//**
523Create words in ranking */
524static
525void
526fts_ranking_words_create(
527/*=====================*/
528 fts_query_t* query, /*!< in: query instance */
529 fts_ranking_t* ranking) /*!< in: ranking instance */
530{
531 ranking->words = static_cast<byte*>(
532 mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
533 ranking->words_len = RANKING_WORDS_INIT_LEN;
534}
535
536/*
537The optimization here is using a char array(bitmap) to replace words rb tree
538in fts_ranking_t.
539
540It can save lots of memory except in some cases of QUERY EXPANSION.
541
542'word_map' is used as a word dictionary, in which the key is a word, the value
543is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
544if not, we add it into 'word_map', and give it a position(actually a number).
545then we set the corresponding bit to '1' at the position in the char array 'words'.
546
547'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
548more quickly than searching by value in 'word_map'. we use 'word_vector'
549in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
550to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
551by the word.
552*/
553
554/*******************************************************************//**
555Add a word into ranking */
556static
557void
558fts_ranking_words_add(
559/*==================*/
560 fts_query_t* query, /*!< in: query instance */
561 fts_ranking_t* ranking, /*!< in: ranking instance */
562 const fts_string_t* word) /*!< in: term/word to add */
563{
564 ulint pos;
565 ulint byte_offset;
566 ulint bit_offset;
567 ib_rbt_bound_t parent;
568
569 /* Note: we suppose the word map and vector are append-only. */
570 ut_ad(query->word_vector->size() == rbt_size(query->word_map));
571
572 /* We use ib_rbt to simulate a map, f_n_char means position. */
573 if (rbt_search(query->word_map, &parent, word) == 0) {
574 fts_string_t* result_word;
575
576 result_word = rbt_value(fts_string_t, parent.last);
577 pos = result_word->f_n_char;
578 ut_ad(pos < rbt_size(query->word_map));
579 } else {
580 /* Add the word to map. */
581 fts_string_t new_word;
582
583 pos = rbt_size(query->word_map);
584
585 fts_string_dup(&new_word, word, query->heap);
586 new_word.f_n_char = pos;
587
588 rbt_add_node(query->word_map, &parent, &new_word);
589 ut_ad(rbt_validate(query->word_map));
590 query->word_vector->push_back(new_word);
591 }
592
593 /* Check words len */
594 byte_offset = pos / CHAR_BIT;
595 if (byte_offset >= ranking->words_len) {
596 byte* words = ranking->words;
597 ulint words_len = ranking->words_len;
598
599 while (byte_offset >= words_len) {
600 words_len *= 2;
601 }
602
603 ranking->words = static_cast<byte*>(
604 mem_heap_zalloc(query->heap, words_len));
605 ut_memcpy(ranking->words, words, ranking->words_len);
606 ranking->words_len = words_len;
607 }
608
609 /* Set ranking words */
610 ut_ad(byte_offset < ranking->words_len);
611 bit_offset = pos % CHAR_BIT;
612 ranking->words[byte_offset] |= 1 << bit_offset;
613}
614
615/*******************************************************************//**
616Get a word from a ranking
617@return true if it's successful */
618static
619bool
620fts_ranking_words_get_next(
621/*=======================*/
622 const fts_query_t* query, /*!< in: query instance */
623 fts_ranking_t* ranking,/*!< in: ranking instance */
624 ulint* pos, /*!< in/out: word start pos */
625 fts_string_t* word) /*!< in/out: term/word to add */
626{
627 bool ret = false;
628 ulint max_pos = ranking->words_len * CHAR_BIT;
629
630 /* Search for next word */
631 while (*pos < max_pos) {
632 ulint byte_offset = *pos / CHAR_BIT;
633 ulint bit_offset = *pos % CHAR_BIT;
634
635 if (ranking->words[byte_offset] & (1 << bit_offset)) {
636 ret = true;
637 break;
638 }
639
640 *pos += 1;
641 };
642
643 /* Get next word from word vector */
644 if (ret) {
645 ut_ad(*pos < query->word_vector->size());
646 *word = query->word_vector->at((size_t)*pos);
647 *pos += 1;
648 }
649
650 return ret;
651}
652
653/*******************************************************************//**
654Add a word if it doesn't exist, to the term freq RB tree. We store
655a pointer to the word that is passed in as the argument.
656@return pointer to word */
657static
658fts_word_freq_t*
659fts_query_add_word_freq(
660/*====================*/
661 fts_query_t* query, /*!< in: query instance */
662 const fts_string_t* word) /*!< in: term/word to add */
663{
664 ib_rbt_bound_t parent;
665
666 /* Lookup the word in our rb tree and add if it doesn't exist. */
667 if (rbt_search(query->word_freqs, &parent, word) != 0) {
668 fts_word_freq_t word_freq;
669
670 memset(&word_freq, 0, sizeof(word_freq));
671
672 fts_string_dup(&word_freq.word, word, query->heap);
673
674 word_freq.doc_count = 0;
675
676 word_freq.doc_freqs = rbt_create(
677 sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
678
679 parent.last = rbt_add_node(
680 query->word_freqs, &parent, &word_freq);
681
682 query->total_size += word->f_len
683 + SIZEOF_RBT_CREATE
684 + SIZEOF_RBT_NODE_ADD
685 + sizeof(fts_word_freq_t);
686 }
687
688 return(rbt_value(fts_word_freq_t, parent.last));
689}
690
691/*******************************************************************//**
692Add a doc id if it doesn't exist, to the doc freq RB tree.
693@return pointer to word */
694static
695fts_doc_freq_t*
696fts_query_add_doc_freq(
697/*===================*/
698 fts_query_t* query, /*!< in: query instance */
699 ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */
700 doc_id_t doc_id) /*!< in: doc id to add */
701{
702 ib_rbt_bound_t parent;
703
704 /* Lookup the doc id in our rb tree and add if it doesn't exist. */
705 if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
706 fts_doc_freq_t doc_freq;
707
708 memset(&doc_freq, 0, sizeof(doc_freq));
709
710 doc_freq.freq = 0;
711 doc_freq.doc_id = doc_id;
712
713 parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
714
715 query->total_size += SIZEOF_RBT_NODE_ADD
716 + sizeof(fts_doc_freq_t);
717 }
718
719 return(rbt_value(fts_doc_freq_t, parent.last));
720}
721
722/*******************************************************************//**
723Add the doc id to the query set only if it's not in the
724deleted array. */
725static
726void
727fts_query_union_doc_id(
728/*===================*/
729 fts_query_t* query, /*!< in: query instance */
730 doc_id_t doc_id, /*!< in: the doc id to add */
731 fts_rank_t rank) /*!< in: if non-zero, it is the
732 rank associated with the doc_id */
733{
734 ib_rbt_bound_t parent;
735 ulint size = ib_vector_size(query->deleted->doc_ids);
736 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
737
738 /* Check if the doc id is deleted and it's not already in our set. */
739 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
740 && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
741
742 fts_ranking_t ranking;
743
744 ranking.rank = rank;
745 ranking.doc_id = doc_id;
746 fts_ranking_words_create(query, &ranking);
747
748 rbt_add_node(query->doc_ids, &parent, &ranking);
749
750 query->total_size += SIZEOF_RBT_NODE_ADD
751 + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
752 }
753}
754
755/*******************************************************************//**
756Remove the doc id from the query set only if it's not in the
757deleted set. */
758static
759void
760fts_query_remove_doc_id(
761/*====================*/
762 fts_query_t* query, /*!< in: query instance */
763 doc_id_t doc_id) /*!< in: the doc id to add */
764{
765 ib_rbt_bound_t parent;
766 ulint size = ib_vector_size(query->deleted->doc_ids);
767 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
768
769 /* Check if the doc id is deleted and it's in our set. */
770 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
771 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
772 ut_free(rbt_remove_node(query->doc_ids, parent.last));
773
774 ut_ad(query->total_size >=
775 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
776 query->total_size -= SIZEOF_RBT_NODE_ADD
777 + sizeof(fts_ranking_t);
778 }
779}
780
781/*******************************************************************//**
782Find the doc id in the query set but not in the deleted set, artificialy
783downgrade or upgrade its ranking by a value and make/initialize its ranking
784under or above its normal range 0 to 1. This is used for Boolean Search
785operator such as Negation operator, which makes word's contribution to the
786row's relevance to be negative */
787static
788void
789fts_query_change_ranking(
790/*====================*/
791 fts_query_t* query, /*!< in: query instance */
792 doc_id_t doc_id, /*!< in: the doc id to add */
793 ibool downgrade) /*!< in: Whether to downgrade ranking */
794{
795 ib_rbt_bound_t parent;
796 ulint size = ib_vector_size(query->deleted->doc_ids);
797 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
798
799 /* Check if the doc id is deleted and it's in our set. */
800 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
801 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
802
803 fts_ranking_t* ranking;
804
805 ranking = rbt_value(fts_ranking_t, parent.last);
806
807 ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
808
809 /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
810 and RANK_UPGRADE (0.5) */
811 if (ranking->rank >= 1.0F) {
812 ranking->rank = 1.0F;
813 } else if (ranking->rank <= -1.0F) {
814 ranking->rank = -1.0F;
815 }
816 }
817}
818
819/*******************************************************************//**
820Check the doc id in the query set only if it's not in the
821deleted array. The doc ids that were found are stored in
822another rb tree (fts_query_t::intersect). */
823static
824void
825fts_query_intersect_doc_id(
826/*=======================*/
827 fts_query_t* query, /*!< in: query instance */
828 doc_id_t doc_id, /*!< in: the doc id to add */
829 fts_rank_t rank) /*!< in: if non-zero, it is the
830 rank associated with the doc_id */
831{
832 ib_rbt_bound_t parent;
833 ulint size = ib_vector_size(query->deleted->doc_ids);
834 fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
835 fts_ranking_t* ranking= NULL;
836
837 /* There are three types of intersect:
838 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
839 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
840 if it matches 'b'. if the doc is also in doc_ids, then change the
841 doc's rank, and add 'a' in doc's words.
842 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
843 if it matches 'b' and it's in doc_ids.(multi_exist = true). */
844
845 /* Check if the doc id is deleted and it's in our set */
846 if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0) {
847 fts_ranking_t new_ranking;
848
849 if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
850 if (query->multi_exist) {
851 return;
852 } else {
853 new_ranking.words = NULL;
854 }
855 } else {
856 ranking = rbt_value(fts_ranking_t, parent.last);
857
858 /* We've just checked the doc id before */
859 if (ranking->words == NULL) {
860 ut_ad(rbt_search(query->intersection, &parent,
861 ranking) == 0);
862 return;
863 }
864
865 /* Merge rank */
866 rank += ranking->rank;
867 if (rank >= 1.0F) {
868 rank = 1.0F;
869 } else if (rank <= -1.0F) {
870 rank = -1.0F;
871 }
872
873 /* Take words */
874 new_ranking.words = ranking->words;
875 new_ranking.words_len = ranking->words_len;
876 }
877
878 new_ranking.rank = rank;
879 new_ranking.doc_id = doc_id;
880
881 if (rbt_search(query->intersection, &parent,
882 &new_ranking) != 0) {
883 if (new_ranking.words == NULL) {
884 fts_ranking_words_create(query, &new_ranking);
885
886 query->total_size += RANKING_WORDS_INIT_LEN;
887 } else {
888 /* Note that the intersection has taken
889 ownership of the ranking data. */
890 ranking->words = NULL;
891 }
892
893 rbt_add_node(query->intersection,
894 &parent, &new_ranking);
895
896 query->total_size += SIZEOF_RBT_NODE_ADD
897 + sizeof(fts_ranking_t);
898 }
899 }
900}
901
902/*******************************************************************//**
903Free the document ranking rb tree. */
904static
905void
906fts_query_free_doc_ids(
907/*===================*/
908 fts_query_t* query, /*!< in: query instance */
909 ib_rbt_t* doc_ids) /*!< in: rb tree to free */
910{
911 const ib_rbt_node_t* node;
912
913 for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
914
915 fts_ranking_t* ranking;
916
917 ranking = rbt_value(fts_ranking_t, node);
918
919 if (ranking->words) {
920 ranking->words = NULL;
921 }
922
923 ut_free(rbt_remove_node(doc_ids, node));
924
925 ut_ad(query->total_size >=
926 SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
927 query->total_size -= SIZEOF_RBT_NODE_ADD
928 + sizeof(fts_ranking_t);
929 }
930
931 rbt_free(doc_ids);
932
933 ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
934 query->total_size -= SIZEOF_RBT_CREATE;
935}
936
937/*******************************************************************//**
938Add the word to the documents "list" of matching words from
939the query. We make a copy of the word from the query heap. */
940static
941void
942fts_query_add_word_to_document(
943/*===========================*/
944 fts_query_t* query, /*!< in: query to update */
945 doc_id_t doc_id, /*!< in: the document to update */
946 const fts_string_t* word) /*!< in: the token to add */
947{
948 ib_rbt_bound_t parent;
949 fts_ranking_t* ranking = NULL;
950
951 if (query->flags == FTS_OPT_RANKING) {
952 return;
953 }
954
955 /* First we search the intersection RB tree as it could have
956 taken ownership of the words rb tree instance. */
957 if (query->intersection
958 && rbt_search(query->intersection, &parent, &doc_id) == 0) {
959
960 ranking = rbt_value(fts_ranking_t, parent.last);
961 }
962
963 if (ranking == NULL
964 && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
965
966 ranking = rbt_value(fts_ranking_t, parent.last);
967 }
968
969 if (ranking != NULL) {
970 fts_ranking_words_add(query, ranking, word);
971 }
972}
973
974/*******************************************************************//**
975Check the node ilist. */
976static
977void
978fts_query_check_node(
979/*=================*/
980 fts_query_t* query, /*!< in: query to update */
981 const fts_string_t* token, /*!< in: the token to search */
982 const fts_node_t* node) /*!< in: node to check */
983{
984 /* Skip nodes whose doc ids are out range. */
985 if (query->oper == FTS_EXIST
986 && ((query->upper_doc_id > 0
987 && node->first_doc_id > query->upper_doc_id)
988 || (query->lower_doc_id > 0
989 && node->last_doc_id < query->lower_doc_id))) {
990
991 /* Ignore */
992
993 } else {
994 int ret;
995 ib_rbt_bound_t parent;
996 ulint ilist_size = node->ilist_size;
997 fts_word_freq_t*word_freqs;
998
999 /* The word must exist. */
1000 ret = rbt_search(query->word_freqs, &parent, token);
1001 ut_a(ret == 0);
1002
1003 word_freqs = rbt_value(fts_word_freq_t, parent.last);
1004
1005 query->error = fts_query_filter_doc_ids(
1006 query, token, word_freqs, node,
1007 node->ilist, ilist_size, TRUE);
1008 }
1009}
1010
1011/*****************************************************************//**
1012Search index cache for word with wildcard match.
1013@return number of words matched */
1014static
1015ulint
1016fts_cache_find_wildcard(
1017/*====================*/
1018 fts_query_t* query, /*!< in: query instance */
1019 const fts_index_cache_t*index_cache, /*!< in: cache to search */
1020 const fts_string_t* token) /*!< in: token to search */
1021{
1022 ib_rbt_bound_t parent;
1023 const ib_vector_t* nodes = NULL;
1024 fts_string_t srch_text;
1025 byte term[FTS_MAX_WORD_LEN + 1];
1026 ulint num_word = 0;
1027
1028 srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
1029 ? token->f_len - 1
1030 : token->f_len;
1031
1032 strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
1033 term[srch_text.f_len] = '\0';
1034 srch_text.f_str = term;
1035
1036 /* Lookup the word in the rb tree */
1037 if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1038 innobase_fts_text_cmp_prefix) == 0) {
1039 const fts_tokenizer_word_t* word;
1040 ulint i;
1041 const ib_rbt_node_t* cur_node;
1042 ibool forward = FALSE;
1043
1044 word = rbt_value(fts_tokenizer_word_t, parent.last);
1045 cur_node = parent.last;
1046
1047 while (innobase_fts_text_cmp_prefix(
1048 index_cache->charset, &srch_text, &word->text) == 0) {
1049
1050 nodes = word->nodes;
1051
1052 for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
1053 int ret;
1054 const fts_node_t* node;
1055 ib_rbt_bound_t freq_parent;
1056 fts_word_freq_t* word_freqs;
1057
1058 node = static_cast<const fts_node_t*>(
1059 ib_vector_get_const(nodes, i));
1060
1061 ret = rbt_search(query->word_freqs,
1062 &freq_parent,
1063 &srch_text);
1064
1065 ut_a(ret == 0);
1066
1067 word_freqs = rbt_value(
1068 fts_word_freq_t,
1069 freq_parent.last);
1070
1071 query->error = fts_query_filter_doc_ids(
1072 query, &srch_text,
1073 word_freqs, node,
1074 node->ilist, node->ilist_size, TRUE);
1075
1076 if (query->error != DB_SUCCESS) {
1077 return(0);
1078 }
1079 }
1080
1081 num_word++;
1082
1083 if (!forward) {
1084 cur_node = rbt_prev(
1085 index_cache->words, cur_node);
1086 } else {
1087cont_search:
1088 cur_node = rbt_next(
1089 index_cache->words, cur_node);
1090 }
1091
1092 if (!cur_node) {
1093 break;
1094 }
1095
1096 word = rbt_value(fts_tokenizer_word_t, cur_node);
1097 }
1098
1099 if (!forward) {
1100 forward = TRUE;
1101 cur_node = parent.last;
1102 goto cont_search;
1103 }
1104 }
1105
1106 return(num_word);
1107}
1108
1109/*****************************************************************//**
1110Set difference.
1111@return DB_SUCCESS if all go well */
1112static MY_ATTRIBUTE((nonnull, warn_unused_result))
1113dberr_t
1114fts_query_difference(
1115/*=================*/
1116 fts_query_t* query, /*!< in: query instance */
1117 const fts_string_t* token) /*!< in: token to search */
1118{
1119 ulint n_doc_ids= 0;
1120 trx_t* trx = query->trx;
1121 dict_table_t* table = query->index->table;
1122
1123 ut_a(query->oper == FTS_IGNORE);
1124
1125#ifdef FTS_INTERNAL_DIAG_PRINT
1126 {
1127 ib::info out;
1128 out << "DIFFERENCE: Searching: '";
1129 out.write(token->f_str, token->f_len);
1130 out << "'";
1131 }
1132#endif
1133
1134 if (query->doc_ids) {
1135 n_doc_ids = rbt_size(query->doc_ids);
1136 }
1137
1138 /* There is nothing we can substract from an empty set. */
1139 if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1140 ulint i;
1141 fts_fetch_t fetch;
1142 const ib_vector_t* nodes;
1143 const fts_index_cache_t*index_cache;
1144 que_t* graph = NULL;
1145 fts_cache_t* cache = table->fts->cache;
1146 dberr_t error;
1147
1148 rw_lock_x_lock(&cache->lock);
1149
1150 index_cache = fts_find_index_cache(cache, query->index);
1151
1152 /* Must find the index cache */
1153 ut_a(index_cache != NULL);
1154
1155 /* Search the cache for a matching word first. */
1156 if (query->cur_node->term.wildcard
1157 && query->flags != FTS_PROXIMITY
1158 && query->flags != FTS_PHRASE) {
1159 fts_cache_find_wildcard(query, index_cache, token);
1160 } else {
1161 nodes = fts_cache_find_word(index_cache, token);
1162
1163 for (i = 0; nodes && i < ib_vector_size(nodes)
1164 && query->error == DB_SUCCESS; ++i) {
1165 const fts_node_t* node;
1166
1167 node = static_cast<const fts_node_t*>(
1168 ib_vector_get_const(nodes, i));
1169
1170 fts_query_check_node(query, token, node);
1171 }
1172 }
1173
1174 rw_lock_x_unlock(&cache->lock);
1175
1176 /* error is passed by 'query->error' */
1177 if (query->error != DB_SUCCESS) {
1178 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1179 return(query->error);
1180 }
1181
1182 /* Setup the callback args for filtering and
1183 consolidating the ilist. */
1184 fetch.read_arg = query;
1185 fetch.read_record = fts_query_index_fetch_nodes;
1186
1187 error = fts_index_fetch_nodes(
1188 trx, &graph, &query->fts_index_table, token, &fetch);
1189
1190 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1191 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1192 if (error != DB_SUCCESS) {
1193 query->error = error;
1194 }
1195
1196 fts_que_graph_free(graph);
1197 }
1198
1199 /* The size can't increase. */
1200 ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1201
1202 return(query->error);
1203}
1204
1205/*****************************************************************//**
1206Intersect the token doc ids with the current set.
1207@return DB_SUCCESS if all go well */
1208static MY_ATTRIBUTE((nonnull, warn_unused_result))
1209dberr_t
1210fts_query_intersect(
1211/*================*/
1212 fts_query_t* query, /*!< in: query instance */
1213 const fts_string_t* token) /*!< in: the token to search */
1214{
1215 trx_t* trx = query->trx;
1216 dict_table_t* table = query->index->table;
1217
1218 ut_a(query->oper == FTS_EXIST);
1219
1220#ifdef FTS_INTERNAL_DIAG_PRINT
1221 {
1222 ib::info out;
1223 out << "INTERSECT: Searching: '";
1224 out.write(token->f_str, token->f_len);
1225 out << "'";
1226 }
1227#endif
1228
1229 /* If the words set is not empty and multi exist is true,
1230 we know the intersection set is empty in advance. */
1231 if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1232 ulint n_doc_ids = 0;
1233 ulint i;
1234 fts_fetch_t fetch;
1235 const ib_vector_t* nodes;
1236 const fts_index_cache_t*index_cache;
1237 que_t* graph = NULL;
1238 fts_cache_t* cache = table->fts->cache;
1239 dberr_t error;
1240
1241 ut_a(!query->intersection);
1242
1243 n_doc_ids = rbt_size(query->doc_ids);
1244
1245 /* Create the rb tree that will hold the doc ids of
1246 the intersection. */
1247 query->intersection = rbt_create(
1248 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1249
1250 query->total_size += SIZEOF_RBT_CREATE;
1251
1252 /* This is to avoid decompressing the ilist if the
1253 node's ilist doc ids are out of range. */
1254 if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1255 const ib_rbt_node_t* node;
1256 doc_id_t* doc_id;
1257
1258 node = rbt_first(query->doc_ids);
1259 doc_id = rbt_value(doc_id_t, node);
1260 query->lower_doc_id = *doc_id;
1261
1262 node = rbt_last(query->doc_ids);
1263 doc_id = rbt_value(doc_id_t, node);
1264 query->upper_doc_id = *doc_id;
1265
1266 } else {
1267 query->lower_doc_id = 0;
1268 query->upper_doc_id = 0;
1269 }
1270
1271 /* Search the cache for a matching word first. */
1272
1273 rw_lock_x_lock(&cache->lock);
1274
1275 /* Search for the index specific cache. */
1276 index_cache = fts_find_index_cache(cache, query->index);
1277
1278 /* Must find the index cache. */
1279 ut_a(index_cache != NULL);
1280
1281 if (query->cur_node->term.wildcard) {
1282 /* Wildcard search the index cache */
1283 fts_cache_find_wildcard(query, index_cache, token);
1284 } else {
1285 nodes = fts_cache_find_word(index_cache, token);
1286
1287 for (i = 0; nodes && i < ib_vector_size(nodes)
1288 && query->error == DB_SUCCESS; ++i) {
1289 const fts_node_t* node;
1290
1291 node = static_cast<const fts_node_t*>(
1292 ib_vector_get_const(nodes, i));
1293
1294 fts_query_check_node(query, token, node);
1295 }
1296 }
1297
1298 rw_lock_x_unlock(&cache->lock);
1299
1300 /* error is passed by 'query->error' */
1301 if (query->error != DB_SUCCESS) {
1302 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1303 return(query->error);
1304 }
1305
1306 /* Setup the callback args for filtering and
1307 consolidating the ilist. */
1308 fetch.read_arg = query;
1309 fetch.read_record = fts_query_index_fetch_nodes;
1310
1311 error = fts_index_fetch_nodes(
1312 trx, &graph, &query->fts_index_table, token, &fetch);
1313
1314 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1315 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1316 if (error != DB_SUCCESS) {
1317 query->error = error;
1318 }
1319
1320 fts_que_graph_free(graph);
1321
1322 if (query->error == DB_SUCCESS) {
1323 /* Make the intesection (rb tree) the current doc id
1324 set and free the old set. */
1325 fts_query_free_doc_ids(query, query->doc_ids);
1326 query->doc_ids = query->intersection;
1327 query->intersection = NULL;
1328
1329 ut_a(!query->multi_exist || (query->multi_exist
1330 && rbt_size(query->doc_ids) <= n_doc_ids));
1331 }
1332 }
1333
1334 return(query->error);
1335}
1336
1337/*****************************************************************//**
1338Query index cache.
1339@return DB_SUCCESS if all go well */
1340static
1341dberr_t
1342fts_query_cache(
1343/*============*/
1344 fts_query_t* query, /*!< in/out: query instance */
1345 const fts_string_t* token) /*!< in: token to search */
1346{
1347 const fts_index_cache_t*index_cache;
1348 dict_table_t* table = query->index->table;
1349 fts_cache_t* cache = table->fts->cache;
1350
1351 /* Search the cache for a matching word first. */
1352 rw_lock_x_lock(&cache->lock);
1353
1354 /* Search for the index specific cache. */
1355 index_cache = fts_find_index_cache(cache, query->index);
1356
1357 /* Must find the index cache. */
1358 ut_a(index_cache != NULL);
1359
1360 if (query->cur_node->term.wildcard
1361 && query->flags != FTS_PROXIMITY
1362 && query->flags != FTS_PHRASE) {
1363 /* Wildcard search the index cache */
1364 fts_cache_find_wildcard(query, index_cache, token);
1365 } else {
1366 const ib_vector_t* nodes;
1367 ulint i;
1368
1369 nodes = fts_cache_find_word(index_cache, token);
1370
1371 for (i = 0; nodes && i < ib_vector_size(nodes)
1372 && query->error == DB_SUCCESS; ++i) {
1373 const fts_node_t* node;
1374
1375 node = static_cast<const fts_node_t*>(
1376 ib_vector_get_const(nodes, i));
1377
1378 fts_query_check_node(query, token, node);
1379 }
1380 }
1381
1382 rw_lock_x_unlock(&cache->lock);
1383
1384 return(query->error);
1385}
1386
1387/*****************************************************************//**
1388Set union.
1389@return DB_SUCCESS if all go well */
1390static MY_ATTRIBUTE((nonnull, warn_unused_result))
1391dberr_t
1392fts_query_union(
1393/*============*/
1394 fts_query_t* query, /*!< in: query instance */
1395 fts_string_t* token) /*!< in: token to search */
1396{
1397 fts_fetch_t fetch;
1398 ulint n_doc_ids = 0;
1399 trx_t* trx = query->trx;
1400 que_t* graph = NULL;
1401 dberr_t error;
1402
1403 ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
1404 query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
1405
1406#ifdef FTS_INTERNAL_DIAG_PRINT
1407 {
1408 ib::info out;
1409 out << "UNION: Searching: '";
1410 out.write(token->f_str, token->f_len);
1411 out << "'";
1412 }
1413#endif
1414
1415 if (query->doc_ids) {
1416 n_doc_ids = rbt_size(query->doc_ids);
1417 }
1418
1419 if (token->f_len == 0) {
1420 return(query->error);
1421 }
1422
1423 fts_query_cache(query, token);
1424
1425 /* Setup the callback args for filtering and
1426 consolidating the ilist. */
1427 fetch.read_arg = query;
1428 fetch.read_record = fts_query_index_fetch_nodes;
1429
1430 /* Read the nodes from disk. */
1431 error = fts_index_fetch_nodes(
1432 trx, &graph, &query->fts_index_table, token, &fetch);
1433
1434 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
1435 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1436 if (error != DB_SUCCESS) {
1437 query->error = error;
1438 }
1439
1440 fts_que_graph_free(graph);
1441
1442 if (query->error == DB_SUCCESS) {
1443
1444 /* The size can't decrease. */
1445 ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1446
1447 /* Calulate the number of doc ids that were added to
1448 the current doc id set. */
1449 if (query->doc_ids) {
1450 n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1451 }
1452 }
1453
1454 return(query->error);
1455}
1456
1457/*****************************************************************//**
1458Depending upon the current query operator process the doc id.
1459return DB_SUCCESS if all go well
1460or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
1461static
1462dberr_t
1463fts_query_process_doc_id(
1464/*=====================*/
1465 fts_query_t* query, /*!< in: query instance */
1466 doc_id_t doc_id, /*!< in: doc id to process */
1467 fts_rank_t rank) /*!< in: if non-zero, it is the
1468 rank associated with the doc_id */
1469{
1470 if (query->flags == FTS_OPT_RANKING) {
1471 return(DB_SUCCESS);
1472 }
1473
1474 switch (query->oper) {
1475 case FTS_NONE:
1476 fts_query_union_doc_id(query, doc_id, rank);
1477 break;
1478
1479 case FTS_EXIST:
1480 fts_query_intersect_doc_id(query, doc_id, rank);
1481 break;
1482
1483 case FTS_IGNORE:
1484 fts_query_remove_doc_id(query, doc_id);
1485 break;
1486
1487 case FTS_NEGATE:
1488 fts_query_change_ranking(query, doc_id, TRUE);
1489 break;
1490
1491 case FTS_DECR_RATING:
1492 fts_query_union_doc_id(query, doc_id, rank);
1493 fts_query_change_ranking(query, doc_id, TRUE);
1494 break;
1495
1496 case FTS_INCR_RATING:
1497 fts_query_union_doc_id(query, doc_id, rank);
1498 fts_query_change_ranking(query, doc_id, FALSE);
1499 break;
1500
1501 default:
1502 ut_error;
1503 }
1504
1505 if (query->total_size > fts_result_cache_limit) {
1506 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1507 } else {
1508 return(DB_SUCCESS);
1509 }
1510}
1511
1512/*****************************************************************//**
1513Merge two result sets. */
1514static
1515dberr_t
1516fts_merge_doc_ids(
1517/*==============*/
1518 fts_query_t* query, /*!< in,out: query instance */
1519 const ib_rbt_t* doc_ids) /*!< in: result set to merge */
1520{
1521 const ib_rbt_node_t* node;
1522
1523 DBUG_ENTER("fts_merge_doc_ids");
1524
1525 ut_a(!query->intersection);
1526
1527 /* To process FTS_EXIST operation (intersection), we need
1528 to create a new result set for fts_query_intersect(). */
1529 if (query->oper == FTS_EXIST) {
1530
1531 query->intersection = rbt_create(
1532 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1533
1534 query->total_size += SIZEOF_RBT_CREATE;
1535 }
1536
1537 /* Merge the elements to the result set. */
1538 for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1539 fts_ranking_t* ranking;
1540 ulint pos = 0;
1541 fts_string_t word;
1542
1543 ranking = rbt_value(fts_ranking_t, node);
1544
1545 query->error = fts_query_process_doc_id(
1546 query, ranking->doc_id, ranking->rank);
1547
1548 if (query->error != DB_SUCCESS) {
1549 DBUG_RETURN(query->error);
1550 }
1551
1552 /* Merge words. Don't need to take operator into account. */
1553 ut_a(ranking->words);
1554 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1555 fts_query_add_word_to_document(query, ranking->doc_id,
1556 &word);
1557 }
1558 }
1559
1560 /* If it is an intersection operation, reset query->doc_ids
1561 to query->intersection and free the old result list. */
1562 if (query->oper == FTS_EXIST && query->intersection != NULL) {
1563 fts_query_free_doc_ids(query, query->doc_ids);
1564 query->doc_ids = query->intersection;
1565 query->intersection = NULL;
1566 }
1567
1568 DBUG_RETURN(DB_SUCCESS);
1569}
1570
1571/*****************************************************************//**
1572Skip non-whitespace in a string. Move ptr to the next word boundary.
1573@return pointer to first whitespace character or end */
1574UNIV_INLINE
1575byte*
1576fts_query_skip_word(
1577/*================*/
1578 byte* ptr, /*!< in: start of scan */
1579 const byte* end) /*!< in: pointer to end of string */
1580{
1581 /* TODO: Does this have to be UTF-8 too ? */
1582 while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
1583 ++ptr;
1584 }
1585
1586 return(ptr);
1587}
1588
1589/*****************************************************************//**
1590Check whether the remaining terms in the phrase match the text.
1591@return TRUE if matched else FALSE */
1592static
1593ibool
1594fts_query_match_phrase_terms(
1595/*=========================*/
1596 fts_phrase_t* phrase, /*!< in: phrase to match */
1597 byte** start, /*!< in/out: text to search, we can't
1598 make this const becase we need to
1599 first convert the string to
1600 lowercase */
1601 const byte* end, /*!< in: pointer to the end of
1602 the string to search */
1603 mem_heap_t* heap) /*!< in: heap */
1604{
1605 ulint i;
1606 byte* ptr = *start;
1607 const ib_vector_t* tokens = phrase->tokens;
1608 ulint distance = phrase->distance;
1609
1610 /* We check only from the second term onwards, since the first
1611 must have matched otherwise we wouldn't be here. */
1612 for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
1613 fts_string_t match;
1614 fts_string_t cmp_str;
1615 const fts_string_t* token;
1616 int result;
1617 ulint ret;
1618
1619 ret = innobase_mysql_fts_get_token(
1620 phrase->charset, ptr,
1621 const_cast<byte*>(end), &match);
1622
1623 if (match.f_len > 0) {
1624 /* Get next token to match. */
1625 token = static_cast<const fts_string_t*>(
1626 ib_vector_get_const(tokens, i));
1627
1628 fts_string_dup(&cmp_str, &match, heap);
1629
1630 result = innobase_fts_text_case_cmp(
1631 phrase->charset, token, &cmp_str);
1632
1633 /* Skip the rest of the tokens if this one doesn't
1634 match and the proximity distance is exceeded. */
1635 if (result
1636 && (distance == ULINT_UNDEFINED
1637 || distance == 0)) {
1638
1639 break;
1640 }
1641
1642 /* This token matched move to the next token. */
1643 if (result == 0) {
1644 /* Advance the text to search by the length
1645 of the last token. */
1646 ptr += ret;
1647
1648 /* Advance to the next token. */
1649 ++i;
1650 } else {
1651
1652 ut_a(distance != ULINT_UNDEFINED);
1653
1654 ptr = fts_query_skip_word(ptr, end);
1655 }
1656
1657 /* Distance can be 0 for exact matches. */
1658 if (distance != ULINT_UNDEFINED && distance > 0) {
1659 --distance;
1660 }
1661 } else {
1662 ptr += ret;
1663 }
1664 }
1665
1666 *start = ptr;
1667
1668 /* Can't be greater than the number of elements. */
1669 ut_a(i <= ib_vector_size(tokens));
1670
1671 /* This is the case for multiple words. */
1672 if (i == ib_vector_size(tokens)) {
1673 phrase->found = TRUE;
1674 }
1675
1676 return(phrase->found);
1677}
1678
1679/*****************************************************************//**
1680Callback function to count the number of words in position ranges,
1681and see whether the word count is in specified "phrase->distance"
1682@return true if the number of characters is less than the "distance" */
1683static
1684bool
1685fts_proximity_is_word_in_range(
1686/*===========================*/
1687 const fts_phrase_t*
1688 phrase, /*!< in: phrase with the search info */
1689 byte* start, /*!< in: text to search */
1690 ulint total_len) /*!< in: length of text */
1691{
1692 fts_proximity_t* proximity_pos = phrase->proximity_pos;
1693
1694 ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1695 ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1696
1697 /* Search each matched position pair (with min and max positions)
1698 and count the number of words in the range */
1699 for (ulint i = 0; i < proximity_pos->n_pos; i++) {
1700 ulint cur_pos = proximity_pos->min_pos[i];
1701 ulint n_word = 0;
1702
1703 ut_ad(proximity_pos->max_pos[i] <= total_len);
1704
1705 /* Walk through words in the range and count them */
1706 while (cur_pos <= proximity_pos->max_pos[i]) {
1707 ulint len;
1708 fts_string_t str;
1709
1710 len = innobase_mysql_fts_get_token(
1711 phrase->charset,
1712 start + cur_pos,
1713 start + total_len, &str);
1714
1715 if (len == 0) {
1716 break;
1717 }
1718
1719 /* Advances position with "len" bytes */
1720 cur_pos += len;
1721
1722 /* Record the number of words */
1723 if (str.f_n_char > 0) {
1724 n_word++;
1725 }
1726
1727 if (n_word > phrase->distance) {
1728 break;
1729 }
1730 }
1731
1732 /* Check if the number of words is less than specified
1733 "distance" */
1734 if (n_word && n_word <= phrase->distance) {
1735 return(true);
1736 }
1737 }
1738
1739 return(false);
1740}
1741
1742/*****************************************************************//**
1743FTS plugin parser 'myql_add_word' callback function for phrase match
1744Refer to 'st_mysql_ftparser_param' for more detail.
1745@return 0 if match, or return non-zero */
1746static
1747int
1748fts_query_match_phrase_add_word_for_parser(
1749/*=======================================*/
1750 MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */
1751 const char* word, /*!< in: token */
1752 int word_len, /*!< in: token length */
1753 MYSQL_FTPARSER_BOOLEAN_INFO*)
1754{
1755 fts_phrase_param_t* phrase_param;
1756 fts_phrase_t* phrase;
1757 const ib_vector_t* tokens;
1758 fts_string_t match;
1759 fts_string_t cmp_str;
1760 const fts_string_t* token;
1761 int result;
1762 mem_heap_t* heap;
1763
1764 phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
1765 heap = phrase_param->heap;
1766 phrase = phrase_param->phrase;
1767 tokens = phrase->tokens;
1768
1769 /* In case plugin parser doesn't check return value */
1770 if (phrase_param->token_index == ib_vector_size(tokens)) {
1771 return(1);
1772 }
1773
1774 match.f_str = (uchar *)(word);
1775 match.f_len = ulint(word_len);
1776 match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len);
1777
1778 if (match.f_len > 0) {
1779 /* Get next token to match. */
1780 ut_a(phrase_param->token_index < ib_vector_size(tokens));
1781 token = static_cast<const fts_string_t*>(
1782 ib_vector_get_const(tokens, phrase_param->token_index));
1783
1784 fts_string_dup(&cmp_str, &match, heap);
1785
1786 result = innobase_fts_text_case_cmp(
1787 phrase->charset, token, &cmp_str);
1788
1789 if (result == 0) {
1790 phrase_param->token_index++;
1791 } else {
1792 return(1);
1793 }
1794 }
1795
1796 /* Can't be greater than the number of elements. */
1797 ut_a(phrase_param->token_index <= ib_vector_size(tokens));
1798
1799 /* This is the case for multiple words. */
1800 if (phrase_param->token_index == ib_vector_size(tokens)) {
1801 phrase->found = TRUE;
1802 }
1803
1804 return(static_cast<int>(phrase->found));
1805}
1806
1807/*****************************************************************//**
1808Check whether the terms in the phrase match the text.
1809@return TRUE if matched else FALSE */
1810static
1811ibool
1812fts_query_match_phrase_terms_by_parser(
1813/*===================================*/
1814 fts_phrase_param_t* phrase_param, /* in/out: phrase param */
1815 st_mysql_ftparser* parser, /* in: plugin fts parser */
1816 byte* text, /* in: text to check */
1817 ulint len) /* in: text length */
1818{
1819 MYSQL_FTPARSER_PARAM param;
1820
1821 ut_a(parser);
1822
1823 /* Set paramters for param */
1824 param.mysql_parse = fts_tokenize_document_internal;
1825 param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
1826 param.mysql_ftparam = phrase_param;
1827 param.cs = phrase_param->phrase->charset;
1828 param.doc = reinterpret_cast<char*>(text);
1829 param.length = static_cast<int>(len);
1830 param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
1831
1832 PARSER_INIT(parser, &param);
1833 parser->parse(&param);
1834 PARSER_DEINIT(parser, &param);
1835
1836 return(phrase_param->phrase->found);
1837}
1838
1839/*****************************************************************//**
1840Callback function to fetch and search the document.
1841@return TRUE if matched else FALSE */
1842static
1843ibool
1844fts_query_match_phrase(
1845/*===================*/
1846 fts_phrase_t* phrase, /*!< in: phrase to match */
1847 byte* start, /*!< in: text to search, we can't make
1848 this const becase we need to first
1849 convert the string to lowercase */
1850 ulint cur_len, /*!< in: length of text */
1851 ulint prev_len, /*!< in: total length for searched
1852 doc fields*/
1853 mem_heap_t* heap) /* heap */
1854{
1855 ulint i;
1856 const fts_string_t* first;
1857 const byte* end = start + cur_len;
1858 const ib_vector_t* tokens = phrase->tokens;
1859 const ib_vector_t* positions = phrase->match->positions;
1860
1861 ut_a(!phrase->found);
1862 ut_a(phrase->match->doc_id > 0);
1863 ut_a(ib_vector_size(tokens) > 0);
1864 ut_a(ib_vector_size(positions) > 0);
1865
1866 first = static_cast<const fts_string_t*>(
1867 ib_vector_get_const(tokens, 0));
1868
1869 ut_a(phrase->match->start < ib_vector_size(positions));
1870
1871 for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1872 ulint pos;
1873 byte* ptr = start;
1874
1875 pos = *(ulint*) ib_vector_get_const(positions, i);
1876
1877 if (pos == ULINT_UNDEFINED) {
1878 break;
1879 }
1880
1881 if (pos < prev_len) {
1882 continue;
1883 }
1884
1885 /* Document positions are calculated from the beginning
1886 of the first field, need to save the length for each
1887 searched field to adjust the doc position when search
1888 phrases. */
1889 pos -= prev_len;
1890 ptr = start + pos;
1891
1892 /* Within limits ? */
1893 if (ptr >= end) {
1894 break;
1895 }
1896
1897 if (phrase->parser) {
1898 fts_phrase_param_t phrase_param;
1899
1900 phrase_param.phrase = phrase;
1901 phrase_param.token_index = 0;
1902 phrase_param.heap = heap;
1903
1904 if (fts_query_match_phrase_terms_by_parser(
1905 &phrase_param,
1906 phrase->parser,
1907 ptr,
1908 ulint(end - ptr))) {
1909 break;
1910 }
1911 } else {
1912 fts_string_t match;
1913 fts_string_t cmp_str;
1914 ulint ret;
1915
1916 match.f_str = ptr;
1917 ret = innobase_mysql_fts_get_token(
1918 phrase->charset, start + pos,
1919 const_cast<byte*>(end), &match);
1920
1921 if (match.f_len == 0) {
1922 break;
1923 }
1924
1925 fts_string_dup(&cmp_str, &match, heap);
1926
1927 if (innobase_fts_text_case_cmp(
1928 phrase->charset, first, &cmp_str) == 0) {
1929
1930 /* This is the case for the single word
1931 in the phrase. */
1932 if (ib_vector_size(phrase->tokens) == 1) {
1933 phrase->found = TRUE;
1934 break;
1935 }
1936
1937 ptr += ret;
1938
1939 /* Match the remaining terms in the phrase. */
1940 if (fts_query_match_phrase_terms(phrase, &ptr,
1941 end, heap)) {
1942 break;
1943 }
1944 }
1945 }
1946 }
1947
1948 return(phrase->found);
1949}
1950
1951/*****************************************************************//**
1952Callback function to fetch and search the document.
1953@return whether the phrase is found */
1954static
1955ibool
1956fts_query_fetch_document(
1957/*=====================*/
1958 void* row, /*!< in: sel_node_t* */
1959 void* user_arg) /*!< in: fts_doc_t* */
1960{
1961
1962 que_node_t* exp;
1963 sel_node_t* node = static_cast<sel_node_t*>(row);
1964 fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
1965 ulint prev_len = 0;
1966 ulint total_len = 0;
1967 byte* document_text = NULL;
1968
1969 exp = node->select_list;
1970
1971 phrase->found = FALSE;
1972
1973 /* For proximity search, we will need to get the whole document
1974 from all fields, so first count the total length of the document
1975 from all the fields */
1976 if (phrase->proximity_pos) {
1977 while (exp) {
1978 ulint field_len;
1979 dfield_t* dfield = que_node_get_val(exp);
1980 byte* data = static_cast<byte*>(
1981 dfield_get_data(dfield));
1982
1983 if (dfield_is_ext(dfield)) {
1984 ulint local_len = dfield_get_len(dfield);
1985
1986 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1987
1988 field_len = mach_read_from_4(
1989 data + local_len + BTR_EXTERN_LEN + 4);
1990 } else {
1991 field_len = dfield_get_len(dfield);
1992 }
1993
1994 if (field_len != UNIV_SQL_NULL) {
1995 total_len += field_len + 1;
1996 }
1997
1998 exp = que_node_get_next(exp);
1999 }
2000
2001 document_text = static_cast<byte*>(mem_heap_zalloc(
2002 phrase->heap, total_len));
2003
2004 if (!document_text) {
2005 return(FALSE);
2006 }
2007 }
2008
2009 exp = node->select_list;
2010
2011 while (exp) {
2012 dfield_t* dfield = que_node_get_val(exp);
2013 byte* data = static_cast<byte*>(
2014 dfield_get_data(dfield));
2015 ulint cur_len;
2016
2017 if (dfield_is_ext(dfield)) {
2018 data = btr_copy_externally_stored_field(
2019 &cur_len, data, phrase->page_size,
2020 dfield_get_len(dfield), phrase->heap);
2021 } else {
2022 cur_len = dfield_get_len(dfield);
2023 }
2024
2025 if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
2026 if (phrase->proximity_pos) {
2027 ut_ad(prev_len + cur_len <= total_len);
2028 memcpy(document_text + prev_len, data, cur_len);
2029 } else {
2030 /* For phrase search */
2031 phrase->found =
2032 fts_query_match_phrase(
2033 phrase,
2034 static_cast<byte*>(data),
2035 cur_len, prev_len,
2036 phrase->heap);
2037 }
2038
2039 /* Document positions are calculated from the beginning
2040 of the first field, need to save the length for each
2041 searched field to adjust the doc position when search
2042 phrases. */
2043 prev_len += cur_len + 1;
2044 }
2045
2046 if (phrase->found) {
2047 break;
2048 }
2049
2050 exp = que_node_get_next(exp);
2051 }
2052
2053 if (phrase->proximity_pos) {
2054 ut_ad(prev_len <= total_len);
2055
2056 phrase->found = fts_proximity_is_word_in_range(
2057 phrase, document_text, total_len);
2058 }
2059
2060 return(phrase->found);
2061}
2062
2063#if 0
2064/********************************************************************
2065Callback function to check whether a record was found or not. */
2066static
2067ibool
2068fts_query_select(
2069/*=============*/
2070 void* row, /*!< in: sel_node_t* */
2071 void* user_arg) /*!< in: fts_doc_t* */
2072{
2073 int i;
2074 que_node_t* exp;
2075 sel_node_t* node = row;
2076 fts_select_t* select = user_arg;
2077
2078 ut_a(select->word_freq);
2079 ut_a(select->word_freq->doc_freqs);
2080
2081 exp = node->select_list;
2082
2083 for (i = 0; exp && !select->found; ++i) {
2084 dfield_t* dfield = que_node_get_val(exp);
2085 void* data = dfield_get_data(dfield);
2086 ulint len = dfield_get_len(dfield);
2087
2088 switch (i) {
2089 case 0: /* DOC_COUNT */
2090 if (len != UNIV_SQL_NULL && len != 0) {
2091
2092 select->word_freq->doc_count +=
2093 mach_read_from_4(data);
2094 }
2095 break;
2096
2097 case 1: /* ILIST */
2098 if (len != UNIV_SQL_NULL && len != 0) {
2099
2100 fts_query_find_doc_id(select, data, len);
2101 }
2102 break;
2103
2104 default:
2105 ut_error;
2106 }
2107
2108 exp = que_node_get_next(exp);
2109 }
2110
2111 return(FALSE);
2112}
2113
2114/********************************************************************
2115Read the rows from the FTS index, that match word and where the
2116doc id is between first and last doc id.
2117@return DB_SUCCESS if all go well else error code */
2118static MY_ATTRIBUTE((nonnull, warn_unused_result))
2119dberr_t
2120fts_query_find_term(
2121/*================*/
2122 fts_query_t* query, /*!< in: FTS query state */
2123 que_t** graph, /*!< in: prepared statement */
2124 const fts_string_t* word, /*!< in: the word to fetch */
2125 doc_id_t doc_id, /*!< in: doc id to match */
2126 ulint* min_pos,/*!< in/out: pos found must be
2127 greater than this minimum value. */
2128 ibool* found) /*!< out: TRUE if found else FALSE */
2129{
2130 pars_info_t* info;
2131 dberr_t error;
2132 fts_select_t select;
2133 doc_id_t match_doc_id;
2134 trx_t* trx = query->trx;
2135 char table_name[MAX_FULL_NAME_LEN];
2136
2137 trx->op_info = "fetching FTS index matching nodes";
2138
2139 if (*graph) {
2140 info = (*graph)->info;
2141 } else {
2142 ulint selected;
2143
2144 info = pars_info_create();
2145
2146 selected = fts_select_index(*word->f_str);
2147 query->fts_index_table.suffix = fts_get_suffix(selected);
2148
2149 fts_get_table_name(&query->fts_index_table, table_name);
2150 pars_info_bind_id(info, true, "index_table_name", table_name);
2151 }
2152
2153 select.found = FALSE;
2154 select.doc_id = doc_id;
2155 select.min_pos = *min_pos;
2156 select.word_freq = fts_query_add_word_freq(query, word->f_str);
2157
2158 pars_info_bind_function(info, "my_func", fts_query_select, &select);
2159 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2160
2161 /* Convert to "storage" byte order. */
2162 fts_write_doc_id((byte*) &match_doc_id, doc_id);
2163
2164 fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2165
2166 fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2167
2168 if (!*graph) {
2169
2170 *graph = fts_parse_sql(
2171 &query->fts_index_table,
2172 info,
2173 "DECLARE FUNCTION my_func;\n"
2174 "DECLARE CURSOR c IS"
2175 " SELECT doc_count, ilist\n"
2176 " FROM $index_table_name\n"
2177 " WHERE word LIKE :word AND"
2178 " first_doc_id <= :min_doc_id AND"
2179 " last_doc_id >= :max_doc_id\n"
2180 " ORDER BY first_doc_id;\n"
2181 "BEGIN\n"
2182 "\n"
2183 "OPEN c;\n"
2184 "WHILE 1 = 1 LOOP\n"
2185 " FETCH c INTO my_func();\n"
2186 " IF c % NOTFOUND THEN\n"
2187 " EXIT;\n"
2188 " END IF;\n"
2189 "END LOOP;\n"
2190 "CLOSE c;");
2191 }
2192
2193 for (;;) {
2194 error = fts_eval_sql(trx, *graph);
2195
2196 if (error == DB_SUCCESS) {
2197
2198 break; /* Exit the loop. */
2199 } else {
2200
2201 if (error == DB_LOCK_WAIT_TIMEOUT) {
2202 ib::warn() << "lock wait timeout reading FTS"
2203 " index. Retrying!";
2204
2205 trx->error_state = DB_SUCCESS;
2206 } else {
2207 ib::error() << error
2208 << " while reading FTS index.";
2209
2210 break; /* Exit the loop. */
2211 }
2212 }
2213 }
2214
2215 /* Value to return */
2216 *found = select.found;
2217
2218 if (*found) {
2219 *min_pos = select.min_pos;
2220 }
2221
2222 return(error);
2223}
2224
2225/********************************************************************
2226Callback aggregator for int columns. */
2227static
2228ibool
2229fts_query_sum(
2230/*==========*/
2231 /*!< out: always returns TRUE */
2232 void* row, /*!< in: sel_node_t* */
2233 void* user_arg) /*!< in: ulint* */
2234{
2235
2236 que_node_t* exp;
2237 sel_node_t* node = row;
2238 ulint* total = user_arg;
2239
2240 exp = node->select_list;
2241
2242 while (exp) {
2243 dfield_t* dfield = que_node_get_val(exp);
2244 void* data = dfield_get_data(dfield);
2245 ulint len = dfield_get_len(dfield);
2246
2247 if (len != UNIV_SQL_NULL && len != 0) {
2248 *total += mach_read_from_4(data);
2249 }
2250
2251 exp = que_node_get_next(exp);
2252 }
2253
2254 return(TRUE);
2255}
2256
2257/********************************************************************
2258Calculate the total documents that contain a particular word (term).
2259@return DB_SUCCESS if all go well else error code */
2260static MY_ATTRIBUTE((nonnull, warn_unused_result))
2261dberr_t
2262fts_query_total_docs_containing_term(
2263/*=================================*/
2264 fts_query_t* query, /*!< in: FTS query state */
2265 const fts_string_t* word, /*!< in: the word to check */
2266 ulint* total) /*!< out: documents containing word */
2267{
2268 pars_info_t* info;
2269 dberr_t error;
2270 que_t* graph;
2271 ulint selected;
2272 trx_t* trx = query->trx;
2273 char table_name[MAX_FULL_NAME_LEN]
2274
2275 trx->op_info = "fetching FTS index document count";
2276
2277 *total = 0;
2278
2279 info = pars_info_create();
2280
2281 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2282 pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2283
2284 selected = fts_select_index(*word->f_str);
2285
2286 query->fts_index_table.suffix = fts_get_suffix(selected);
2287
2288 fts_get_table_name(&query->fts_index_table, table_name);
2289
2290 pars_info_bind_id(info, true, "index_table_name", table_name);
2291
2292 graph = fts_parse_sql(
2293 &query->fts_index_table,
2294 info,
2295 "DECLARE FUNCTION my_func;\n"
2296 "DECLARE CURSOR c IS"
2297 " SELECT doc_count\n"
2298 " FROM $index_table_name\n"
2299 " WHERE word = :word"
2300 " ORDER BY first_doc_id;\n"
2301 "BEGIN\n"
2302 "\n"
2303 "OPEN c;\n"
2304 "WHILE 1 = 1 LOOP\n"
2305 " FETCH c INTO my_func();\n"
2306 " IF c % NOTFOUND THEN\n"
2307 " EXIT;\n"
2308 " END IF;\n"
2309 "END LOOP;\n"
2310 "CLOSE c;");
2311
2312 for (;;) {
2313 error = fts_eval_sql(trx, graph);
2314
2315 if (error == DB_SUCCESS) {
2316
2317 break; /* Exit the loop. */
2318 } else {
2319
2320 if (error == DB_LOCK_WAIT_TIMEOUT) {
2321 ib::warn() << "lock wait timeout reading FTS"
2322 " index. Retrying!";
2323
2324 trx->error_state = DB_SUCCESS;
2325 } else {
2326 ib::error() << error
2327 << " while reading FTS index.";
2328
2329 break; /* Exit the loop. */
2330 }
2331 }
2332 }
2333
2334 fts_que_graph_free(graph);
2335
2336 return(error);
2337}
2338
2339/********************************************************************
2340Get the total number of words in a documents.
2341@return DB_SUCCESS if all go well else error code */
2342static MY_ATTRIBUTE((nonnull, warn_unused_result))
2343dberr_t
2344fts_query_terms_in_document(
2345/*========================*/
2346 fts_query_t* query, /*!< in: FTS query state */
2347 doc_id_t doc_id, /*!< in: the word to check */
2348 ulint* total) /*!< out: total words in document */
2349{
2350 pars_info_t* info;
2351 dberr_t error;
2352 que_t* graph;
2353 doc_id_t read_doc_id;
2354 trx_t* trx = query->trx;
2355 char table_name[MAX_FULL_NAME_LEN];
2356
2357 trx->op_info = "fetching FTS document term count";
2358
2359 *total = 0;
2360
2361 info = pars_info_create();
2362
2363 pars_info_bind_function(info, "my_func", fts_query_sum, total);
2364
2365 /* Convert to "storage" byte order. */
2366 fts_write_doc_id((byte*) &read_doc_id, doc_id);
2367 fts_bind_doc_id(info, "doc_id", &read_doc_id);
2368
2369 query->fts_index_table.suffix = "DOC_ID";
2370
2371 fts_get_table_name(&query->fts_index_table, table_name);
2372
2373 pars_info_bind_id(info, true, "index_table_name", table_name);
2374
2375 graph = fts_parse_sql(
2376 &query->fts_index_table,
2377 info,
2378 "DECLARE FUNCTION my_func;\n"
2379 "DECLARE CURSOR c IS"
2380 " SELECT count\n"
2381 " FROM $index_table_name\n"
2382 " WHERE doc_id = :doc_id"
2383 " BEGIN\n"
2384 "\n"
2385 "OPEN c;\n"
2386 "WHILE 1 = 1 LOOP\n"
2387 " FETCH c INTO my_func();\n"
2388 " IF c % NOTFOUND THEN\n"
2389 " EXIT;\n"
2390 " END IF;\n"
2391 "END LOOP;\n"
2392 "CLOSE c;");
2393
2394 for (;;) {
2395 error = fts_eval_sql(trx, graph);
2396
2397 if (error == DB_SUCCESS) {
2398
2399 break; /* Exit the loop. */
2400 } else {
2401
2402 if (error == DB_LOCK_WAIT_TIMEOUT) {
2403 ib::warn() << "lock wait timeout reading FTS"
2404 " doc id table. Retrying!";
2405
2406 trx->error_state = DB_SUCCESS;
2407 } else {
2408 ib::error() << error << " while reading FTS"
2409 " doc id table.";
2410
2411 break; /* Exit the loop. */
2412 }
2413 }
2414 }
2415
2416 fts_que_graph_free(graph);
2417
2418 return(error);
2419}
2420#endif
2421
2422/*****************************************************************//**
2423Retrieve the document and match the phrase tokens.
2424@return DB_SUCCESS or error code */
2425MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result))
2426static
2427dberr_t
2428fts_query_match_document(
2429/*=====================*/
2430 ib_vector_t* tokens, /*!< in: phrase tokens */
2431 fts_get_doc_t* get_doc, /*!< in: table and prepared statements */
2432 fts_match_t* match, /*!< in: doc id and positions */
2433 ulint distance, /*!< in: proximity distance */
2434 st_mysql_ftparser* parser, /*!< in: fts plugin parser */
2435 ibool* found) /*!< out: TRUE if phrase found */
2436{
2437 dberr_t error;
2438 fts_phrase_t phrase(get_doc->index_cache->index->table);
2439
2440 phrase.match = match; /* Positions to match */
2441 phrase.tokens = tokens; /* Tokens to match */
2442 phrase.distance = distance;
2443 phrase.charset = get_doc->index_cache->charset;
2444 phrase.heap = mem_heap_create(512);
2445 phrase.parser = parser;
2446
2447 *found = phrase.found = FALSE;
2448
2449 error = fts_doc_fetch_by_doc_id(
2450 get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2451 fts_query_fetch_document, &phrase);
2452
2453 if (error != DB_SUCCESS) {
2454 ib::error() << "(" << ut_strerr(error)
2455 << ") matching document.";
2456 } else {
2457 *found = phrase.found;
2458 }
2459
2460 mem_heap_free(phrase.heap);
2461
2462 return(error);
2463}
2464
2465/*****************************************************************//**
2466This function fetches the original documents and count the
2467words in between matching words to see that is in specified distance
2468@return DB_SUCCESS if all OK */
2469static MY_ATTRIBUTE((nonnull, warn_unused_result))
2470bool
2471fts_query_is_in_proximity_range(
2472/*============================*/
2473 const fts_query_t* query, /*!< in: query instance */
2474 fts_match_t** match, /*!< in: query instance */
2475 fts_proximity_t* qualified_pos) /*!< in: position info for
2476 qualified ranges */
2477{
2478 fts_get_doc_t get_doc;
2479 fts_cache_t* cache = query->index->table->fts->cache;
2480 dberr_t err;
2481
2482 memset(&get_doc, 0x0, sizeof(get_doc));
2483
2484 rw_lock_x_lock(&cache->lock);
2485 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2486 rw_lock_x_unlock(&cache->lock);
2487 ut_a(get_doc.index_cache != NULL);
2488
2489 fts_phrase_t phrase(get_doc.index_cache->index->table);
2490
2491 phrase.distance = query->distance;
2492 phrase.charset = get_doc.index_cache->charset;
2493 phrase.heap = mem_heap_create(512);
2494 phrase.proximity_pos = qualified_pos;
2495 phrase.found = FALSE;
2496
2497 err = fts_doc_fetch_by_doc_id(
2498 &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2499 fts_query_fetch_document, &phrase);
2500
2501 if (err != DB_SUCCESS) {
2502 ib::error() << "(" << ut_strerr(err) << ") in verification"
2503 " phase of proximity search";
2504 }
2505
2506 /* Free the prepared statement. */
2507 if (get_doc.get_document_graph) {
2508 fts_que_graph_free(get_doc.get_document_graph);
2509 get_doc.get_document_graph = NULL;
2510 }
2511
2512 mem_heap_free(phrase.heap);
2513
2514 return(err == DB_SUCCESS && phrase.found);
2515}
2516
2517/*****************************************************************//**
2518Iterate over the matched document ids and search the for the
2519actual phrase in the text.
2520@return DB_SUCCESS if all OK */
2521static MY_ATTRIBUTE((nonnull, warn_unused_result))
2522dberr_t
2523fts_query_search_phrase(
2524/*====================*/
2525 fts_query_t* query, /*!< in: query instance */
2526 ib_vector_t* orig_tokens, /*!< in: tokens to search,
2527 with any stopwords in the
2528 original phrase */
2529 ib_vector_t* tokens) /*!< in: tokens that does
2530 not include stopwords and
2531 can be used to calculate
2532 ranking */
2533{
2534 ulint i;
2535 fts_get_doc_t get_doc;
2536 ulint n_matched;
2537 fts_cache_t* cache = query->index->table->fts->cache;
2538
2539 n_matched = ib_vector_size(query->matched);
2540
2541 /* Setup the doc retrieval infrastructure. */
2542 memset(&get_doc, 0x0, sizeof(get_doc));
2543
2544 rw_lock_x_lock(&cache->lock);
2545
2546 get_doc.index_cache = fts_find_index_cache(cache, query->index);
2547
2548 /* Must find the index cache */
2549 ut_a(get_doc.index_cache != NULL);
2550
2551 rw_lock_x_unlock(&cache->lock);
2552
2553#ifdef FTS_INTERNAL_DIAG_PRINT
2554 ib::info() << "Start phrase search";
2555#endif
2556
2557 /* Read the document from disk and do the actual
2558 match, matching documents will be added to the current
2559 doc id set. */
2560 for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
2561 fts_match_t* match;
2562 ibool found = FALSE;
2563
2564 match = static_cast<fts_match_t*>(
2565 ib_vector_get(query->matched, i));
2566
2567 /* Skip the document ids that were filtered out by
2568 an earlier pass. */
2569 if (match->doc_id != 0) {
2570
2571 query->error = fts_query_match_document(
2572 orig_tokens, &get_doc, match,
2573 query->distance, query->parser, &found);
2574
2575 if (query->error == DB_SUCCESS && found) {
2576 ulint z;
2577
2578 query->error = fts_query_process_doc_id(query,
2579 match->doc_id, 0);
2580 if (query->error != DB_SUCCESS) {
2581 goto func_exit;
2582 }
2583
2584 for (z = 0; z < ib_vector_size(tokens); z++) {
2585 fts_string_t* token;
2586 token = static_cast<fts_string_t*>(
2587 ib_vector_get(tokens, z));
2588 fts_query_add_word_to_document(
2589 query, match->doc_id, token);
2590 }
2591 }
2592 }
2593 }
2594
2595func_exit:
2596 /* Free the prepared statement. */
2597 if (get_doc.get_document_graph) {
2598 fts_que_graph_free(get_doc.get_document_graph);
2599 get_doc.get_document_graph = NULL;
2600 }
2601
2602 return(query->error);
2603}
2604
2605/** Split the phrase into tokens
2606@param[in,out] query query instance
2607@param[in] node query node to search
2608@param[in,out] tokens token vector
2609@param[in,out] orig_tokens original node tokens include stopword
2610@param[in,out] heap mem heap */
2611static
2612void
2613fts_query_phrase_split(
2614 fts_query_t* query,
2615 const fts_ast_node_t* node,
2616 ib_vector_t* tokens,
2617 ib_vector_t* orig_tokens,
2618 mem_heap_t* heap)
2619{
2620 fts_string_t phrase;
2621 ulint len = 0;
2622 ulint cur_pos = 0;
2623 fts_ast_node_t* term_node = NULL;
2624
2625 if (node->type == FTS_AST_TEXT) {
2626 phrase.f_str = node->text.ptr->str;
2627 phrase.f_len = node->text.ptr->len;
2628 len = phrase.f_len;
2629 } else {
2630 ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2631 phrase.f_str = NULL;
2632 phrase.f_len = 0;
2633 term_node = node->list.head;
2634 }
2635
2636 while (true) {
2637 fts_cache_t* cache = query->index->table->fts->cache;
2638 ulint cur_len;
2639 fts_string_t result_str;
2640
2641 if (node->type == FTS_AST_TEXT) {
2642 if (cur_pos >= len) {
2643 break;
2644 }
2645
2646 cur_len = innobase_mysql_fts_get_token(
2647 query->fts_index_table.charset,
2648 reinterpret_cast<const byte*>(phrase.f_str)
2649 + cur_pos,
2650 reinterpret_cast<const byte*>(phrase.f_str)
2651 + len,
2652 &result_str);
2653
2654 if (cur_len == 0) {
2655 break;
2656 }
2657
2658 cur_pos += cur_len;
2659 } else {
2660 ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2661 /* Term node in parser phrase list */
2662 if (term_node == NULL) {
2663 break;
2664 }
2665
2666 ut_a(term_node->type == FTS_AST_TERM);
2667 result_str.f_str = term_node->term.ptr->str;
2668 result_str.f_len = term_node->term.ptr->len;
2669 result_str.f_n_char = fts_get_token_size(
2670 query->fts_index_table.charset,
2671 reinterpret_cast<char*>(result_str.f_str),
2672 result_str.f_len);
2673
2674 term_node = term_node->next;
2675 }
2676
2677 if (result_str.f_n_char == 0) {
2678 continue;
2679 }
2680
2681 fts_string_t* token = static_cast<fts_string_t*>(
2682 ib_vector_push(tokens, NULL));
2683 fts_string_dup(token, &result_str, heap);
2684
2685 if (fts_check_token(
2686 &result_str,
2687 cache->stopword_info.cached_stopword,
2688 query->fts_index_table.charset)) {
2689 /* Add the word to the RB tree so that we can
2690 calculate it's frequencey within a document. */
2691 fts_query_add_word_freq(query, token);
2692 } else {
2693 ib_vector_pop(tokens);
2694 }
2695
2696 /* we will start to store all words including stopwords
2697 in the "orig_tokens" vector, but skip any leading words
2698 that are stopwords */
2699 if (!ib_vector_is_empty(tokens)) {
2700 fts_string_t* orig_token = static_cast<fts_string_t*>(
2701 ib_vector_push(orig_tokens, NULL));
2702
2703 orig_token->f_str = token->f_str;
2704 orig_token->f_len = token->f_len;
2705 }
2706 }
2707}
2708
2709/*****************************************************************//**
2710Text/Phrase search.
2711@return DB_SUCCESS or error code */
2712static MY_ATTRIBUTE((warn_unused_result))
2713dberr_t
2714fts_query_phrase_search(
2715/*====================*/
2716 fts_query_t* query, /*!< in: query instance */
2717 const fts_ast_node_t* node) /*!< in: node to search */
2718{
2719 ib_vector_t* tokens;
2720 ib_vector_t* orig_tokens;
2721 mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
2722 ib_alloc_t* heap_alloc;
2723 ulint num_token;
2724
2725 heap_alloc = ib_heap_allocator_create(heap);
2726
2727 tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2728 orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
2729
2730 if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
2731 query->flags = FTS_PROXIMITY;
2732 } else {
2733 query->flags = FTS_PHRASE;
2734 }
2735
2736 /* Split the phrase into tokens. */
2737 fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
2738
2739 num_token = ib_vector_size(tokens);
2740 if (num_token > MAX_PROXIMITY_ITEM) {
2741 query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2742 goto func_exit;
2743 }
2744
2745 ut_ad(ib_vector_size(orig_tokens) >= num_token);
2746
2747 /* Ignore empty strings. */
2748 if (num_token > 0) {
2749 fts_string_t* token;
2750 fts_fetch_t fetch;
2751 trx_t* trx = query->trx;
2752 fts_ast_oper_t oper = query->oper;
2753 que_t* graph = NULL;
2754 ulint i;
2755 dberr_t error;
2756
2757 /* Create the vector for storing matching document ids
2758 and the positions of the first token of the phrase. */
2759 if (!query->matched) {
2760 ib_alloc_t* heap_alloc;
2761
2762 heap_alloc = ib_heap_allocator_create(heap);
2763
2764 if (!(query->flags & FTS_PROXIMITY)
2765 && !(query->flags & FTS_PHRASE)) {
2766 query->matched = ib_vector_create(
2767 heap_alloc, sizeof(fts_match_t),
2768 64);
2769 } else {
2770 ut_a(num_token <= MAX_PROXIMITY_ITEM);
2771 query->match_array =
2772 (ib_vector_t**) mem_heap_alloc(
2773 heap,
2774 num_token *
2775 sizeof(query->matched));
2776
2777 for (i = 0; i < num_token; i++) {
2778 query->match_array[i] =
2779 ib_vector_create(
2780 heap_alloc, sizeof(fts_match_t),
2781 64);
2782 }
2783
2784 query->matched = query->match_array[0];
2785 }
2786 }
2787
2788 /* Setup the callback args for filtering and consolidating
2789 the ilist. */
2790 fetch.read_arg = query;
2791 fetch.read_record = fts_query_index_fetch_nodes;
2792
2793 for (i = 0; i < num_token; i++) {
2794 /* Search for the first word from the phrase. */
2795 token = static_cast<fts_string_t*>(
2796 ib_vector_get(tokens, i));
2797
2798 if (query->flags & FTS_PROXIMITY
2799 || query->flags & FTS_PHRASE) {
2800 query->matched = query->match_array[i];
2801 }
2802
2803 error = fts_index_fetch_nodes(
2804 trx, &graph, &query->fts_index_table,
2805 token, &fetch);
2806
2807 /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
2808 ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2809 if (error != DB_SUCCESS) {
2810 query->error = error;
2811 }
2812
2813 fts_que_graph_free(graph);
2814 graph = NULL;
2815
2816 fts_query_cache(query, token);
2817
2818 if (!(query->flags & FTS_PHRASE)
2819 && !(query->flags & FTS_PROXIMITY)) {
2820 break;
2821 }
2822
2823 /* If any of the token can't be found,
2824 no need to continue match */
2825 if (ib_vector_is_empty(query->match_array[i])
2826 || query->error != DB_SUCCESS) {
2827 goto func_exit;
2828 }
2829 }
2830
2831 /* Just a single word, no need to fetch the original
2832 documents to do phrase matching */
2833 if (ib_vector_size(orig_tokens) == 1
2834 && !ib_vector_is_empty(query->match_array[0])) {
2835 fts_match_t* match;
2836 ulint n_matched;
2837
2838 n_matched = ib_vector_size(query->match_array[0]);
2839
2840 for (i = 0; i < n_matched; i++) {
2841 match = static_cast<fts_match_t*>(
2842 ib_vector_get(
2843 query->match_array[0], i));
2844
2845 query->error = fts_query_process_doc_id(
2846 query, match->doc_id, 0);
2847 if (query->error != DB_SUCCESS) {
2848 goto func_exit;
2849 }
2850
2851 fts_query_add_word_to_document(
2852 query, match->doc_id, token);
2853 }
2854 query->oper = oper;
2855 goto func_exit;
2856 }
2857
2858 /* If we are doing proximity search, verify the distance
2859 between all words, and check they are in specified distance. */
2860 if (query->flags & FTS_PROXIMITY) {
2861 fts_phrase_or_proximity_search(query, tokens);
2862 } else {
2863 ibool matched;
2864
2865 /* Phrase Search case:
2866 We filter out the doc ids that don't contain
2867 all the tokens in the phrase. It's cheaper to
2868 search the ilist than bringing the documents in
2869 and then doing a search through the text. Isolated
2870 testing shows this also helps in mitigating disruption
2871 of the buffer cache. */
2872 matched = fts_phrase_or_proximity_search(query, tokens);
2873 query->matched = query->match_array[0];
2874
2875 /* Read the actual text in and search for the phrase. */
2876 if (matched) {
2877 ut_ad(query->error == DB_SUCCESS);
2878 query->error = fts_query_search_phrase(
2879 query, orig_tokens, tokens);
2880 }
2881 }
2882
2883 /* Restore original operation. */
2884 query->oper = oper;
2885
2886 if (query->error != DB_SUCCESS) {
2887 goto func_exit;
2888 }
2889 }
2890
2891func_exit:
2892 mem_heap_free(heap);
2893
2894 /* Don't need it anymore. */
2895 query->matched = NULL;
2896
2897 return(query->error);
2898}
2899
2900/*****************************************************************//**
2901Find the word and evaluate.
2902@return DB_SUCCESS if all go well */
2903static MY_ATTRIBUTE((nonnull, warn_unused_result))
2904dberr_t
2905fts_query_execute(
2906/*==============*/
2907 fts_query_t* query, /*!< in: query instance */
2908 fts_string_t* token) /*!< in: token to search */
2909{
2910 switch (query->oper) {
2911 case FTS_NONE:
2912 case FTS_NEGATE:
2913 case FTS_INCR_RATING:
2914 case FTS_DECR_RATING:
2915 query->error = fts_query_union(query, token);
2916 break;
2917
2918 case FTS_EXIST:
2919 query->error = fts_query_intersect(query, token);
2920 break;
2921
2922 case FTS_IGNORE:
2923 query->error = fts_query_difference(query, token);
2924 break;
2925
2926 default:
2927 ut_error;
2928 }
2929
2930 return(query->error);
2931}
2932
2933/*****************************************************************//**
2934Create a wildcard string. It's the responsibility of the caller to
2935free the byte* pointer. It's allocated using ut_malloc_nokey().
2936@return ptr to allocated memory */
2937static
2938byte*
2939fts_query_get_token(
2940/*================*/
2941 fts_ast_node_t* node, /*!< in: the current sub tree */
2942 fts_string_t* token) /*!< in: token to create */
2943{
2944 ulint str_len;
2945 byte* new_ptr = NULL;
2946
2947 str_len = node->term.ptr->len;
2948
2949 ut_a(node->type == FTS_AST_TERM);
2950
2951 token->f_len = str_len;
2952 token->f_str = node->term.ptr->str;
2953
2954 if (node->term.wildcard) {
2955
2956 token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
2957 token->f_len = str_len + 1;
2958
2959 memcpy(token->f_str, node->term.ptr->str, str_len);
2960
2961 token->f_str[str_len] = '%';
2962 token->f_str[token->f_len] = 0;
2963
2964 new_ptr = token->f_str;
2965 }
2966
2967 return(new_ptr);
2968}
2969
2970/*****************************************************************//**
2971Visit every node of the AST. */
2972static
2973dberr_t
2974fts_query_visitor(
2975/*==============*/
2976 fts_ast_oper_t oper, /*!< in: current operator */
2977 fts_ast_node_t* node, /*!< in: The root of the current subtree*/
2978 void* arg) /*!< in: callback arg*/
2979{
2980 byte* ptr;
2981 fts_string_t token;
2982 fts_query_t* query = static_cast<fts_query_t*>(arg);
2983
2984 ut_a(node);
2985 DBUG_ENTER("fts_query_visitor");
2986 DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2987
2988 token.f_n_char = 0;
2989 query->oper = oper;
2990 query->cur_node = node;
2991
2992 switch (node->type) {
2993 case FTS_AST_TEXT:
2994 case FTS_AST_PARSER_PHRASE_LIST:
2995
2996 if (query->oper == FTS_EXIST) {
2997 ut_ad(query->intersection == NULL);
2998 query->intersection = rbt_create(
2999 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3000
3001 query->total_size += SIZEOF_RBT_CREATE;
3002 }
3003
3004 /* Set the current proximity distance. */
3005 query->distance = node->text.distance;
3006
3007 /* Force collection of doc ids and the positions. */
3008 query->collect_positions = TRUE;
3009
3010 query->error = fts_query_phrase_search(query, node);
3011
3012 query->collect_positions = FALSE;
3013
3014 if (query->oper == FTS_EXIST) {
3015 fts_query_free_doc_ids(query, query->doc_ids);
3016 query->doc_ids = query->intersection;
3017 query->intersection = NULL;
3018 }
3019
3020 break;
3021
3022 case FTS_AST_TERM:
3023 token.f_str = node->term.ptr->str;
3024 token.f_len = node->term.ptr->len;
3025
3026 /* Collect wildcard words for QUERY EXPANSION. */
3027 if (node->term.wildcard && query->wildcard_words != NULL) {
3028 ib_rbt_bound_t parent;
3029
3030 if (rbt_search(query->wildcard_words, &parent, &token)
3031 != 0) {
3032 fts_string_t word;
3033
3034 fts_string_dup(&word, &token, query->heap);
3035 rbt_add_node(query->wildcard_words, &parent,
3036 &word);
3037 }
3038 }
3039
3040 /* Add the word to our RB tree that will be used to
3041 calculate this terms per document frequency. */
3042 fts_query_add_word_freq(query, &token);
3043
3044 ptr = fts_query_get_token(node, &token);
3045 query->error = fts_query_execute(query, &token);
3046
3047 if (ptr) {
3048 ut_free(ptr);
3049 }
3050
3051 break;
3052
3053 case FTS_AST_SUBEXP_LIST:
3054 query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
3055 break;
3056
3057 default:
3058 ut_error;
3059 }
3060
3061 if (query->oper == FTS_EXIST) {
3062 query->multi_exist = true;
3063 }
3064
3065 DBUG_RETURN(query->error);
3066}
3067
3068/** Process (nested) sub-expression, create a new result set to store the
3069sub-expression result by processing nodes under current sub-expression
3070list. Merge the sub-expression result with that of parent expression list.
3071@param[in,out] node current root node
3072@param[in,out] visitor callback function
3073@param[in,out] arg argument for callback
3074@return DB_SUCCESS if all go well */
3075static
3076dberr_t
3077fts_ast_visit_sub_exp(
3078 fts_ast_node_t* node,
3079 fts_ast_callback visitor,
3080 void* arg)
3081{
3082 fts_ast_oper_t cur_oper;
3083 fts_query_t* query = static_cast<fts_query_t*>(arg);
3084 ib_rbt_t* parent_doc_ids;
3085 ib_rbt_t* subexpr_doc_ids;
3086 dberr_t error = DB_SUCCESS;
3087 bool will_be_ignored = false;
3088 bool multi_exist;
3089
3090 DBUG_ENTER("fts_ast_visit_sub_exp");
3091
3092 ut_a(node->type == FTS_AST_SUBEXP_LIST);
3093
3094 cur_oper = query->oper;
3095
3096 /* Save current result set */
3097 parent_doc_ids = query->doc_ids;
3098
3099 /* Create new result set to store the sub-expression result. We
3100 will merge this result set with the parent after processing. */
3101 query->doc_ids = rbt_create(sizeof(fts_ranking_t),
3102 fts_ranking_doc_id_cmp);
3103
3104 query->total_size += SIZEOF_RBT_CREATE;
3105
3106 multi_exist = query->multi_exist;
3107 query->multi_exist = false;
3108 /* Process nodes in current sub-expression and store its
3109 result set in query->doc_ids we created above. */
3110 error = fts_ast_visit(FTS_NONE, node, visitor,
3111 arg, &will_be_ignored);
3112
3113 /* Reinstate parent node state */
3114 query->multi_exist = multi_exist;
3115 query->oper = cur_oper;
3116
3117 /* Merge the sub-expression result with the parent result set. */
3118 subexpr_doc_ids = query->doc_ids;
3119 query->doc_ids = parent_doc_ids;
3120 if (error == DB_SUCCESS) {
3121 error = fts_merge_doc_ids(query, subexpr_doc_ids);
3122 }
3123
3124 /* Free current result set. Result already merged into parent. */
3125 fts_query_free_doc_ids(query, subexpr_doc_ids);
3126
3127 DBUG_RETURN(error);
3128}
3129
3130#if 0
3131/*****************************************************************//***
3132Check if the doc id exists in the ilist.
3133@return TRUE if doc id found */
3134static
3135ulint
3136fts_query_find_doc_id(
3137/*==================*/
3138 fts_select_t* select, /*!< in/out: contains the doc id to
3139 find, we update the word freq if
3140 document found */
3141 void* data, /*!< in: doc id ilist */
3142 ulint len) /*!< in: doc id ilist size */
3143{
3144 byte* ptr = data;
3145 doc_id_t doc_id = 0;
3146 ulint decoded = 0;
3147
3148 /* Decode the ilist and search for selected doc_id. We also
3149 calculate the frequency of the word in the document if found. */
3150 while (decoded < len && !select->found) {
3151 ulint freq = 0;
3152 ulint min_pos = 0;
3153 ulint last_pos = 0;
3154 ulint pos = fts_decode_vlc(&ptr);
3155
3156 /* Add the delta. */
3157 doc_id += pos;
3158
3159 while (*ptr) {
3160 ++freq;
3161 last_pos += fts_decode_vlc(&ptr);
3162
3163 /* Only if min_pos is not set and the current
3164 term exists in a position greater than the
3165 min_pos of the previous term. */
3166 if (min_pos == 0 && last_pos > select->min_pos) {
3167 min_pos = last_pos;
3168 }
3169 }
3170
3171 /* Skip the end of word position marker. */
3172 ++ptr;
3173
3174 /* Bytes decoded so far. */
3175 decoded = ptr - (byte*) data;
3176
3177 /* A word may exist in the document but we only consider a
3178 match if it exists in a position that is greater than the
3179 position of the previous term. */
3180 if (doc_id == select->doc_id && min_pos > 0) {
3181 fts_doc_freq_t* doc_freq;
3182
3183 /* Add the doc id to the doc freq rb tree, if
3184 the doc id doesn't exist it will be created. */
3185 doc_freq = fts_query_add_doc_freq(
3186 select->word_freq->doc_freqs, doc_id);
3187
3188 /* Avoid duplicating the frequency tally */
3189 if (doc_freq->freq == 0) {
3190 doc_freq->freq = freq;
3191 }
3192
3193 select->found = TRUE;
3194 select->min_pos = min_pos;
3195 }
3196 }
3197
3198 return(select->found);
3199}
3200#endif
3201
3202/*****************************************************************//**
3203Read and filter nodes.
3204@return DB_SUCCESS if all go well,
3205or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
3206static
3207dberr_t
3208fts_query_filter_doc_ids(
3209/*=====================*/
3210 fts_query_t* query, /*!< in: query instance */
3211 const fts_string_t* word, /*!< in: the current word */
3212 fts_word_freq_t* word_freq, /*!< in/out: word frequency */
3213 const fts_node_t* node, /*!< in: current FTS node */
3214 void* data, /*!< in: doc id ilist */
3215 ulint len, /*!< in: doc id ilist size */
3216 ibool calc_doc_count) /*!< in: whether to remember doc count */
3217{
3218 byte* ptr = static_cast<byte*>(data);
3219 doc_id_t doc_id = 0;
3220 ulint decoded = 0;
3221 ib_rbt_t* doc_freqs = word_freq->doc_freqs;
3222
3223 /* Decode the ilist and add the doc ids to the query doc_id set. */
3224 while (decoded < len) {
3225 ulint freq = 0;
3226 fts_doc_freq_t* doc_freq;
3227 fts_match_t* match = NULL;
3228 ulint last_pos = 0;
3229 ulint pos = fts_decode_vlc(&ptr);
3230
3231 /* Some sanity checks. */
3232 if (doc_id == 0) {
3233 ut_a(pos == node->first_doc_id);
3234 }
3235
3236 /* Add the delta. */
3237 doc_id += pos;
3238
3239 if (calc_doc_count) {
3240 word_freq->doc_count++;
3241 }
3242
3243 /* We simply collect the matching instances here. */
3244 if (query->collect_positions) {
3245 ib_alloc_t* heap_alloc;
3246
3247 /* Create a new fts_match_t instance. */
3248 match = static_cast<fts_match_t*>(
3249 ib_vector_push(query->matched, NULL));
3250
3251 match->start = 0;
3252 match->doc_id = doc_id;
3253 heap_alloc = ib_vector_allocator(query->matched);
3254
3255 /* Allocate from the same heap as the
3256 parent container. */
3257 match->positions = ib_vector_create(
3258 heap_alloc, sizeof(ulint), 64);
3259
3260 query->total_size += sizeof(fts_match_t)
3261 + sizeof(ib_vector_t)
3262 + sizeof(ulint) * 64;
3263 }
3264
3265 /* Unpack the positions within the document. */
3266 while (*ptr) {
3267 last_pos += fts_decode_vlc(&ptr);
3268
3269 /* Collect the matching word positions, for phrase
3270 matching later. */
3271 if (query->collect_positions) {
3272 ib_vector_push(match->positions, &last_pos);
3273 }
3274
3275 ++freq;
3276 }
3277
3278 /* End of list marker. */
3279 last_pos = (ulint) -1;
3280
3281 if (query->collect_positions) {
3282 ut_a(match != NULL);
3283 ib_vector_push(match->positions, &last_pos);
3284 }
3285
3286 /* Add the doc id to the doc freq rb tree, if the doc id
3287 doesn't exist it will be created. */
3288 doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3289
3290 /* Avoid duplicating frequency tally. */
3291 if (doc_freq->freq == 0) {
3292 doc_freq->freq = freq;
3293 }
3294
3295 /* Skip the end of word position marker. */
3296 ++ptr;
3297
3298 /* Bytes decoded so far */
3299 decoded = ulint(ptr - (byte*) data);
3300
3301 /* We simply collect the matching documents and the
3302 positions here and match later. */
3303 if (!query->collect_positions) {
3304 /* We ignore error here and will check it later */
3305 fts_query_process_doc_id(query, doc_id, 0);
3306
3307 /* Add the word to the document's matched RB tree. */
3308 fts_query_add_word_to_document(query, doc_id, word);
3309 }
3310 }
3311
3312 /* Some sanity checks. */
3313 ut_a(doc_id == node->last_doc_id);
3314
3315 if (query->total_size > fts_result_cache_limit) {
3316 return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3317 } else {
3318 return(DB_SUCCESS);
3319 }
3320}
3321
3322/*****************************************************************//**
3323Read the FTS INDEX row.
3324@return DB_SUCCESS if all go well. */
3325static
3326dberr_t
3327fts_query_read_node(
3328/*================*/
3329 fts_query_t* query, /*!< in: query instance */
3330 const fts_string_t* word, /*!< in: current word */
3331 que_node_t* exp) /*!< in: query graph node */
3332{
3333 int i;
3334 int ret;
3335 fts_node_t node;
3336 ib_rbt_bound_t parent;
3337 fts_word_freq_t* word_freq;
3338 ibool skip = FALSE;
3339 fts_string_t term;
3340 byte buf[FTS_MAX_WORD_LEN + 1];
3341 dberr_t error = DB_SUCCESS;
3342
3343 ut_a(query->cur_node->type == FTS_AST_TERM
3344 || query->cur_node->type == FTS_AST_TEXT
3345 || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
3346
3347 memset(&node, 0, sizeof(node));
3348 term.f_str = buf;
3349
3350 /* Need to consider the wildcard search case, the word frequency
3351 is created on the search string not the actual word. So we need
3352 to assign the frequency on search string behalf. */
3353 if (query->cur_node->type == FTS_AST_TERM
3354 && query->cur_node->term.wildcard) {
3355
3356 term.f_len = query->cur_node->term.ptr->len;
3357 ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3358 memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3359 } else {
3360 term.f_len = word->f_len;
3361 ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3362 memcpy(term.f_str, word->f_str, word->f_len);
3363 }
3364
3365 /* Lookup the word in our rb tree, it must exist. */
3366 ret = rbt_search(query->word_freqs, &parent, &term);
3367
3368 ut_a(ret == 0);
3369
3370 word_freq = rbt_value(fts_word_freq_t, parent.last);
3371
3372 /* Start from 1 since the first column has been read by the caller.
3373 Also, we rely on the order of the columns projected, to filter
3374 out ilists that are out of range and we always want to read
3375 the doc_count irrespective of the suitablility of the row. */
3376
3377 for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
3378
3379 dfield_t* dfield = que_node_get_val(exp);
3380 byte* data = static_cast<byte*>(
3381 dfield_get_data(dfield));
3382 ulint len = dfield_get_len(dfield);
3383
3384 ut_a(len != UNIV_SQL_NULL);
3385
3386 /* Note: The column numbers below must match the SELECT. */
3387
3388 switch (i) {
3389 case 1: /* DOC_COUNT */
3390 word_freq->doc_count += mach_read_from_4(data);
3391 break;
3392
3393 case 2: /* FIRST_DOC_ID */
3394 node.first_doc_id = fts_read_doc_id(data);
3395
3396 /* Skip nodes whose doc ids are out range. */
3397 if (query->oper == FTS_EXIST
3398 && query->upper_doc_id > 0
3399 && node.first_doc_id > query->upper_doc_id) {
3400 skip = TRUE;
3401 }
3402 break;
3403
3404 case 3: /* LAST_DOC_ID */
3405 node.last_doc_id = fts_read_doc_id(data);
3406
3407 /* Skip nodes whose doc ids are out range. */
3408 if (query->oper == FTS_EXIST
3409 && query->lower_doc_id > 0
3410 && node.last_doc_id < query->lower_doc_id) {
3411 skip = TRUE;
3412 }
3413 break;
3414
3415 case 4: /* ILIST */
3416
3417 error = fts_query_filter_doc_ids(
3418 query, &word_freq->word, word_freq,
3419 &node, data, len, FALSE);
3420
3421 break;
3422
3423 default:
3424 ut_error;
3425 }
3426 }
3427
3428 if (!skip) {
3429 /* Make sure all columns were read. */
3430
3431 ut_a(i == 5);
3432 }
3433
3434 return error;
3435}
3436
3437/*****************************************************************//**
3438Callback function to fetch the rows in an FTS INDEX record.
3439@return always returns TRUE */
3440static
3441ibool
3442fts_query_index_fetch_nodes(
3443/*========================*/
3444 void* row, /*!< in: sel_node_t* */
3445 void* user_arg) /*!< in: pointer to fts_fetch_t */
3446{
3447 fts_string_t key;
3448 sel_node_t* sel_node = static_cast<sel_node_t*>(row);
3449 fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
3450 fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg);
3451 que_node_t* exp = sel_node->select_list;
3452 dfield_t* dfield = que_node_get_val(exp);
3453 void* data = dfield_get_data(dfield);
3454 ulint dfield_len = dfield_get_len(dfield);
3455
3456 key.f_str = static_cast<byte*>(data);
3457 key.f_len = dfield_len;
3458
3459 ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3460
3461 /* Note: we pass error out by 'query->error' */
3462 query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3463
3464 if (query->error != DB_SUCCESS) {
3465 ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3466 return(FALSE);
3467 } else {
3468 return(TRUE);
3469 }
3470}
3471
3472/*****************************************************************//**
3473Calculate the inverse document frequency (IDF) for all the terms. */
3474static
3475void
3476fts_query_calculate_idf(
3477/*====================*/
3478 fts_query_t* query) /*!< in: Query state */
3479{
3480 const ib_rbt_node_t* node;
3481 ib_uint64_t total_docs = query->total_docs;
3482
3483 /* We need to free any instances of fts_doc_freq_t that we
3484 may have allocated. */
3485 for (node = rbt_first(query->word_freqs);
3486 node;
3487 node = rbt_next(query->word_freqs, node)) {
3488
3489 fts_word_freq_t* word_freq;
3490
3491 word_freq = rbt_value(fts_word_freq_t, node);
3492
3493 if (word_freq->doc_count > 0) {
3494 if (total_docs == word_freq->doc_count) {
3495 /* QP assume ranking > 0 if we find
3496 a match. Since Log10(1) = 0, we cannot
3497 make IDF a zero value if do find a
3498 word in all documents. So let's make
3499 it an arbitrary very small number */
3500 word_freq->idf = log10(1.0001);
3501 } else {
3502 word_freq->idf = log10(
3503 total_docs
3504 / (double) word_freq->doc_count);
3505 }
3506 }
3507
3508 if (fts_enable_diag_print) {
3509 ib::info() << "'" << word_freq->word.f_str << "' -> "
3510 << query->total_docs << "/"
3511 << word_freq->doc_count << " "
3512 << std::setw(6) << std::setprecision(5)
3513 << word_freq->idf;
3514 }
3515 }
3516}
3517
3518/*****************************************************************//**
3519Calculate the ranking of the document. */
3520static
3521void
3522fts_query_calculate_ranking(
3523/*========================*/
3524 const fts_query_t* query, /*!< in: query state */
3525 fts_ranking_t* ranking) /*!< in: Document to rank */
3526{
3527 ulint pos = 0;
3528 fts_string_t word;
3529
3530 /* At this stage, ranking->rank should not exceed the 1.0
3531 bound */
3532 ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
3533 ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3534
3535 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3536 int ret;
3537 ib_rbt_bound_t parent;
3538 double weight;
3539 fts_doc_freq_t* doc_freq;
3540 fts_word_freq_t* word_freq;
3541
3542 ret = rbt_search(query->word_freqs, &parent, &word);
3543
3544 /* It must exist. */
3545 ut_a(ret == 0);
3546
3547 word_freq = rbt_value(fts_word_freq_t, parent.last);
3548
3549 ret = rbt_search(
3550 word_freq->doc_freqs, &parent, &ranking->doc_id);
3551
3552 /* It must exist. */
3553 ut_a(ret == 0);
3554
3555 doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3556
3557 weight = (double) doc_freq->freq * word_freq->idf;
3558
3559 ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3560 }
3561}
3562
3563/*****************************************************************//**
3564Add ranking to the result set. */
3565static
3566void
3567fts_query_add_ranking(
3568/*==================*/
3569 fts_query_t* query, /*!< in: query state */
3570 ib_rbt_t* ranking_tree, /*!< in: ranking tree */
3571 const fts_ranking_t* new_ranking) /*!< in: ranking of a document */
3572{
3573 ib_rbt_bound_t parent;
3574
3575 /* Lookup the ranking in our rb tree and add if it doesn't exist. */
3576 if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
3577 fts_ranking_t* ranking;
3578
3579 ranking = rbt_value(fts_ranking_t, parent.last);
3580
3581 ranking->rank += new_ranking->rank;
3582
3583 ut_a(ranking->words == NULL);
3584 } else {
3585 rbt_add_node(ranking_tree, &parent, new_ranking);
3586
3587 query->total_size += SIZEOF_RBT_NODE_ADD
3588 + sizeof(fts_ranking_t);
3589 }
3590}
3591
3592/*****************************************************************//**
3593Retrieve the FTS Relevance Ranking result for doc with doc_id
3594@return the relevance ranking value, 0 if no ranking value
3595present. */
3596float
3597fts_retrieve_ranking(
3598/*=================*/
3599 fts_result_t* result, /*!< in: FTS result structure */
3600 doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */
3601{
3602 ib_rbt_bound_t parent;
3603 fts_ranking_t new_ranking;
3604
3605 DBUG_ENTER("fts_retrieve_ranking");
3606
3607 if (!result || !result->rankings_by_id) {
3608 DBUG_RETURN(0);
3609 }
3610
3611 new_ranking.doc_id = doc_id;
3612
3613 /* Lookup the ranking in our rb tree */
3614 if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
3615 fts_ranking_t* ranking;
3616
3617 ranking = rbt_value(fts_ranking_t, parent.last);
3618
3619 DBUG_RETURN(ranking->rank);
3620 }
3621
3622 DBUG_RETURN(0);
3623}
3624
3625/*****************************************************************//**
3626Create the result and copy the data to it. */
3627static
3628fts_result_t*
3629fts_query_prepare_result(
3630/*=====================*/
3631 fts_query_t* query, /*!< in: Query state */
3632 fts_result_t* result) /*!< in: result this can contain
3633 data from a previous search on
3634 another FTS index */
3635{
3636 const ib_rbt_node_t* node;
3637 bool result_is_null = false;
3638
3639 DBUG_ENTER("fts_query_prepare_result");
3640
3641 if (result == NULL) {
3642 result = static_cast<fts_result_t*>(
3643 ut_zalloc_nokey(sizeof(*result)));
3644
3645 result->rankings_by_id = rbt_create(
3646 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3647
3648 query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3649 result_is_null = true;
3650 }
3651
3652 if (query->flags == FTS_OPT_RANKING) {
3653 fts_word_freq_t* word_freq;
3654 ulint size = ib_vector_size(query->deleted->doc_ids);
3655 fts_update_t* array =
3656 (fts_update_t*) query->deleted->doc_ids->data;
3657
3658 node = rbt_first(query->word_freqs);
3659 ut_ad(node);
3660 word_freq = rbt_value(fts_word_freq_t, node);
3661
3662 for (node = rbt_first(word_freq->doc_freqs);
3663 node;
3664 node = rbt_next(word_freq->doc_freqs, node)) {
3665 fts_doc_freq_t* doc_freq;
3666 fts_ranking_t ranking;
3667
3668 doc_freq = rbt_value(fts_doc_freq_t, node);
3669
3670 /* Don't put deleted docs into result */
3671 if (fts_bsearch(array, 0, static_cast<int>(size),
3672 doc_freq->doc_id) >= 0) {
3673 /* one less matching doc count */
3674 --word_freq->doc_count;
3675 continue;
3676 }
3677
3678 ranking.doc_id = doc_freq->doc_id;
3679 ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3680 ranking.words = NULL;
3681
3682 fts_query_add_ranking(query, result->rankings_by_id,
3683 &ranking);
3684
3685 if (query->total_size > fts_result_cache_limit) {
3686 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3687 fts_query_free_result(result);
3688 DBUG_RETURN(NULL);
3689 }
3690 }
3691
3692 /* Calculate IDF only after we exclude the deleted items */
3693 fts_query_calculate_idf(query);
3694
3695 node = rbt_first(query->word_freqs);
3696 word_freq = rbt_value(fts_word_freq_t, node);
3697
3698 /* Calculate the ranking for each doc */
3699 for (node = rbt_first(result->rankings_by_id);
3700 node != NULL;
3701 node = rbt_next(result->rankings_by_id, node)) {
3702
3703 fts_ranking_t* ranking;
3704
3705 ranking = rbt_value(fts_ranking_t, node);
3706
3707 ranking->rank = static_cast<fts_rank_t>(
3708 ranking->rank * word_freq->idf * word_freq->idf);
3709 }
3710
3711 DBUG_RETURN(result);
3712 }
3713
3714 ut_a(rbt_size(query->doc_ids) > 0);
3715
3716 for (node = rbt_first(query->doc_ids);
3717 node;
3718 node = rbt_next(query->doc_ids, node)) {
3719
3720 fts_ranking_t* ranking;
3721
3722 ranking = rbt_value(fts_ranking_t, node);
3723 fts_query_calculate_ranking(query, ranking);
3724
3725 // FIXME: I think we may requre this information to improve the
3726 // ranking of doc ids which have more word matches from
3727 // different FTS indexes.
3728
3729 /* We don't need these anymore free the resources. */
3730 ranking->words = NULL;
3731
3732 if (!result_is_null) {
3733 fts_query_add_ranking(query, result->rankings_by_id, ranking);
3734
3735 if (query->total_size > fts_result_cache_limit) {
3736 query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3737 fts_query_free_result(result);
3738 DBUG_RETURN(NULL);
3739 }
3740 }
3741 }
3742
3743 if (result_is_null) {
3744 /* Use doc_ids directly */
3745 rbt_free(result->rankings_by_id);
3746 result->rankings_by_id = query->doc_ids;
3747 query->doc_ids = NULL;
3748 }
3749
3750 DBUG_RETURN(result);
3751}
3752
3753/*****************************************************************//**
3754Get the result of the query. Calculate the similarity coefficient. */
3755static
3756fts_result_t*
3757fts_query_get_result(
3758/*=================*/
3759 fts_query_t* query, /*!< in: query instance */
3760 fts_result_t* result) /*!< in: result */
3761{
3762 DBUG_ENTER("fts_query_get_result");
3763
3764 if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
3765 /* Copy the doc ids to the result. */
3766 result = fts_query_prepare_result(query, result);
3767 } else {
3768 /* Create an empty result instance. */
3769 result = static_cast<fts_result_t*>(
3770 ut_zalloc_nokey(sizeof(*result)));
3771 }
3772
3773 DBUG_RETURN(result);
3774}
3775
3776/*****************************************************************//**
3777FTS Query free resources and reset. */
3778static
3779void
3780fts_query_free(
3781/*===========*/
3782 fts_query_t* query) /*!< in: query instance to free*/
3783{
3784
3785 if (query->read_nodes_graph) {
3786 fts_que_graph_free(query->read_nodes_graph);
3787 }
3788
3789 if (query->root) {
3790 fts_ast_free_node(query->root);
3791 }
3792
3793 if (query->deleted) {
3794 fts_doc_ids_free(query->deleted);
3795 }
3796
3797 if (query->intersection) {
3798 fts_query_free_doc_ids(query, query->intersection);
3799 }
3800
3801 if (query->doc_ids) {
3802 fts_query_free_doc_ids(query, query->doc_ids);
3803 }
3804
3805 if (query->word_freqs) {
3806 const ib_rbt_node_t* node;
3807
3808 /* We need to free any instances of fts_doc_freq_t that we
3809 may have allocated. */
3810 for (node = rbt_first(query->word_freqs);
3811 node;
3812 node = rbt_next(query->word_freqs, node)) {
3813
3814 fts_word_freq_t* word_freq;
3815
3816 word_freq = rbt_value(fts_word_freq_t, node);
3817
3818 /* We need to cast away the const. */
3819 rbt_free(word_freq->doc_freqs);
3820 }
3821
3822 rbt_free(query->word_freqs);
3823 }
3824
3825 if (query->wildcard_words != NULL) {
3826 rbt_free(query->wildcard_words);
3827 }
3828
3829 ut_a(!query->intersection);
3830
3831 if (query->word_map) {
3832 rbt_free(query->word_map);
3833 }
3834
3835 if (query->word_vector != NULL) {
3836 UT_DELETE(query->word_vector);
3837 }
3838
3839 if (query->heap) {
3840 mem_heap_free(query->heap);
3841 }
3842
3843 memset(query, 0, sizeof(*query));
3844}
3845
3846/*****************************************************************//**
3847Parse the query using flex/bison or plugin parser.
3848@return parse tree node. */
3849static
3850fts_ast_node_t*
3851fts_query_parse(
3852/*============*/
3853 fts_query_t* query, /*!< in: query instance */
3854 byte* query_str, /*!< in: query string */
3855 ulint query_len) /*!< in: query string length */
3856{
3857 int error;
3858 fts_ast_state_t state;
3859 bool mode = query->boolean_mode;
3860 DBUG_ENTER("fts_query_parse");
3861
3862 memset(&state, 0x0, sizeof(state));
3863
3864 state.charset = query->fts_index_table.charset;
3865
3866 DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
3867 query->parser = NULL;);
3868
3869 if (query->parser) {
3870 state.root = state.cur_node =
3871 fts_ast_create_node_list(&state, NULL);
3872 error = fts_parse_by_parser(mode, query_str, query_len,
3873 query->parser, &state);
3874 } else {
3875 /* Setup the scanner to use, this depends on the mode flag. */
3876 state.lexer = fts_lexer_create(mode, query_str, query_len);
3877 state.charset = query->fts_index_table.charset;
3878 error = fts_parse(&state);
3879 fts_lexer_free(state.lexer);
3880 state.lexer = NULL;
3881 }
3882
3883 /* Error during parsing ? */
3884 if (error) {
3885 /* Free the nodes that were allocated during parsing. */
3886 fts_ast_state_free(&state);
3887 } else {
3888 query->root = state.root;
3889
3890 if (fts_enable_diag_print && query->root != NULL) {
3891 fts_ast_node_print(query->root);
3892 }
3893 }
3894
3895 DBUG_RETURN(state.root);
3896}
3897
3898/*******************************************************************//**
3899FTS Query optimization
3900Set FTS_OPT_RANKING if it is a simple term query */
3901static
3902void
3903fts_query_can_optimize(
3904/*===================*/
3905 fts_query_t* query, /*!< in/out: query instance */
3906 uint flags) /*!< In: FTS search mode */
3907{
3908 fts_ast_node_t* node = query->root;
3909
3910 if (flags & FTS_EXPAND) {
3911 return;
3912 }
3913
3914 /* Check if it has only a term without oper */
3915 ut_ad(node->type == FTS_AST_LIST);
3916 node = node->list.head;
3917 if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3918 query->flags = FTS_OPT_RANKING;
3919 }
3920}
3921
3922/** FTS Query entry point.
3923@param[in] index fts index to search
3924@param[in] flags FTS search mode
3925@param[in] query_str FTS query
3926@param[in] query_len FTS query string len in bytes
3927@param[in,out] result result doc ids
3928@return DB_SUCCESS if successful otherwise error code */
3929dberr_t
3930fts_query(
3931 dict_index_t* index,
3932 uint flags,
3933 const byte* query_str,
3934 ulint query_len,
3935 fts_result_t** result)
3936{
3937 fts_query_t query;
3938 dberr_t error = DB_SUCCESS;
3939 byte* lc_query_str;
3940 ulint lc_query_str_len;
3941 ulint result_len;
3942 bool boolean_mode;
3943 trx_t* query_trx;
3944 CHARSET_INFO* charset;
3945 ulint start_time_ms;
3946 bool will_be_ignored = false;
3947
3948 boolean_mode = flags & FTS_BOOL;
3949
3950 *result = NULL;
3951 memset(&query, 0x0, sizeof(query));
3952 query_trx = trx_create();
3953 query_trx->op_info = "FTS query";
3954
3955 start_time_ms = ut_time_ms();
3956
3957 query.trx = query_trx;
3958 query.index = index;
3959 query.boolean_mode = boolean_mode;
3960 query.deleted = fts_doc_ids_create();
3961 query.cur_node = NULL;
3962
3963 query.fts_common_table.type = FTS_COMMON_TABLE;
3964 query.fts_common_table.table_id = index->table->id;
3965 query.fts_common_table.parent = index->table->name.m_name;
3966 query.fts_common_table.table = index->table;
3967
3968 charset = fts_index_get_charset(index);
3969
3970 query.fts_index_table.type = FTS_INDEX_TABLE;
3971 query.fts_index_table.index_id = index->id;
3972 query.fts_index_table.table_id = index->table->id;
3973 query.fts_index_table.parent = index->table->name.m_name;
3974 query.fts_index_table.charset = charset;
3975 query.fts_index_table.table = index->table;
3976
3977 query.word_map = rbt_create_arg_cmp(
3978 sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
3979 query.word_vector = UT_NEW_NOKEY(word_vector_t());
3980 query.error = DB_SUCCESS;
3981
3982 /* Setup the RB tree that will be used to collect per term
3983 statistics. */
3984 query.word_freqs = rbt_create_arg_cmp(
3985 sizeof(fts_word_freq_t), innobase_fts_text_cmp,
3986 (void*) charset);
3987
3988 if (flags & FTS_EXPAND) {
3989 query.wildcard_words = rbt_create_arg_cmp(
3990 sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
3991 }
3992
3993 query.total_size += SIZEOF_RBT_CREATE;
3994
3995 query.total_docs = dict_table_get_n_rows(index->table);
3996
3997 query.fts_common_table.suffix = "DELETED";
3998
3999 /* Read the deleted doc_ids, we need these for filtering. */
4000 error = fts_table_fetch_doc_ids(
4001 NULL, &query.fts_common_table, query.deleted);
4002
4003 if (error != DB_SUCCESS) {
4004 goto func_exit;
4005 }
4006
4007 query.fts_common_table.suffix = "DELETED_CACHE";
4008
4009 error = fts_table_fetch_doc_ids(
4010 NULL, &query.fts_common_table, query.deleted);
4011
4012 if (error != DB_SUCCESS) {
4013 goto func_exit;
4014 }
4015
4016 /* Get the deleted doc ids that are in the cache. */
4017 fts_cache_append_deleted_doc_ids(
4018 index->table->fts->cache, query.deleted->doc_ids);
4019 DEBUG_SYNC_C("fts_deleted_doc_ids_append");
4020
4021 /* Sort the vector so that we can do a binary search over the ids. */
4022 ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
4023
4024 /* Convert the query string to lower case before parsing. We own
4025 the ut_malloc'ed result and so remember to free it before return. */
4026
4027 lc_query_str_len = query_len * charset->casedn_multiply + 1;
4028 lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
4029
4030 /* For binary collations, a case sensitive search is
4031 performed. Hence don't convert to lower case. */
4032 if (my_binary_compare(charset)) {
4033 memcpy(lc_query_str, query_str, query_len);
4034 lc_query_str[query_len]= 0;
4035 result_len= query_len;
4036 } else {
4037 result_len = innobase_fts_casedn_str(
4038 charset, (char*)( query_str), query_len,
4039 (char*)(lc_query_str), lc_query_str_len);
4040 }
4041
4042 ut_ad(result_len < lc_query_str_len);
4043
4044 lc_query_str[result_len] = 0;
4045
4046 query.heap = mem_heap_create(128);
4047
4048 /* Create the rb tree for the doc id (current) set. */
4049 query.doc_ids = rbt_create(
4050 sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4051 query.parser = index->parser;
4052
4053 query.total_size += SIZEOF_RBT_CREATE;
4054
4055 /* Parse the input query string. */
4056 if (fts_query_parse(&query, lc_query_str, result_len)) {
4057 fts_ast_node_t* ast = query.root;
4058
4059 /* Optimize query to check if it's a single term */
4060 fts_query_can_optimize(&query, flags);
4061
4062 DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4063 fts_result_cache_limit = 2048;
4064 );
4065
4066 /* Traverse the Abstract Syntax Tree (AST) and execute
4067 the query. */
4068 query.error = fts_ast_visit(
4069 FTS_NONE, ast, fts_query_visitor,
4070 &query, &will_be_ignored);
4071
4072 /* If query expansion is requested, extend the search
4073 with first search pass result */
4074 if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4075 query.error = fts_expand_query(index, &query);
4076 }
4077
4078 /* Calculate the inverse document frequency of the terms. */
4079 if (query.error == DB_SUCCESS
4080 && query.flags != FTS_OPT_RANKING) {
4081 fts_query_calculate_idf(&query);
4082 }
4083
4084 /* Copy the result from the query state, so that we can
4085 return it to the caller. */
4086 if (query.error == DB_SUCCESS) {
4087 *result = fts_query_get_result(&query, *result);
4088 }
4089
4090 error = query.error;
4091 } else {
4092 /* still return an empty result set */
4093 *result = static_cast<fts_result_t*>(
4094 ut_zalloc_nokey(sizeof(**result)));
4095 }
4096
4097 ut_free(lc_query_str);
4098
4099 if (fts_enable_diag_print && (*result)) {
4100 ulint diff_time = ut_time_ms() - start_time_ms;
4101
4102 ib::info() << "FTS Search Processing time: "
4103 << diff_time / 1000 << " secs: " << diff_time % 1000
4104 << " millisec: row(s) "
4105 << ((*result)->rankings_by_id
4106 ? lint(rbt_size((*result)->rankings_by_id))
4107 : -1);
4108
4109 /* Log memory consumption & result size */
4110 ib::info() << "Full Search Memory: " << query.total_size
4111 << " (bytes), Row: "
4112 << ((*result)->rankings_by_id
4113 ? rbt_size((*result)->rankings_by_id)
4114 : 0)
4115 << ".";
4116 }
4117
4118func_exit:
4119 fts_query_free(&query);
4120
4121 trx_free(query_trx);
4122
4123 return(error);
4124}
4125
4126/*****************************************************************//**
4127FTS Query free result, returned by fts_query(). */
4128void
4129fts_query_free_result(
4130/*==================*/
4131 fts_result_t* result) /*!< in: result instance to free.*/
4132{
4133 if (result) {
4134 if (result->rankings_by_id != NULL) {
4135 rbt_free(result->rankings_by_id);
4136 result->rankings_by_id = NULL;
4137 }
4138 if (result->rankings_by_rank != NULL) {
4139 rbt_free(result->rankings_by_rank);
4140 result->rankings_by_rank = NULL;
4141 }
4142
4143 ut_free(result);
4144 result = NULL;
4145 }
4146}
4147
4148/*****************************************************************//**
4149FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
4150void
4151fts_query_sort_result_on_rank(
4152/*==========================*/
4153 fts_result_t* result) /*!< out: result instance to sort.*/
4154{
4155 const ib_rbt_node_t* node;
4156 ib_rbt_t* ranked;
4157
4158 ut_a(result->rankings_by_id != NULL);
4159 if (result->rankings_by_rank) {
4160 rbt_free(result->rankings_by_rank);
4161 }
4162
4163 ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4164
4165 /* We need to free any instances of fts_doc_freq_t that we
4166 may have allocated. */
4167 for (node = rbt_first(result->rankings_by_id);
4168 node;
4169 node = rbt_next(result->rankings_by_id, node)) {
4170
4171 fts_ranking_t* ranking;
4172
4173 ranking = rbt_value(fts_ranking_t, node);
4174
4175 ut_a(ranking->words == NULL);
4176
4177 rbt_insert(ranked, ranking, ranking);
4178 }
4179
4180 /* Reset the current node too. */
4181 result->current = NULL;
4182 result->rankings_by_rank = ranked;
4183}
4184
4185/*******************************************************************//**
4186A debug function to print result doc_id set. */
4187static
4188void
4189fts_print_doc_id(
4190/*=============*/
4191 fts_query_t* query) /*!< in : tree that stores doc_ids.*/
4192{
4193 const ib_rbt_node_t* node;
4194
4195 /* Iterate each member of the doc_id set */
4196 for (node = rbt_first(query->doc_ids);
4197 node;
4198 node = rbt_next(query->doc_ids, node)) {
4199 fts_ranking_t* ranking;
4200 ranking = rbt_value(fts_ranking_t, node);
4201
4202 ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
4203
4204 ulint pos = 0;
4205 fts_string_t word;
4206
4207 while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4208 ib::info() << "doc_ids info, value: " << word.f_str;
4209 }
4210 }
4211}
4212
4213/*************************************************************//**
4214This function implements a simple "blind" query expansion search:
4215words in documents found in the first search pass will be used as
4216search arguments to search the document again, thus "expand"
4217the search result set.
4218@return DB_SUCCESS if success, otherwise the error code */
4219static MY_ATTRIBUTE((nonnull, warn_unused_result))
4220dberr_t
4221fts_expand_query(
4222/*=============*/
4223 dict_index_t* index, /*!< in: FTS index to search */
4224 fts_query_t* query) /*!< in: FTS query instance */
4225{
4226 const ib_rbt_node_t* node;
4227 const ib_rbt_node_t* token_node;
4228 fts_doc_t result_doc;
4229 dberr_t error = DB_SUCCESS;
4230 const fts_index_cache_t*index_cache;
4231
4232 /* If no doc is found in first search pass, return */
4233 if (!rbt_size(query->doc_ids)) {
4234 return(error);
4235 }
4236
4237 /* Init "result_doc", to hold words from the first search pass */
4238 fts_doc_init(&result_doc);
4239
4240 rw_lock_x_lock(&index->table->fts->cache->lock);
4241 index_cache = fts_find_index_cache(index->table->fts->cache, index);
4242 rw_lock_x_unlock(&index->table->fts->cache->lock);
4243
4244 ut_a(index_cache);
4245
4246 result_doc.tokens = rbt_create_arg_cmp(
4247 sizeof(fts_token_t), innobase_fts_text_cmp,
4248 (void*) index_cache->charset);
4249
4250 result_doc.charset = index_cache->charset;
4251 result_doc.parser = index_cache->index->parser;
4252
4253 query->total_size += SIZEOF_RBT_CREATE;
4254
4255 if (fts_enable_diag_print) {
4256 fts_print_doc_id(query);
4257 }
4258
4259 for (node = rbt_first(query->doc_ids);
4260 node;
4261 node = rbt_next(query->doc_ids, node)) {
4262
4263 fts_ranking_t* ranking;
4264 ulint prev_token_size;
4265 ulint estimate_size;
4266
4267 prev_token_size = rbt_size(result_doc.tokens);
4268
4269 ranking = rbt_value(fts_ranking_t, node);
4270
4271 /* Fetch the documents with the doc_id from the
4272 result of first seach pass. Since we do not
4273 store document-to-word mapping, we need to
4274 fetch the original document and parse them.
4275 Future optimization could be done here if we
4276 support some forms of document-to-word mapping */
4277 fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4278 FTS_FETCH_DOC_BY_ID_EQUAL,
4279 fts_query_expansion_fetch_doc,
4280 &result_doc);
4281
4282 /* Estimate memory used, see fts_process_token and fts_token_t.
4283 We ignore token size here. */
4284 estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4285 * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4286 + sizeof(ib_vector_t) + sizeof(ulint) * 32);
4287 query->total_size += estimate_size;
4288
4289 if (query->total_size > fts_result_cache_limit) {
4290 error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4291 goto func_exit;
4292 }
4293 }
4294
4295 /* Remove words that have already been searched in the first pass */
4296 for (ulint i = 0; i < query->word_vector->size(); i++) {
4297 fts_string_t word = query->word_vector->at(i);
4298 ib_rbt_bound_t parent;
4299
4300 if (query->wildcard_words
4301 && rbt_search(query->wildcard_words, &parent, &word) == 0) {
4302 /* If it's a wildcard word, remove words having
4303 it as prefix. */
4304 while (rbt_search_cmp(result_doc.tokens,
4305 &parent, &word, NULL,
4306 innobase_fts_text_cmp_prefix)
4307 == 0) {
4308 ut_free(rbt_remove_node(result_doc.tokens,
4309 parent.last));
4310 }
4311 } else {
4312 /* We don't check return value, because the word may
4313 have been deleted by a previous wildcard word as its
4314 prefix, e.g. ('g * good'). */
4315 rbt_delete(result_doc.tokens, &word);
4316 }
4317 }
4318
4319 /* Search the table the second time with expanded search list */
4320 for (token_node = rbt_first(result_doc.tokens);
4321 token_node;
4322 token_node = rbt_next(result_doc.tokens, token_node)) {
4323 fts_token_t* mytoken;
4324 mytoken = rbt_value(fts_token_t, token_node);
4325
4326 /* '%' in the end is treated as prefix search,
4327 it can cause assert failure, so we skip it. */
4328 if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
4329 continue;
4330 }
4331
4332 ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
4333 fts_query_add_word_freq(query, &mytoken->text);
4334 error = fts_query_union(query, &mytoken->text);
4335
4336 if (error != DB_SUCCESS) {
4337 break;
4338 }
4339 }
4340
4341func_exit:
4342 fts_doc_free(&result_doc);
4343
4344 return(error);
4345}
4346/*************************************************************//**
4347This function finds documents that contain all words in a
4348phrase or proximity search. And if proximity search, verify
4349the words are close enough to each other, as in specified distance.
4350This function is called for phrase and proximity search.
4351@return TRUE if documents are found, FALSE if otherwise */
4352static
4353ibool
4354fts_phrase_or_proximity_search(
4355/*===========================*/
4356 fts_query_t* query, /*!< in/out: query instance.
4357 query->doc_ids might be instantiated
4358 with qualified doc IDs */
4359 ib_vector_t* tokens) /*!< in: Tokens contain words */
4360{
4361 ulint n_matched;
4362 ulint i;
4363 ibool matched = FALSE;
4364 ulint num_token = ib_vector_size(tokens);
4365 fts_match_t* match[MAX_PROXIMITY_ITEM];
4366 ibool end_list = FALSE;
4367
4368 /* Number of matched documents for the first token */
4369 n_matched = ib_vector_size(query->match_array[0]);
4370
4371 /* We have a set of match list for each word, we shall
4372 walk through the list and find common documents that
4373 contain all the matching words. */
4374 for (i = 0; i < n_matched; i++) {
4375 ulint j;
4376 ulint k = 0;
4377 fts_proximity_t qualified_pos;
4378
4379 match[0] = static_cast<fts_match_t*>(
4380 ib_vector_get(query->match_array[0], i));
4381
4382 /* For remaining match list for the token(word), we
4383 try to see if there is a document with the same
4384 doc id */
4385 for (j = 1; j < num_token; j++) {
4386 match[j] = static_cast<fts_match_t*>(
4387 ib_vector_get(query->match_array[j], k));
4388
4389 while (match[j]->doc_id < match[0]->doc_id
4390 && k < ib_vector_size(query->match_array[j])) {
4391 match[j] = static_cast<fts_match_t*>(
4392 ib_vector_get(
4393 query->match_array[j], k));
4394 k++;
4395 }
4396
4397 if (match[j]->doc_id > match[0]->doc_id) {
4398 /* no match */
4399 if (query->flags & FTS_PHRASE) {
4400 match[0]->doc_id = 0;
4401 }
4402 break;
4403 }
4404
4405 if (k == ib_vector_size(query->match_array[j])) {
4406 end_list = TRUE;
4407
4408 if (match[j]->doc_id != match[0]->doc_id) {
4409 /* no match */
4410 if (query->flags & FTS_PHRASE) {
4411 ulint s;
4412
4413 match[0]->doc_id = 0;
4414
4415 for (s = i + 1; s < n_matched;
4416 s++) {
4417 match[0] = static_cast<
4418 fts_match_t*>(
4419 ib_vector_get(
4420 query->match_array[0],
4421 s));
4422 match[0]->doc_id = 0;
4423 }
4424 }
4425
4426 goto func_exit;
4427 }
4428 }
4429
4430 /* FIXME: A better solution will be a counter array
4431 remember each run's last position. So we don't
4432 reset it here very time */
4433 k = 0;
4434 }
4435
4436 if (j != num_token) {
4437 continue;
4438 }
4439
4440 /* For this matching doc, we need to further
4441 verify whether the words in the doc are close
4442 to each other, and within the distance specified
4443 in the proximity search */
4444 if (query->flags & FTS_PHRASE) {
4445 matched = TRUE;
4446 } else if (fts_proximity_get_positions(
4447 match, num_token, ULINT_MAX, &qualified_pos)) {
4448
4449 /* Fetch the original documents and count the
4450 words in between matching words to see that is in
4451 specified distance */
4452 if (fts_query_is_in_proximity_range(
4453 query, match, &qualified_pos)) {
4454 /* If so, mark we find a matching doc */
4455 query->error = fts_query_process_doc_id(
4456 query, match[0]->doc_id, 0);
4457 if (query->error != DB_SUCCESS) {
4458 matched = FALSE;
4459 goto func_exit;
4460 }
4461
4462 matched = TRUE;
4463 for (ulint z = 0; z < num_token; z++) {
4464 fts_string_t* token;
4465 token = static_cast<fts_string_t*>(
4466 ib_vector_get(tokens, z));
4467 fts_query_add_word_to_document(
4468 query, match[0]->doc_id, token);
4469 }
4470 }
4471 }
4472
4473 if (end_list) {
4474 break;
4475 }
4476 }
4477
4478func_exit:
4479 return(matched);
4480}
4481
4482/*************************************************************//**
4483This function checks whether words in result documents are close to
4484each other (within proximity range as specified by "distance").
4485If "distance" is MAX_ULINT, then it will find all combinations of
4486positions of matching words and store min and max positions
4487in the "qualified_pos" for later verification.
4488@return true if words are close to each other, false if otherwise */
4489static
4490bool
4491fts_proximity_get_positions(
4492/*========================*/
4493 fts_match_t** match, /*!< in: query instance */
4494 ulint num_match, /*!< in: number of matching
4495 items */
4496 ulint distance, /*!< in: distance value
4497 for proximity search */
4498 fts_proximity_t* qualified_pos) /*!< out: the position info
4499 records ranges containing
4500 all matching words. */
4501{
4502 ulint i;
4503 ulint idx[MAX_PROXIMITY_ITEM];
4504 ulint num_pos[MAX_PROXIMITY_ITEM];
4505 ulint min_idx;
4506
4507 qualified_pos->n_pos = 0;
4508
4509 ut_a(num_match <= MAX_PROXIMITY_ITEM);
4510
4511 /* Each word could appear multiple times in a doc. So
4512 we need to walk through each word's position list, and find
4513 closest distance between different words to see if
4514 they are in the proximity distance. */
4515
4516 /* Assume each word's position list is sorted, we
4517 will just do a walk through to all words' lists
4518 similar to a the merge phase of a merge sort */
4519 for (i = 0; i < num_match; i++) {
4520 /* idx is the current position we are checking
4521 for a particular word */
4522 idx[i] = 0;
4523
4524 /* Number of positions for this word */
4525 num_pos[i] = ib_vector_size(match[i]->positions);
4526 }
4527
4528 /* Start with the first word */
4529 min_idx = 0;
4530
4531 while (idx[min_idx] < num_pos[min_idx]) {
4532 ulint position[MAX_PROXIMITY_ITEM];
4533 ulint min_pos = ULINT_MAX;
4534 ulint max_pos = 0;
4535
4536 /* Check positions in each word position list, and
4537 record the max/min position */
4538 for (i = 0; i < num_match; i++) {
4539 position[i] = *(ulint*) ib_vector_get_const(
4540 match[i]->positions, idx[i]);
4541
4542 if (position[i] == ULINT_UNDEFINED) {
4543 break;
4544 }
4545
4546 if (position[i] < min_pos) {
4547 min_pos = position[i];
4548 min_idx = i;
4549 }
4550
4551 if (position[i] > max_pos) {
4552 max_pos = position[i];
4553 }
4554 }
4555
4556 /* If max and min position are within range, we
4557 find a good match */
4558 if (max_pos - min_pos <= distance
4559 && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
4560 /* The charset has variable character
4561 length encoding, record the min_pos and
4562 max_pos, we will need to verify the actual
4563 number of characters */
4564 qualified_pos->min_pos.push_back(min_pos);
4565 qualified_pos->max_pos.push_back(max_pos);
4566 qualified_pos->n_pos++;
4567 }
4568
4569 /* Otherwise, move to the next position is the
4570 list for the word with the smallest position */
4571 idx[min_idx]++;
4572 }
4573
4574 return(qualified_pos->n_pos != 0);
4575}
4576