1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 2007, 2017, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2017, 2018, MariaDB Corporation. |
5 | |
6 | This program is free software; you can redistribute it and/or modify it under |
7 | the terms of the GNU General Public License as published by the Free Software |
8 | Foundation; version 2 of the License. |
9 | |
10 | This program is distributed in the hope that it will be useful, but WITHOUT |
11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License along with |
15 | this program; if not, write to the Free Software Foundation, Inc., |
16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
17 | |
18 | *****************************************************************************/ |
19 | |
20 | /**************************************************//** |
21 | @file fts/fts0que.cc |
22 | Full Text Search functionality. |
23 | |
24 | Created 2007/03/27 Sunny Bains |
25 | Completed 2011/7/10 Sunny and Jimmy Yang |
26 | *******************************************************/ |
27 | |
28 | #include "ha_prototypes.h" |
29 | |
30 | #include "dict0dict.h" |
31 | #include "ut0rbt.h" |
32 | #include "row0sel.h" |
33 | #include "fts0fts.h" |
34 | #include "fts0priv.h" |
35 | #include "fts0ast.h" |
36 | #include "fts0pars.h" |
37 | #include "fts0types.h" |
38 | #include "fts0plugin.h" |
39 | #include "ut0new.h" |
40 | |
41 | #include <iomanip> |
42 | #include <vector> |
43 | |
44 | #define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)]) |
45 | |
46 | #define RANK_DOWNGRADE (-1.0F) |
47 | #define RANK_UPGRADE (1.0F) |
48 | |
49 | /* Maximum number of words supported in a phrase or proximity search. */ |
50 | #define MAX_PROXIMITY_ITEM 128 |
51 | |
52 | /* Memory used by rbt itself for create and node add */ |
53 | #define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2 |
54 | #define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t) |
55 | |
56 | /*Initial byte length for 'words' in fts_ranking_t */ |
57 | #define RANKING_WORDS_INIT_LEN 4 |
58 | |
59 | // FIXME: Need to have a generic iterator that traverses the ilist. |
60 | |
61 | typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t; |
62 | |
63 | struct fts_word_freq_t; |
64 | |
65 | /** State of an FTS query. */ |
66 | struct fts_query_t { |
67 | mem_heap_t* heap; /*!< Heap to use for allocations */ |
68 | |
69 | trx_t* trx; /*!< The query transaction */ |
70 | |
71 | dict_index_t* index; /*!< The FTS index to search */ |
72 | /*!< FTS auxiliary common table def */ |
73 | |
74 | fts_table_t fts_common_table; |
75 | |
76 | fts_table_t fts_index_table;/*!< FTS auxiliary index table def */ |
77 | |
78 | ulint total_size; /*!< total memory size used by query */ |
79 | |
80 | fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be |
81 | filtered from the output */ |
82 | |
83 | fts_ast_node_t* root; /*!< Abstract syntax tree */ |
84 | |
85 | fts_ast_node_t* cur_node; /*!< Current tree node */ |
86 | |
87 | ib_rbt_t* word_map; /*!< Matched word map for |
88 | searching by word*/ |
89 | |
90 | word_vector_t* word_vector; /*!< Matched word vector for |
91 | searching by index */ |
92 | |
93 | ib_rbt_t* doc_ids; /*!< The current set of matching |
94 | doc ids, elements are of |
95 | type fts_ranking_t */ |
96 | |
97 | ib_rbt_t* intersection; /*!< The doc ids that were found in |
98 | doc_ids, this tree will become |
99 | the new doc_ids, elements are of type |
100 | fts_ranking_t */ |
101 | |
102 | /*!< Prepared statement to read the |
103 | nodes from the FTS INDEX */ |
104 | que_t* read_nodes_graph; |
105 | |
106 | fts_ast_oper_t oper; /*!< Current boolean mode operator */ |
107 | |
108 | /*!< TRUE if we want to collect the |
109 | word positions within the document */ |
110 | ibool collect_positions; |
111 | |
112 | ulint flags; /*!< Specify the full text search type, |
113 | such as boolean search, phrase |
114 | search, proximity search etc. */ |
115 | |
116 | ulint distance; /*!< The proximity distance of a |
117 | phrase search. */ |
118 | |
119 | /*!< These doc ids are used as a |
120 | boundary condition when searching the |
121 | FTS index rows */ |
122 | |
123 | doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */ |
124 | |
125 | doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */ |
126 | |
127 | bool boolean_mode; /*!< TRUE if boolean mode query */ |
128 | |
129 | ib_vector_t* matched; /*!< Array of matching documents |
130 | (fts_match_t) to search for a phrase */ |
131 | |
132 | ib_vector_t** match_array; /*!< Used for proximity search, contains |
133 | position info for each matched word |
134 | in the word list */ |
135 | |
136 | ib_uint64_t total_docs; /*!< The total number of documents */ |
137 | |
138 | ulint total_words; /*!< The total number of words */ |
139 | |
140 | dberr_t error; /*!< Error code if any, that is |
141 | encountered during query processing */ |
142 | |
143 | ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per |
144 | document, its elements are of type |
145 | fts_word_freq_t */ |
146 | |
147 | ib_rbt_t* wildcard_words; /*!< words with wildcard */ |
148 | |
149 | bool multi_exist; /*!< multiple FTS_EXIST oper */ |
150 | |
151 | st_mysql_ftparser* parser; /*!< fts plugin parser */ |
152 | }; |
153 | |
154 | /** For phrase matching, first we collect the documents and the positions |
155 | then we match. */ |
156 | struct fts_match_t { |
157 | doc_id_t doc_id; /*!< Document id */ |
158 | |
159 | ulint start; /*!< Start the phrase match from |
160 | this offset within the positions |
161 | vector. */ |
162 | |
163 | ib_vector_t* positions; /*!< Offsets of a word in a |
164 | document */ |
165 | }; |
166 | |
167 | /** For matching tokens in a phrase search. We use this data structure in |
168 | the callback that determines whether a document should be accepted or |
169 | rejected for a phrase search. */ |
170 | struct fts_select_t { |
171 | doc_id_t doc_id; /*!< The document id to match */ |
172 | |
173 | ulint min_pos; /*!< For found to be TRUE at least |
174 | one position must be greater than |
175 | min_pos. */ |
176 | |
177 | ibool found; /*!< TRUE if found */ |
178 | |
179 | fts_word_freq_t* |
180 | word_freq; /*!< Word frequency instance of the |
181 | current word being looked up in |
182 | the FTS index */ |
183 | }; |
184 | |
185 | typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t; |
186 | |
187 | /** structure defines a set of ranges for original documents, each of which |
188 | has a minimum position and maximum position. Text in such range should |
189 | contain all words in the proximity search. We will need to count the |
190 | words in such range to make sure it is less than the specified distance |
191 | of the proximity search */ |
192 | struct fts_proximity_t { |
193 | ulint n_pos; /*!< number of position set, defines |
194 | a range (min to max) containing all |
195 | matching words */ |
196 | pos_vector_t min_pos; /*!< the minimum position (in bytes) |
197 | of the range */ |
198 | pos_vector_t max_pos; /*!< the maximum position (in bytes) |
199 | of the range */ |
200 | }; |
201 | |
202 | /** The match positions and tokesn to match */ |
203 | struct fts_phrase_t { |
204 | fts_phrase_t(const dict_table_t* table) |
205 | : |
206 | found(false), |
207 | match(NULL), |
208 | tokens(NULL), |
209 | distance(0), |
210 | charset(NULL), |
211 | heap(NULL), |
212 | page_size(dict_table_page_size(table)), |
213 | proximity_pos(NULL), |
214 | parser(NULL) |
215 | { |
216 | } |
217 | |
218 | /** Match result */ |
219 | ibool found; |
220 | |
221 | /** Positions within text */ |
222 | const fts_match_t* match; |
223 | |
224 | /** Tokens to match */ |
225 | const ib_vector_t* tokens; |
226 | |
227 | /** For matching on proximity distance. Can be 0 for exact match */ |
228 | ulint distance; |
229 | |
230 | /** Phrase match charset */ |
231 | CHARSET_INFO* charset; |
232 | |
233 | /** Heap for word processing */ |
234 | mem_heap_t* heap; |
235 | |
236 | /** Row page size */ |
237 | const page_size_t page_size; |
238 | |
239 | /** Position info for proximity search verification. Records the |
240 | min and max position of words matched */ |
241 | fts_proximity_t* proximity_pos; |
242 | |
243 | /** FTS plugin parser */ |
244 | st_mysql_ftparser* parser; |
245 | }; |
246 | |
247 | /** Paramter passed to fts phrase match by parser */ |
248 | struct fts_phrase_param_t { |
249 | fts_phrase_t* phrase; /*!< Match phrase instance */ |
250 | ulint token_index; /*!< Index of token to match next */ |
251 | mem_heap_t* heap; /*!< Heap for word processing */ |
252 | }; |
253 | |
254 | /** For storing the frequncy of a word/term in a document */ |
255 | struct fts_doc_freq_t { |
256 | doc_id_t doc_id; /*!< Document id */ |
257 | ulint freq; /*!< Frequency of a word in a document */ |
258 | }; |
259 | |
260 | /** To determine the word frequency per document. */ |
261 | struct fts_word_freq_t { |
262 | fts_string_t word; /*!< Word for which we need the freq, |
263 | it's allocated on the query heap */ |
264 | |
265 | ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document |
266 | word frequencies. The elements are |
267 | of type fts_doc_freq_t */ |
268 | ib_uint64_t doc_count; /*!< Total number of documents that |
269 | contain this word */ |
270 | double idf; /*!< Inverse document frequency */ |
271 | }; |
272 | |
273 | /******************************************************************** |
274 | Callback function to fetch the rows in an FTS INDEX record. |
275 | @return always TRUE */ |
276 | static |
277 | ibool |
278 | fts_query_index_fetch_nodes( |
279 | /*========================*/ |
280 | void* row, /*!< in: sel_node_t* */ |
281 | void* user_arg); /*!< in: pointer to ib_vector_t */ |
282 | |
283 | /******************************************************************** |
284 | Read and filter nodes. |
285 | @return fts_node_t instance */ |
286 | static |
287 | dberr_t |
288 | fts_query_filter_doc_ids( |
289 | /*=====================*/ |
290 | fts_query_t* query, /*!< in: query instance */ |
291 | const fts_string_t* word, /*!< in: the current word */ |
292 | fts_word_freq_t* word_freq, /*!< in/out: word frequency */ |
293 | const fts_node_t* node, /*!< in: current FTS node */ |
294 | void* data, /*!< in: doc id ilist */ |
295 | ulint len, /*!< in: doc id ilist size */ |
296 | ibool calc_doc_count);/*!< in: whether to remember doc |
297 | count */ |
298 | |
299 | /** Process (nested) sub-expression, create a new result set to store the |
300 | sub-expression result by processing nodes under current sub-expression |
301 | list. Merge the sub-expression result with that of parent expression list. |
302 | @param[in,out] node current root node |
303 | @param[in,out] visitor callback function |
304 | @param[in,out] arg argument for callback |
305 | @return DB_SUCCESS if all go well */ |
306 | static |
307 | dberr_t |
308 | fts_ast_visit_sub_exp( |
309 | fts_ast_node_t* node, |
310 | fts_ast_callback visitor, |
311 | void* arg); |
312 | |
313 | #if 0 |
314 | /*****************************************************************//*** |
315 | Find a doc_id in a word's ilist. |
316 | @return TRUE if found. */ |
317 | static |
318 | ibool |
319 | fts_query_find_doc_id( |
320 | /*==================*/ |
321 | fts_select_t* select, /*!< in/out: search the doc id selected, |
322 | update the frequency if found. */ |
323 | void* data, /*!< in: doc id ilist */ |
324 | ulint len); /*!< in: doc id ilist size */ |
325 | #endif |
326 | |
327 | /*************************************************************//** |
328 | This function implements a simple "blind" query expansion search: |
329 | words in documents found in the first search pass will be used as |
330 | search arguments to search the document again, thus "expand" |
331 | the search result set. |
332 | @return DB_SUCCESS if success, otherwise the error code */ |
333 | static |
334 | dberr_t |
335 | fts_expand_query( |
336 | /*=============*/ |
337 | dict_index_t* index, /*!< in: FTS index to search */ |
338 | fts_query_t* query) /*!< in: query result, to be freed |
339 | by the client */ |
340 | MY_ATTRIBUTE((nonnull, warn_unused_result)); |
341 | /*************************************************************//** |
342 | This function finds documents that contain all words in a |
343 | phrase or proximity search. And if proximity search, verify |
344 | the words are close enough to each other, as in specified distance. |
345 | This function is called for phrase and proximity search. |
346 | @return TRUE if documents are found, FALSE if otherwise */ |
347 | static |
348 | ibool |
349 | fts_phrase_or_proximity_search( |
350 | /*===========================*/ |
351 | fts_query_t* query, /*!< in/out: query instance |
352 | query->doc_ids might be instantiated |
353 | with qualified doc IDs */ |
354 | ib_vector_t* tokens); /*!< in: Tokens contain words */ |
355 | /*************************************************************//** |
356 | This function checks whether words in result documents are close to |
357 | each other (within proximity range as specified by "distance"). |
358 | If "distance" is MAX_ULINT, then it will find all combinations of |
359 | positions of matching words and store min and max positions |
360 | in the "qualified_pos" for later verification. |
361 | @return true if words are close to each other, false if otherwise */ |
362 | static |
363 | bool |
364 | fts_proximity_get_positions( |
365 | /*========================*/ |
366 | fts_match_t** match, /*!< in: query instance */ |
367 | ulint num_match, /*!< in: number of matching |
368 | items */ |
369 | ulint distance, /*!< in: distance value |
370 | for proximity search */ |
371 | fts_proximity_t* qualified_pos); /*!< out: the position info |
372 | records ranges containing |
373 | all matching words. */ |
374 | #if 0 |
375 | /******************************************************************** |
376 | Get the total number of words in a documents. */ |
377 | static |
378 | ulint |
379 | fts_query_terms_in_document( |
380 | /*========================*/ |
381 | /*!< out: DB_SUCCESS if all go well |
382 | else error code */ |
383 | fts_query_t* query, /*!< in: FTS query state */ |
384 | doc_id_t doc_id, /*!< in: the word to check */ |
385 | ulint* total); /*!< out: total words in document */ |
386 | #endif |
387 | |
388 | /******************************************************************** |
389 | Compare two fts_doc_freq_t doc_ids. |
390 | @return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ |
391 | UNIV_INLINE |
392 | int |
393 | fts_freq_doc_id_cmp( |
394 | /*================*/ |
395 | const void* p1, /*!< in: id1 */ |
396 | const void* p2) /*!< in: id2 */ |
397 | { |
398 | const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1; |
399 | const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2; |
400 | |
401 | return((int) (fq1->doc_id - fq2->doc_id)); |
402 | } |
403 | |
404 | #if 0 |
405 | /*******************************************************************//** |
406 | Print the table used for calculating LCS. */ |
407 | static |
408 | void |
409 | fts_print_lcs_table( |
410 | /*================*/ |
411 | const ulint* table, /*!< in: array to print */ |
412 | ulint n_rows, /*!< in: total no. of rows */ |
413 | ulint n_cols) /*!< in: total no. of cols */ |
414 | { |
415 | ulint i; |
416 | |
417 | for (i = 0; i < n_rows; ++i) { |
418 | ulint j; |
419 | |
420 | printf("\n" ); |
421 | |
422 | for (j = 0; j < n_cols; ++j) { |
423 | |
424 | printf("%2lu " , FTS_ELEM(table, n_cols, i, j)); |
425 | } |
426 | } |
427 | } |
428 | |
429 | /******************************************************************** |
430 | Find the longest common subsequence between the query string and |
431 | the document. */ |
432 | static |
433 | ulint |
434 | fts_query_lcs( |
435 | /*==========*/ |
436 | /*!< out: LCS (length) between |
437 | two ilists */ |
438 | const ulint* p1, /*!< in: word positions of query */ |
439 | ulint len_p1, /*!< in: no. of elements in p1 */ |
440 | const ulint* p2, /*!< in: word positions within document */ |
441 | ulint len_p2) /*!< in: no. of elements in p2 */ |
442 | { |
443 | int i; |
444 | ulint len = 0; |
445 | ulint r = len_p1; |
446 | ulint c = len_p2; |
447 | ulint size = (r + 1) * (c + 1) * sizeof(ulint); |
448 | ulint* table = (ulint*) ut_malloc_nokey(size); |
449 | |
450 | /* Traverse the table backwards, from the last row to the first and |
451 | also from the last column to the first. We compute the smaller |
452 | common subsequeces first, then use the caluclated values to determine |
453 | the longest common subsequence. The result will be in TABLE[0][0]. */ |
454 | for (i = r; i >= 0; --i) { |
455 | int j; |
456 | |
457 | for (j = c; j >= 0; --j) { |
458 | |
459 | if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) { |
460 | |
461 | FTS_ELEM(table, c, i, j) = 0; |
462 | |
463 | } else if (p1[i] == p2[j]) { |
464 | |
465 | FTS_ELEM(table, c, i, j) = FTS_ELEM( |
466 | table, c, i + 1, j + 1) + 1; |
467 | |
468 | } else { |
469 | |
470 | ulint value; |
471 | |
472 | value = ut_max( |
473 | FTS_ELEM(table, c, i + 1, j), |
474 | FTS_ELEM(table, c, i, j + 1)); |
475 | |
476 | FTS_ELEM(table, c, i, j) = value; |
477 | } |
478 | } |
479 | } |
480 | |
481 | len = FTS_ELEM(table, c, 0, 0); |
482 | |
483 | fts_print_lcs_table(table, r, c); |
484 | printf("\nLen=" ULINTPF "\n" , len); |
485 | |
486 | ut_free(table); |
487 | |
488 | return(len); |
489 | } |
490 | #endif |
491 | |
492 | /*******************************************************************//** |
493 | Compare two fts_ranking_t instance on their rank value and doc ids in |
494 | descending order on the rank and ascending order on doc id. |
495 | @return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */ |
496 | static |
497 | int |
498 | fts_query_compare_rank( |
499 | /*===================*/ |
500 | const void* p1, /*!< in: pointer to elem */ |
501 | const void* p2) /*!< in: pointer to elem */ |
502 | { |
503 | const fts_ranking_t* r1 = (const fts_ranking_t*) p1; |
504 | const fts_ranking_t* r2 = (const fts_ranking_t*) p2; |
505 | |
506 | if (r2->rank < r1->rank) { |
507 | return(-1); |
508 | } else if (r2->rank == r1->rank) { |
509 | |
510 | if (r1->doc_id < r2->doc_id) { |
511 | return(1); |
512 | } else if (r1->doc_id > r2->doc_id) { |
513 | return(1); |
514 | } |
515 | |
516 | return(0); |
517 | } |
518 | |
519 | return(1); |
520 | } |
521 | |
522 | /*******************************************************************//** |
523 | Create words in ranking */ |
524 | static |
525 | void |
526 | fts_ranking_words_create( |
527 | /*=====================*/ |
528 | fts_query_t* query, /*!< in: query instance */ |
529 | fts_ranking_t* ranking) /*!< in: ranking instance */ |
530 | { |
531 | ranking->words = static_cast<byte*>( |
532 | mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN)); |
533 | ranking->words_len = RANKING_WORDS_INIT_LEN; |
534 | } |
535 | |
536 | /* |
537 | The optimization here is using a char array(bitmap) to replace words rb tree |
538 | in fts_ranking_t. |
539 | |
540 | It can save lots of memory except in some cases of QUERY EXPANSION. |
541 | |
542 | 'word_map' is used as a word dictionary, in which the key is a word, the value |
543 | is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'. |
544 | if not, we add it into 'word_map', and give it a position(actually a number). |
545 | then we set the corresponding bit to '1' at the position in the char array 'words'. |
546 | |
547 | 'word_vector' is a useful backup of 'word_map', and we can get a word by its position, |
548 | more quickly than searching by value in 'word_map'. we use 'word_vector' |
549 | in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need |
550 | to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq |
551 | by the word. |
552 | */ |
553 | |
554 | /*******************************************************************//** |
555 | Add a word into ranking */ |
556 | static |
557 | void |
558 | fts_ranking_words_add( |
559 | /*==================*/ |
560 | fts_query_t* query, /*!< in: query instance */ |
561 | fts_ranking_t* ranking, /*!< in: ranking instance */ |
562 | const fts_string_t* word) /*!< in: term/word to add */ |
563 | { |
564 | ulint pos; |
565 | ulint byte_offset; |
566 | ulint bit_offset; |
567 | ib_rbt_bound_t parent; |
568 | |
569 | /* Note: we suppose the word map and vector are append-only. */ |
570 | ut_ad(query->word_vector->size() == rbt_size(query->word_map)); |
571 | |
572 | /* We use ib_rbt to simulate a map, f_n_char means position. */ |
573 | if (rbt_search(query->word_map, &parent, word) == 0) { |
574 | fts_string_t* result_word; |
575 | |
576 | result_word = rbt_value(fts_string_t, parent.last); |
577 | pos = result_word->f_n_char; |
578 | ut_ad(pos < rbt_size(query->word_map)); |
579 | } else { |
580 | /* Add the word to map. */ |
581 | fts_string_t new_word; |
582 | |
583 | pos = rbt_size(query->word_map); |
584 | |
585 | fts_string_dup(&new_word, word, query->heap); |
586 | new_word.f_n_char = pos; |
587 | |
588 | rbt_add_node(query->word_map, &parent, &new_word); |
589 | ut_ad(rbt_validate(query->word_map)); |
590 | query->word_vector->push_back(new_word); |
591 | } |
592 | |
593 | /* Check words len */ |
594 | byte_offset = pos / CHAR_BIT; |
595 | if (byte_offset >= ranking->words_len) { |
596 | byte* words = ranking->words; |
597 | ulint words_len = ranking->words_len; |
598 | |
599 | while (byte_offset >= words_len) { |
600 | words_len *= 2; |
601 | } |
602 | |
603 | ranking->words = static_cast<byte*>( |
604 | mem_heap_zalloc(query->heap, words_len)); |
605 | ut_memcpy(ranking->words, words, ranking->words_len); |
606 | ranking->words_len = words_len; |
607 | } |
608 | |
609 | /* Set ranking words */ |
610 | ut_ad(byte_offset < ranking->words_len); |
611 | bit_offset = pos % CHAR_BIT; |
612 | ranking->words[byte_offset] |= 1 << bit_offset; |
613 | } |
614 | |
615 | /*******************************************************************//** |
616 | Get a word from a ranking |
617 | @return true if it's successful */ |
618 | static |
619 | bool |
620 | fts_ranking_words_get_next( |
621 | /*=======================*/ |
622 | const fts_query_t* query, /*!< in: query instance */ |
623 | fts_ranking_t* ranking,/*!< in: ranking instance */ |
624 | ulint* pos, /*!< in/out: word start pos */ |
625 | fts_string_t* word) /*!< in/out: term/word to add */ |
626 | { |
627 | bool ret = false; |
628 | ulint max_pos = ranking->words_len * CHAR_BIT; |
629 | |
630 | /* Search for next word */ |
631 | while (*pos < max_pos) { |
632 | ulint byte_offset = *pos / CHAR_BIT; |
633 | ulint bit_offset = *pos % CHAR_BIT; |
634 | |
635 | if (ranking->words[byte_offset] & (1 << bit_offset)) { |
636 | ret = true; |
637 | break; |
638 | } |
639 | |
640 | *pos += 1; |
641 | }; |
642 | |
643 | /* Get next word from word vector */ |
644 | if (ret) { |
645 | ut_ad(*pos < query->word_vector->size()); |
646 | *word = query->word_vector->at((size_t)*pos); |
647 | *pos += 1; |
648 | } |
649 | |
650 | return ret; |
651 | } |
652 | |
653 | /*******************************************************************//** |
654 | Add a word if it doesn't exist, to the term freq RB tree. We store |
655 | a pointer to the word that is passed in as the argument. |
656 | @return pointer to word */ |
657 | static |
658 | fts_word_freq_t* |
659 | fts_query_add_word_freq( |
660 | /*====================*/ |
661 | fts_query_t* query, /*!< in: query instance */ |
662 | const fts_string_t* word) /*!< in: term/word to add */ |
663 | { |
664 | ib_rbt_bound_t parent; |
665 | |
666 | /* Lookup the word in our rb tree and add if it doesn't exist. */ |
667 | if (rbt_search(query->word_freqs, &parent, word) != 0) { |
668 | fts_word_freq_t word_freq; |
669 | |
670 | memset(&word_freq, 0, sizeof(word_freq)); |
671 | |
672 | fts_string_dup(&word_freq.word, word, query->heap); |
673 | |
674 | word_freq.doc_count = 0; |
675 | |
676 | word_freq.doc_freqs = rbt_create( |
677 | sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp); |
678 | |
679 | parent.last = rbt_add_node( |
680 | query->word_freqs, &parent, &word_freq); |
681 | |
682 | query->total_size += word->f_len |
683 | + SIZEOF_RBT_CREATE |
684 | + SIZEOF_RBT_NODE_ADD |
685 | + sizeof(fts_word_freq_t); |
686 | } |
687 | |
688 | return(rbt_value(fts_word_freq_t, parent.last)); |
689 | } |
690 | |
691 | /*******************************************************************//** |
692 | Add a doc id if it doesn't exist, to the doc freq RB tree. |
693 | @return pointer to word */ |
694 | static |
695 | fts_doc_freq_t* |
696 | fts_query_add_doc_freq( |
697 | /*===================*/ |
698 | fts_query_t* query, /*!< in: query instance */ |
699 | ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */ |
700 | doc_id_t doc_id) /*!< in: doc id to add */ |
701 | { |
702 | ib_rbt_bound_t parent; |
703 | |
704 | /* Lookup the doc id in our rb tree and add if it doesn't exist. */ |
705 | if (rbt_search(doc_freqs, &parent, &doc_id) != 0) { |
706 | fts_doc_freq_t doc_freq; |
707 | |
708 | memset(&doc_freq, 0, sizeof(doc_freq)); |
709 | |
710 | doc_freq.freq = 0; |
711 | doc_freq.doc_id = doc_id; |
712 | |
713 | parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq); |
714 | |
715 | query->total_size += SIZEOF_RBT_NODE_ADD |
716 | + sizeof(fts_doc_freq_t); |
717 | } |
718 | |
719 | return(rbt_value(fts_doc_freq_t, parent.last)); |
720 | } |
721 | |
722 | /*******************************************************************//** |
723 | Add the doc id to the query set only if it's not in the |
724 | deleted array. */ |
725 | static |
726 | void |
727 | fts_query_union_doc_id( |
728 | /*===================*/ |
729 | fts_query_t* query, /*!< in: query instance */ |
730 | doc_id_t doc_id, /*!< in: the doc id to add */ |
731 | fts_rank_t rank) /*!< in: if non-zero, it is the |
732 | rank associated with the doc_id */ |
733 | { |
734 | ib_rbt_bound_t parent; |
735 | ulint size = ib_vector_size(query->deleted->doc_ids); |
736 | fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; |
737 | |
738 | /* Check if the doc id is deleted and it's not already in our set. */ |
739 | if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0 |
740 | && rbt_search(query->doc_ids, &parent, &doc_id) != 0) { |
741 | |
742 | fts_ranking_t ranking; |
743 | |
744 | ranking.rank = rank; |
745 | ranking.doc_id = doc_id; |
746 | fts_ranking_words_create(query, &ranking); |
747 | |
748 | rbt_add_node(query->doc_ids, &parent, &ranking); |
749 | |
750 | query->total_size += SIZEOF_RBT_NODE_ADD |
751 | + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN; |
752 | } |
753 | } |
754 | |
755 | /*******************************************************************//** |
756 | Remove the doc id from the query set only if it's not in the |
757 | deleted set. */ |
758 | static |
759 | void |
760 | fts_query_remove_doc_id( |
761 | /*====================*/ |
762 | fts_query_t* query, /*!< in: query instance */ |
763 | doc_id_t doc_id) /*!< in: the doc id to add */ |
764 | { |
765 | ib_rbt_bound_t parent; |
766 | ulint size = ib_vector_size(query->deleted->doc_ids); |
767 | fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; |
768 | |
769 | /* Check if the doc id is deleted and it's in our set. */ |
770 | if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0 |
771 | && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { |
772 | ut_free(rbt_remove_node(query->doc_ids, parent.last)); |
773 | |
774 | ut_ad(query->total_size >= |
775 | SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); |
776 | query->total_size -= SIZEOF_RBT_NODE_ADD |
777 | + sizeof(fts_ranking_t); |
778 | } |
779 | } |
780 | |
781 | /*******************************************************************//** |
782 | Find the doc id in the query set but not in the deleted set, artificialy |
783 | downgrade or upgrade its ranking by a value and make/initialize its ranking |
784 | under or above its normal range 0 to 1. This is used for Boolean Search |
785 | operator such as Negation operator, which makes word's contribution to the |
786 | row's relevance to be negative */ |
787 | static |
788 | void |
789 | fts_query_change_ranking( |
790 | /*====================*/ |
791 | fts_query_t* query, /*!< in: query instance */ |
792 | doc_id_t doc_id, /*!< in: the doc id to add */ |
793 | ibool downgrade) /*!< in: Whether to downgrade ranking */ |
794 | { |
795 | ib_rbt_bound_t parent; |
796 | ulint size = ib_vector_size(query->deleted->doc_ids); |
797 | fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; |
798 | |
799 | /* Check if the doc id is deleted and it's in our set. */ |
800 | if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0 |
801 | && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { |
802 | |
803 | fts_ranking_t* ranking; |
804 | |
805 | ranking = rbt_value(fts_ranking_t, parent.last); |
806 | |
807 | ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE; |
808 | |
809 | /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5) |
810 | and RANK_UPGRADE (0.5) */ |
811 | if (ranking->rank >= 1.0F) { |
812 | ranking->rank = 1.0F; |
813 | } else if (ranking->rank <= -1.0F) { |
814 | ranking->rank = -1.0F; |
815 | } |
816 | } |
817 | } |
818 | |
819 | /*******************************************************************//** |
820 | Check the doc id in the query set only if it's not in the |
821 | deleted array. The doc ids that were found are stored in |
822 | another rb tree (fts_query_t::intersect). */ |
823 | static |
824 | void |
825 | fts_query_intersect_doc_id( |
826 | /*=======================*/ |
827 | fts_query_t* query, /*!< in: query instance */ |
828 | doc_id_t doc_id, /*!< in: the doc id to add */ |
829 | fts_rank_t rank) /*!< in: if non-zero, it is the |
830 | rank associated with the doc_id */ |
831 | { |
832 | ib_rbt_bound_t parent; |
833 | ulint size = ib_vector_size(query->deleted->doc_ids); |
834 | fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data; |
835 | fts_ranking_t* ranking= NULL; |
836 | |
837 | /* There are three types of intersect: |
838 | 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'. |
839 | 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect |
840 | if it matches 'b'. if the doc is also in doc_ids, then change the |
841 | doc's rank, and add 'a' in doc's words. |
842 | 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect |
843 | if it matches 'b' and it's in doc_ids.(multi_exist = true). */ |
844 | |
845 | /* Check if the doc id is deleted and it's in our set */ |
846 | if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0) { |
847 | fts_ranking_t new_ranking; |
848 | |
849 | if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) { |
850 | if (query->multi_exist) { |
851 | return; |
852 | } else { |
853 | new_ranking.words = NULL; |
854 | } |
855 | } else { |
856 | ranking = rbt_value(fts_ranking_t, parent.last); |
857 | |
858 | /* We've just checked the doc id before */ |
859 | if (ranking->words == NULL) { |
860 | ut_ad(rbt_search(query->intersection, &parent, |
861 | ranking) == 0); |
862 | return; |
863 | } |
864 | |
865 | /* Merge rank */ |
866 | rank += ranking->rank; |
867 | if (rank >= 1.0F) { |
868 | rank = 1.0F; |
869 | } else if (rank <= -1.0F) { |
870 | rank = -1.0F; |
871 | } |
872 | |
873 | /* Take words */ |
874 | new_ranking.words = ranking->words; |
875 | new_ranking.words_len = ranking->words_len; |
876 | } |
877 | |
878 | new_ranking.rank = rank; |
879 | new_ranking.doc_id = doc_id; |
880 | |
881 | if (rbt_search(query->intersection, &parent, |
882 | &new_ranking) != 0) { |
883 | if (new_ranking.words == NULL) { |
884 | fts_ranking_words_create(query, &new_ranking); |
885 | |
886 | query->total_size += RANKING_WORDS_INIT_LEN; |
887 | } else { |
888 | /* Note that the intersection has taken |
889 | ownership of the ranking data. */ |
890 | ranking->words = NULL; |
891 | } |
892 | |
893 | rbt_add_node(query->intersection, |
894 | &parent, &new_ranking); |
895 | |
896 | query->total_size += SIZEOF_RBT_NODE_ADD |
897 | + sizeof(fts_ranking_t); |
898 | } |
899 | } |
900 | } |
901 | |
902 | /*******************************************************************//** |
903 | Free the document ranking rb tree. */ |
904 | static |
905 | void |
906 | fts_query_free_doc_ids( |
907 | /*===================*/ |
908 | fts_query_t* query, /*!< in: query instance */ |
909 | ib_rbt_t* doc_ids) /*!< in: rb tree to free */ |
910 | { |
911 | const ib_rbt_node_t* node; |
912 | |
913 | for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) { |
914 | |
915 | fts_ranking_t* ranking; |
916 | |
917 | ranking = rbt_value(fts_ranking_t, node); |
918 | |
919 | if (ranking->words) { |
920 | ranking->words = NULL; |
921 | } |
922 | |
923 | ut_free(rbt_remove_node(doc_ids, node)); |
924 | |
925 | ut_ad(query->total_size >= |
926 | SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); |
927 | query->total_size -= SIZEOF_RBT_NODE_ADD |
928 | + sizeof(fts_ranking_t); |
929 | } |
930 | |
931 | rbt_free(doc_ids); |
932 | |
933 | ut_ad(query->total_size >= SIZEOF_RBT_CREATE); |
934 | query->total_size -= SIZEOF_RBT_CREATE; |
935 | } |
936 | |
937 | /*******************************************************************//** |
938 | Add the word to the documents "list" of matching words from |
939 | the query. We make a copy of the word from the query heap. */ |
940 | static |
941 | void |
942 | fts_query_add_word_to_document( |
943 | /*===========================*/ |
944 | fts_query_t* query, /*!< in: query to update */ |
945 | doc_id_t doc_id, /*!< in: the document to update */ |
946 | const fts_string_t* word) /*!< in: the token to add */ |
947 | { |
948 | ib_rbt_bound_t parent; |
949 | fts_ranking_t* ranking = NULL; |
950 | |
951 | if (query->flags == FTS_OPT_RANKING) { |
952 | return; |
953 | } |
954 | |
955 | /* First we search the intersection RB tree as it could have |
956 | taken ownership of the words rb tree instance. */ |
957 | if (query->intersection |
958 | && rbt_search(query->intersection, &parent, &doc_id) == 0) { |
959 | |
960 | ranking = rbt_value(fts_ranking_t, parent.last); |
961 | } |
962 | |
963 | if (ranking == NULL |
964 | && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { |
965 | |
966 | ranking = rbt_value(fts_ranking_t, parent.last); |
967 | } |
968 | |
969 | if (ranking != NULL) { |
970 | fts_ranking_words_add(query, ranking, word); |
971 | } |
972 | } |
973 | |
974 | /*******************************************************************//** |
975 | Check the node ilist. */ |
976 | static |
977 | void |
978 | fts_query_check_node( |
979 | /*=================*/ |
980 | fts_query_t* query, /*!< in: query to update */ |
981 | const fts_string_t* token, /*!< in: the token to search */ |
982 | const fts_node_t* node) /*!< in: node to check */ |
983 | { |
984 | /* Skip nodes whose doc ids are out range. */ |
985 | if (query->oper == FTS_EXIST |
986 | && ((query->upper_doc_id > 0 |
987 | && node->first_doc_id > query->upper_doc_id) |
988 | || (query->lower_doc_id > 0 |
989 | && node->last_doc_id < query->lower_doc_id))) { |
990 | |
991 | /* Ignore */ |
992 | |
993 | } else { |
994 | int ret; |
995 | ib_rbt_bound_t parent; |
996 | ulint ilist_size = node->ilist_size; |
997 | fts_word_freq_t*word_freqs; |
998 | |
999 | /* The word must exist. */ |
1000 | ret = rbt_search(query->word_freqs, &parent, token); |
1001 | ut_a(ret == 0); |
1002 | |
1003 | word_freqs = rbt_value(fts_word_freq_t, parent.last); |
1004 | |
1005 | query->error = fts_query_filter_doc_ids( |
1006 | query, token, word_freqs, node, |
1007 | node->ilist, ilist_size, TRUE); |
1008 | } |
1009 | } |
1010 | |
1011 | /*****************************************************************//** |
1012 | Search index cache for word with wildcard match. |
1013 | @return number of words matched */ |
1014 | static |
1015 | ulint |
1016 | fts_cache_find_wildcard( |
1017 | /*====================*/ |
1018 | fts_query_t* query, /*!< in: query instance */ |
1019 | const fts_index_cache_t*index_cache, /*!< in: cache to search */ |
1020 | const fts_string_t* token) /*!< in: token to search */ |
1021 | { |
1022 | ib_rbt_bound_t parent; |
1023 | const ib_vector_t* nodes = NULL; |
1024 | fts_string_t srch_text; |
1025 | byte term[FTS_MAX_WORD_LEN + 1]; |
1026 | ulint num_word = 0; |
1027 | |
1028 | srch_text.f_len = (token->f_str[token->f_len - 1] == '%') |
1029 | ? token->f_len - 1 |
1030 | : token->f_len; |
1031 | |
1032 | strncpy((char*) term, (char*) token->f_str, srch_text.f_len); |
1033 | term[srch_text.f_len] = '\0'; |
1034 | srch_text.f_str = term; |
1035 | |
1036 | /* Lookup the word in the rb tree */ |
1037 | if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL, |
1038 | innobase_fts_text_cmp_prefix) == 0) { |
1039 | const fts_tokenizer_word_t* word; |
1040 | ulint i; |
1041 | const ib_rbt_node_t* cur_node; |
1042 | ibool forward = FALSE; |
1043 | |
1044 | word = rbt_value(fts_tokenizer_word_t, parent.last); |
1045 | cur_node = parent.last; |
1046 | |
1047 | while (innobase_fts_text_cmp_prefix( |
1048 | index_cache->charset, &srch_text, &word->text) == 0) { |
1049 | |
1050 | nodes = word->nodes; |
1051 | |
1052 | for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { |
1053 | int ret; |
1054 | const fts_node_t* node; |
1055 | ib_rbt_bound_t freq_parent; |
1056 | fts_word_freq_t* word_freqs; |
1057 | |
1058 | node = static_cast<const fts_node_t*>( |
1059 | ib_vector_get_const(nodes, i)); |
1060 | |
1061 | ret = rbt_search(query->word_freqs, |
1062 | &freq_parent, |
1063 | &srch_text); |
1064 | |
1065 | ut_a(ret == 0); |
1066 | |
1067 | word_freqs = rbt_value( |
1068 | fts_word_freq_t, |
1069 | freq_parent.last); |
1070 | |
1071 | query->error = fts_query_filter_doc_ids( |
1072 | query, &srch_text, |
1073 | word_freqs, node, |
1074 | node->ilist, node->ilist_size, TRUE); |
1075 | |
1076 | if (query->error != DB_SUCCESS) { |
1077 | return(0); |
1078 | } |
1079 | } |
1080 | |
1081 | num_word++; |
1082 | |
1083 | if (!forward) { |
1084 | cur_node = rbt_prev( |
1085 | index_cache->words, cur_node); |
1086 | } else { |
1087 | cont_search: |
1088 | cur_node = rbt_next( |
1089 | index_cache->words, cur_node); |
1090 | } |
1091 | |
1092 | if (!cur_node) { |
1093 | break; |
1094 | } |
1095 | |
1096 | word = rbt_value(fts_tokenizer_word_t, cur_node); |
1097 | } |
1098 | |
1099 | if (!forward) { |
1100 | forward = TRUE; |
1101 | cur_node = parent.last; |
1102 | goto cont_search; |
1103 | } |
1104 | } |
1105 | |
1106 | return(num_word); |
1107 | } |
1108 | |
1109 | /*****************************************************************//** |
1110 | Set difference. |
1111 | @return DB_SUCCESS if all go well */ |
1112 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
1113 | dberr_t |
1114 | fts_query_difference( |
1115 | /*=================*/ |
1116 | fts_query_t* query, /*!< in: query instance */ |
1117 | const fts_string_t* token) /*!< in: token to search */ |
1118 | { |
1119 | ulint n_doc_ids= 0; |
1120 | trx_t* trx = query->trx; |
1121 | dict_table_t* table = query->index->table; |
1122 | |
1123 | ut_a(query->oper == FTS_IGNORE); |
1124 | |
1125 | #ifdef FTS_INTERNAL_DIAG_PRINT |
1126 | { |
1127 | ib::info out; |
1128 | out << "DIFFERENCE: Searching: '" ; |
1129 | out.write(token->f_str, token->f_len); |
1130 | out << "'" ; |
1131 | } |
1132 | #endif |
1133 | |
1134 | if (query->doc_ids) { |
1135 | n_doc_ids = rbt_size(query->doc_ids); |
1136 | } |
1137 | |
1138 | /* There is nothing we can substract from an empty set. */ |
1139 | if (query->doc_ids && !rbt_empty(query->doc_ids)) { |
1140 | ulint i; |
1141 | fts_fetch_t fetch; |
1142 | const ib_vector_t* nodes; |
1143 | const fts_index_cache_t*index_cache; |
1144 | que_t* graph = NULL; |
1145 | fts_cache_t* cache = table->fts->cache; |
1146 | dberr_t error; |
1147 | |
1148 | rw_lock_x_lock(&cache->lock); |
1149 | |
1150 | index_cache = fts_find_index_cache(cache, query->index); |
1151 | |
1152 | /* Must find the index cache */ |
1153 | ut_a(index_cache != NULL); |
1154 | |
1155 | /* Search the cache for a matching word first. */ |
1156 | if (query->cur_node->term.wildcard |
1157 | && query->flags != FTS_PROXIMITY |
1158 | && query->flags != FTS_PHRASE) { |
1159 | fts_cache_find_wildcard(query, index_cache, token); |
1160 | } else { |
1161 | nodes = fts_cache_find_word(index_cache, token); |
1162 | |
1163 | for (i = 0; nodes && i < ib_vector_size(nodes) |
1164 | && query->error == DB_SUCCESS; ++i) { |
1165 | const fts_node_t* node; |
1166 | |
1167 | node = static_cast<const fts_node_t*>( |
1168 | ib_vector_get_const(nodes, i)); |
1169 | |
1170 | fts_query_check_node(query, token, node); |
1171 | } |
1172 | } |
1173 | |
1174 | rw_lock_x_unlock(&cache->lock); |
1175 | |
1176 | /* error is passed by 'query->error' */ |
1177 | if (query->error != DB_SUCCESS) { |
1178 | ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); |
1179 | return(query->error); |
1180 | } |
1181 | |
1182 | /* Setup the callback args for filtering and |
1183 | consolidating the ilist. */ |
1184 | fetch.read_arg = query; |
1185 | fetch.read_record = fts_query_index_fetch_nodes; |
1186 | |
1187 | error = fts_index_fetch_nodes( |
1188 | trx, &graph, &query->fts_index_table, token, &fetch); |
1189 | |
1190 | /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ |
1191 | ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); |
1192 | if (error != DB_SUCCESS) { |
1193 | query->error = error; |
1194 | } |
1195 | |
1196 | fts_que_graph_free(graph); |
1197 | } |
1198 | |
1199 | /* The size can't increase. */ |
1200 | ut_a(rbt_size(query->doc_ids) <= n_doc_ids); |
1201 | |
1202 | return(query->error); |
1203 | } |
1204 | |
1205 | /*****************************************************************//** |
1206 | Intersect the token doc ids with the current set. |
1207 | @return DB_SUCCESS if all go well */ |
1208 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
1209 | dberr_t |
1210 | fts_query_intersect( |
1211 | /*================*/ |
1212 | fts_query_t* query, /*!< in: query instance */ |
1213 | const fts_string_t* token) /*!< in: the token to search */ |
1214 | { |
1215 | trx_t* trx = query->trx; |
1216 | dict_table_t* table = query->index->table; |
1217 | |
1218 | ut_a(query->oper == FTS_EXIST); |
1219 | |
1220 | #ifdef FTS_INTERNAL_DIAG_PRINT |
1221 | { |
1222 | ib::info out; |
1223 | out << "INTERSECT: Searching: '" ; |
1224 | out.write(token->f_str, token->f_len); |
1225 | out << "'" ; |
1226 | } |
1227 | #endif |
1228 | |
1229 | /* If the words set is not empty and multi exist is true, |
1230 | we know the intersection set is empty in advance. */ |
1231 | if (!(rbt_empty(query->doc_ids) && query->multi_exist)) { |
1232 | ulint n_doc_ids = 0; |
1233 | ulint i; |
1234 | fts_fetch_t fetch; |
1235 | const ib_vector_t* nodes; |
1236 | const fts_index_cache_t*index_cache; |
1237 | que_t* graph = NULL; |
1238 | fts_cache_t* cache = table->fts->cache; |
1239 | dberr_t error; |
1240 | |
1241 | ut_a(!query->intersection); |
1242 | |
1243 | n_doc_ids = rbt_size(query->doc_ids); |
1244 | |
1245 | /* Create the rb tree that will hold the doc ids of |
1246 | the intersection. */ |
1247 | query->intersection = rbt_create( |
1248 | sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); |
1249 | |
1250 | query->total_size += SIZEOF_RBT_CREATE; |
1251 | |
1252 | /* This is to avoid decompressing the ilist if the |
1253 | node's ilist doc ids are out of range. */ |
1254 | if (!rbt_empty(query->doc_ids) && query->multi_exist) { |
1255 | const ib_rbt_node_t* node; |
1256 | doc_id_t* doc_id; |
1257 | |
1258 | node = rbt_first(query->doc_ids); |
1259 | doc_id = rbt_value(doc_id_t, node); |
1260 | query->lower_doc_id = *doc_id; |
1261 | |
1262 | node = rbt_last(query->doc_ids); |
1263 | doc_id = rbt_value(doc_id_t, node); |
1264 | query->upper_doc_id = *doc_id; |
1265 | |
1266 | } else { |
1267 | query->lower_doc_id = 0; |
1268 | query->upper_doc_id = 0; |
1269 | } |
1270 | |
1271 | /* Search the cache for a matching word first. */ |
1272 | |
1273 | rw_lock_x_lock(&cache->lock); |
1274 | |
1275 | /* Search for the index specific cache. */ |
1276 | index_cache = fts_find_index_cache(cache, query->index); |
1277 | |
1278 | /* Must find the index cache. */ |
1279 | ut_a(index_cache != NULL); |
1280 | |
1281 | if (query->cur_node->term.wildcard) { |
1282 | /* Wildcard search the index cache */ |
1283 | fts_cache_find_wildcard(query, index_cache, token); |
1284 | } else { |
1285 | nodes = fts_cache_find_word(index_cache, token); |
1286 | |
1287 | for (i = 0; nodes && i < ib_vector_size(nodes) |
1288 | && query->error == DB_SUCCESS; ++i) { |
1289 | const fts_node_t* node; |
1290 | |
1291 | node = static_cast<const fts_node_t*>( |
1292 | ib_vector_get_const(nodes, i)); |
1293 | |
1294 | fts_query_check_node(query, token, node); |
1295 | } |
1296 | } |
1297 | |
1298 | rw_lock_x_unlock(&cache->lock); |
1299 | |
1300 | /* error is passed by 'query->error' */ |
1301 | if (query->error != DB_SUCCESS) { |
1302 | ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); |
1303 | return(query->error); |
1304 | } |
1305 | |
1306 | /* Setup the callback args for filtering and |
1307 | consolidating the ilist. */ |
1308 | fetch.read_arg = query; |
1309 | fetch.read_record = fts_query_index_fetch_nodes; |
1310 | |
1311 | error = fts_index_fetch_nodes( |
1312 | trx, &graph, &query->fts_index_table, token, &fetch); |
1313 | |
1314 | /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ |
1315 | ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); |
1316 | if (error != DB_SUCCESS) { |
1317 | query->error = error; |
1318 | } |
1319 | |
1320 | fts_que_graph_free(graph); |
1321 | |
1322 | if (query->error == DB_SUCCESS) { |
1323 | /* Make the intesection (rb tree) the current doc id |
1324 | set and free the old set. */ |
1325 | fts_query_free_doc_ids(query, query->doc_ids); |
1326 | query->doc_ids = query->intersection; |
1327 | query->intersection = NULL; |
1328 | |
1329 | ut_a(!query->multi_exist || (query->multi_exist |
1330 | && rbt_size(query->doc_ids) <= n_doc_ids)); |
1331 | } |
1332 | } |
1333 | |
1334 | return(query->error); |
1335 | } |
1336 | |
1337 | /*****************************************************************//** |
1338 | Query index cache. |
1339 | @return DB_SUCCESS if all go well */ |
1340 | static |
1341 | dberr_t |
1342 | fts_query_cache( |
1343 | /*============*/ |
1344 | fts_query_t* query, /*!< in/out: query instance */ |
1345 | const fts_string_t* token) /*!< in: token to search */ |
1346 | { |
1347 | const fts_index_cache_t*index_cache; |
1348 | dict_table_t* table = query->index->table; |
1349 | fts_cache_t* cache = table->fts->cache; |
1350 | |
1351 | /* Search the cache for a matching word first. */ |
1352 | rw_lock_x_lock(&cache->lock); |
1353 | |
1354 | /* Search for the index specific cache. */ |
1355 | index_cache = fts_find_index_cache(cache, query->index); |
1356 | |
1357 | /* Must find the index cache. */ |
1358 | ut_a(index_cache != NULL); |
1359 | |
1360 | if (query->cur_node->term.wildcard |
1361 | && query->flags != FTS_PROXIMITY |
1362 | && query->flags != FTS_PHRASE) { |
1363 | /* Wildcard search the index cache */ |
1364 | fts_cache_find_wildcard(query, index_cache, token); |
1365 | } else { |
1366 | const ib_vector_t* nodes; |
1367 | ulint i; |
1368 | |
1369 | nodes = fts_cache_find_word(index_cache, token); |
1370 | |
1371 | for (i = 0; nodes && i < ib_vector_size(nodes) |
1372 | && query->error == DB_SUCCESS; ++i) { |
1373 | const fts_node_t* node; |
1374 | |
1375 | node = static_cast<const fts_node_t*>( |
1376 | ib_vector_get_const(nodes, i)); |
1377 | |
1378 | fts_query_check_node(query, token, node); |
1379 | } |
1380 | } |
1381 | |
1382 | rw_lock_x_unlock(&cache->lock); |
1383 | |
1384 | return(query->error); |
1385 | } |
1386 | |
1387 | /*****************************************************************//** |
1388 | Set union. |
1389 | @return DB_SUCCESS if all go well */ |
1390 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
1391 | dberr_t |
1392 | fts_query_union( |
1393 | /*============*/ |
1394 | fts_query_t* query, /*!< in: query instance */ |
1395 | fts_string_t* token) /*!< in: token to search */ |
1396 | { |
1397 | fts_fetch_t fetch; |
1398 | ulint n_doc_ids = 0; |
1399 | trx_t* trx = query->trx; |
1400 | que_t* graph = NULL; |
1401 | dberr_t error; |
1402 | |
1403 | ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING || |
1404 | query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING); |
1405 | |
1406 | #ifdef FTS_INTERNAL_DIAG_PRINT |
1407 | { |
1408 | ib::info out; |
1409 | out << "UNION: Searching: '" ; |
1410 | out.write(token->f_str, token->f_len); |
1411 | out << "'" ; |
1412 | } |
1413 | #endif |
1414 | |
1415 | if (query->doc_ids) { |
1416 | n_doc_ids = rbt_size(query->doc_ids); |
1417 | } |
1418 | |
1419 | if (token->f_len == 0) { |
1420 | return(query->error); |
1421 | } |
1422 | |
1423 | fts_query_cache(query, token); |
1424 | |
1425 | /* Setup the callback args for filtering and |
1426 | consolidating the ilist. */ |
1427 | fetch.read_arg = query; |
1428 | fetch.read_record = fts_query_index_fetch_nodes; |
1429 | |
1430 | /* Read the nodes from disk. */ |
1431 | error = fts_index_fetch_nodes( |
1432 | trx, &graph, &query->fts_index_table, token, &fetch); |
1433 | |
1434 | /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ |
1435 | ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); |
1436 | if (error != DB_SUCCESS) { |
1437 | query->error = error; |
1438 | } |
1439 | |
1440 | fts_que_graph_free(graph); |
1441 | |
1442 | if (query->error == DB_SUCCESS) { |
1443 | |
1444 | /* The size can't decrease. */ |
1445 | ut_a(rbt_size(query->doc_ids) >= n_doc_ids); |
1446 | |
1447 | /* Calulate the number of doc ids that were added to |
1448 | the current doc id set. */ |
1449 | if (query->doc_ids) { |
1450 | n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids; |
1451 | } |
1452 | } |
1453 | |
1454 | return(query->error); |
1455 | } |
1456 | |
1457 | /*****************************************************************//** |
1458 | Depending upon the current query operator process the doc id. |
1459 | return DB_SUCCESS if all go well |
1460 | or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ |
1461 | static |
1462 | dberr_t |
1463 | fts_query_process_doc_id( |
1464 | /*=====================*/ |
1465 | fts_query_t* query, /*!< in: query instance */ |
1466 | doc_id_t doc_id, /*!< in: doc id to process */ |
1467 | fts_rank_t rank) /*!< in: if non-zero, it is the |
1468 | rank associated with the doc_id */ |
1469 | { |
1470 | if (query->flags == FTS_OPT_RANKING) { |
1471 | return(DB_SUCCESS); |
1472 | } |
1473 | |
1474 | switch (query->oper) { |
1475 | case FTS_NONE: |
1476 | fts_query_union_doc_id(query, doc_id, rank); |
1477 | break; |
1478 | |
1479 | case FTS_EXIST: |
1480 | fts_query_intersect_doc_id(query, doc_id, rank); |
1481 | break; |
1482 | |
1483 | case FTS_IGNORE: |
1484 | fts_query_remove_doc_id(query, doc_id); |
1485 | break; |
1486 | |
1487 | case FTS_NEGATE: |
1488 | fts_query_change_ranking(query, doc_id, TRUE); |
1489 | break; |
1490 | |
1491 | case FTS_DECR_RATING: |
1492 | fts_query_union_doc_id(query, doc_id, rank); |
1493 | fts_query_change_ranking(query, doc_id, TRUE); |
1494 | break; |
1495 | |
1496 | case FTS_INCR_RATING: |
1497 | fts_query_union_doc_id(query, doc_id, rank); |
1498 | fts_query_change_ranking(query, doc_id, FALSE); |
1499 | break; |
1500 | |
1501 | default: |
1502 | ut_error; |
1503 | } |
1504 | |
1505 | if (query->total_size > fts_result_cache_limit) { |
1506 | return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); |
1507 | } else { |
1508 | return(DB_SUCCESS); |
1509 | } |
1510 | } |
1511 | |
1512 | /*****************************************************************//** |
1513 | Merge two result sets. */ |
1514 | static |
1515 | dberr_t |
1516 | fts_merge_doc_ids( |
1517 | /*==============*/ |
1518 | fts_query_t* query, /*!< in,out: query instance */ |
1519 | const ib_rbt_t* doc_ids) /*!< in: result set to merge */ |
1520 | { |
1521 | const ib_rbt_node_t* node; |
1522 | |
1523 | DBUG_ENTER("fts_merge_doc_ids" ); |
1524 | |
1525 | ut_a(!query->intersection); |
1526 | |
1527 | /* To process FTS_EXIST operation (intersection), we need |
1528 | to create a new result set for fts_query_intersect(). */ |
1529 | if (query->oper == FTS_EXIST) { |
1530 | |
1531 | query->intersection = rbt_create( |
1532 | sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); |
1533 | |
1534 | query->total_size += SIZEOF_RBT_CREATE; |
1535 | } |
1536 | |
1537 | /* Merge the elements to the result set. */ |
1538 | for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) { |
1539 | fts_ranking_t* ranking; |
1540 | ulint pos = 0; |
1541 | fts_string_t word; |
1542 | |
1543 | ranking = rbt_value(fts_ranking_t, node); |
1544 | |
1545 | query->error = fts_query_process_doc_id( |
1546 | query, ranking->doc_id, ranking->rank); |
1547 | |
1548 | if (query->error != DB_SUCCESS) { |
1549 | DBUG_RETURN(query->error); |
1550 | } |
1551 | |
1552 | /* Merge words. Don't need to take operator into account. */ |
1553 | ut_a(ranking->words); |
1554 | while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { |
1555 | fts_query_add_word_to_document(query, ranking->doc_id, |
1556 | &word); |
1557 | } |
1558 | } |
1559 | |
1560 | /* If it is an intersection operation, reset query->doc_ids |
1561 | to query->intersection and free the old result list. */ |
1562 | if (query->oper == FTS_EXIST && query->intersection != NULL) { |
1563 | fts_query_free_doc_ids(query, query->doc_ids); |
1564 | query->doc_ids = query->intersection; |
1565 | query->intersection = NULL; |
1566 | } |
1567 | |
1568 | DBUG_RETURN(DB_SUCCESS); |
1569 | } |
1570 | |
1571 | /*****************************************************************//** |
1572 | Skip non-whitespace in a string. Move ptr to the next word boundary. |
1573 | @return pointer to first whitespace character or end */ |
1574 | UNIV_INLINE |
1575 | byte* |
1576 | fts_query_skip_word( |
1577 | /*================*/ |
1578 | byte* ptr, /*!< in: start of scan */ |
1579 | const byte* end) /*!< in: pointer to end of string */ |
1580 | { |
1581 | /* TODO: Does this have to be UTF-8 too ? */ |
1582 | while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) { |
1583 | ++ptr; |
1584 | } |
1585 | |
1586 | return(ptr); |
1587 | } |
1588 | |
1589 | /*****************************************************************//** |
1590 | Check whether the remaining terms in the phrase match the text. |
1591 | @return TRUE if matched else FALSE */ |
1592 | static |
1593 | ibool |
1594 | fts_query_match_phrase_terms( |
1595 | /*=========================*/ |
1596 | fts_phrase_t* phrase, /*!< in: phrase to match */ |
1597 | byte** start, /*!< in/out: text to search, we can't |
1598 | make this const becase we need to |
1599 | first convert the string to |
1600 | lowercase */ |
1601 | const byte* end, /*!< in: pointer to the end of |
1602 | the string to search */ |
1603 | mem_heap_t* heap) /*!< in: heap */ |
1604 | { |
1605 | ulint i; |
1606 | byte* ptr = *start; |
1607 | const ib_vector_t* tokens = phrase->tokens; |
1608 | ulint distance = phrase->distance; |
1609 | |
1610 | /* We check only from the second term onwards, since the first |
1611 | must have matched otherwise we wouldn't be here. */ |
1612 | for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) { |
1613 | fts_string_t match; |
1614 | fts_string_t cmp_str; |
1615 | const fts_string_t* token; |
1616 | int result; |
1617 | ulint ret; |
1618 | |
1619 | ret = innobase_mysql_fts_get_token( |
1620 | phrase->charset, ptr, |
1621 | const_cast<byte*>(end), &match); |
1622 | |
1623 | if (match.f_len > 0) { |
1624 | /* Get next token to match. */ |
1625 | token = static_cast<const fts_string_t*>( |
1626 | ib_vector_get_const(tokens, i)); |
1627 | |
1628 | fts_string_dup(&cmp_str, &match, heap); |
1629 | |
1630 | result = innobase_fts_text_case_cmp( |
1631 | phrase->charset, token, &cmp_str); |
1632 | |
1633 | /* Skip the rest of the tokens if this one doesn't |
1634 | match and the proximity distance is exceeded. */ |
1635 | if (result |
1636 | && (distance == ULINT_UNDEFINED |
1637 | || distance == 0)) { |
1638 | |
1639 | break; |
1640 | } |
1641 | |
1642 | /* This token matched move to the next token. */ |
1643 | if (result == 0) { |
1644 | /* Advance the text to search by the length |
1645 | of the last token. */ |
1646 | ptr += ret; |
1647 | |
1648 | /* Advance to the next token. */ |
1649 | ++i; |
1650 | } else { |
1651 | |
1652 | ut_a(distance != ULINT_UNDEFINED); |
1653 | |
1654 | ptr = fts_query_skip_word(ptr, end); |
1655 | } |
1656 | |
1657 | /* Distance can be 0 for exact matches. */ |
1658 | if (distance != ULINT_UNDEFINED && distance > 0) { |
1659 | --distance; |
1660 | } |
1661 | } else { |
1662 | ptr += ret; |
1663 | } |
1664 | } |
1665 | |
1666 | *start = ptr; |
1667 | |
1668 | /* Can't be greater than the number of elements. */ |
1669 | ut_a(i <= ib_vector_size(tokens)); |
1670 | |
1671 | /* This is the case for multiple words. */ |
1672 | if (i == ib_vector_size(tokens)) { |
1673 | phrase->found = TRUE; |
1674 | } |
1675 | |
1676 | return(phrase->found); |
1677 | } |
1678 | |
1679 | /*****************************************************************//** |
1680 | Callback function to count the number of words in position ranges, |
1681 | and see whether the word count is in specified "phrase->distance" |
1682 | @return true if the number of characters is less than the "distance" */ |
1683 | static |
1684 | bool |
1685 | fts_proximity_is_word_in_range( |
1686 | /*===========================*/ |
1687 | const fts_phrase_t* |
1688 | phrase, /*!< in: phrase with the search info */ |
1689 | byte* start, /*!< in: text to search */ |
1690 | ulint total_len) /*!< in: length of text */ |
1691 | { |
1692 | fts_proximity_t* proximity_pos = phrase->proximity_pos; |
1693 | |
1694 | ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size()); |
1695 | ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size()); |
1696 | |
1697 | /* Search each matched position pair (with min and max positions) |
1698 | and count the number of words in the range */ |
1699 | for (ulint i = 0; i < proximity_pos->n_pos; i++) { |
1700 | ulint cur_pos = proximity_pos->min_pos[i]; |
1701 | ulint n_word = 0; |
1702 | |
1703 | ut_ad(proximity_pos->max_pos[i] <= total_len); |
1704 | |
1705 | /* Walk through words in the range and count them */ |
1706 | while (cur_pos <= proximity_pos->max_pos[i]) { |
1707 | ulint len; |
1708 | fts_string_t str; |
1709 | |
1710 | len = innobase_mysql_fts_get_token( |
1711 | phrase->charset, |
1712 | start + cur_pos, |
1713 | start + total_len, &str); |
1714 | |
1715 | if (len == 0) { |
1716 | break; |
1717 | } |
1718 | |
1719 | /* Advances position with "len" bytes */ |
1720 | cur_pos += len; |
1721 | |
1722 | /* Record the number of words */ |
1723 | if (str.f_n_char > 0) { |
1724 | n_word++; |
1725 | } |
1726 | |
1727 | if (n_word > phrase->distance) { |
1728 | break; |
1729 | } |
1730 | } |
1731 | |
1732 | /* Check if the number of words is less than specified |
1733 | "distance" */ |
1734 | if (n_word && n_word <= phrase->distance) { |
1735 | return(true); |
1736 | } |
1737 | } |
1738 | |
1739 | return(false); |
1740 | } |
1741 | |
1742 | /*****************************************************************//** |
1743 | FTS plugin parser 'myql_add_word' callback function for phrase match |
1744 | Refer to 'st_mysql_ftparser_param' for more detail. |
1745 | @return 0 if match, or return non-zero */ |
1746 | static |
1747 | int |
1748 | fts_query_match_phrase_add_word_for_parser( |
1749 | /*=======================================*/ |
1750 | MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */ |
1751 | const char* word, /*!< in: token */ |
1752 | int word_len, /*!< in: token length */ |
1753 | MYSQL_FTPARSER_BOOLEAN_INFO*) |
1754 | { |
1755 | fts_phrase_param_t* phrase_param; |
1756 | fts_phrase_t* phrase; |
1757 | const ib_vector_t* tokens; |
1758 | fts_string_t match; |
1759 | fts_string_t cmp_str; |
1760 | const fts_string_t* token; |
1761 | int result; |
1762 | mem_heap_t* heap; |
1763 | |
1764 | phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam); |
1765 | heap = phrase_param->heap; |
1766 | phrase = phrase_param->phrase; |
1767 | tokens = phrase->tokens; |
1768 | |
1769 | /* In case plugin parser doesn't check return value */ |
1770 | if (phrase_param->token_index == ib_vector_size(tokens)) { |
1771 | return(1); |
1772 | } |
1773 | |
1774 | match.f_str = (uchar *)(word); |
1775 | match.f_len = ulint(word_len); |
1776 | match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len); |
1777 | |
1778 | if (match.f_len > 0) { |
1779 | /* Get next token to match. */ |
1780 | ut_a(phrase_param->token_index < ib_vector_size(tokens)); |
1781 | token = static_cast<const fts_string_t*>( |
1782 | ib_vector_get_const(tokens, phrase_param->token_index)); |
1783 | |
1784 | fts_string_dup(&cmp_str, &match, heap); |
1785 | |
1786 | result = innobase_fts_text_case_cmp( |
1787 | phrase->charset, token, &cmp_str); |
1788 | |
1789 | if (result == 0) { |
1790 | phrase_param->token_index++; |
1791 | } else { |
1792 | return(1); |
1793 | } |
1794 | } |
1795 | |
1796 | /* Can't be greater than the number of elements. */ |
1797 | ut_a(phrase_param->token_index <= ib_vector_size(tokens)); |
1798 | |
1799 | /* This is the case for multiple words. */ |
1800 | if (phrase_param->token_index == ib_vector_size(tokens)) { |
1801 | phrase->found = TRUE; |
1802 | } |
1803 | |
1804 | return(static_cast<int>(phrase->found)); |
1805 | } |
1806 | |
1807 | /*****************************************************************//** |
1808 | Check whether the terms in the phrase match the text. |
1809 | @return TRUE if matched else FALSE */ |
1810 | static |
1811 | ibool |
1812 | fts_query_match_phrase_terms_by_parser( |
1813 | /*===================================*/ |
1814 | fts_phrase_param_t* phrase_param, /* in/out: phrase param */ |
1815 | st_mysql_ftparser* parser, /* in: plugin fts parser */ |
1816 | byte* text, /* in: text to check */ |
1817 | ulint len) /* in: text length */ |
1818 | { |
1819 | MYSQL_FTPARSER_PARAM param; |
1820 | |
1821 | ut_a(parser); |
1822 | |
1823 | /* Set paramters for param */ |
1824 | param.mysql_parse = fts_tokenize_document_internal; |
1825 | param.mysql_add_word = fts_query_match_phrase_add_word_for_parser; |
1826 | param.mysql_ftparam = phrase_param; |
1827 | param.cs = phrase_param->phrase->charset; |
1828 | param.doc = reinterpret_cast<char*>(text); |
1829 | param.length = static_cast<int>(len); |
1830 | param.mode= MYSQL_FTPARSER_WITH_STOPWORDS; |
1831 | |
1832 | PARSER_INIT(parser, ¶m); |
1833 | parser->parse(¶m); |
1834 | PARSER_DEINIT(parser, ¶m); |
1835 | |
1836 | return(phrase_param->phrase->found); |
1837 | } |
1838 | |
1839 | /*****************************************************************//** |
1840 | Callback function to fetch and search the document. |
1841 | @return TRUE if matched else FALSE */ |
1842 | static |
1843 | ibool |
1844 | fts_query_match_phrase( |
1845 | /*===================*/ |
1846 | fts_phrase_t* phrase, /*!< in: phrase to match */ |
1847 | byte* start, /*!< in: text to search, we can't make |
1848 | this const becase we need to first |
1849 | convert the string to lowercase */ |
1850 | ulint cur_len, /*!< in: length of text */ |
1851 | ulint prev_len, /*!< in: total length for searched |
1852 | doc fields*/ |
1853 | mem_heap_t* heap) /* heap */ |
1854 | { |
1855 | ulint i; |
1856 | const fts_string_t* first; |
1857 | const byte* end = start + cur_len; |
1858 | const ib_vector_t* tokens = phrase->tokens; |
1859 | const ib_vector_t* positions = phrase->match->positions; |
1860 | |
1861 | ut_a(!phrase->found); |
1862 | ut_a(phrase->match->doc_id > 0); |
1863 | ut_a(ib_vector_size(tokens) > 0); |
1864 | ut_a(ib_vector_size(positions) > 0); |
1865 | |
1866 | first = static_cast<const fts_string_t*>( |
1867 | ib_vector_get_const(tokens, 0)); |
1868 | |
1869 | ut_a(phrase->match->start < ib_vector_size(positions)); |
1870 | |
1871 | for (i = phrase->match->start; i < ib_vector_size(positions); ++i) { |
1872 | ulint pos; |
1873 | byte* ptr = start; |
1874 | |
1875 | pos = *(ulint*) ib_vector_get_const(positions, i); |
1876 | |
1877 | if (pos == ULINT_UNDEFINED) { |
1878 | break; |
1879 | } |
1880 | |
1881 | if (pos < prev_len) { |
1882 | continue; |
1883 | } |
1884 | |
1885 | /* Document positions are calculated from the beginning |
1886 | of the first field, need to save the length for each |
1887 | searched field to adjust the doc position when search |
1888 | phrases. */ |
1889 | pos -= prev_len; |
1890 | ptr = start + pos; |
1891 | |
1892 | /* Within limits ? */ |
1893 | if (ptr >= end) { |
1894 | break; |
1895 | } |
1896 | |
1897 | if (phrase->parser) { |
1898 | fts_phrase_param_t phrase_param; |
1899 | |
1900 | phrase_param.phrase = phrase; |
1901 | phrase_param.token_index = 0; |
1902 | phrase_param.heap = heap; |
1903 | |
1904 | if (fts_query_match_phrase_terms_by_parser( |
1905 | &phrase_param, |
1906 | phrase->parser, |
1907 | ptr, |
1908 | ulint(end - ptr))) { |
1909 | break; |
1910 | } |
1911 | } else { |
1912 | fts_string_t match; |
1913 | fts_string_t cmp_str; |
1914 | ulint ret; |
1915 | |
1916 | match.f_str = ptr; |
1917 | ret = innobase_mysql_fts_get_token( |
1918 | phrase->charset, start + pos, |
1919 | const_cast<byte*>(end), &match); |
1920 | |
1921 | if (match.f_len == 0) { |
1922 | break; |
1923 | } |
1924 | |
1925 | fts_string_dup(&cmp_str, &match, heap); |
1926 | |
1927 | if (innobase_fts_text_case_cmp( |
1928 | phrase->charset, first, &cmp_str) == 0) { |
1929 | |
1930 | /* This is the case for the single word |
1931 | in the phrase. */ |
1932 | if (ib_vector_size(phrase->tokens) == 1) { |
1933 | phrase->found = TRUE; |
1934 | break; |
1935 | } |
1936 | |
1937 | ptr += ret; |
1938 | |
1939 | /* Match the remaining terms in the phrase. */ |
1940 | if (fts_query_match_phrase_terms(phrase, &ptr, |
1941 | end, heap)) { |
1942 | break; |
1943 | } |
1944 | } |
1945 | } |
1946 | } |
1947 | |
1948 | return(phrase->found); |
1949 | } |
1950 | |
1951 | /*****************************************************************//** |
1952 | Callback function to fetch and search the document. |
1953 | @return whether the phrase is found */ |
1954 | static |
1955 | ibool |
1956 | fts_query_fetch_document( |
1957 | /*=====================*/ |
1958 | void* row, /*!< in: sel_node_t* */ |
1959 | void* user_arg) /*!< in: fts_doc_t* */ |
1960 | { |
1961 | |
1962 | que_node_t* exp; |
1963 | sel_node_t* node = static_cast<sel_node_t*>(row); |
1964 | fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg); |
1965 | ulint prev_len = 0; |
1966 | ulint total_len = 0; |
1967 | byte* document_text = NULL; |
1968 | |
1969 | exp = node->select_list; |
1970 | |
1971 | phrase->found = FALSE; |
1972 | |
1973 | /* For proximity search, we will need to get the whole document |
1974 | from all fields, so first count the total length of the document |
1975 | from all the fields */ |
1976 | if (phrase->proximity_pos) { |
1977 | while (exp) { |
1978 | ulint field_len; |
1979 | dfield_t* dfield = que_node_get_val(exp); |
1980 | byte* data = static_cast<byte*>( |
1981 | dfield_get_data(dfield)); |
1982 | |
1983 | if (dfield_is_ext(dfield)) { |
1984 | ulint local_len = dfield_get_len(dfield); |
1985 | |
1986 | local_len -= BTR_EXTERN_FIELD_REF_SIZE; |
1987 | |
1988 | field_len = mach_read_from_4( |
1989 | data + local_len + BTR_EXTERN_LEN + 4); |
1990 | } else { |
1991 | field_len = dfield_get_len(dfield); |
1992 | } |
1993 | |
1994 | if (field_len != UNIV_SQL_NULL) { |
1995 | total_len += field_len + 1; |
1996 | } |
1997 | |
1998 | exp = que_node_get_next(exp); |
1999 | } |
2000 | |
2001 | document_text = static_cast<byte*>(mem_heap_zalloc( |
2002 | phrase->heap, total_len)); |
2003 | |
2004 | if (!document_text) { |
2005 | return(FALSE); |
2006 | } |
2007 | } |
2008 | |
2009 | exp = node->select_list; |
2010 | |
2011 | while (exp) { |
2012 | dfield_t* dfield = que_node_get_val(exp); |
2013 | byte* data = static_cast<byte*>( |
2014 | dfield_get_data(dfield)); |
2015 | ulint cur_len; |
2016 | |
2017 | if (dfield_is_ext(dfield)) { |
2018 | data = btr_copy_externally_stored_field( |
2019 | &cur_len, data, phrase->page_size, |
2020 | dfield_get_len(dfield), phrase->heap); |
2021 | } else { |
2022 | cur_len = dfield_get_len(dfield); |
2023 | } |
2024 | |
2025 | if (cur_len != UNIV_SQL_NULL && cur_len != 0) { |
2026 | if (phrase->proximity_pos) { |
2027 | ut_ad(prev_len + cur_len <= total_len); |
2028 | memcpy(document_text + prev_len, data, cur_len); |
2029 | } else { |
2030 | /* For phrase search */ |
2031 | phrase->found = |
2032 | fts_query_match_phrase( |
2033 | phrase, |
2034 | static_cast<byte*>(data), |
2035 | cur_len, prev_len, |
2036 | phrase->heap); |
2037 | } |
2038 | |
2039 | /* Document positions are calculated from the beginning |
2040 | of the first field, need to save the length for each |
2041 | searched field to adjust the doc position when search |
2042 | phrases. */ |
2043 | prev_len += cur_len + 1; |
2044 | } |
2045 | |
2046 | if (phrase->found) { |
2047 | break; |
2048 | } |
2049 | |
2050 | exp = que_node_get_next(exp); |
2051 | } |
2052 | |
2053 | if (phrase->proximity_pos) { |
2054 | ut_ad(prev_len <= total_len); |
2055 | |
2056 | phrase->found = fts_proximity_is_word_in_range( |
2057 | phrase, document_text, total_len); |
2058 | } |
2059 | |
2060 | return(phrase->found); |
2061 | } |
2062 | |
2063 | #if 0 |
2064 | /******************************************************************** |
2065 | Callback function to check whether a record was found or not. */ |
2066 | static |
2067 | ibool |
2068 | fts_query_select( |
2069 | /*=============*/ |
2070 | void* row, /*!< in: sel_node_t* */ |
2071 | void* user_arg) /*!< in: fts_doc_t* */ |
2072 | { |
2073 | int i; |
2074 | que_node_t* exp; |
2075 | sel_node_t* node = row; |
2076 | fts_select_t* select = user_arg; |
2077 | |
2078 | ut_a(select->word_freq); |
2079 | ut_a(select->word_freq->doc_freqs); |
2080 | |
2081 | exp = node->select_list; |
2082 | |
2083 | for (i = 0; exp && !select->found; ++i) { |
2084 | dfield_t* dfield = que_node_get_val(exp); |
2085 | void* data = dfield_get_data(dfield); |
2086 | ulint len = dfield_get_len(dfield); |
2087 | |
2088 | switch (i) { |
2089 | case 0: /* DOC_COUNT */ |
2090 | if (len != UNIV_SQL_NULL && len != 0) { |
2091 | |
2092 | select->word_freq->doc_count += |
2093 | mach_read_from_4(data); |
2094 | } |
2095 | break; |
2096 | |
2097 | case 1: /* ILIST */ |
2098 | if (len != UNIV_SQL_NULL && len != 0) { |
2099 | |
2100 | fts_query_find_doc_id(select, data, len); |
2101 | } |
2102 | break; |
2103 | |
2104 | default: |
2105 | ut_error; |
2106 | } |
2107 | |
2108 | exp = que_node_get_next(exp); |
2109 | } |
2110 | |
2111 | return(FALSE); |
2112 | } |
2113 | |
2114 | /******************************************************************** |
2115 | Read the rows from the FTS index, that match word and where the |
2116 | doc id is between first and last doc id. |
2117 | @return DB_SUCCESS if all go well else error code */ |
2118 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
2119 | dberr_t |
2120 | fts_query_find_term( |
2121 | /*================*/ |
2122 | fts_query_t* query, /*!< in: FTS query state */ |
2123 | que_t** graph, /*!< in: prepared statement */ |
2124 | const fts_string_t* word, /*!< in: the word to fetch */ |
2125 | doc_id_t doc_id, /*!< in: doc id to match */ |
2126 | ulint* min_pos,/*!< in/out: pos found must be |
2127 | greater than this minimum value. */ |
2128 | ibool* found) /*!< out: TRUE if found else FALSE */ |
2129 | { |
2130 | pars_info_t* info; |
2131 | dberr_t error; |
2132 | fts_select_t select; |
2133 | doc_id_t match_doc_id; |
2134 | trx_t* trx = query->trx; |
2135 | char table_name[MAX_FULL_NAME_LEN]; |
2136 | |
2137 | trx->op_info = "fetching FTS index matching nodes" ; |
2138 | |
2139 | if (*graph) { |
2140 | info = (*graph)->info; |
2141 | } else { |
2142 | ulint selected; |
2143 | |
2144 | info = pars_info_create(); |
2145 | |
2146 | selected = fts_select_index(*word->f_str); |
2147 | query->fts_index_table.suffix = fts_get_suffix(selected); |
2148 | |
2149 | fts_get_table_name(&query->fts_index_table, table_name); |
2150 | pars_info_bind_id(info, true, "index_table_name" , table_name); |
2151 | } |
2152 | |
2153 | select.found = FALSE; |
2154 | select.doc_id = doc_id; |
2155 | select.min_pos = *min_pos; |
2156 | select.word_freq = fts_query_add_word_freq(query, word->f_str); |
2157 | |
2158 | pars_info_bind_function(info, "my_func" , fts_query_select, &select); |
2159 | pars_info_bind_varchar_literal(info, "word" , word->f_str, word->f_len); |
2160 | |
2161 | /* Convert to "storage" byte order. */ |
2162 | fts_write_doc_id((byte*) &match_doc_id, doc_id); |
2163 | |
2164 | fts_bind_doc_id(info, "min_doc_id" , &match_doc_id); |
2165 | |
2166 | fts_bind_doc_id(info, "max_doc_id" , &match_doc_id); |
2167 | |
2168 | if (!*graph) { |
2169 | |
2170 | *graph = fts_parse_sql( |
2171 | &query->fts_index_table, |
2172 | info, |
2173 | "DECLARE FUNCTION my_func;\n" |
2174 | "DECLARE CURSOR c IS" |
2175 | " SELECT doc_count, ilist\n" |
2176 | " FROM $index_table_name\n" |
2177 | " WHERE word LIKE :word AND" |
2178 | " first_doc_id <= :min_doc_id AND" |
2179 | " last_doc_id >= :max_doc_id\n" |
2180 | " ORDER BY first_doc_id;\n" |
2181 | "BEGIN\n" |
2182 | "\n" |
2183 | "OPEN c;\n" |
2184 | "WHILE 1 = 1 LOOP\n" |
2185 | " FETCH c INTO my_func();\n" |
2186 | " IF c % NOTFOUND THEN\n" |
2187 | " EXIT;\n" |
2188 | " END IF;\n" |
2189 | "END LOOP;\n" |
2190 | "CLOSE c;" ); |
2191 | } |
2192 | |
2193 | for (;;) { |
2194 | error = fts_eval_sql(trx, *graph); |
2195 | |
2196 | if (error == DB_SUCCESS) { |
2197 | |
2198 | break; /* Exit the loop. */ |
2199 | } else { |
2200 | |
2201 | if (error == DB_LOCK_WAIT_TIMEOUT) { |
2202 | ib::warn() << "lock wait timeout reading FTS" |
2203 | " index. Retrying!" ; |
2204 | |
2205 | trx->error_state = DB_SUCCESS; |
2206 | } else { |
2207 | ib::error() << error |
2208 | << " while reading FTS index." ; |
2209 | |
2210 | break; /* Exit the loop. */ |
2211 | } |
2212 | } |
2213 | } |
2214 | |
2215 | /* Value to return */ |
2216 | *found = select.found; |
2217 | |
2218 | if (*found) { |
2219 | *min_pos = select.min_pos; |
2220 | } |
2221 | |
2222 | return(error); |
2223 | } |
2224 | |
2225 | /******************************************************************** |
2226 | Callback aggregator for int columns. */ |
2227 | static |
2228 | ibool |
2229 | fts_query_sum( |
2230 | /*==========*/ |
2231 | /*!< out: always returns TRUE */ |
2232 | void* row, /*!< in: sel_node_t* */ |
2233 | void* user_arg) /*!< in: ulint* */ |
2234 | { |
2235 | |
2236 | que_node_t* exp; |
2237 | sel_node_t* node = row; |
2238 | ulint* total = user_arg; |
2239 | |
2240 | exp = node->select_list; |
2241 | |
2242 | while (exp) { |
2243 | dfield_t* dfield = que_node_get_val(exp); |
2244 | void* data = dfield_get_data(dfield); |
2245 | ulint len = dfield_get_len(dfield); |
2246 | |
2247 | if (len != UNIV_SQL_NULL && len != 0) { |
2248 | *total += mach_read_from_4(data); |
2249 | } |
2250 | |
2251 | exp = que_node_get_next(exp); |
2252 | } |
2253 | |
2254 | return(TRUE); |
2255 | } |
2256 | |
2257 | /******************************************************************** |
2258 | Calculate the total documents that contain a particular word (term). |
2259 | @return DB_SUCCESS if all go well else error code */ |
2260 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
2261 | dberr_t |
2262 | fts_query_total_docs_containing_term( |
2263 | /*=================================*/ |
2264 | fts_query_t* query, /*!< in: FTS query state */ |
2265 | const fts_string_t* word, /*!< in: the word to check */ |
2266 | ulint* total) /*!< out: documents containing word */ |
2267 | { |
2268 | pars_info_t* info; |
2269 | dberr_t error; |
2270 | que_t* graph; |
2271 | ulint selected; |
2272 | trx_t* trx = query->trx; |
2273 | char table_name[MAX_FULL_NAME_LEN] |
2274 | |
2275 | trx->op_info = "fetching FTS index document count" ; |
2276 | |
2277 | *total = 0; |
2278 | |
2279 | info = pars_info_create(); |
2280 | |
2281 | pars_info_bind_function(info, "my_func" , fts_query_sum, total); |
2282 | pars_info_bind_varchar_literal(info, "word" , word->f_str, word->f_len); |
2283 | |
2284 | selected = fts_select_index(*word->f_str); |
2285 | |
2286 | query->fts_index_table.suffix = fts_get_suffix(selected); |
2287 | |
2288 | fts_get_table_name(&query->fts_index_table, table_name); |
2289 | |
2290 | pars_info_bind_id(info, true, "index_table_name" , table_name); |
2291 | |
2292 | graph = fts_parse_sql( |
2293 | &query->fts_index_table, |
2294 | info, |
2295 | "DECLARE FUNCTION my_func;\n" |
2296 | "DECLARE CURSOR c IS" |
2297 | " SELECT doc_count\n" |
2298 | " FROM $index_table_name\n" |
2299 | " WHERE word = :word" |
2300 | " ORDER BY first_doc_id;\n" |
2301 | "BEGIN\n" |
2302 | "\n" |
2303 | "OPEN c;\n" |
2304 | "WHILE 1 = 1 LOOP\n" |
2305 | " FETCH c INTO my_func();\n" |
2306 | " IF c % NOTFOUND THEN\n" |
2307 | " EXIT;\n" |
2308 | " END IF;\n" |
2309 | "END LOOP;\n" |
2310 | "CLOSE c;" ); |
2311 | |
2312 | for (;;) { |
2313 | error = fts_eval_sql(trx, graph); |
2314 | |
2315 | if (error == DB_SUCCESS) { |
2316 | |
2317 | break; /* Exit the loop. */ |
2318 | } else { |
2319 | |
2320 | if (error == DB_LOCK_WAIT_TIMEOUT) { |
2321 | ib::warn() << "lock wait timeout reading FTS" |
2322 | " index. Retrying!" ; |
2323 | |
2324 | trx->error_state = DB_SUCCESS; |
2325 | } else { |
2326 | ib::error() << error |
2327 | << " while reading FTS index." ; |
2328 | |
2329 | break; /* Exit the loop. */ |
2330 | } |
2331 | } |
2332 | } |
2333 | |
2334 | fts_que_graph_free(graph); |
2335 | |
2336 | return(error); |
2337 | } |
2338 | |
2339 | /******************************************************************** |
2340 | Get the total number of words in a documents. |
2341 | @return DB_SUCCESS if all go well else error code */ |
2342 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
2343 | dberr_t |
2344 | fts_query_terms_in_document( |
2345 | /*========================*/ |
2346 | fts_query_t* query, /*!< in: FTS query state */ |
2347 | doc_id_t doc_id, /*!< in: the word to check */ |
2348 | ulint* total) /*!< out: total words in document */ |
2349 | { |
2350 | pars_info_t* info; |
2351 | dberr_t error; |
2352 | que_t* graph; |
2353 | doc_id_t read_doc_id; |
2354 | trx_t* trx = query->trx; |
2355 | char table_name[MAX_FULL_NAME_LEN]; |
2356 | |
2357 | trx->op_info = "fetching FTS document term count" ; |
2358 | |
2359 | *total = 0; |
2360 | |
2361 | info = pars_info_create(); |
2362 | |
2363 | pars_info_bind_function(info, "my_func" , fts_query_sum, total); |
2364 | |
2365 | /* Convert to "storage" byte order. */ |
2366 | fts_write_doc_id((byte*) &read_doc_id, doc_id); |
2367 | fts_bind_doc_id(info, "doc_id" , &read_doc_id); |
2368 | |
2369 | query->fts_index_table.suffix = "DOC_ID" ; |
2370 | |
2371 | fts_get_table_name(&query->fts_index_table, table_name); |
2372 | |
2373 | pars_info_bind_id(info, true, "index_table_name" , table_name); |
2374 | |
2375 | graph = fts_parse_sql( |
2376 | &query->fts_index_table, |
2377 | info, |
2378 | "DECLARE FUNCTION my_func;\n" |
2379 | "DECLARE CURSOR c IS" |
2380 | " SELECT count\n" |
2381 | " FROM $index_table_name\n" |
2382 | " WHERE doc_id = :doc_id" |
2383 | " BEGIN\n" |
2384 | "\n" |
2385 | "OPEN c;\n" |
2386 | "WHILE 1 = 1 LOOP\n" |
2387 | " FETCH c INTO my_func();\n" |
2388 | " IF c % NOTFOUND THEN\n" |
2389 | " EXIT;\n" |
2390 | " END IF;\n" |
2391 | "END LOOP;\n" |
2392 | "CLOSE c;" ); |
2393 | |
2394 | for (;;) { |
2395 | error = fts_eval_sql(trx, graph); |
2396 | |
2397 | if (error == DB_SUCCESS) { |
2398 | |
2399 | break; /* Exit the loop. */ |
2400 | } else { |
2401 | |
2402 | if (error == DB_LOCK_WAIT_TIMEOUT) { |
2403 | ib::warn() << "lock wait timeout reading FTS" |
2404 | " doc id table. Retrying!" ; |
2405 | |
2406 | trx->error_state = DB_SUCCESS; |
2407 | } else { |
2408 | ib::error() << error << " while reading FTS" |
2409 | " doc id table." ; |
2410 | |
2411 | break; /* Exit the loop. */ |
2412 | } |
2413 | } |
2414 | } |
2415 | |
2416 | fts_que_graph_free(graph); |
2417 | |
2418 | return(error); |
2419 | } |
2420 | #endif |
2421 | |
2422 | /*****************************************************************//** |
2423 | Retrieve the document and match the phrase tokens. |
2424 | @return DB_SUCCESS or error code */ |
2425 | MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result)) |
2426 | static |
2427 | dberr_t |
2428 | fts_query_match_document( |
2429 | /*=====================*/ |
2430 | ib_vector_t* tokens, /*!< in: phrase tokens */ |
2431 | fts_get_doc_t* get_doc, /*!< in: table and prepared statements */ |
2432 | fts_match_t* match, /*!< in: doc id and positions */ |
2433 | ulint distance, /*!< in: proximity distance */ |
2434 | st_mysql_ftparser* parser, /*!< in: fts plugin parser */ |
2435 | ibool* found) /*!< out: TRUE if phrase found */ |
2436 | { |
2437 | dberr_t error; |
2438 | fts_phrase_t phrase(get_doc->index_cache->index->table); |
2439 | |
2440 | phrase.match = match; /* Positions to match */ |
2441 | phrase.tokens = tokens; /* Tokens to match */ |
2442 | phrase.distance = distance; |
2443 | phrase.charset = get_doc->index_cache->charset; |
2444 | phrase.heap = mem_heap_create(512); |
2445 | phrase.parser = parser; |
2446 | |
2447 | *found = phrase.found = FALSE; |
2448 | |
2449 | error = fts_doc_fetch_by_doc_id( |
2450 | get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, |
2451 | fts_query_fetch_document, &phrase); |
2452 | |
2453 | if (error != DB_SUCCESS) { |
2454 | ib::error() << "(" << ut_strerr(error) |
2455 | << ") matching document." ; |
2456 | } else { |
2457 | *found = phrase.found; |
2458 | } |
2459 | |
2460 | mem_heap_free(phrase.heap); |
2461 | |
2462 | return(error); |
2463 | } |
2464 | |
2465 | /*****************************************************************//** |
2466 | This function fetches the original documents and count the |
2467 | words in between matching words to see that is in specified distance |
2468 | @return DB_SUCCESS if all OK */ |
2469 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
2470 | bool |
2471 | fts_query_is_in_proximity_range( |
2472 | /*============================*/ |
2473 | const fts_query_t* query, /*!< in: query instance */ |
2474 | fts_match_t** match, /*!< in: query instance */ |
2475 | fts_proximity_t* qualified_pos) /*!< in: position info for |
2476 | qualified ranges */ |
2477 | { |
2478 | fts_get_doc_t get_doc; |
2479 | fts_cache_t* cache = query->index->table->fts->cache; |
2480 | dberr_t err; |
2481 | |
2482 | memset(&get_doc, 0x0, sizeof(get_doc)); |
2483 | |
2484 | rw_lock_x_lock(&cache->lock); |
2485 | get_doc.index_cache = fts_find_index_cache(cache, query->index); |
2486 | rw_lock_x_unlock(&cache->lock); |
2487 | ut_a(get_doc.index_cache != NULL); |
2488 | |
2489 | fts_phrase_t phrase(get_doc.index_cache->index->table); |
2490 | |
2491 | phrase.distance = query->distance; |
2492 | phrase.charset = get_doc.index_cache->charset; |
2493 | phrase.heap = mem_heap_create(512); |
2494 | phrase.proximity_pos = qualified_pos; |
2495 | phrase.found = FALSE; |
2496 | |
2497 | err = fts_doc_fetch_by_doc_id( |
2498 | &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, |
2499 | fts_query_fetch_document, &phrase); |
2500 | |
2501 | if (err != DB_SUCCESS) { |
2502 | ib::error() << "(" << ut_strerr(err) << ") in verification" |
2503 | " phase of proximity search" ; |
2504 | } |
2505 | |
2506 | /* Free the prepared statement. */ |
2507 | if (get_doc.get_document_graph) { |
2508 | fts_que_graph_free(get_doc.get_document_graph); |
2509 | get_doc.get_document_graph = NULL; |
2510 | } |
2511 | |
2512 | mem_heap_free(phrase.heap); |
2513 | |
2514 | return(err == DB_SUCCESS && phrase.found); |
2515 | } |
2516 | |
2517 | /*****************************************************************//** |
2518 | Iterate over the matched document ids and search the for the |
2519 | actual phrase in the text. |
2520 | @return DB_SUCCESS if all OK */ |
2521 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
2522 | dberr_t |
2523 | fts_query_search_phrase( |
2524 | /*====================*/ |
2525 | fts_query_t* query, /*!< in: query instance */ |
2526 | ib_vector_t* orig_tokens, /*!< in: tokens to search, |
2527 | with any stopwords in the |
2528 | original phrase */ |
2529 | ib_vector_t* tokens) /*!< in: tokens that does |
2530 | not include stopwords and |
2531 | can be used to calculate |
2532 | ranking */ |
2533 | { |
2534 | ulint i; |
2535 | fts_get_doc_t get_doc; |
2536 | ulint n_matched; |
2537 | fts_cache_t* cache = query->index->table->fts->cache; |
2538 | |
2539 | n_matched = ib_vector_size(query->matched); |
2540 | |
2541 | /* Setup the doc retrieval infrastructure. */ |
2542 | memset(&get_doc, 0x0, sizeof(get_doc)); |
2543 | |
2544 | rw_lock_x_lock(&cache->lock); |
2545 | |
2546 | get_doc.index_cache = fts_find_index_cache(cache, query->index); |
2547 | |
2548 | /* Must find the index cache */ |
2549 | ut_a(get_doc.index_cache != NULL); |
2550 | |
2551 | rw_lock_x_unlock(&cache->lock); |
2552 | |
2553 | #ifdef FTS_INTERNAL_DIAG_PRINT |
2554 | ib::info() << "Start phrase search" ; |
2555 | #endif |
2556 | |
2557 | /* Read the document from disk and do the actual |
2558 | match, matching documents will be added to the current |
2559 | doc id set. */ |
2560 | for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) { |
2561 | fts_match_t* match; |
2562 | ibool found = FALSE; |
2563 | |
2564 | match = static_cast<fts_match_t*>( |
2565 | ib_vector_get(query->matched, i)); |
2566 | |
2567 | /* Skip the document ids that were filtered out by |
2568 | an earlier pass. */ |
2569 | if (match->doc_id != 0) { |
2570 | |
2571 | query->error = fts_query_match_document( |
2572 | orig_tokens, &get_doc, match, |
2573 | query->distance, query->parser, &found); |
2574 | |
2575 | if (query->error == DB_SUCCESS && found) { |
2576 | ulint z; |
2577 | |
2578 | query->error = fts_query_process_doc_id(query, |
2579 | match->doc_id, 0); |
2580 | if (query->error != DB_SUCCESS) { |
2581 | goto func_exit; |
2582 | } |
2583 | |
2584 | for (z = 0; z < ib_vector_size(tokens); z++) { |
2585 | fts_string_t* token; |
2586 | token = static_cast<fts_string_t*>( |
2587 | ib_vector_get(tokens, z)); |
2588 | fts_query_add_word_to_document( |
2589 | query, match->doc_id, token); |
2590 | } |
2591 | } |
2592 | } |
2593 | } |
2594 | |
2595 | func_exit: |
2596 | /* Free the prepared statement. */ |
2597 | if (get_doc.get_document_graph) { |
2598 | fts_que_graph_free(get_doc.get_document_graph); |
2599 | get_doc.get_document_graph = NULL; |
2600 | } |
2601 | |
2602 | return(query->error); |
2603 | } |
2604 | |
2605 | /** Split the phrase into tokens |
2606 | @param[in,out] query query instance |
2607 | @param[in] node query node to search |
2608 | @param[in,out] tokens token vector |
2609 | @param[in,out] orig_tokens original node tokens include stopword |
2610 | @param[in,out] heap mem heap */ |
2611 | static |
2612 | void |
2613 | fts_query_phrase_split( |
2614 | fts_query_t* query, |
2615 | const fts_ast_node_t* node, |
2616 | ib_vector_t* tokens, |
2617 | ib_vector_t* orig_tokens, |
2618 | mem_heap_t* heap) |
2619 | { |
2620 | fts_string_t phrase; |
2621 | ulint len = 0; |
2622 | ulint cur_pos = 0; |
2623 | fts_ast_node_t* term_node = NULL; |
2624 | |
2625 | if (node->type == FTS_AST_TEXT) { |
2626 | phrase.f_str = node->text.ptr->str; |
2627 | phrase.f_len = node->text.ptr->len; |
2628 | len = phrase.f_len; |
2629 | } else { |
2630 | ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST); |
2631 | phrase.f_str = NULL; |
2632 | phrase.f_len = 0; |
2633 | term_node = node->list.head; |
2634 | } |
2635 | |
2636 | while (true) { |
2637 | fts_cache_t* cache = query->index->table->fts->cache; |
2638 | ulint cur_len; |
2639 | fts_string_t result_str; |
2640 | |
2641 | if (node->type == FTS_AST_TEXT) { |
2642 | if (cur_pos >= len) { |
2643 | break; |
2644 | } |
2645 | |
2646 | cur_len = innobase_mysql_fts_get_token( |
2647 | query->fts_index_table.charset, |
2648 | reinterpret_cast<const byte*>(phrase.f_str) |
2649 | + cur_pos, |
2650 | reinterpret_cast<const byte*>(phrase.f_str) |
2651 | + len, |
2652 | &result_str); |
2653 | |
2654 | if (cur_len == 0) { |
2655 | break; |
2656 | } |
2657 | |
2658 | cur_pos += cur_len; |
2659 | } else { |
2660 | ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST); |
2661 | /* Term node in parser phrase list */ |
2662 | if (term_node == NULL) { |
2663 | break; |
2664 | } |
2665 | |
2666 | ut_a(term_node->type == FTS_AST_TERM); |
2667 | result_str.f_str = term_node->term.ptr->str; |
2668 | result_str.f_len = term_node->term.ptr->len; |
2669 | result_str.f_n_char = fts_get_token_size( |
2670 | query->fts_index_table.charset, |
2671 | reinterpret_cast<char*>(result_str.f_str), |
2672 | result_str.f_len); |
2673 | |
2674 | term_node = term_node->next; |
2675 | } |
2676 | |
2677 | if (result_str.f_n_char == 0) { |
2678 | continue; |
2679 | } |
2680 | |
2681 | fts_string_t* token = static_cast<fts_string_t*>( |
2682 | ib_vector_push(tokens, NULL)); |
2683 | fts_string_dup(token, &result_str, heap); |
2684 | |
2685 | if (fts_check_token( |
2686 | &result_str, |
2687 | cache->stopword_info.cached_stopword, |
2688 | query->fts_index_table.charset)) { |
2689 | /* Add the word to the RB tree so that we can |
2690 | calculate it's frequencey within a document. */ |
2691 | fts_query_add_word_freq(query, token); |
2692 | } else { |
2693 | ib_vector_pop(tokens); |
2694 | } |
2695 | |
2696 | /* we will start to store all words including stopwords |
2697 | in the "orig_tokens" vector, but skip any leading words |
2698 | that are stopwords */ |
2699 | if (!ib_vector_is_empty(tokens)) { |
2700 | fts_string_t* orig_token = static_cast<fts_string_t*>( |
2701 | ib_vector_push(orig_tokens, NULL)); |
2702 | |
2703 | orig_token->f_str = token->f_str; |
2704 | orig_token->f_len = token->f_len; |
2705 | } |
2706 | } |
2707 | } |
2708 | |
2709 | /*****************************************************************//** |
2710 | Text/Phrase search. |
2711 | @return DB_SUCCESS or error code */ |
2712 | static MY_ATTRIBUTE((warn_unused_result)) |
2713 | dberr_t |
2714 | fts_query_phrase_search( |
2715 | /*====================*/ |
2716 | fts_query_t* query, /*!< in: query instance */ |
2717 | const fts_ast_node_t* node) /*!< in: node to search */ |
2718 | { |
2719 | ib_vector_t* tokens; |
2720 | ib_vector_t* orig_tokens; |
2721 | mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t)); |
2722 | ib_alloc_t* heap_alloc; |
2723 | ulint num_token; |
2724 | |
2725 | heap_alloc = ib_heap_allocator_create(heap); |
2726 | |
2727 | tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); |
2728 | orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); |
2729 | |
2730 | if (query->distance != ULINT_UNDEFINED && query->distance > 0) { |
2731 | query->flags = FTS_PROXIMITY; |
2732 | } else { |
2733 | query->flags = FTS_PHRASE; |
2734 | } |
2735 | |
2736 | /* Split the phrase into tokens. */ |
2737 | fts_query_phrase_split(query, node, tokens, orig_tokens, heap); |
2738 | |
2739 | num_token = ib_vector_size(tokens); |
2740 | if (num_token > MAX_PROXIMITY_ITEM) { |
2741 | query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE; |
2742 | goto func_exit; |
2743 | } |
2744 | |
2745 | ut_ad(ib_vector_size(orig_tokens) >= num_token); |
2746 | |
2747 | /* Ignore empty strings. */ |
2748 | if (num_token > 0) { |
2749 | fts_string_t* token; |
2750 | fts_fetch_t fetch; |
2751 | trx_t* trx = query->trx; |
2752 | fts_ast_oper_t oper = query->oper; |
2753 | que_t* graph = NULL; |
2754 | ulint i; |
2755 | dberr_t error; |
2756 | |
2757 | /* Create the vector for storing matching document ids |
2758 | and the positions of the first token of the phrase. */ |
2759 | if (!query->matched) { |
2760 | ib_alloc_t* heap_alloc; |
2761 | |
2762 | heap_alloc = ib_heap_allocator_create(heap); |
2763 | |
2764 | if (!(query->flags & FTS_PROXIMITY) |
2765 | && !(query->flags & FTS_PHRASE)) { |
2766 | query->matched = ib_vector_create( |
2767 | heap_alloc, sizeof(fts_match_t), |
2768 | 64); |
2769 | } else { |
2770 | ut_a(num_token <= MAX_PROXIMITY_ITEM); |
2771 | query->match_array = |
2772 | (ib_vector_t**) mem_heap_alloc( |
2773 | heap, |
2774 | num_token * |
2775 | sizeof(query->matched)); |
2776 | |
2777 | for (i = 0; i < num_token; i++) { |
2778 | query->match_array[i] = |
2779 | ib_vector_create( |
2780 | heap_alloc, sizeof(fts_match_t), |
2781 | 64); |
2782 | } |
2783 | |
2784 | query->matched = query->match_array[0]; |
2785 | } |
2786 | } |
2787 | |
2788 | /* Setup the callback args for filtering and consolidating |
2789 | the ilist. */ |
2790 | fetch.read_arg = query; |
2791 | fetch.read_record = fts_query_index_fetch_nodes; |
2792 | |
2793 | for (i = 0; i < num_token; i++) { |
2794 | /* Search for the first word from the phrase. */ |
2795 | token = static_cast<fts_string_t*>( |
2796 | ib_vector_get(tokens, i)); |
2797 | |
2798 | if (query->flags & FTS_PROXIMITY |
2799 | || query->flags & FTS_PHRASE) { |
2800 | query->matched = query->match_array[i]; |
2801 | } |
2802 | |
2803 | error = fts_index_fetch_nodes( |
2804 | trx, &graph, &query->fts_index_table, |
2805 | token, &fetch); |
2806 | |
2807 | /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ |
2808 | ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); |
2809 | if (error != DB_SUCCESS) { |
2810 | query->error = error; |
2811 | } |
2812 | |
2813 | fts_que_graph_free(graph); |
2814 | graph = NULL; |
2815 | |
2816 | fts_query_cache(query, token); |
2817 | |
2818 | if (!(query->flags & FTS_PHRASE) |
2819 | && !(query->flags & FTS_PROXIMITY)) { |
2820 | break; |
2821 | } |
2822 | |
2823 | /* If any of the token can't be found, |
2824 | no need to continue match */ |
2825 | if (ib_vector_is_empty(query->match_array[i]) |
2826 | || query->error != DB_SUCCESS) { |
2827 | goto func_exit; |
2828 | } |
2829 | } |
2830 | |
2831 | /* Just a single word, no need to fetch the original |
2832 | documents to do phrase matching */ |
2833 | if (ib_vector_size(orig_tokens) == 1 |
2834 | && !ib_vector_is_empty(query->match_array[0])) { |
2835 | fts_match_t* match; |
2836 | ulint n_matched; |
2837 | |
2838 | n_matched = ib_vector_size(query->match_array[0]); |
2839 | |
2840 | for (i = 0; i < n_matched; i++) { |
2841 | match = static_cast<fts_match_t*>( |
2842 | ib_vector_get( |
2843 | query->match_array[0], i)); |
2844 | |
2845 | query->error = fts_query_process_doc_id( |
2846 | query, match->doc_id, 0); |
2847 | if (query->error != DB_SUCCESS) { |
2848 | goto func_exit; |
2849 | } |
2850 | |
2851 | fts_query_add_word_to_document( |
2852 | query, match->doc_id, token); |
2853 | } |
2854 | query->oper = oper; |
2855 | goto func_exit; |
2856 | } |
2857 | |
2858 | /* If we are doing proximity search, verify the distance |
2859 | between all words, and check they are in specified distance. */ |
2860 | if (query->flags & FTS_PROXIMITY) { |
2861 | fts_phrase_or_proximity_search(query, tokens); |
2862 | } else { |
2863 | ibool matched; |
2864 | |
2865 | /* Phrase Search case: |
2866 | We filter out the doc ids that don't contain |
2867 | all the tokens in the phrase. It's cheaper to |
2868 | search the ilist than bringing the documents in |
2869 | and then doing a search through the text. Isolated |
2870 | testing shows this also helps in mitigating disruption |
2871 | of the buffer cache. */ |
2872 | matched = fts_phrase_or_proximity_search(query, tokens); |
2873 | query->matched = query->match_array[0]; |
2874 | |
2875 | /* Read the actual text in and search for the phrase. */ |
2876 | if (matched) { |
2877 | ut_ad(query->error == DB_SUCCESS); |
2878 | query->error = fts_query_search_phrase( |
2879 | query, orig_tokens, tokens); |
2880 | } |
2881 | } |
2882 | |
2883 | /* Restore original operation. */ |
2884 | query->oper = oper; |
2885 | |
2886 | if (query->error != DB_SUCCESS) { |
2887 | goto func_exit; |
2888 | } |
2889 | } |
2890 | |
2891 | func_exit: |
2892 | mem_heap_free(heap); |
2893 | |
2894 | /* Don't need it anymore. */ |
2895 | query->matched = NULL; |
2896 | |
2897 | return(query->error); |
2898 | } |
2899 | |
2900 | /*****************************************************************//** |
2901 | Find the word and evaluate. |
2902 | @return DB_SUCCESS if all go well */ |
2903 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
2904 | dberr_t |
2905 | fts_query_execute( |
2906 | /*==============*/ |
2907 | fts_query_t* query, /*!< in: query instance */ |
2908 | fts_string_t* token) /*!< in: token to search */ |
2909 | { |
2910 | switch (query->oper) { |
2911 | case FTS_NONE: |
2912 | case FTS_NEGATE: |
2913 | case FTS_INCR_RATING: |
2914 | case FTS_DECR_RATING: |
2915 | query->error = fts_query_union(query, token); |
2916 | break; |
2917 | |
2918 | case FTS_EXIST: |
2919 | query->error = fts_query_intersect(query, token); |
2920 | break; |
2921 | |
2922 | case FTS_IGNORE: |
2923 | query->error = fts_query_difference(query, token); |
2924 | break; |
2925 | |
2926 | default: |
2927 | ut_error; |
2928 | } |
2929 | |
2930 | return(query->error); |
2931 | } |
2932 | |
2933 | /*****************************************************************//** |
2934 | Create a wildcard string. It's the responsibility of the caller to |
2935 | free the byte* pointer. It's allocated using ut_malloc_nokey(). |
2936 | @return ptr to allocated memory */ |
2937 | static |
2938 | byte* |
2939 | fts_query_get_token( |
2940 | /*================*/ |
2941 | fts_ast_node_t* node, /*!< in: the current sub tree */ |
2942 | fts_string_t* token) /*!< in: token to create */ |
2943 | { |
2944 | ulint str_len; |
2945 | byte* new_ptr = NULL; |
2946 | |
2947 | str_len = node->term.ptr->len; |
2948 | |
2949 | ut_a(node->type == FTS_AST_TERM); |
2950 | |
2951 | token->f_len = str_len; |
2952 | token->f_str = node->term.ptr->str; |
2953 | |
2954 | if (node->term.wildcard) { |
2955 | |
2956 | token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2)); |
2957 | token->f_len = str_len + 1; |
2958 | |
2959 | memcpy(token->f_str, node->term.ptr->str, str_len); |
2960 | |
2961 | token->f_str[str_len] = '%'; |
2962 | token->f_str[token->f_len] = 0; |
2963 | |
2964 | new_ptr = token->f_str; |
2965 | } |
2966 | |
2967 | return(new_ptr); |
2968 | } |
2969 | |
2970 | /*****************************************************************//** |
2971 | Visit every node of the AST. */ |
2972 | static |
2973 | dberr_t |
2974 | fts_query_visitor( |
2975 | /*==============*/ |
2976 | fts_ast_oper_t oper, /*!< in: current operator */ |
2977 | fts_ast_node_t* node, /*!< in: The root of the current subtree*/ |
2978 | void* arg) /*!< in: callback arg*/ |
2979 | { |
2980 | byte* ptr; |
2981 | fts_string_t token; |
2982 | fts_query_t* query = static_cast<fts_query_t*>(arg); |
2983 | |
2984 | ut_a(node); |
2985 | DBUG_ENTER("fts_query_visitor" ); |
2986 | DBUG_PRINT("fts" , ("nodetype: %s" , fts_ast_node_type_get(node->type))); |
2987 | |
2988 | token.f_n_char = 0; |
2989 | query->oper = oper; |
2990 | query->cur_node = node; |
2991 | |
2992 | switch (node->type) { |
2993 | case FTS_AST_TEXT: |
2994 | case FTS_AST_PARSER_PHRASE_LIST: |
2995 | |
2996 | if (query->oper == FTS_EXIST) { |
2997 | ut_ad(query->intersection == NULL); |
2998 | query->intersection = rbt_create( |
2999 | sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); |
3000 | |
3001 | query->total_size += SIZEOF_RBT_CREATE; |
3002 | } |
3003 | |
3004 | /* Set the current proximity distance. */ |
3005 | query->distance = node->text.distance; |
3006 | |
3007 | /* Force collection of doc ids and the positions. */ |
3008 | query->collect_positions = TRUE; |
3009 | |
3010 | query->error = fts_query_phrase_search(query, node); |
3011 | |
3012 | query->collect_positions = FALSE; |
3013 | |
3014 | if (query->oper == FTS_EXIST) { |
3015 | fts_query_free_doc_ids(query, query->doc_ids); |
3016 | query->doc_ids = query->intersection; |
3017 | query->intersection = NULL; |
3018 | } |
3019 | |
3020 | break; |
3021 | |
3022 | case FTS_AST_TERM: |
3023 | token.f_str = node->term.ptr->str; |
3024 | token.f_len = node->term.ptr->len; |
3025 | |
3026 | /* Collect wildcard words for QUERY EXPANSION. */ |
3027 | if (node->term.wildcard && query->wildcard_words != NULL) { |
3028 | ib_rbt_bound_t parent; |
3029 | |
3030 | if (rbt_search(query->wildcard_words, &parent, &token) |
3031 | != 0) { |
3032 | fts_string_t word; |
3033 | |
3034 | fts_string_dup(&word, &token, query->heap); |
3035 | rbt_add_node(query->wildcard_words, &parent, |
3036 | &word); |
3037 | } |
3038 | } |
3039 | |
3040 | /* Add the word to our RB tree that will be used to |
3041 | calculate this terms per document frequency. */ |
3042 | fts_query_add_word_freq(query, &token); |
3043 | |
3044 | ptr = fts_query_get_token(node, &token); |
3045 | query->error = fts_query_execute(query, &token); |
3046 | |
3047 | if (ptr) { |
3048 | ut_free(ptr); |
3049 | } |
3050 | |
3051 | break; |
3052 | |
3053 | case FTS_AST_SUBEXP_LIST: |
3054 | query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg); |
3055 | break; |
3056 | |
3057 | default: |
3058 | ut_error; |
3059 | } |
3060 | |
3061 | if (query->oper == FTS_EXIST) { |
3062 | query->multi_exist = true; |
3063 | } |
3064 | |
3065 | DBUG_RETURN(query->error); |
3066 | } |
3067 | |
3068 | /** Process (nested) sub-expression, create a new result set to store the |
3069 | sub-expression result by processing nodes under current sub-expression |
3070 | list. Merge the sub-expression result with that of parent expression list. |
3071 | @param[in,out] node current root node |
3072 | @param[in,out] visitor callback function |
3073 | @param[in,out] arg argument for callback |
3074 | @return DB_SUCCESS if all go well */ |
3075 | static |
3076 | dberr_t |
3077 | fts_ast_visit_sub_exp( |
3078 | fts_ast_node_t* node, |
3079 | fts_ast_callback visitor, |
3080 | void* arg) |
3081 | { |
3082 | fts_ast_oper_t cur_oper; |
3083 | fts_query_t* query = static_cast<fts_query_t*>(arg); |
3084 | ib_rbt_t* parent_doc_ids; |
3085 | ib_rbt_t* subexpr_doc_ids; |
3086 | dberr_t error = DB_SUCCESS; |
3087 | bool will_be_ignored = false; |
3088 | bool multi_exist; |
3089 | |
3090 | DBUG_ENTER("fts_ast_visit_sub_exp" ); |
3091 | |
3092 | ut_a(node->type == FTS_AST_SUBEXP_LIST); |
3093 | |
3094 | cur_oper = query->oper; |
3095 | |
3096 | /* Save current result set */ |
3097 | parent_doc_ids = query->doc_ids; |
3098 | |
3099 | /* Create new result set to store the sub-expression result. We |
3100 | will merge this result set with the parent after processing. */ |
3101 | query->doc_ids = rbt_create(sizeof(fts_ranking_t), |
3102 | fts_ranking_doc_id_cmp); |
3103 | |
3104 | query->total_size += SIZEOF_RBT_CREATE; |
3105 | |
3106 | multi_exist = query->multi_exist; |
3107 | query->multi_exist = false; |
3108 | /* Process nodes in current sub-expression and store its |
3109 | result set in query->doc_ids we created above. */ |
3110 | error = fts_ast_visit(FTS_NONE, node, visitor, |
3111 | arg, &will_be_ignored); |
3112 | |
3113 | /* Reinstate parent node state */ |
3114 | query->multi_exist = multi_exist; |
3115 | query->oper = cur_oper; |
3116 | |
3117 | /* Merge the sub-expression result with the parent result set. */ |
3118 | subexpr_doc_ids = query->doc_ids; |
3119 | query->doc_ids = parent_doc_ids; |
3120 | if (error == DB_SUCCESS) { |
3121 | error = fts_merge_doc_ids(query, subexpr_doc_ids); |
3122 | } |
3123 | |
3124 | /* Free current result set. Result already merged into parent. */ |
3125 | fts_query_free_doc_ids(query, subexpr_doc_ids); |
3126 | |
3127 | DBUG_RETURN(error); |
3128 | } |
3129 | |
3130 | #if 0 |
3131 | /*****************************************************************//*** |
3132 | Check if the doc id exists in the ilist. |
3133 | @return TRUE if doc id found */ |
3134 | static |
3135 | ulint |
3136 | fts_query_find_doc_id( |
3137 | /*==================*/ |
3138 | fts_select_t* select, /*!< in/out: contains the doc id to |
3139 | find, we update the word freq if |
3140 | document found */ |
3141 | void* data, /*!< in: doc id ilist */ |
3142 | ulint len) /*!< in: doc id ilist size */ |
3143 | { |
3144 | byte* ptr = data; |
3145 | doc_id_t doc_id = 0; |
3146 | ulint decoded = 0; |
3147 | |
3148 | /* Decode the ilist and search for selected doc_id. We also |
3149 | calculate the frequency of the word in the document if found. */ |
3150 | while (decoded < len && !select->found) { |
3151 | ulint freq = 0; |
3152 | ulint min_pos = 0; |
3153 | ulint last_pos = 0; |
3154 | ulint pos = fts_decode_vlc(&ptr); |
3155 | |
3156 | /* Add the delta. */ |
3157 | doc_id += pos; |
3158 | |
3159 | while (*ptr) { |
3160 | ++freq; |
3161 | last_pos += fts_decode_vlc(&ptr); |
3162 | |
3163 | /* Only if min_pos is not set and the current |
3164 | term exists in a position greater than the |
3165 | min_pos of the previous term. */ |
3166 | if (min_pos == 0 && last_pos > select->min_pos) { |
3167 | min_pos = last_pos; |
3168 | } |
3169 | } |
3170 | |
3171 | /* Skip the end of word position marker. */ |
3172 | ++ptr; |
3173 | |
3174 | /* Bytes decoded so far. */ |
3175 | decoded = ptr - (byte*) data; |
3176 | |
3177 | /* A word may exist in the document but we only consider a |
3178 | match if it exists in a position that is greater than the |
3179 | position of the previous term. */ |
3180 | if (doc_id == select->doc_id && min_pos > 0) { |
3181 | fts_doc_freq_t* doc_freq; |
3182 | |
3183 | /* Add the doc id to the doc freq rb tree, if |
3184 | the doc id doesn't exist it will be created. */ |
3185 | doc_freq = fts_query_add_doc_freq( |
3186 | select->word_freq->doc_freqs, doc_id); |
3187 | |
3188 | /* Avoid duplicating the frequency tally */ |
3189 | if (doc_freq->freq == 0) { |
3190 | doc_freq->freq = freq; |
3191 | } |
3192 | |
3193 | select->found = TRUE; |
3194 | select->min_pos = min_pos; |
3195 | } |
3196 | } |
3197 | |
3198 | return(select->found); |
3199 | } |
3200 | #endif |
3201 | |
3202 | /*****************************************************************//** |
3203 | Read and filter nodes. |
3204 | @return DB_SUCCESS if all go well, |
3205 | or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ |
3206 | static |
3207 | dberr_t |
3208 | fts_query_filter_doc_ids( |
3209 | /*=====================*/ |
3210 | fts_query_t* query, /*!< in: query instance */ |
3211 | const fts_string_t* word, /*!< in: the current word */ |
3212 | fts_word_freq_t* word_freq, /*!< in/out: word frequency */ |
3213 | const fts_node_t* node, /*!< in: current FTS node */ |
3214 | void* data, /*!< in: doc id ilist */ |
3215 | ulint len, /*!< in: doc id ilist size */ |
3216 | ibool calc_doc_count) /*!< in: whether to remember doc count */ |
3217 | { |
3218 | byte* ptr = static_cast<byte*>(data); |
3219 | doc_id_t doc_id = 0; |
3220 | ulint decoded = 0; |
3221 | ib_rbt_t* doc_freqs = word_freq->doc_freqs; |
3222 | |
3223 | /* Decode the ilist and add the doc ids to the query doc_id set. */ |
3224 | while (decoded < len) { |
3225 | ulint freq = 0; |
3226 | fts_doc_freq_t* doc_freq; |
3227 | fts_match_t* match = NULL; |
3228 | ulint last_pos = 0; |
3229 | ulint pos = fts_decode_vlc(&ptr); |
3230 | |
3231 | /* Some sanity checks. */ |
3232 | if (doc_id == 0) { |
3233 | ut_a(pos == node->first_doc_id); |
3234 | } |
3235 | |
3236 | /* Add the delta. */ |
3237 | doc_id += pos; |
3238 | |
3239 | if (calc_doc_count) { |
3240 | word_freq->doc_count++; |
3241 | } |
3242 | |
3243 | /* We simply collect the matching instances here. */ |
3244 | if (query->collect_positions) { |
3245 | ib_alloc_t* heap_alloc; |
3246 | |
3247 | /* Create a new fts_match_t instance. */ |
3248 | match = static_cast<fts_match_t*>( |
3249 | ib_vector_push(query->matched, NULL)); |
3250 | |
3251 | match->start = 0; |
3252 | match->doc_id = doc_id; |
3253 | heap_alloc = ib_vector_allocator(query->matched); |
3254 | |
3255 | /* Allocate from the same heap as the |
3256 | parent container. */ |
3257 | match->positions = ib_vector_create( |
3258 | heap_alloc, sizeof(ulint), 64); |
3259 | |
3260 | query->total_size += sizeof(fts_match_t) |
3261 | + sizeof(ib_vector_t) |
3262 | + sizeof(ulint) * 64; |
3263 | } |
3264 | |
3265 | /* Unpack the positions within the document. */ |
3266 | while (*ptr) { |
3267 | last_pos += fts_decode_vlc(&ptr); |
3268 | |
3269 | /* Collect the matching word positions, for phrase |
3270 | matching later. */ |
3271 | if (query->collect_positions) { |
3272 | ib_vector_push(match->positions, &last_pos); |
3273 | } |
3274 | |
3275 | ++freq; |
3276 | } |
3277 | |
3278 | /* End of list marker. */ |
3279 | last_pos = (ulint) -1; |
3280 | |
3281 | if (query->collect_positions) { |
3282 | ut_a(match != NULL); |
3283 | ib_vector_push(match->positions, &last_pos); |
3284 | } |
3285 | |
3286 | /* Add the doc id to the doc freq rb tree, if the doc id |
3287 | doesn't exist it will be created. */ |
3288 | doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id); |
3289 | |
3290 | /* Avoid duplicating frequency tally. */ |
3291 | if (doc_freq->freq == 0) { |
3292 | doc_freq->freq = freq; |
3293 | } |
3294 | |
3295 | /* Skip the end of word position marker. */ |
3296 | ++ptr; |
3297 | |
3298 | /* Bytes decoded so far */ |
3299 | decoded = ulint(ptr - (byte*) data); |
3300 | |
3301 | /* We simply collect the matching documents and the |
3302 | positions here and match later. */ |
3303 | if (!query->collect_positions) { |
3304 | /* We ignore error here and will check it later */ |
3305 | fts_query_process_doc_id(query, doc_id, 0); |
3306 | |
3307 | /* Add the word to the document's matched RB tree. */ |
3308 | fts_query_add_word_to_document(query, doc_id, word); |
3309 | } |
3310 | } |
3311 | |
3312 | /* Some sanity checks. */ |
3313 | ut_a(doc_id == node->last_doc_id); |
3314 | |
3315 | if (query->total_size > fts_result_cache_limit) { |
3316 | return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); |
3317 | } else { |
3318 | return(DB_SUCCESS); |
3319 | } |
3320 | } |
3321 | |
3322 | /*****************************************************************//** |
3323 | Read the FTS INDEX row. |
3324 | @return DB_SUCCESS if all go well. */ |
3325 | static |
3326 | dberr_t |
3327 | fts_query_read_node( |
3328 | /*================*/ |
3329 | fts_query_t* query, /*!< in: query instance */ |
3330 | const fts_string_t* word, /*!< in: current word */ |
3331 | que_node_t* exp) /*!< in: query graph node */ |
3332 | { |
3333 | int i; |
3334 | int ret; |
3335 | fts_node_t node; |
3336 | ib_rbt_bound_t parent; |
3337 | fts_word_freq_t* word_freq; |
3338 | ibool skip = FALSE; |
3339 | fts_string_t term; |
3340 | byte buf[FTS_MAX_WORD_LEN + 1]; |
3341 | dberr_t error = DB_SUCCESS; |
3342 | |
3343 | ut_a(query->cur_node->type == FTS_AST_TERM |
3344 | || query->cur_node->type == FTS_AST_TEXT |
3345 | || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST); |
3346 | |
3347 | memset(&node, 0, sizeof(node)); |
3348 | term.f_str = buf; |
3349 | |
3350 | /* Need to consider the wildcard search case, the word frequency |
3351 | is created on the search string not the actual word. So we need |
3352 | to assign the frequency on search string behalf. */ |
3353 | if (query->cur_node->type == FTS_AST_TERM |
3354 | && query->cur_node->term.wildcard) { |
3355 | |
3356 | term.f_len = query->cur_node->term.ptr->len; |
3357 | ut_ad(FTS_MAX_WORD_LEN >= term.f_len); |
3358 | memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len); |
3359 | } else { |
3360 | term.f_len = word->f_len; |
3361 | ut_ad(FTS_MAX_WORD_LEN >= word->f_len); |
3362 | memcpy(term.f_str, word->f_str, word->f_len); |
3363 | } |
3364 | |
3365 | /* Lookup the word in our rb tree, it must exist. */ |
3366 | ret = rbt_search(query->word_freqs, &parent, &term); |
3367 | |
3368 | ut_a(ret == 0); |
3369 | |
3370 | word_freq = rbt_value(fts_word_freq_t, parent.last); |
3371 | |
3372 | /* Start from 1 since the first column has been read by the caller. |
3373 | Also, we rely on the order of the columns projected, to filter |
3374 | out ilists that are out of range and we always want to read |
3375 | the doc_count irrespective of the suitablility of the row. */ |
3376 | |
3377 | for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) { |
3378 | |
3379 | dfield_t* dfield = que_node_get_val(exp); |
3380 | byte* data = static_cast<byte*>( |
3381 | dfield_get_data(dfield)); |
3382 | ulint len = dfield_get_len(dfield); |
3383 | |
3384 | ut_a(len != UNIV_SQL_NULL); |
3385 | |
3386 | /* Note: The column numbers below must match the SELECT. */ |
3387 | |
3388 | switch (i) { |
3389 | case 1: /* DOC_COUNT */ |
3390 | word_freq->doc_count += mach_read_from_4(data); |
3391 | break; |
3392 | |
3393 | case 2: /* FIRST_DOC_ID */ |
3394 | node.first_doc_id = fts_read_doc_id(data); |
3395 | |
3396 | /* Skip nodes whose doc ids are out range. */ |
3397 | if (query->oper == FTS_EXIST |
3398 | && query->upper_doc_id > 0 |
3399 | && node.first_doc_id > query->upper_doc_id) { |
3400 | skip = TRUE; |
3401 | } |
3402 | break; |
3403 | |
3404 | case 3: /* LAST_DOC_ID */ |
3405 | node.last_doc_id = fts_read_doc_id(data); |
3406 | |
3407 | /* Skip nodes whose doc ids are out range. */ |
3408 | if (query->oper == FTS_EXIST |
3409 | && query->lower_doc_id > 0 |
3410 | && node.last_doc_id < query->lower_doc_id) { |
3411 | skip = TRUE; |
3412 | } |
3413 | break; |
3414 | |
3415 | case 4: /* ILIST */ |
3416 | |
3417 | error = fts_query_filter_doc_ids( |
3418 | query, &word_freq->word, word_freq, |
3419 | &node, data, len, FALSE); |
3420 | |
3421 | break; |
3422 | |
3423 | default: |
3424 | ut_error; |
3425 | } |
3426 | } |
3427 | |
3428 | if (!skip) { |
3429 | /* Make sure all columns were read. */ |
3430 | |
3431 | ut_a(i == 5); |
3432 | } |
3433 | |
3434 | return error; |
3435 | } |
3436 | |
3437 | /*****************************************************************//** |
3438 | Callback function to fetch the rows in an FTS INDEX record. |
3439 | @return always returns TRUE */ |
3440 | static |
3441 | ibool |
3442 | fts_query_index_fetch_nodes( |
3443 | /*========================*/ |
3444 | void* row, /*!< in: sel_node_t* */ |
3445 | void* user_arg) /*!< in: pointer to fts_fetch_t */ |
3446 | { |
3447 | fts_string_t key; |
3448 | sel_node_t* sel_node = static_cast<sel_node_t*>(row); |
3449 | fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg); |
3450 | fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg); |
3451 | que_node_t* exp = sel_node->select_list; |
3452 | dfield_t* dfield = que_node_get_val(exp); |
3453 | void* data = dfield_get_data(dfield); |
3454 | ulint dfield_len = dfield_get_len(dfield); |
3455 | |
3456 | key.f_str = static_cast<byte*>(data); |
3457 | key.f_len = dfield_len; |
3458 | |
3459 | ut_a(dfield_len <= FTS_MAX_WORD_LEN); |
3460 | |
3461 | /* Note: we pass error out by 'query->error' */ |
3462 | query->error = fts_query_read_node(query, &key, que_node_get_next(exp)); |
3463 | |
3464 | if (query->error != DB_SUCCESS) { |
3465 | ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); |
3466 | return(FALSE); |
3467 | } else { |
3468 | return(TRUE); |
3469 | } |
3470 | } |
3471 | |
3472 | /*****************************************************************//** |
3473 | Calculate the inverse document frequency (IDF) for all the terms. */ |
3474 | static |
3475 | void |
3476 | fts_query_calculate_idf( |
3477 | /*====================*/ |
3478 | fts_query_t* query) /*!< in: Query state */ |
3479 | { |
3480 | const ib_rbt_node_t* node; |
3481 | ib_uint64_t total_docs = query->total_docs; |
3482 | |
3483 | /* We need to free any instances of fts_doc_freq_t that we |
3484 | may have allocated. */ |
3485 | for (node = rbt_first(query->word_freqs); |
3486 | node; |
3487 | node = rbt_next(query->word_freqs, node)) { |
3488 | |
3489 | fts_word_freq_t* word_freq; |
3490 | |
3491 | word_freq = rbt_value(fts_word_freq_t, node); |
3492 | |
3493 | if (word_freq->doc_count > 0) { |
3494 | if (total_docs == word_freq->doc_count) { |
3495 | /* QP assume ranking > 0 if we find |
3496 | a match. Since Log10(1) = 0, we cannot |
3497 | make IDF a zero value if do find a |
3498 | word in all documents. So let's make |
3499 | it an arbitrary very small number */ |
3500 | word_freq->idf = log10(1.0001); |
3501 | } else { |
3502 | word_freq->idf = log10( |
3503 | total_docs |
3504 | / (double) word_freq->doc_count); |
3505 | } |
3506 | } |
3507 | |
3508 | if (fts_enable_diag_print) { |
3509 | ib::info() << "'" << word_freq->word.f_str << "' -> " |
3510 | << query->total_docs << "/" |
3511 | << word_freq->doc_count << " " |
3512 | << std::setw(6) << std::setprecision(5) |
3513 | << word_freq->idf; |
3514 | } |
3515 | } |
3516 | } |
3517 | |
3518 | /*****************************************************************//** |
3519 | Calculate the ranking of the document. */ |
3520 | static |
3521 | void |
3522 | fts_query_calculate_ranking( |
3523 | /*========================*/ |
3524 | const fts_query_t* query, /*!< in: query state */ |
3525 | fts_ranking_t* ranking) /*!< in: Document to rank */ |
3526 | { |
3527 | ulint pos = 0; |
3528 | fts_string_t word; |
3529 | |
3530 | /* At this stage, ranking->rank should not exceed the 1.0 |
3531 | bound */ |
3532 | ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0); |
3533 | ut_ad(rbt_size(query->word_map) == query->word_vector->size()); |
3534 | |
3535 | while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { |
3536 | int ret; |
3537 | ib_rbt_bound_t parent; |
3538 | double weight; |
3539 | fts_doc_freq_t* doc_freq; |
3540 | fts_word_freq_t* word_freq; |
3541 | |
3542 | ret = rbt_search(query->word_freqs, &parent, &word); |
3543 | |
3544 | /* It must exist. */ |
3545 | ut_a(ret == 0); |
3546 | |
3547 | word_freq = rbt_value(fts_word_freq_t, parent.last); |
3548 | |
3549 | ret = rbt_search( |
3550 | word_freq->doc_freqs, &parent, &ranking->doc_id); |
3551 | |
3552 | /* It must exist. */ |
3553 | ut_a(ret == 0); |
3554 | |
3555 | doc_freq = rbt_value(fts_doc_freq_t, parent.last); |
3556 | |
3557 | weight = (double) doc_freq->freq * word_freq->idf; |
3558 | |
3559 | ranking->rank += (fts_rank_t) (weight * word_freq->idf); |
3560 | } |
3561 | } |
3562 | |
3563 | /*****************************************************************//** |
3564 | Add ranking to the result set. */ |
3565 | static |
3566 | void |
3567 | fts_query_add_ranking( |
3568 | /*==================*/ |
3569 | fts_query_t* query, /*!< in: query state */ |
3570 | ib_rbt_t* ranking_tree, /*!< in: ranking tree */ |
3571 | const fts_ranking_t* new_ranking) /*!< in: ranking of a document */ |
3572 | { |
3573 | ib_rbt_bound_t parent; |
3574 | |
3575 | /* Lookup the ranking in our rb tree and add if it doesn't exist. */ |
3576 | if (rbt_search(ranking_tree, &parent, new_ranking) == 0) { |
3577 | fts_ranking_t* ranking; |
3578 | |
3579 | ranking = rbt_value(fts_ranking_t, parent.last); |
3580 | |
3581 | ranking->rank += new_ranking->rank; |
3582 | |
3583 | ut_a(ranking->words == NULL); |
3584 | } else { |
3585 | rbt_add_node(ranking_tree, &parent, new_ranking); |
3586 | |
3587 | query->total_size += SIZEOF_RBT_NODE_ADD |
3588 | + sizeof(fts_ranking_t); |
3589 | } |
3590 | } |
3591 | |
3592 | /*****************************************************************//** |
3593 | Retrieve the FTS Relevance Ranking result for doc with doc_id |
3594 | @return the relevance ranking value, 0 if no ranking value |
3595 | present. */ |
3596 | float |
3597 | fts_retrieve_ranking( |
3598 | /*=================*/ |
3599 | fts_result_t* result, /*!< in: FTS result structure */ |
3600 | doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */ |
3601 | { |
3602 | ib_rbt_bound_t parent; |
3603 | fts_ranking_t new_ranking; |
3604 | |
3605 | DBUG_ENTER("fts_retrieve_ranking" ); |
3606 | |
3607 | if (!result || !result->rankings_by_id) { |
3608 | DBUG_RETURN(0); |
3609 | } |
3610 | |
3611 | new_ranking.doc_id = doc_id; |
3612 | |
3613 | /* Lookup the ranking in our rb tree */ |
3614 | if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) { |
3615 | fts_ranking_t* ranking; |
3616 | |
3617 | ranking = rbt_value(fts_ranking_t, parent.last); |
3618 | |
3619 | DBUG_RETURN(ranking->rank); |
3620 | } |
3621 | |
3622 | DBUG_RETURN(0); |
3623 | } |
3624 | |
3625 | /*****************************************************************//** |
3626 | Create the result and copy the data to it. */ |
3627 | static |
3628 | fts_result_t* |
3629 | fts_query_prepare_result( |
3630 | /*=====================*/ |
3631 | fts_query_t* query, /*!< in: Query state */ |
3632 | fts_result_t* result) /*!< in: result this can contain |
3633 | data from a previous search on |
3634 | another FTS index */ |
3635 | { |
3636 | const ib_rbt_node_t* node; |
3637 | bool result_is_null = false; |
3638 | |
3639 | DBUG_ENTER("fts_query_prepare_result" ); |
3640 | |
3641 | if (result == NULL) { |
3642 | result = static_cast<fts_result_t*>( |
3643 | ut_zalloc_nokey(sizeof(*result))); |
3644 | |
3645 | result->rankings_by_id = rbt_create( |
3646 | sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); |
3647 | |
3648 | query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE; |
3649 | result_is_null = true; |
3650 | } |
3651 | |
3652 | if (query->flags == FTS_OPT_RANKING) { |
3653 | fts_word_freq_t* word_freq; |
3654 | ulint size = ib_vector_size(query->deleted->doc_ids); |
3655 | fts_update_t* array = |
3656 | (fts_update_t*) query->deleted->doc_ids->data; |
3657 | |
3658 | node = rbt_first(query->word_freqs); |
3659 | ut_ad(node); |
3660 | word_freq = rbt_value(fts_word_freq_t, node); |
3661 | |
3662 | for (node = rbt_first(word_freq->doc_freqs); |
3663 | node; |
3664 | node = rbt_next(word_freq->doc_freqs, node)) { |
3665 | fts_doc_freq_t* doc_freq; |
3666 | fts_ranking_t ranking; |
3667 | |
3668 | doc_freq = rbt_value(fts_doc_freq_t, node); |
3669 | |
3670 | /* Don't put deleted docs into result */ |
3671 | if (fts_bsearch(array, 0, static_cast<int>(size), |
3672 | doc_freq->doc_id) >= 0) { |
3673 | /* one less matching doc count */ |
3674 | --word_freq->doc_count; |
3675 | continue; |
3676 | } |
3677 | |
3678 | ranking.doc_id = doc_freq->doc_id; |
3679 | ranking.rank = static_cast<fts_rank_t>(doc_freq->freq); |
3680 | ranking.words = NULL; |
3681 | |
3682 | fts_query_add_ranking(query, result->rankings_by_id, |
3683 | &ranking); |
3684 | |
3685 | if (query->total_size > fts_result_cache_limit) { |
3686 | query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; |
3687 | fts_query_free_result(result); |
3688 | DBUG_RETURN(NULL); |
3689 | } |
3690 | } |
3691 | |
3692 | /* Calculate IDF only after we exclude the deleted items */ |
3693 | fts_query_calculate_idf(query); |
3694 | |
3695 | node = rbt_first(query->word_freqs); |
3696 | word_freq = rbt_value(fts_word_freq_t, node); |
3697 | |
3698 | /* Calculate the ranking for each doc */ |
3699 | for (node = rbt_first(result->rankings_by_id); |
3700 | node != NULL; |
3701 | node = rbt_next(result->rankings_by_id, node)) { |
3702 | |
3703 | fts_ranking_t* ranking; |
3704 | |
3705 | ranking = rbt_value(fts_ranking_t, node); |
3706 | |
3707 | ranking->rank = static_cast<fts_rank_t>( |
3708 | ranking->rank * word_freq->idf * word_freq->idf); |
3709 | } |
3710 | |
3711 | DBUG_RETURN(result); |
3712 | } |
3713 | |
3714 | ut_a(rbt_size(query->doc_ids) > 0); |
3715 | |
3716 | for (node = rbt_first(query->doc_ids); |
3717 | node; |
3718 | node = rbt_next(query->doc_ids, node)) { |
3719 | |
3720 | fts_ranking_t* ranking; |
3721 | |
3722 | ranking = rbt_value(fts_ranking_t, node); |
3723 | fts_query_calculate_ranking(query, ranking); |
3724 | |
3725 | // FIXME: I think we may requre this information to improve the |
3726 | // ranking of doc ids which have more word matches from |
3727 | // different FTS indexes. |
3728 | |
3729 | /* We don't need these anymore free the resources. */ |
3730 | ranking->words = NULL; |
3731 | |
3732 | if (!result_is_null) { |
3733 | fts_query_add_ranking(query, result->rankings_by_id, ranking); |
3734 | |
3735 | if (query->total_size > fts_result_cache_limit) { |
3736 | query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; |
3737 | fts_query_free_result(result); |
3738 | DBUG_RETURN(NULL); |
3739 | } |
3740 | } |
3741 | } |
3742 | |
3743 | if (result_is_null) { |
3744 | /* Use doc_ids directly */ |
3745 | rbt_free(result->rankings_by_id); |
3746 | result->rankings_by_id = query->doc_ids; |
3747 | query->doc_ids = NULL; |
3748 | } |
3749 | |
3750 | DBUG_RETURN(result); |
3751 | } |
3752 | |
3753 | /*****************************************************************//** |
3754 | Get the result of the query. Calculate the similarity coefficient. */ |
3755 | static |
3756 | fts_result_t* |
3757 | fts_query_get_result( |
3758 | /*=================*/ |
3759 | fts_query_t* query, /*!< in: query instance */ |
3760 | fts_result_t* result) /*!< in: result */ |
3761 | { |
3762 | DBUG_ENTER("fts_query_get_result" ); |
3763 | |
3764 | if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) { |
3765 | /* Copy the doc ids to the result. */ |
3766 | result = fts_query_prepare_result(query, result); |
3767 | } else { |
3768 | /* Create an empty result instance. */ |
3769 | result = static_cast<fts_result_t*>( |
3770 | ut_zalloc_nokey(sizeof(*result))); |
3771 | } |
3772 | |
3773 | DBUG_RETURN(result); |
3774 | } |
3775 | |
3776 | /*****************************************************************//** |
3777 | FTS Query free resources and reset. */ |
3778 | static |
3779 | void |
3780 | fts_query_free( |
3781 | /*===========*/ |
3782 | fts_query_t* query) /*!< in: query instance to free*/ |
3783 | { |
3784 | |
3785 | if (query->read_nodes_graph) { |
3786 | fts_que_graph_free(query->read_nodes_graph); |
3787 | } |
3788 | |
3789 | if (query->root) { |
3790 | fts_ast_free_node(query->root); |
3791 | } |
3792 | |
3793 | if (query->deleted) { |
3794 | fts_doc_ids_free(query->deleted); |
3795 | } |
3796 | |
3797 | if (query->intersection) { |
3798 | fts_query_free_doc_ids(query, query->intersection); |
3799 | } |
3800 | |
3801 | if (query->doc_ids) { |
3802 | fts_query_free_doc_ids(query, query->doc_ids); |
3803 | } |
3804 | |
3805 | if (query->word_freqs) { |
3806 | const ib_rbt_node_t* node; |
3807 | |
3808 | /* We need to free any instances of fts_doc_freq_t that we |
3809 | may have allocated. */ |
3810 | for (node = rbt_first(query->word_freqs); |
3811 | node; |
3812 | node = rbt_next(query->word_freqs, node)) { |
3813 | |
3814 | fts_word_freq_t* word_freq; |
3815 | |
3816 | word_freq = rbt_value(fts_word_freq_t, node); |
3817 | |
3818 | /* We need to cast away the const. */ |
3819 | rbt_free(word_freq->doc_freqs); |
3820 | } |
3821 | |
3822 | rbt_free(query->word_freqs); |
3823 | } |
3824 | |
3825 | if (query->wildcard_words != NULL) { |
3826 | rbt_free(query->wildcard_words); |
3827 | } |
3828 | |
3829 | ut_a(!query->intersection); |
3830 | |
3831 | if (query->word_map) { |
3832 | rbt_free(query->word_map); |
3833 | } |
3834 | |
3835 | if (query->word_vector != NULL) { |
3836 | UT_DELETE(query->word_vector); |
3837 | } |
3838 | |
3839 | if (query->heap) { |
3840 | mem_heap_free(query->heap); |
3841 | } |
3842 | |
3843 | memset(query, 0, sizeof(*query)); |
3844 | } |
3845 | |
3846 | /*****************************************************************//** |
3847 | Parse the query using flex/bison or plugin parser. |
3848 | @return parse tree node. */ |
3849 | static |
3850 | fts_ast_node_t* |
3851 | fts_query_parse( |
3852 | /*============*/ |
3853 | fts_query_t* query, /*!< in: query instance */ |
3854 | byte* query_str, /*!< in: query string */ |
3855 | ulint query_len) /*!< in: query string length */ |
3856 | { |
3857 | int error; |
3858 | fts_ast_state_t state; |
3859 | bool mode = query->boolean_mode; |
3860 | DBUG_ENTER("fts_query_parse" ); |
3861 | |
3862 | memset(&state, 0x0, sizeof(state)); |
3863 | |
3864 | state.charset = query->fts_index_table.charset; |
3865 | |
3866 | DBUG_EXECUTE_IF("fts_instrument_query_disable_parser" , |
3867 | query->parser = NULL;); |
3868 | |
3869 | if (query->parser) { |
3870 | state.root = state.cur_node = |
3871 | fts_ast_create_node_list(&state, NULL); |
3872 | error = fts_parse_by_parser(mode, query_str, query_len, |
3873 | query->parser, &state); |
3874 | } else { |
3875 | /* Setup the scanner to use, this depends on the mode flag. */ |
3876 | state.lexer = fts_lexer_create(mode, query_str, query_len); |
3877 | state.charset = query->fts_index_table.charset; |
3878 | error = fts_parse(&state); |
3879 | fts_lexer_free(state.lexer); |
3880 | state.lexer = NULL; |
3881 | } |
3882 | |
3883 | /* Error during parsing ? */ |
3884 | if (error) { |
3885 | /* Free the nodes that were allocated during parsing. */ |
3886 | fts_ast_state_free(&state); |
3887 | } else { |
3888 | query->root = state.root; |
3889 | |
3890 | if (fts_enable_diag_print && query->root != NULL) { |
3891 | fts_ast_node_print(query->root); |
3892 | } |
3893 | } |
3894 | |
3895 | DBUG_RETURN(state.root); |
3896 | } |
3897 | |
3898 | /*******************************************************************//** |
3899 | FTS Query optimization |
3900 | Set FTS_OPT_RANKING if it is a simple term query */ |
3901 | static |
3902 | void |
3903 | fts_query_can_optimize( |
3904 | /*===================*/ |
3905 | fts_query_t* query, /*!< in/out: query instance */ |
3906 | uint flags) /*!< In: FTS search mode */ |
3907 | { |
3908 | fts_ast_node_t* node = query->root; |
3909 | |
3910 | if (flags & FTS_EXPAND) { |
3911 | return; |
3912 | } |
3913 | |
3914 | /* Check if it has only a term without oper */ |
3915 | ut_ad(node->type == FTS_AST_LIST); |
3916 | node = node->list.head; |
3917 | if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) { |
3918 | query->flags = FTS_OPT_RANKING; |
3919 | } |
3920 | } |
3921 | |
3922 | /** FTS Query entry point. |
3923 | @param[in] index fts index to search |
3924 | @param[in] flags FTS search mode |
3925 | @param[in] query_str FTS query |
3926 | @param[in] query_len FTS query string len in bytes |
3927 | @param[in,out] result result doc ids |
3928 | @return DB_SUCCESS if successful otherwise error code */ |
3929 | dberr_t |
3930 | fts_query( |
3931 | dict_index_t* index, |
3932 | uint flags, |
3933 | const byte* query_str, |
3934 | ulint query_len, |
3935 | fts_result_t** result) |
3936 | { |
3937 | fts_query_t query; |
3938 | dberr_t error = DB_SUCCESS; |
3939 | byte* lc_query_str; |
3940 | ulint lc_query_str_len; |
3941 | ulint result_len; |
3942 | bool boolean_mode; |
3943 | trx_t* query_trx; |
3944 | CHARSET_INFO* charset; |
3945 | ulint start_time_ms; |
3946 | bool will_be_ignored = false; |
3947 | |
3948 | boolean_mode = flags & FTS_BOOL; |
3949 | |
3950 | *result = NULL; |
3951 | memset(&query, 0x0, sizeof(query)); |
3952 | query_trx = trx_create(); |
3953 | query_trx->op_info = "FTS query" ; |
3954 | |
3955 | start_time_ms = ut_time_ms(); |
3956 | |
3957 | query.trx = query_trx; |
3958 | query.index = index; |
3959 | query.boolean_mode = boolean_mode; |
3960 | query.deleted = fts_doc_ids_create(); |
3961 | query.cur_node = NULL; |
3962 | |
3963 | query.fts_common_table.type = FTS_COMMON_TABLE; |
3964 | query.fts_common_table.table_id = index->table->id; |
3965 | query.fts_common_table.parent = index->table->name.m_name; |
3966 | query.fts_common_table.table = index->table; |
3967 | |
3968 | charset = fts_index_get_charset(index); |
3969 | |
3970 | query.fts_index_table.type = FTS_INDEX_TABLE; |
3971 | query.fts_index_table.index_id = index->id; |
3972 | query.fts_index_table.table_id = index->table->id; |
3973 | query.fts_index_table.parent = index->table->name.m_name; |
3974 | query.fts_index_table.charset = charset; |
3975 | query.fts_index_table.table = index->table; |
3976 | |
3977 | query.word_map = rbt_create_arg_cmp( |
3978 | sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset); |
3979 | query.word_vector = UT_NEW_NOKEY(word_vector_t()); |
3980 | query.error = DB_SUCCESS; |
3981 | |
3982 | /* Setup the RB tree that will be used to collect per term |
3983 | statistics. */ |
3984 | query.word_freqs = rbt_create_arg_cmp( |
3985 | sizeof(fts_word_freq_t), innobase_fts_text_cmp, |
3986 | (void*) charset); |
3987 | |
3988 | if (flags & FTS_EXPAND) { |
3989 | query.wildcard_words = rbt_create_arg_cmp( |
3990 | sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset); |
3991 | } |
3992 | |
3993 | query.total_size += SIZEOF_RBT_CREATE; |
3994 | |
3995 | query.total_docs = dict_table_get_n_rows(index->table); |
3996 | |
3997 | query.fts_common_table.suffix = "DELETED" ; |
3998 | |
3999 | /* Read the deleted doc_ids, we need these for filtering. */ |
4000 | error = fts_table_fetch_doc_ids( |
4001 | NULL, &query.fts_common_table, query.deleted); |
4002 | |
4003 | if (error != DB_SUCCESS) { |
4004 | goto func_exit; |
4005 | } |
4006 | |
4007 | query.fts_common_table.suffix = "DELETED_CACHE" ; |
4008 | |
4009 | error = fts_table_fetch_doc_ids( |
4010 | NULL, &query.fts_common_table, query.deleted); |
4011 | |
4012 | if (error != DB_SUCCESS) { |
4013 | goto func_exit; |
4014 | } |
4015 | |
4016 | /* Get the deleted doc ids that are in the cache. */ |
4017 | fts_cache_append_deleted_doc_ids( |
4018 | index->table->fts->cache, query.deleted->doc_ids); |
4019 | DEBUG_SYNC_C("fts_deleted_doc_ids_append" ); |
4020 | |
4021 | /* Sort the vector so that we can do a binary search over the ids. */ |
4022 | ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp); |
4023 | |
4024 | /* Convert the query string to lower case before parsing. We own |
4025 | the ut_malloc'ed result and so remember to free it before return. */ |
4026 | |
4027 | lc_query_str_len = query_len * charset->casedn_multiply + 1; |
4028 | lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len)); |
4029 | |
4030 | /* For binary collations, a case sensitive search is |
4031 | performed. Hence don't convert to lower case. */ |
4032 | if (my_binary_compare(charset)) { |
4033 | memcpy(lc_query_str, query_str, query_len); |
4034 | lc_query_str[query_len]= 0; |
4035 | result_len= query_len; |
4036 | } else { |
4037 | result_len = innobase_fts_casedn_str( |
4038 | charset, (char*)( query_str), query_len, |
4039 | (char*)(lc_query_str), lc_query_str_len); |
4040 | } |
4041 | |
4042 | ut_ad(result_len < lc_query_str_len); |
4043 | |
4044 | lc_query_str[result_len] = 0; |
4045 | |
4046 | query.heap = mem_heap_create(128); |
4047 | |
4048 | /* Create the rb tree for the doc id (current) set. */ |
4049 | query.doc_ids = rbt_create( |
4050 | sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); |
4051 | query.parser = index->parser; |
4052 | |
4053 | query.total_size += SIZEOF_RBT_CREATE; |
4054 | |
4055 | /* Parse the input query string. */ |
4056 | if (fts_query_parse(&query, lc_query_str, result_len)) { |
4057 | fts_ast_node_t* ast = query.root; |
4058 | |
4059 | /* Optimize query to check if it's a single term */ |
4060 | fts_query_can_optimize(&query, flags); |
4061 | |
4062 | DBUG_EXECUTE_IF("fts_instrument_result_cache_limit" , |
4063 | fts_result_cache_limit = 2048; |
4064 | ); |
4065 | |
4066 | /* Traverse the Abstract Syntax Tree (AST) and execute |
4067 | the query. */ |
4068 | query.error = fts_ast_visit( |
4069 | FTS_NONE, ast, fts_query_visitor, |
4070 | &query, &will_be_ignored); |
4071 | |
4072 | /* If query expansion is requested, extend the search |
4073 | with first search pass result */ |
4074 | if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) { |
4075 | query.error = fts_expand_query(index, &query); |
4076 | } |
4077 | |
4078 | /* Calculate the inverse document frequency of the terms. */ |
4079 | if (query.error == DB_SUCCESS |
4080 | && query.flags != FTS_OPT_RANKING) { |
4081 | fts_query_calculate_idf(&query); |
4082 | } |
4083 | |
4084 | /* Copy the result from the query state, so that we can |
4085 | return it to the caller. */ |
4086 | if (query.error == DB_SUCCESS) { |
4087 | *result = fts_query_get_result(&query, *result); |
4088 | } |
4089 | |
4090 | error = query.error; |
4091 | } else { |
4092 | /* still return an empty result set */ |
4093 | *result = static_cast<fts_result_t*>( |
4094 | ut_zalloc_nokey(sizeof(**result))); |
4095 | } |
4096 | |
4097 | ut_free(lc_query_str); |
4098 | |
4099 | if (fts_enable_diag_print && (*result)) { |
4100 | ulint diff_time = ut_time_ms() - start_time_ms; |
4101 | |
4102 | ib::info() << "FTS Search Processing time: " |
4103 | << diff_time / 1000 << " secs: " << diff_time % 1000 |
4104 | << " millisec: row(s) " |
4105 | << ((*result)->rankings_by_id |
4106 | ? lint(rbt_size((*result)->rankings_by_id)) |
4107 | : -1); |
4108 | |
4109 | /* Log memory consumption & result size */ |
4110 | ib::info() << "Full Search Memory: " << query.total_size |
4111 | << " (bytes), Row: " |
4112 | << ((*result)->rankings_by_id |
4113 | ? rbt_size((*result)->rankings_by_id) |
4114 | : 0) |
4115 | << "." ; |
4116 | } |
4117 | |
4118 | func_exit: |
4119 | fts_query_free(&query); |
4120 | |
4121 | trx_free(query_trx); |
4122 | |
4123 | return(error); |
4124 | } |
4125 | |
4126 | /*****************************************************************//** |
4127 | FTS Query free result, returned by fts_query(). */ |
4128 | void |
4129 | fts_query_free_result( |
4130 | /*==================*/ |
4131 | fts_result_t* result) /*!< in: result instance to free.*/ |
4132 | { |
4133 | if (result) { |
4134 | if (result->rankings_by_id != NULL) { |
4135 | rbt_free(result->rankings_by_id); |
4136 | result->rankings_by_id = NULL; |
4137 | } |
4138 | if (result->rankings_by_rank != NULL) { |
4139 | rbt_free(result->rankings_by_rank); |
4140 | result->rankings_by_rank = NULL; |
4141 | } |
4142 | |
4143 | ut_free(result); |
4144 | result = NULL; |
4145 | } |
4146 | } |
4147 | |
4148 | /*****************************************************************//** |
4149 | FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */ |
4150 | void |
4151 | fts_query_sort_result_on_rank( |
4152 | /*==========================*/ |
4153 | fts_result_t* result) /*!< out: result instance to sort.*/ |
4154 | { |
4155 | const ib_rbt_node_t* node; |
4156 | ib_rbt_t* ranked; |
4157 | |
4158 | ut_a(result->rankings_by_id != NULL); |
4159 | if (result->rankings_by_rank) { |
4160 | rbt_free(result->rankings_by_rank); |
4161 | } |
4162 | |
4163 | ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank); |
4164 | |
4165 | /* We need to free any instances of fts_doc_freq_t that we |
4166 | may have allocated. */ |
4167 | for (node = rbt_first(result->rankings_by_id); |
4168 | node; |
4169 | node = rbt_next(result->rankings_by_id, node)) { |
4170 | |
4171 | fts_ranking_t* ranking; |
4172 | |
4173 | ranking = rbt_value(fts_ranking_t, node); |
4174 | |
4175 | ut_a(ranking->words == NULL); |
4176 | |
4177 | rbt_insert(ranked, ranking, ranking); |
4178 | } |
4179 | |
4180 | /* Reset the current node too. */ |
4181 | result->current = NULL; |
4182 | result->rankings_by_rank = ranked; |
4183 | } |
4184 | |
4185 | /*******************************************************************//** |
4186 | A debug function to print result doc_id set. */ |
4187 | static |
4188 | void |
4189 | fts_print_doc_id( |
4190 | /*=============*/ |
4191 | fts_query_t* query) /*!< in : tree that stores doc_ids.*/ |
4192 | { |
4193 | const ib_rbt_node_t* node; |
4194 | |
4195 | /* Iterate each member of the doc_id set */ |
4196 | for (node = rbt_first(query->doc_ids); |
4197 | node; |
4198 | node = rbt_next(query->doc_ids, node)) { |
4199 | fts_ranking_t* ranking; |
4200 | ranking = rbt_value(fts_ranking_t, node); |
4201 | |
4202 | ib::info() << "doc_ids info, doc_id: " << ranking->doc_id; |
4203 | |
4204 | ulint pos = 0; |
4205 | fts_string_t word; |
4206 | |
4207 | while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { |
4208 | ib::info() << "doc_ids info, value: " << word.f_str; |
4209 | } |
4210 | } |
4211 | } |
4212 | |
4213 | /*************************************************************//** |
4214 | This function implements a simple "blind" query expansion search: |
4215 | words in documents found in the first search pass will be used as |
4216 | search arguments to search the document again, thus "expand" |
4217 | the search result set. |
4218 | @return DB_SUCCESS if success, otherwise the error code */ |
4219 | static MY_ATTRIBUTE((nonnull, warn_unused_result)) |
4220 | dberr_t |
4221 | fts_expand_query( |
4222 | /*=============*/ |
4223 | dict_index_t* index, /*!< in: FTS index to search */ |
4224 | fts_query_t* query) /*!< in: FTS query instance */ |
4225 | { |
4226 | const ib_rbt_node_t* node; |
4227 | const ib_rbt_node_t* token_node; |
4228 | fts_doc_t result_doc; |
4229 | dberr_t error = DB_SUCCESS; |
4230 | const fts_index_cache_t*index_cache; |
4231 | |
4232 | /* If no doc is found in first search pass, return */ |
4233 | if (!rbt_size(query->doc_ids)) { |
4234 | return(error); |
4235 | } |
4236 | |
4237 | /* Init "result_doc", to hold words from the first search pass */ |
4238 | fts_doc_init(&result_doc); |
4239 | |
4240 | rw_lock_x_lock(&index->table->fts->cache->lock); |
4241 | index_cache = fts_find_index_cache(index->table->fts->cache, index); |
4242 | rw_lock_x_unlock(&index->table->fts->cache->lock); |
4243 | |
4244 | ut_a(index_cache); |
4245 | |
4246 | result_doc.tokens = rbt_create_arg_cmp( |
4247 | sizeof(fts_token_t), innobase_fts_text_cmp, |
4248 | (void*) index_cache->charset); |
4249 | |
4250 | result_doc.charset = index_cache->charset; |
4251 | result_doc.parser = index_cache->index->parser; |
4252 | |
4253 | query->total_size += SIZEOF_RBT_CREATE; |
4254 | |
4255 | if (fts_enable_diag_print) { |
4256 | fts_print_doc_id(query); |
4257 | } |
4258 | |
4259 | for (node = rbt_first(query->doc_ids); |
4260 | node; |
4261 | node = rbt_next(query->doc_ids, node)) { |
4262 | |
4263 | fts_ranking_t* ranking; |
4264 | ulint prev_token_size; |
4265 | ulint estimate_size; |
4266 | |
4267 | prev_token_size = rbt_size(result_doc.tokens); |
4268 | |
4269 | ranking = rbt_value(fts_ranking_t, node); |
4270 | |
4271 | /* Fetch the documents with the doc_id from the |
4272 | result of first seach pass. Since we do not |
4273 | store document-to-word mapping, we need to |
4274 | fetch the original document and parse them. |
4275 | Future optimization could be done here if we |
4276 | support some forms of document-to-word mapping */ |
4277 | fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index, |
4278 | FTS_FETCH_DOC_BY_ID_EQUAL, |
4279 | fts_query_expansion_fetch_doc, |
4280 | &result_doc); |
4281 | |
4282 | /* Estimate memory used, see fts_process_token and fts_token_t. |
4283 | We ignore token size here. */ |
4284 | estimate_size = (rbt_size(result_doc.tokens) - prev_token_size) |
4285 | * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t) |
4286 | + sizeof(ib_vector_t) + sizeof(ulint) * 32); |
4287 | query->total_size += estimate_size; |
4288 | |
4289 | if (query->total_size > fts_result_cache_limit) { |
4290 | error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; |
4291 | goto func_exit; |
4292 | } |
4293 | } |
4294 | |
4295 | /* Remove words that have already been searched in the first pass */ |
4296 | for (ulint i = 0; i < query->word_vector->size(); i++) { |
4297 | fts_string_t word = query->word_vector->at(i); |
4298 | ib_rbt_bound_t parent; |
4299 | |
4300 | if (query->wildcard_words |
4301 | && rbt_search(query->wildcard_words, &parent, &word) == 0) { |
4302 | /* If it's a wildcard word, remove words having |
4303 | it as prefix. */ |
4304 | while (rbt_search_cmp(result_doc.tokens, |
4305 | &parent, &word, NULL, |
4306 | innobase_fts_text_cmp_prefix) |
4307 | == 0) { |
4308 | ut_free(rbt_remove_node(result_doc.tokens, |
4309 | parent.last)); |
4310 | } |
4311 | } else { |
4312 | /* We don't check return value, because the word may |
4313 | have been deleted by a previous wildcard word as its |
4314 | prefix, e.g. ('g * good'). */ |
4315 | rbt_delete(result_doc.tokens, &word); |
4316 | } |
4317 | } |
4318 | |
4319 | /* Search the table the second time with expanded search list */ |
4320 | for (token_node = rbt_first(result_doc.tokens); |
4321 | token_node; |
4322 | token_node = rbt_next(result_doc.tokens, token_node)) { |
4323 | fts_token_t* mytoken; |
4324 | mytoken = rbt_value(fts_token_t, token_node); |
4325 | |
4326 | /* '%' in the end is treated as prefix search, |
4327 | it can cause assert failure, so we skip it. */ |
4328 | if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') { |
4329 | continue; |
4330 | } |
4331 | |
4332 | ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0); |
4333 | fts_query_add_word_freq(query, &mytoken->text); |
4334 | error = fts_query_union(query, &mytoken->text); |
4335 | |
4336 | if (error != DB_SUCCESS) { |
4337 | break; |
4338 | } |
4339 | } |
4340 | |
4341 | func_exit: |
4342 | fts_doc_free(&result_doc); |
4343 | |
4344 | return(error); |
4345 | } |
4346 | /*************************************************************//** |
4347 | This function finds documents that contain all words in a |
4348 | phrase or proximity search. And if proximity search, verify |
4349 | the words are close enough to each other, as in specified distance. |
4350 | This function is called for phrase and proximity search. |
4351 | @return TRUE if documents are found, FALSE if otherwise */ |
4352 | static |
4353 | ibool |
4354 | fts_phrase_or_proximity_search( |
4355 | /*===========================*/ |
4356 | fts_query_t* query, /*!< in/out: query instance. |
4357 | query->doc_ids might be instantiated |
4358 | with qualified doc IDs */ |
4359 | ib_vector_t* tokens) /*!< in: Tokens contain words */ |
4360 | { |
4361 | ulint n_matched; |
4362 | ulint i; |
4363 | ibool matched = FALSE; |
4364 | ulint num_token = ib_vector_size(tokens); |
4365 | fts_match_t* match[MAX_PROXIMITY_ITEM]; |
4366 | ibool end_list = FALSE; |
4367 | |
4368 | /* Number of matched documents for the first token */ |
4369 | n_matched = ib_vector_size(query->match_array[0]); |
4370 | |
4371 | /* We have a set of match list for each word, we shall |
4372 | walk through the list and find common documents that |
4373 | contain all the matching words. */ |
4374 | for (i = 0; i < n_matched; i++) { |
4375 | ulint j; |
4376 | ulint k = 0; |
4377 | fts_proximity_t qualified_pos; |
4378 | |
4379 | match[0] = static_cast<fts_match_t*>( |
4380 | ib_vector_get(query->match_array[0], i)); |
4381 | |
4382 | /* For remaining match list for the token(word), we |
4383 | try to see if there is a document with the same |
4384 | doc id */ |
4385 | for (j = 1; j < num_token; j++) { |
4386 | match[j] = static_cast<fts_match_t*>( |
4387 | ib_vector_get(query->match_array[j], k)); |
4388 | |
4389 | while (match[j]->doc_id < match[0]->doc_id |
4390 | && k < ib_vector_size(query->match_array[j])) { |
4391 | match[j] = static_cast<fts_match_t*>( |
4392 | ib_vector_get( |
4393 | query->match_array[j], k)); |
4394 | k++; |
4395 | } |
4396 | |
4397 | if (match[j]->doc_id > match[0]->doc_id) { |
4398 | /* no match */ |
4399 | if (query->flags & FTS_PHRASE) { |
4400 | match[0]->doc_id = 0; |
4401 | } |
4402 | break; |
4403 | } |
4404 | |
4405 | if (k == ib_vector_size(query->match_array[j])) { |
4406 | end_list = TRUE; |
4407 | |
4408 | if (match[j]->doc_id != match[0]->doc_id) { |
4409 | /* no match */ |
4410 | if (query->flags & FTS_PHRASE) { |
4411 | ulint s; |
4412 | |
4413 | match[0]->doc_id = 0; |
4414 | |
4415 | for (s = i + 1; s < n_matched; |
4416 | s++) { |
4417 | match[0] = static_cast< |
4418 | fts_match_t*>( |
4419 | ib_vector_get( |
4420 | query->match_array[0], |
4421 | s)); |
4422 | match[0]->doc_id = 0; |
4423 | } |
4424 | } |
4425 | |
4426 | goto func_exit; |
4427 | } |
4428 | } |
4429 | |
4430 | /* FIXME: A better solution will be a counter array |
4431 | remember each run's last position. So we don't |
4432 | reset it here very time */ |
4433 | k = 0; |
4434 | } |
4435 | |
4436 | if (j != num_token) { |
4437 | continue; |
4438 | } |
4439 | |
4440 | /* For this matching doc, we need to further |
4441 | verify whether the words in the doc are close |
4442 | to each other, and within the distance specified |
4443 | in the proximity search */ |
4444 | if (query->flags & FTS_PHRASE) { |
4445 | matched = TRUE; |
4446 | } else if (fts_proximity_get_positions( |
4447 | match, num_token, ULINT_MAX, &qualified_pos)) { |
4448 | |
4449 | /* Fetch the original documents and count the |
4450 | words in between matching words to see that is in |
4451 | specified distance */ |
4452 | if (fts_query_is_in_proximity_range( |
4453 | query, match, &qualified_pos)) { |
4454 | /* If so, mark we find a matching doc */ |
4455 | query->error = fts_query_process_doc_id( |
4456 | query, match[0]->doc_id, 0); |
4457 | if (query->error != DB_SUCCESS) { |
4458 | matched = FALSE; |
4459 | goto func_exit; |
4460 | } |
4461 | |
4462 | matched = TRUE; |
4463 | for (ulint z = 0; z < num_token; z++) { |
4464 | fts_string_t* token; |
4465 | token = static_cast<fts_string_t*>( |
4466 | ib_vector_get(tokens, z)); |
4467 | fts_query_add_word_to_document( |
4468 | query, match[0]->doc_id, token); |
4469 | } |
4470 | } |
4471 | } |
4472 | |
4473 | if (end_list) { |
4474 | break; |
4475 | } |
4476 | } |
4477 | |
4478 | func_exit: |
4479 | return(matched); |
4480 | } |
4481 | |
4482 | /*************************************************************//** |
4483 | This function checks whether words in result documents are close to |
4484 | each other (within proximity range as specified by "distance"). |
4485 | If "distance" is MAX_ULINT, then it will find all combinations of |
4486 | positions of matching words and store min and max positions |
4487 | in the "qualified_pos" for later verification. |
4488 | @return true if words are close to each other, false if otherwise */ |
4489 | static |
4490 | bool |
4491 | fts_proximity_get_positions( |
4492 | /*========================*/ |
4493 | fts_match_t** match, /*!< in: query instance */ |
4494 | ulint num_match, /*!< in: number of matching |
4495 | items */ |
4496 | ulint distance, /*!< in: distance value |
4497 | for proximity search */ |
4498 | fts_proximity_t* qualified_pos) /*!< out: the position info |
4499 | records ranges containing |
4500 | all matching words. */ |
4501 | { |
4502 | ulint i; |
4503 | ulint idx[MAX_PROXIMITY_ITEM]; |
4504 | ulint num_pos[MAX_PROXIMITY_ITEM]; |
4505 | ulint min_idx; |
4506 | |
4507 | qualified_pos->n_pos = 0; |
4508 | |
4509 | ut_a(num_match <= MAX_PROXIMITY_ITEM); |
4510 | |
4511 | /* Each word could appear multiple times in a doc. So |
4512 | we need to walk through each word's position list, and find |
4513 | closest distance between different words to see if |
4514 | they are in the proximity distance. */ |
4515 | |
4516 | /* Assume each word's position list is sorted, we |
4517 | will just do a walk through to all words' lists |
4518 | similar to a the merge phase of a merge sort */ |
4519 | for (i = 0; i < num_match; i++) { |
4520 | /* idx is the current position we are checking |
4521 | for a particular word */ |
4522 | idx[i] = 0; |
4523 | |
4524 | /* Number of positions for this word */ |
4525 | num_pos[i] = ib_vector_size(match[i]->positions); |
4526 | } |
4527 | |
4528 | /* Start with the first word */ |
4529 | min_idx = 0; |
4530 | |
4531 | while (idx[min_idx] < num_pos[min_idx]) { |
4532 | ulint position[MAX_PROXIMITY_ITEM]; |
4533 | ulint min_pos = ULINT_MAX; |
4534 | ulint max_pos = 0; |
4535 | |
4536 | /* Check positions in each word position list, and |
4537 | record the max/min position */ |
4538 | for (i = 0; i < num_match; i++) { |
4539 | position[i] = *(ulint*) ib_vector_get_const( |
4540 | match[i]->positions, idx[i]); |
4541 | |
4542 | if (position[i] == ULINT_UNDEFINED) { |
4543 | break; |
4544 | } |
4545 | |
4546 | if (position[i] < min_pos) { |
4547 | min_pos = position[i]; |
4548 | min_idx = i; |
4549 | } |
4550 | |
4551 | if (position[i] > max_pos) { |
4552 | max_pos = position[i]; |
4553 | } |
4554 | } |
4555 | |
4556 | /* If max and min position are within range, we |
4557 | find a good match */ |
4558 | if (max_pos - min_pos <= distance |
4559 | && (i >= num_match || position[i] != ULINT_UNDEFINED)) { |
4560 | /* The charset has variable character |
4561 | length encoding, record the min_pos and |
4562 | max_pos, we will need to verify the actual |
4563 | number of characters */ |
4564 | qualified_pos->min_pos.push_back(min_pos); |
4565 | qualified_pos->max_pos.push_back(max_pos); |
4566 | qualified_pos->n_pos++; |
4567 | } |
4568 | |
4569 | /* Otherwise, move to the next position is the |
4570 | list for the word with the smallest position */ |
4571 | idx[min_idx]++; |
4572 | } |
4573 | |
4574 | return(qualified_pos->n_pos != 0); |
4575 | } |
4576 | |