fts0que.cc source code [MariaDB/storage/innobase/fts/fts0que.cc]

1	/*****************************************************************************
2
3	Copyright (c) 2007, 2017, Oracle and/or its affiliates. All Rights Reserved.
4	Copyright (c) 2017, 2018, MariaDB Corporation.
5
6	This program is free software; you can redistribute it and/or modify it under
7	the terms of the GNU General Public License as published by the Free Software
8	Foundation; version 2 of the License.
9
10	This program is distributed in the hope that it will be useful, but WITHOUT
11	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License along with
15	this program; if not, write to the Free Software Foundation, Inc.,
16	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18	*****************************************************************************/
19
20	/************************************************//**
21	@file fts/fts0que.cc
22	Full Text Search functionality.
23
24	Created 2007/03/27 Sunny Bains
25	Completed 2011/7/10 Sunny and Jimmy Yang
26	*******************************************************/
27
28	#include "ha_prototypes.h"
29
30	#include "dict0dict.h"
31	#include "ut0rbt.h"
32	#include "row0sel.h"
33	#include "fts0fts.h"
34	#include "fts0priv.h"
35	#include "fts0ast.h"
36	#include "fts0pars.h"
37	#include "fts0types.h"
38	#include "fts0plugin.h"
39	#include "ut0new.h"
40
41	#include <iomanip>
42	#include <vector>
43
44	#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
45
46	#define RANK_DOWNGRADE (-1.0F)
47	#define RANK_UPGRADE (1.0F)
48
49	/ Maximum number of words supported in a phrase or proximity search. /
50	#define MAX_PROXIMITY_ITEM 128
51
52	/ Memory used by rbt itself for create and node add /
53	#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
54	#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t)
55
56	/Initial byte length for 'words' in fts_ranking_t /
57	#define RANKING_WORDS_INIT_LEN 4
58
59	// FIXME: Need to have a generic iterator that traverses the ilist.
60
61	typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t;
62
63	struct fts_word_freq_t;
64
65	/* State of an FTS query. /
66	struct fts_query_t {
67	mem_heap_t* heap; /!< Heap to use for allocations /
68
69	trx_t* trx; /!< The query transaction /
70
71	dict_index_t* index; /!< The FTS index to search /
72	/!< FTS auxiliary common table def /
73
74	fts_table_t fts_common_table;
75
76	fts_table_t fts_index_table;/!< FTS auxiliary index table def /
77
78	ulint total_size; /!< total memory size used by query /
79
80	fts_doc_ids_t* deleted; /!< Deleted doc ids that need to be*
81	filtered from the output /*
82
83	fts_ast_node_t* root; /!< Abstract syntax tree /
84
85	fts_ast_node_t* cur_node; /!< Current tree node /
86
87	ib_rbt_t* word_map; /!< Matched word map for*
88	searching by word/*
89
90	word_vector_t* word_vector; /!< Matched word vector for*
91	searching by index /*
92
93	ib_rbt_t* doc_ids; /!< The current set of matching*
94	doc ids, elements are of
95	type fts_ranking_t /*
96
97	ib_rbt_t* intersection; /!< The doc ids that were found in*
98	doc_ids, this tree will become
99	the new doc_ids, elements are of type
100	fts_ranking_t /*
101
102	/!< Prepared statement to read the*
103	nodes from the FTS INDEX /*
104	que_t* read_nodes_graph;
105
106	fts_ast_oper_t oper; /!< Current boolean mode operator /
107
108	/!< TRUE if we want to collect the*
109	word positions within the document /*
110	ibool collect_positions;
111
112	ulint flags; /!< Specify the full text search type,*
113	such as boolean search, phrase
114	search, proximity search etc. /*
115
116	ulint distance; /!< The proximity distance of a*
117	phrase search. /*
118
119	/!< These doc ids are used as a*
120	boundary condition when searching the
121	FTS index rows /*
122
123	doc_id_t lower_doc_id; /!< Lowest doc id in doc_ids /
124
125	doc_id_t upper_doc_id; /!< Highest doc id in doc_ids /
126
127	bool boolean_mode; /!< TRUE if boolean mode query /
128
129	ib_vector_t* matched; /!< Array of matching documents*
130	(fts_match_t) to search for a phrase /*
131
132	ib_vector_t** match_array; /!< Used for proximity search, contains*
133	position info for each matched word
134	in the word list /*
135
136	ib_uint64_t total_docs; /!< The total number of documents /
137
138	ulint total_words; /!< The total number of words /
139
140	dberr_t error; /!< Error code if any, that is*
141	encountered during query processing /*
142
143	ib_rbt_t* word_freqs; /!< RB tree of word frequencies per*
144	document, its elements are of type
145	fts_word_freq_t /*
146
147	ib_rbt_t* wildcard_words; /!< words with wildcard /
148
149	bool multi_exist; /!< multiple FTS_EXIST oper /
150
151	st_mysql_ftparser* parser; /!< fts plugin parser /
152	};
153
154	/* For phrase matching, first we collect the documents and the positions*
155	then we match. /*
156	struct fts_match_t {
157	doc_id_t doc_id; /!< Document id /
158
159	ulint start; /!< Start the phrase match from*
160	this offset within the positions
161	vector. /*
162
163	ib_vector_t* positions; /!< Offsets of a word in a*
164	document /*
165	};
166
167	/* For matching tokens in a phrase search. We use this data structure in*
168	the callback that determines whether a document should be accepted or
169	rejected for a phrase search. /*
170	struct fts_select_t {
171	doc_id_t doc_id; /!< The document id to match /
172
173	ulint min_pos; /!< For found to be TRUE at least*
174	one position must be greater than
175	min_pos. /*
176
177	ibool found; /!< TRUE if found /
178
179	fts_word_freq_t*
180	word_freq; /!< Word frequency instance of the*
181	current word being looked up in
182	the FTS index /*
183	};
184
185	typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t;
186
187	/* structure defines a set of ranges for original documents, each of which*
188	has a minimum position and maximum position. Text in such range should
189	contain all words in the proximity search. We will need to count the
190	words in such range to make sure it is less than the specified distance
191	of the proximity search /*
192	struct fts_proximity_t {
193	ulint n_pos; /!< number of position set, defines*
194	a range (min to max) containing all
195	matching words /*
196	pos_vector_t min_pos; /!< the minimum position (in bytes)*
197	of the range /*
198	pos_vector_t max_pos; /!< the maximum position (in bytes)*
199	of the range /*
200	};
201
202	/* The match positions and tokesn to match /
203	struct fts_phrase_t {
204	fts_phrase_t(const dict_table_t* table)
205	:
206	found(false),
207	match(NULL),
208	tokens(NULL),
209	distance(`0`),
210	charset(NULL),
211	heap(NULL),
212	page_size (dict_table_page_size(table)),
213	proximity_pos(NULL),
214	parser(NULL)
215	{
216	}
217
218	/* Match result /
219	ibool found;
220
221	/* Positions within text /
222	const fts_match_t* match;
223
224	/* Tokens to match /
225	const ib_vector_t* tokens;
226
227	/* For matching on proximity distance. Can be 0 for exact match /
228	ulint distance;
229
230	/* Phrase match charset /
231	CHARSET_INFO* charset;
232
233	/* Heap for word processing /
234	mem_heap_t* heap;
235
236	/* Row page size /
237	const page_size_t page_size;
238
239	/* Position info for proximity search verification. Records the*
240	min and max position of words matched /*
241	fts_proximity_t* proximity_pos;
242
243	/* FTS plugin parser /
244	st_mysql_ftparser* parser;
245	};
246
247	/* Paramter passed to fts phrase match by parser /
248	struct fts_phrase_param_t {
249	fts_phrase_t* phrase; /!< Match phrase instance /
250	ulint token_index; /!< Index of token to match next /
251	mem_heap_t* heap; /!< Heap for word processing /
252	};
253
254	/* For storing the frequncy of a word/term in a document /
255	struct fts_doc_freq_t {
256	doc_id_t doc_id; /!< Document id /
257	ulint freq; /!< Frequency of a word in a document /
258	};
259
260	/* To determine the word frequency per document. /
261	struct fts_word_freq_t {
262	fts_string_t word; /!< Word for which we need the freq,*
263	it's allocated on the query heap /*
264
265	ib_rbt_t* doc_freqs; /!< RB Tree for storing per document*
266	word frequencies. The elements are
267	of type fts_doc_freq_t /*
268	ib_uint64_t doc_count; /!< Total number of documents that*
269	contain this word /*
270	double idf; /!< Inverse document frequency /
271	};
272
273	/********************************************************************
274	Callback function to fetch the rows in an FTS INDEX record.
275	@return always TRUE /*
276	static
277	ibool
278	fts_query_index_fetch_nodes(
279	/========================/
280	void* row, /!< in: sel_node_t* /
281	void* user_arg); /!< in: pointer to ib_vector_t /
282
283	/********************************************************************
284	Read and filter nodes.
285	@return fts_node_t instance /*
286	static
287	dberr_t
288	fts_query_filter_doc_ids(
289	/=====================/
290	fts_query_t* query, /!< in: query instance /
291	const fts_string_t* word, /!< in: the current word /
292	fts_word_freq_t* word_freq, /!< in/out: word frequency /
293	const fts_node_t* node, /!< in: current FTS node /
294	void* data, /!< in: doc id ilist /
295	ulint len, /!< in: doc id ilist size /
296	ibool calc_doc_count);/!< in: whether to remember doc*
297	count /*
298
299	/* Process (nested) sub-expression, create a new result set to store the*
300	sub-expression result by processing nodes under current sub-expression
301	list. Merge the sub-expression result with that of parent expression list.
302	@param[in,out] node current root node
303	@param[in,out] visitor callback function
304	@param[in,out] arg argument for callback
305	@return DB_SUCCESS if all go well /*
306	static
307	dberr_t
308	fts_ast_visit_sub_exp(
309	fts_ast_node_t* node,
310	fts_ast_callback visitor,
311	void* arg);
312
313	#if 0
314	/***************************************************************//***
315	Find a doc_id in a word's ilist.
316	@return TRUE if found. /*
317	static
318	ibool
319	fts_query_find_doc_id(
320	/==================/
321	fts_select_t* select, /!< in/out: search the doc id selected,*
322	update the frequency if found. /*
323	void* data, /!< in: doc id ilist /
324	ulint len); /!< in: doc id ilist size /
325	#endif
326
327	/***********************************************************//**
328	This function implements a simple "blind" query expansion search:
329	words in documents found in the first search pass will be used as
330	search arguments to search the document again, thus "expand"
331	the search result set.
332	@return DB_SUCCESS if success, otherwise the error code /*
333	static
334	dberr_t
335	fts_expand_query(
336	/=============/
337	dict_index_t* index, /!< in: FTS index to search /
338	fts_query_t* query) /!< in: query result, to be freed*
339	by the client /*
340	MY_ATTRIBUTE((nonnull, warn_unused_result));
341	/***********************************************************//**
342	This function finds documents that contain all words in a
343	phrase or proximity search. And if proximity search, verify
344	the words are close enough to each other, as in specified distance.
345	This function is called for phrase and proximity search.
346	@return TRUE if documents are found, FALSE if otherwise /*
347	static
348	ibool
349	fts_phrase_or_proximity_search(
350	/===========================/
351	fts_query_t* query, /!< in/out: query instance*
352	query->doc_ids might be instantiated
353	with qualified doc IDs /*
354	ib_vector_t* tokens); /!< in: Tokens contain words /
355	/***********************************************************//**
356	This function checks whether words in result documents are close to
357	each other (within proximity range as specified by "distance").
358	If "distance" is MAX_ULINT, then it will find all combinations of
359	positions of matching words and store min and max positions
360	in the "qualified_pos" for later verification.
361	@return true if words are close to each other, false if otherwise /*
362	static
363	bool
364	fts_proximity_get_positions(
365	/========================/
366	fts_match_t** match, /!< in: query instance /
367	ulint num_match, /!< in: number of matching*
368	items /*
369	ulint distance, /!< in: distance value*
370	for proximity search /*
371	fts_proximity_t* qualified_pos); /!< out: the position info*
372	records ranges containing
373	all matching words. /*
374	#if 0
375	/********************************************************************
376	Get the total number of words in a documents. /*
377	static
378	ulint
379	fts_query_terms_in_document(
380	/========================/
381	/!< out: DB_SUCCESS if all go well*
382	else error code /*
383	fts_query_t* query, /!< in: FTS query state /
384	doc_id_t doc_id, /!< in: the word to check /
385	ulint* total); /!< out: total words in document /
386	#endif
387
388	/********************************************************************
389	Compare two fts_doc_freq_t doc_ids.
390	@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 /*
391	UNIV_INLINE
392	int
393	fts_freq_doc_id_cmp(
394	/================/
395	const void* p1, /!< in: id1 /
396	const void* p2) /!< in: id2 /
397	{
398	const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1;
399	const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2;
400
401	return((int) (fq1->doc_id - fq2->doc_id));
402	}
403
404	#if 0
405	/*****************************************************************//**
406	Print the table used for calculating LCS. /*
407	static
408	void
409	fts_print_lcs_table(
410	/================/
411	const ulint* table, /!< in: array to print /
412	ulint n_rows, /!< in: total no. of rows /
413	ulint n_cols) /!< in: total no. of cols /
414	{
415	ulint i;
416
417	for (i = `0`; i < n_rows; ++i) {
418	ulint j;
419
420	printf("\n");
421
422	for (j = `0`; j < n_cols; ++j) {
423
424	printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
425	}
426	}
427	}
428
429	/********************************************************************
430	Find the longest common subsequence between the query string and
431	the document. /*
432	static
433	ulint
434	fts_query_lcs(
435	/==========/
436	/!< out: LCS (length) between*
437	two ilists /*
438	const ulint* p1, /!< in: word positions of query /
439	ulint len_p1, /!< in: no. of elements in p1 /
440	const ulint* p2, /!< in: word positions within document /
441	ulint len_p2) /!< in: no. of elements in p2 /
442	{
443	int i;
444	ulint len = `0`;
445	ulint r = len_p1;
446	ulint c = len_p2;
447	ulint size = (r + `1`) * (c + `1`) * sizeof(ulint);
448	ulint* table = (ulint*) ut_malloc_nokey(size);
449
450	/ Traverse the table backwards, from the last row to the first and*
451	also from the last column to the first. We compute the smaller
452	common subsequeces first, then use the caluclated values to determine
453	the longest common subsequence. The result will be in TABLE[0][0]. /*
454	for (i = r; i >= `0`; --i) {
455	int j;
456
457	for (j = c; j >= `0`; --j) {
458
459	if (p1[i] == (ulint) -`1` \|\| p2[j] == (ulint) -`1`) {
460
461	FTS_ELEM(table, c, i, j) = `0`;
462
463	} else if (p1[i] == p2[j]) {
464
465	FTS_ELEM(table, c, i, j) = FTS_ELEM(
466	table, c, i + `1`, j + `1`) + `1`;
467
468	} else {
469
470	ulint value;
471
472	value = ut_max(
473	FTS_ELEM(table, c, i + `1`, j),
474	FTS_ELEM(table, c, i, j + `1`));
475
476	FTS_ELEM(table, c, i, j) = value;
477	}
478	}
479	}
480
481	len = FTS_ELEM(table, c, `0`, `0`);
482
483	fts_print_lcs_table(table, r, c);
484	printf("\nLen=" ULINTPF "\n", len);
485
486	ut_free(table);
487
488	return(len);
489	}
490	#endif
491
492	/*****************************************************************//**
493	Compare two fts_ranking_t instance on their rank value and doc ids in
494	descending order on the rank and ascending order on doc id.
495	@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 /*
496	static
497	int
498	fts_query_compare_rank(
499	/===================/
500	const void* p1, /!< in: pointer to elem /
501	const void* p2) /!< in: pointer to elem /
502	{
503	const fts_ranking_t* r1 = (const fts_ranking_t*) p1;
504	const fts_ranking_t* r2 = (const fts_ranking_t*) p2;
505
506	if (r2->rank < r1->rank) {
507	return(-`1`);
508	} else if (r2->rank == r1->rank) {
509
510	if (r1->doc_id < r2->doc_id) {
511	return(`1`);
512	} else if (r1->doc_id > r2->doc_id) {
513	return(`1`);
514	}
515
516	return(`0`);
517	}
518
519	return(`1`);
520	}
521
522	/*****************************************************************//**
523	Create words in ranking /*
524	static
525	void
526	fts_ranking_words_create(
527	/=====================/
528	fts_query_t* query, /!< in: query instance /
529	fts_ranking_t* ranking) /!< in: ranking instance /
530	{
531	ranking->words = static_cast<byte*>(
532	mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
533	ranking->words_len = RANKING_WORDS_INIT_LEN;
534	}
535
536	/*
537	The optimization here is using a char array(bitmap) to replace words rb tree
538	in fts_ranking_t.
539
540	It can save lots of memory except in some cases of QUERY EXPANSION.
541
542	'word_map' is used as a word dictionary, in which the key is a word, the value
543	is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
544	if not, we add it into 'word_map', and give it a position(actually a number).
545	then we set the corresponding bit to '1' at the position in the char array 'words'.
546
547	'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
548	more quickly than searching by value in 'word_map'. we use 'word_vector'
549	in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
550	to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
551	by the word.
552	*/
553
554	/*****************************************************************//**
555	Add a word into ranking /*
556	static
557	void
558	fts_ranking_words_add(
559	/==================/
560	fts_query_t* query, /!< in: query instance /
561	fts_ranking_t* ranking, /!< in: ranking instance /
562	const fts_string_t* word) /!< in: term/word to add /
563	{
564	ulint pos;
565	ulint byte_offset;
566	ulint bit_offset;
567	ib_rbt_bound_t parent;
568
569	/ Note: we suppose the word map and vector are append-only. /
570	ut_ad(query->word_vector->size() == rbt_size(query->word_map));
571
572	/ We use ib_rbt to simulate a map, f_n_char means position. /
573	if (rbt_search(query->word_map, &parent, word) == `0`) {
574	fts_string_t* result_word;
575
576	result_word = rbt_value(fts_string_t, parent.last);
577	pos = result_word->f_n_char;
578	ut_ad(pos < rbt_size(query->word_map));
579	} else {
580	/ Add the word to map. /
581	fts_string_t new_word;
582
583	pos = rbt_size(query->word_map);
584
585	fts_string_dup(&new_word, word, query->heap);
586	new_word.f_n_char = pos;
587
588	rbt_add_node(query->word_map, &parent, &new_word);
589	ut_ad(rbt_validate(query->word_map));
590	query->word_vector->push_back(new_word);
591	}
592
593	/ Check words len /
594	byte_offset = pos / CHAR_BIT;
595	if (byte_offset >= ranking->words_len) {
596	byte* words = ranking->words;
597	ulint words_len = ranking->words_len;
598
599	while (byte_offset >= words_len) {
600	words_len *= `2`;
601	}
602
603	ranking->words = static_cast<byte*>(
604	mem_heap_zalloc(query->heap, words_len));
605	ut_memcpy(ranking->words, words, ranking->words_len);
606	ranking->words_len = words_len;
607	}
608
609	/ Set ranking words /
610	ut_ad(byte_offset < ranking->words_len);
611	bit_offset = pos % CHAR_BIT;
612	ranking->words[byte_offset] \|= `1` << bit_offset;
613	}
614
615	/*****************************************************************//**
616	Get a word from a ranking
617	@return true if it's successful /*
618	static
619	bool
620	fts_ranking_words_get_next(
621	/=======================/
622	const fts_query_t* query, /!< in: query instance /
623	fts_ranking_t* ranking,/!< in: ranking instance /
624	ulint* pos, /!< in/out: word start pos /
625	fts_string_t* word) /!< in/out: term/word to add /
626	{
627	bool ret = false;
628	ulint max_pos = ranking->words_len * CHAR_BIT;
629
630	/ Search for next word /
631	while (*pos < max_pos) {
632	ulint byte_offset = *pos / CHAR_BIT;
633	ulint bit_offset = *pos % CHAR_BIT;
634
635	if (ranking->words[byte_offset] & (`1` << bit_offset)) {
636	ret = true;
637	break;
638	}
639
640	*pos += `1`;
641	};
642
643	/ Get next word from word vector /
644	if (ret) {
645	ut_ad(*pos < query->word_vector->size());
646	word = query->word_vector->at((size_t)pos);
647	*pos += `1`;
648	}
649
650	return ret;
651	}
652
653	/*****************************************************************//**
654	Add a word if it doesn't exist, to the term freq RB tree. We store
655	a pointer to the word that is passed in as the argument.
656	@return pointer to word /*
657	static
658	fts_word_freq_t*
659	fts_query_add_word_freq(
660	/====================/
661	fts_query_t* query, /!< in: query instance /
662	const fts_string_t* word) /!< in: term/word to add /
663	{
664	ib_rbt_bound_t parent;
665
666	/ Lookup the word in our rb tree and add if it doesn't exist. /
667	if (rbt_search(query->word_freqs, &parent, word) != `0`) {
668	fts_word_freq_t word_freq;
669
670	memset(&word_freq, `0`, sizeof(word_freq));
671
672	fts_string_dup(&word_freq.word, word, query->heap);
673
674	word_freq.doc_count = `0`;
675
676	word_freq.doc_freqs = rbt_create(
677	sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
678
679	parent.last = rbt_add_node(
680	query->word_freqs, &parent, &word_freq);
681
682	query->total_size += word->f_len
683	+ SIZEOF_RBT_CREATE
684	+ SIZEOF_RBT_NODE_ADD
685	+ sizeof(fts_word_freq_t);
686	}
687
688	return(rbt_value(fts_word_freq_t, parent.last));
689	}
690
691	/*****************************************************************//**
692	Add a doc id if it doesn't exist, to the doc freq RB tree.
693	@return pointer to word /*
694	static
695	fts_doc_freq_t*
696	fts_query_add_doc_freq(
697	/===================/
698	fts_query_t* query, /!< in: query instance /
699	ib_rbt_t* doc_freqs, /!< in: rb tree of fts_doc_freq_t /
700	doc_id_t doc_id) /!< in: doc id to add /
701	{
702	ib_rbt_bound_t parent;
703
704	/ Lookup the doc id in our rb tree and add if it doesn't exist. /
705	if (rbt_search(doc_freqs, &parent, &doc_id) != `0`) {
706	fts_doc_freq_t doc_freq;
707
708	memset(&doc_freq, `0`, sizeof(doc_freq));
709
710	doc_freq.freq = `0`;
711	doc_freq.doc_id = doc_id;
712
713	parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
714
715	query->total_size += SIZEOF_RBT_NODE_ADD
716	+ sizeof(fts_doc_freq_t);
717	}
718
719	return(rbt_value(fts_doc_freq_t, parent.last));
720	}
721
722	/*****************************************************************//**
723	Add the doc id to the query set only if it's not in the
724	deleted array. /*
725	static
726	void
727	fts_query_union_doc_id(
728	/===================/
729	fts_query_t* query, /!< in: query instance /
730	doc_id_t doc_id, /!< in: the doc id to add /
731	fts_rank_t rank) /!< in: if non-zero, it is the*
732	rank associated with the doc_id /*
733	{
734	ib_rbt_bound_t parent;
735	ulint size = ib_vector_size(query->deleted->doc_ids);
736	fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
737
738	/ Check if the doc id is deleted and it's not already in our set. /
739	if (fts_bsearch(array, `0`, static_cast<int>(size), doc_id) < `0`
740	&& rbt_search(query->doc_ids, &parent, &doc_id) != `0`) {
741
742	fts_ranking_t ranking;
743
744	ranking.rank = rank;
745	ranking.doc_id = doc_id;
746	fts_ranking_words_create(query, &ranking);
747
748	rbt_add_node(query->doc_ids, &parent, &ranking);
749
750	query->total_size += SIZEOF_RBT_NODE_ADD
751	+ sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
752	}
753	}
754
755	/*****************************************************************//**
756	Remove the doc id from the query set only if it's not in the
757	deleted set. /*
758	static
759	void
760	fts_query_remove_doc_id(
761	/====================/
762	fts_query_t* query, /!< in: query instance /
763	doc_id_t doc_id) /!< in: the doc id to add /
764	{
765	ib_rbt_bound_t parent;
766	ulint size = ib_vector_size(query->deleted->doc_ids);
767	fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
768
769	/ Check if the doc id is deleted and it's in our set. /
770	if (fts_bsearch(array, `0`, static_cast<int>(size), doc_id) < `0`
771	&& rbt_search(query->doc_ids, &parent, &doc_id) == `0`) {
772	ut_free(rbt_remove_node(query->doc_ids, parent.last));
773
774	ut_ad(query->total_size >=
775	SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
776	query->total_size -= SIZEOF_RBT_NODE_ADD
777	+ sizeof(fts_ranking_t);
778	}
779	}
780
781	/*****************************************************************//**
782	Find the doc id in the query set but not in the deleted set, artificialy
783	downgrade or upgrade its ranking by a value and make/initialize its ranking
784	under or above its normal range 0 to 1. This is used for Boolean Search
785	operator such as Negation operator, which makes word's contribution to the
786	row's relevance to be negative /*
787	static
788	void
789	fts_query_change_ranking(
790	/====================/
791	fts_query_t* query, /!< in: query instance /
792	doc_id_t doc_id, /!< in: the doc id to add /
793	ibool downgrade) /!< in: Whether to downgrade ranking /
794	{
795	ib_rbt_bound_t parent;
796	ulint size = ib_vector_size(query->deleted->doc_ids);
797	fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
798
799	/ Check if the doc id is deleted and it's in our set. /
800	if (fts_bsearch(array, `0`, static_cast<int>(size), doc_id) < `0`
801	&& rbt_search(query->doc_ids, &parent, &doc_id) == `0`) {
802
803	fts_ranking_t* ranking;
804
805	ranking = rbt_value(fts_ranking_t, parent.last);
806
807	ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
808
809	/ Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)*
810	and RANK_UPGRADE (0.5) /*
811	if (ranking->rank >= `1.0F`) {
812	ranking->rank = `1.0F`;
813	} else if (ranking->rank <= -`1.0F`) {
814	ranking->rank = -`1.0F`;
815	}
816	}
817	}
818
819	/*****************************************************************//**
820	Check the doc id in the query set only if it's not in the
821	deleted array. The doc ids that were found are stored in
822	another rb tree (fts_query_t::intersect). /*
823	static
824	void
825	fts_query_intersect_doc_id(
826	/=======================/
827	fts_query_t* query, /!< in: query instance /
828	doc_id_t doc_id, /!< in: the doc id to add /
829	fts_rank_t rank) /!< in: if non-zero, it is the*
830	rank associated with the doc_id /*
831	{
832	ib_rbt_bound_t parent;
833	ulint size = ib_vector_size(query->deleted->doc_ids);
834	fts_update_t* array = (fts_update_t*) query->deleted->doc_ids->data;
835	fts_ranking_t* ranking= NULL;
836
837	/ There are three types of intersect:*
838	1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
839	2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
840	if it matches 'b'. if the doc is also in doc_ids, then change the
841	doc's rank, and add 'a' in doc's words.
842	3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
843	if it matches 'b' and it's in doc_ids.(multi_exist = true). /*
844
845	/ Check if the doc id is deleted and it's in our set /
846	if (fts_bsearch(array, `0`, static_cast<int>(size), doc_id) < `0`) {
847	fts_ranking_t new_ranking;
848
849	if (rbt_search(query->doc_ids, &parent, &doc_id) != `0`) {
850	if (query->multi_exist) {
851	return;
852	} else {
853	new_ranking.words = NULL;
854	}
855	} else {
856	ranking = rbt_value(fts_ranking_t, parent.last);
857
858	/ We've just checked the doc id before /
859	if (ranking->words == NULL) {
860	ut_ad(rbt_search(query->intersection, &parent,
861	ranking) == `0`);
862	return;
863	}
864
865	/ Merge rank /
866	rank += ranking->rank;
867	if (rank >= `1.0F`) {
868	rank = `1.0F`;
869	} else if (rank <= -`1.0F`) {
870	rank = -`1.0F`;
871	}
872
873	/ Take words /
874	new_ranking.words = ranking->words;
875	new_ranking.words_len = ranking->words_len;
876	}
877
878	new_ranking.rank = rank;
879	new_ranking.doc_id = doc_id;
880
881	if (rbt_search(query->intersection, &parent,
882	&new_ranking) != `0`) {
883	if (new_ranking.words == NULL) {
884	fts_ranking_words_create(query, &new_ranking);
885
886	query->total_size += RANKING_WORDS_INIT_LEN;
887	} else {
888	/ Note that the intersection has taken*
889	ownership of the ranking data. /*
890	ranking->words = NULL;
891	}
892
893	rbt_add_node(query->intersection,
894	&parent, &new_ranking);
895
896	query->total_size += SIZEOF_RBT_NODE_ADD
897	+ sizeof(fts_ranking_t);
898	}
899	}
900	}
901
902	/*****************************************************************//**
903	Free the document ranking rb tree. /*
904	static
905	void
906	fts_query_free_doc_ids(
907	/===================/
908	fts_query_t* query, /!< in: query instance /
909	ib_rbt_t* doc_ids) /!< in: rb tree to free /
910	{
911	const ib_rbt_node_t* node;
912
913	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
914
915	fts_ranking_t* ranking;
916
917	ranking = rbt_value(fts_ranking_t, node);
918
919	if (ranking->words) {
920	ranking->words = NULL;
921	}
922
923	ut_free(rbt_remove_node(doc_ids, node));
924
925	ut_ad(query->total_size >=
926	SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
927	query->total_size -= SIZEOF_RBT_NODE_ADD
928	+ sizeof(fts_ranking_t);
929	}
930
931	rbt_free(doc_ids);
932
933	ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
934	query->total_size -= SIZEOF_RBT_CREATE;
935	}
936
937	/*****************************************************************//**
938	Add the word to the documents "list" of matching words from
939	the query. We make a copy of the word from the query heap. /*
940	static
941	void
942	fts_query_add_word_to_document(
943	/===========================/
944	fts_query_t* query, /!< in: query to update /
945	doc_id_t doc_id, /!< in: the document to update /
946	const fts_string_t* word) /!< in: the token to add /
947	{
948	ib_rbt_bound_t parent;
949	fts_ranking_t* ranking = NULL;
950
951	if (query->flags == FTS_OPT_RANKING) {
952	return;
953	}
954
955	/ First we search the intersection RB tree as it could have*
956	taken ownership of the words rb tree instance. /*
957	if (query->intersection
958	&& rbt_search(query->intersection, &parent, &doc_id) == `0`) {
959
960	ranking = rbt_value(fts_ranking_t, parent.last);
961	}
962
963	if (ranking == NULL
964	&& rbt_search(query->doc_ids, &parent, &doc_id) == `0`) {
965
966	ranking = rbt_value(fts_ranking_t, parent.last);
967	}
968
969	if (ranking != NULL) {
970	fts_ranking_words_add(query, ranking, word);
971	}
972	}
973
974	/*****************************************************************//**
975	Check the node ilist. /*
976	static
977	void
978	fts_query_check_node(
979	/=================/
980	fts_query_t* query, /!< in: query to update /
981	const fts_string_t* token, /!< in: the token to search /
982	const fts_node_t* node) /!< in: node to check /
983	{
984	/ Skip nodes whose doc ids are out range. /
985	if (query->oper == FTS_EXIST
986	&& ((query->upper_doc_id > `0`
987	&& node->first_doc_id > query->upper_doc_id)
988	\|\| (query->lower_doc_id > `0`
989	&& node->last_doc_id < query->lower_doc_id))) {
990
991	/ Ignore /
992
993	} else {
994	int ret;
995	ib_rbt_bound_t parent;
996	ulint ilist_size = node->ilist_size;
997	fts_word_freq_t*word_freqs;
998
999	/ The word must exist. /
1000	ret = rbt_search(query->word_freqs, &parent, token);
1001	ut_a(ret == `0`);
1002
1003	word_freqs = rbt_value(fts_word_freq_t, parent.last);
1004
1005	query->error = fts_query_filter_doc_ids(
1006	query, token, word_freqs, node,
1007	node->ilist, ilist_size, TRUE);
1008	}
1009	}
1010
1011	/***************************************************************//**
1012	Search index cache for word with wildcard match.
1013	@return number of words matched /*
1014	static
1015	ulint
1016	fts_cache_find_wildcard(
1017	/====================/
1018	fts_query_t* query, /!< in: query instance /
1019	const fts_index_cache_tindex_cache, /!< in: cache to search /*
1020	const fts_string_t* token) /!< in: token to search /
1021	{
1022	ib_rbt_bound_t parent;
1023	const ib_vector_t* nodes = NULL;
1024	fts_string_t srch_text;
1025	byte term[FTS_MAX_WORD_LEN + `1`];
1026	ulint num_word = `0`;
1027
1028	srch_text.f_len = (token->f_str[token->f_len - `1`] == `'%'`)
1029	? token->f_len - `1`
1030	: token->f_len;
1031
1032	strncpy((char) term, (char**) token->f_str, srch_text.f_len);
1033	term[srch_text.f_len] = `'\0'`;
1034	srch_text.f_str = term;
1035
1036	/ Lookup the word in the rb tree /
1037	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
1038	innobase_fts_text_cmp_prefix) == `0`) {
1039	const fts_tokenizer_word_t* word;
1040	ulint i;
1041	const ib_rbt_node_t* cur_node;
1042	ibool forward = FALSE;
1043
1044	word = rbt_value(fts_tokenizer_word_t, parent.last);
1045	cur_node = parent.last;
1046
1047	while (innobase_fts_text_cmp_prefix(
1048	index_cache->charset, &srch_text, &word->text) == `0`) {
1049
1050	nodes = word->nodes;
1051
1052	for (i = `0`; nodes && i < ib_vector_size(nodes); ++i) {
1053	int ret;
1054	const fts_node_t* node;
1055	ib_rbt_bound_t freq_parent;
1056	fts_word_freq_t* word_freqs;
1057
1058	node = static_cast<const fts_node_t*>(
1059	ib_vector_get_const(nodes, i));
1060
1061	ret = rbt_search(query->word_freqs,
1062	&freq_parent,
1063	&srch_text);
1064
1065	ut_a(ret == `0`);
1066
1067	word_freqs = rbt_value(
1068	fts_word_freq_t,
1069	freq_parent.last);
1070
1071	query->error = fts_query_filter_doc_ids(
1072	query, &srch_text,
1073	word_freqs, node,
1074	node->ilist, node->ilist_size, TRUE);
1075
1076	if (query->error != DB_SUCCESS) {
1077	return(`0`);
1078	}
1079	}
1080
1081	num_word++;
1082
1083	if (!forward) {
1084	cur_node = rbt_prev(
1085	index_cache->words, cur_node);
1086	} else {
1087	cont_search:
1088	cur_node = rbt_next(
1089	index_cache->words, cur_node);
1090	}
1091
1092	if (!cur_node) {
1093	break;
1094	}
1095
1096	word = rbt_value(fts_tokenizer_word_t, cur_node);
1097	}
1098
1099	if (!forward) {
1100	forward = TRUE;
1101	cur_node = parent.last;
1102	goto cont_search;
1103	}
1104	}
1105
1106	return(num_word);
1107	}
1108
1109	/***************************************************************//**
1110	Set difference.
1111	@return DB_SUCCESS if all go well /*
1112	static MY_ATTRIBUTE((nonnull, warn_unused_result))
1113	dberr_t
1114	fts_query_difference(
1115	/=================/
1116	fts_query_t* query, /!< in: query instance /
1117	const fts_string_t* token) /!< in: token to search /
1118	{
1119	ulint n_doc_ids= `0`;
1120	trx_t* trx = query->trx;
1121	dict_table_t* table = query->index->table;
1122
1123	ut_a(query->oper == FTS_IGNORE);
1124
1125	#ifdef FTS_INTERNAL_DIAG_PRINT
1126	{
1127	ib::info out;
1128	out << "DIFFERENCE: Searching: '";
1129	out.write(token->f_str, token->f_len);
1130	out << "'";
1131	}
1132	#endif
1133
1134	if (query->doc_ids) {
1135	n_doc_ids = rbt_size(query->doc_ids);
1136	}
1137
1138	/ There is nothing we can substract from an empty set. /
1139	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
1140	ulint i;
1141	fts_fetch_t fetch;
1142	const ib_vector_t* nodes;
1143	const fts_index_cache_t*index_cache;
1144	que_t* graph = NULL;
1145	fts_cache_t* cache = table->fts->cache;
1146	dberr_t error;
1147
1148	rw_lock_x_lock(&cache->lock);
1149
1150	index_cache = fts_find_index_cache(cache, query->index);
1151
1152	/ Must find the index cache /
1153	ut_a(index_cache != NULL);
1154
1155	/ Search the cache for a matching word first. /
1156	if (query->cur_node->term.wildcard
1157	&& query->flags != FTS_PROXIMITY
1158	&& query->flags != FTS_PHRASE) {
1159	fts_cache_find_wildcard(query, index_cache, token);
1160	} else {
1161	nodes = fts_cache_find_word(index_cache, token);
1162
1163	for (i = `0`; nodes && i < ib_vector_size(nodes)
1164	&& query->error == DB_SUCCESS; ++i) {
1165	const fts_node_t* node;
1166
1167	node = static_cast<const fts_node_t*>(
1168	ib_vector_get_const(nodes, i));
1169
1170	fts_query_check_node(query, token, node);
1171	}
1172	}
1173
1174	rw_lock_x_unlock(&cache->lock);
1175
1176	/ error is passed by 'query->error' /
1177	if (query->error != DB_SUCCESS) {
1178	ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1179	return(query->error);
1180	}
1181
1182	/ Setup the callback args for filtering and*
1183	consolidating the ilist. /*
1184	fetch.read_arg = query;
1185	fetch.read_record = fts_query_index_fetch_nodes;
1186
1187	error = fts_index_fetch_nodes(
1188	trx, &graph, &query->fts_index_table, token, &fetch);
1189
1190	/ DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' /
1191	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1192	if (error != DB_SUCCESS) {
1193	query->error = error;
1194	}
1195
1196	fts_que_graph_free(graph);
1197	}
1198
1199	/ The size can't increase. /
1200	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
1201
1202	return(query->error);
1203	}
1204
1205	/***************************************************************//**
1206	Intersect the token doc ids with the current set.
1207	@return DB_SUCCESS if all go well /*
1208	static MY_ATTRIBUTE((nonnull, warn_unused_result))
1209	dberr_t
1210	fts_query_intersect(
1211	/================/
1212	fts_query_t* query, /!< in: query instance /
1213	const fts_string_t* token) /!< in: the token to search /
1214	{
1215	trx_t* trx = query->trx;
1216	dict_table_t* table = query->index->table;
1217
1218	ut_a(query->oper == FTS_EXIST);
1219
1220	#ifdef FTS_INTERNAL_DIAG_PRINT
1221	{
1222	ib::info out;
1223	out << "INTERSECT: Searching: '";
1224	out.write(token->f_str, token->f_len);
1225	out << "'";
1226	}
1227	#endif
1228
1229	/ If the words set is not empty and multi exist is true,*
1230	we know the intersection set is empty in advance. /*
1231	if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
1232	ulint n_doc_ids = `0`;
1233	ulint i;
1234	fts_fetch_t fetch;
1235	const ib_vector_t* nodes;
1236	const fts_index_cache_t*index_cache;
1237	que_t* graph = NULL;
1238	fts_cache_t* cache = table->fts->cache;
1239	dberr_t error;
1240
1241	ut_a(!query->intersection);
1242
1243	n_doc_ids = rbt_size(query->doc_ids);
1244
1245	/ Create the rb tree that will hold the doc ids of*
1246	the intersection. /*
1247	query->intersection = rbt_create(
1248	sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1249
1250	query->total_size += SIZEOF_RBT_CREATE;
1251
1252	/ This is to avoid decompressing the ilist if the*
1253	node's ilist doc ids are out of range. /*
1254	if (!rbt_empty(query->doc_ids) && query->multi_exist) {
1255	const ib_rbt_node_t* node;
1256	doc_id_t* doc_id;
1257
1258	node = rbt_first(query->doc_ids);
1259	doc_id = rbt_value(doc_id_t, node);
1260	query->lower_doc_id = *doc_id;
1261
1262	node = rbt_last(query->doc_ids);
1263	doc_id = rbt_value(doc_id_t, node);
1264	query->upper_doc_id = *doc_id;
1265
1266	} else {
1267	query->lower_doc_id = `0`;
1268	query->upper_doc_id = `0`;
1269	}
1270
1271	/ Search the cache for a matching word first. /
1272
1273	rw_lock_x_lock(&cache->lock);
1274
1275	/ Search for the index specific cache. /
1276	index_cache = fts_find_index_cache(cache, query->index);
1277
1278	/ Must find the index cache. /
1279	ut_a(index_cache != NULL);
1280
1281	if (query->cur_node->term.wildcard) {
1282	/ Wildcard search the index cache /
1283	fts_cache_find_wildcard(query, index_cache, token);
1284	} else {
1285	nodes = fts_cache_find_word(index_cache, token);
1286
1287	for (i = `0`; nodes && i < ib_vector_size(nodes)
1288	&& query->error == DB_SUCCESS; ++i) {
1289	const fts_node_t* node;
1290
1291	node = static_cast<const fts_node_t*>(
1292	ib_vector_get_const(nodes, i));
1293
1294	fts_query_check_node(query, token, node);
1295	}
1296	}
1297
1298	rw_lock_x_unlock(&cache->lock);
1299
1300	/ error is passed by 'query->error' /
1301	if (query->error != DB_SUCCESS) {
1302	ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1303	return(query->error);
1304	}
1305
1306	/ Setup the callback args for filtering and*
1307	consolidating the ilist. /*
1308	fetch.read_arg = query;
1309	fetch.read_record = fts_query_index_fetch_nodes;
1310
1311	error = fts_index_fetch_nodes(
1312	trx, &graph, &query->fts_index_table, token, &fetch);
1313
1314	/ DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' /
1315	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1316	if (error != DB_SUCCESS) {
1317	query->error = error;
1318	}
1319
1320	fts_que_graph_free(graph);
1321
1322	if (query->error == DB_SUCCESS) {
1323	/ Make the intesection (rb tree) the current doc id*
1324	set and free the old set. /*
1325	fts_query_free_doc_ids(query, query->doc_ids);
1326	query->doc_ids = query->intersection;
1327	query->intersection = NULL;
1328
1329	ut_a(!query->multi_exist \|\| (query->multi_exist
1330	&& rbt_size(query->doc_ids) <= n_doc_ids));
1331	}
1332	}
1333
1334	return(query->error);
1335	}
1336
1337	/***************************************************************//**
1338	Query index cache.
1339	@return DB_SUCCESS if all go well /*
1340	static
1341	dberr_t
1342	fts_query_cache(
1343	/============/
1344	fts_query_t* query, /!< in/out: query instance /
1345	const fts_string_t* token) /!< in: token to search /
1346	{
1347	const fts_index_cache_t*index_cache;
1348	dict_table_t* table = query->index->table;
1349	fts_cache_t* cache = table->fts->cache;
1350
1351	/ Search the cache for a matching word first. /
1352	rw_lock_x_lock(&cache->lock);
1353
1354	/ Search for the index specific cache. /
1355	index_cache = fts_find_index_cache(cache, query->index);
1356
1357	/ Must find the index cache. /
1358	ut_a(index_cache != NULL);
1359
1360	if (query->cur_node->term.wildcard
1361	&& query->flags != FTS_PROXIMITY
1362	&& query->flags != FTS_PHRASE) {
1363	/ Wildcard search the index cache /
1364	fts_cache_find_wildcard(query, index_cache, token);
1365	} else {
1366	const ib_vector_t* nodes;
1367	ulint i;
1368
1369	nodes = fts_cache_find_word(index_cache, token);
1370
1371	for (i = `0`; nodes && i < ib_vector_size(nodes)
1372	&& query->error == DB_SUCCESS; ++i) {
1373	const fts_node_t* node;
1374
1375	node = static_cast<const fts_node_t*>(
1376	ib_vector_get_const(nodes, i));
1377
1378	fts_query_check_node(query, token, node);
1379	}
1380	}
1381
1382	rw_lock_x_unlock(&cache->lock);
1383
1384	return(query->error);
1385	}
1386
1387	/***************************************************************//**
1388	Set union.
1389	@return DB_SUCCESS if all go well /*
1390	static MY_ATTRIBUTE((nonnull, warn_unused_result))
1391	dberr_t
1392	fts_query_union(
1393	/============/
1394	fts_query_t* query, /!< in: query instance /
1395	fts_string_t* token) /!< in: token to search /
1396	{
1397	fts_fetch_t fetch;
1398	ulint n_doc_ids = `0`;
1399	trx_t* trx = query->trx;
1400	que_t* graph = NULL;
1401	dberr_t error;
1402
1403	ut_a(query->oper == FTS_NONE \|\| query->oper == FTS_DECR_RATING \|\|
1404	query->oper == FTS_NEGATE \|\| query->oper == FTS_INCR_RATING);
1405
1406	#ifdef FTS_INTERNAL_DIAG_PRINT
1407	{
1408	ib::info out;
1409	out << "UNION: Searching: '";
1410	out.write(token->f_str, token->f_len);
1411	out << "'";
1412	}
1413	#endif
1414
1415	if (query->doc_ids) {
1416	n_doc_ids = rbt_size(query->doc_ids);
1417	}
1418
1419	if (token->f_len == `0`) {
1420	return(query->error);
1421	}
1422
1423	fts_query_cache(query, token);
1424
1425	/ Setup the callback args for filtering and*
1426	consolidating the ilist. /*
1427	fetch.read_arg = query;
1428	fetch.read_record = fts_query_index_fetch_nodes;
1429
1430	/ Read the nodes from disk. /
1431	error = fts_index_fetch_nodes(
1432	trx, &graph, &query->fts_index_table, token, &fetch);
1433
1434	/ DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' /
1435	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
1436	if (error != DB_SUCCESS) {
1437	query->error = error;
1438	}
1439
1440	fts_que_graph_free(graph);
1441
1442	if (query->error == DB_SUCCESS) {
1443
1444	/ The size can't decrease. /
1445	ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
1446
1447	/ Calulate the number of doc ids that were added to*
1448	the current doc id set. /*
1449	if (query->doc_ids) {
1450	n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
1451	}
1452	}
1453
1454	return(query->error);
1455	}
1456
1457	/***************************************************************//**
1458	Depending upon the current query operator process the doc id.
1459	return DB_SUCCESS if all go well
1460	or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT /*
1461	static
1462	dberr_t
1463	fts_query_process_doc_id(
1464	/=====================/
1465	fts_query_t* query, /!< in: query instance /
1466	doc_id_t doc_id, /!< in: doc id to process /
1467	fts_rank_t rank) /!< in: if non-zero, it is the*
1468	rank associated with the doc_id /*
1469	{
1470	if (query->flags == FTS_OPT_RANKING) {
1471	return(DB_SUCCESS);
1472	}
1473
1474	switch (query->oper) {
1475	case FTS_NONE:
1476	fts_query_union_doc_id(query, doc_id, rank);
1477	break;
1478
1479	case FTS_EXIST:
1480	fts_query_intersect_doc_id(query, doc_id, rank);
1481	break;
1482
1483	case FTS_IGNORE:
1484	fts_query_remove_doc_id(query, doc_id);
1485	break;
1486
1487	case FTS_NEGATE:
1488	fts_query_change_ranking(query, doc_id, TRUE);
1489	break;
1490
1491	case FTS_DECR_RATING:
1492	fts_query_union_doc_id(query, doc_id, rank);
1493	fts_query_change_ranking(query, doc_id, TRUE);
1494	break;
1495
1496	case FTS_INCR_RATING:
1497	fts_query_union_doc_id(query, doc_id, rank);
1498	fts_query_change_ranking(query, doc_id, FALSE);
1499	break;
1500
1501	default:
1502	ut_error;
1503	}
1504
1505	if (query->total_size > fts_result_cache_limit) {
1506	return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
1507	} else {
1508	return(DB_SUCCESS);
1509	}
1510	}
1511
1512	/***************************************************************//**
1513	Merge two result sets. /*
1514	static
1515	dberr_t
1516	fts_merge_doc_ids(
1517	/==============/
1518	fts_query_t* query, /!< in,out: query instance /
1519	const ib_rbt_t* doc_ids) /!< in: result set to merge /
1520	{
1521	const ib_rbt_node_t* node;
1522
1523	DBUG_ENTER("fts_merge_doc_ids");
1524
1525	ut_a(!query->intersection);
1526
1527	/ To process FTS_EXIST operation (intersection), we need*
1528	to create a new result set for fts_query_intersect(). /*
1529	if (query->oper == FTS_EXIST) {
1530
1531	query->intersection = rbt_create(
1532	sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
1533
1534	query->total_size += SIZEOF_RBT_CREATE;
1535	}
1536
1537	/ Merge the elements to the result set. /
1538	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
1539	fts_ranking_t* ranking;
1540	ulint pos = `0`;
1541	fts_string_t word;
1542
1543	ranking = rbt_value(fts_ranking_t, node);
1544
1545	query->error = fts_query_process_doc_id(
1546	query, ranking->doc_id, ranking->rank);
1547
1548	if (query->error != DB_SUCCESS) {
1549	DBUG_RETURN(query->error);
1550	}
1551
1552	/ Merge words. Don't need to take operator into account. /
1553	ut_a(ranking->words);
1554	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
1555	fts_query_add_word_to_document(query, ranking->doc_id,
1556	&word);
1557	}
1558	}
1559
1560	/ If it is an intersection operation, reset query->doc_ids*
1561	to query->intersection and free the old result list. /*
1562	if (query->oper == FTS_EXIST && query->intersection != NULL) {
1563	fts_query_free_doc_ids(query, query->doc_ids);
1564	query->doc_ids = query->intersection;
1565	query->intersection = NULL;
1566	}
1567
1568	DBUG_RETURN(DB_SUCCESS);
1569	}
1570
1571	/***************************************************************//**
1572	Skip non-whitespace in a string. Move ptr to the next word boundary.
1573	@return pointer to first whitespace character or end /*
1574	UNIV_INLINE
1575	byte*
1576	fts_query_skip_word(
1577	/================/
1578	byte* ptr, /!< in: start of scan /
1579	const byte* end) /!< in: pointer to end of string /
1580	{
1581	/ TODO: Does this have to be UTF-8 too ? /
1582	while (ptr < end && !(ispunct(ptr) \|\| isspace(ptr))) {
1583	++ptr;
1584	}
1585
1586	return(ptr);
1587	}
1588
1589	/***************************************************************//**
1590	Check whether the remaining terms in the phrase match the text.
1591	@return TRUE if matched else FALSE /*
1592	static
1593	ibool
1594	fts_query_match_phrase_terms(
1595	/=========================/
1596	fts_phrase_t* phrase, /!< in: phrase to match /
1597	byte** start, /!< in/out: text to search, we can't*
1598	make this const becase we need to
1599	first convert the string to
1600	lowercase /*
1601	const byte* end, /!< in: pointer to the end of*
1602	the string to search /*
1603	mem_heap_t* heap) /!< in: heap /
1604	{
1605	ulint i;
1606	byte* ptr = *start;
1607	const ib_vector_t* tokens = phrase->tokens;
1608	ulint distance = phrase->distance;
1609
1610	/ We check only from the second term onwards, since the first*
1611	must have matched otherwise we wouldn't be here. /*
1612	for (i = `1`; ptr < end && i < ib_vector_size(tokens); / No op /) {
1613	fts_string_t match;
1614	fts_string_t cmp_str;
1615	const fts_string_t* token;
1616	int result;
1617	ulint ret;
1618
1619	ret = innobase_mysql_fts_get_token(
1620	phrase->charset, ptr,
1621	const_cast<byte*>(end), &match);
1622
1623	if (match.f_len > `0`) {
1624	/ Get next token to match. /
1625	token = static_cast<const fts_string_t*>(
1626	ib_vector_get_const(tokens, i));
1627
1628	fts_string_dup(&cmp_str, &match, heap);
1629
1630	result = innobase_fts_text_case_cmp(
1631	phrase->charset, token, &cmp_str);
1632
1633	/ Skip the rest of the tokens if this one doesn't*
1634	match and the proximity distance is exceeded. /*
1635	if (result
1636	&& (distance == ULINT_UNDEFINED
1637	\|\| distance == `0`)) {
1638
1639	break;
1640	}
1641
1642	/ This token matched move to the next token. /
1643	if (result == `0`) {
1644	/ Advance the text to search by the length*
1645	of the last token. /*
1646	ptr += ret;
1647
1648	/ Advance to the next token. /
1649	++i;
1650	} else {
1651
1652	ut_a(distance != ULINT_UNDEFINED);
1653
1654	ptr = fts_query_skip_word(ptr, end);
1655	}
1656
1657	/ Distance can be 0 for exact matches. /
1658	if (distance != ULINT_UNDEFINED && distance > `0`) {
1659	--distance;
1660	}
1661	} else {
1662	ptr += ret;
1663	}
1664	}
1665
1666	*start = ptr;
1667
1668	/ Can't be greater than the number of elements. /
1669	ut_a(i <= ib_vector_size(tokens));
1670
1671	/ This is the case for multiple words. /
1672	if (i == ib_vector_size(tokens)) {
1673	phrase->found = TRUE;
1674	}
1675
1676	return(phrase->found);
1677	}
1678
1679	/***************************************************************//**
1680	Callback function to count the number of words in position ranges,
1681	and see whether the word count is in specified "phrase->distance"
1682	@return true if the number of characters is less than the "distance" /*
1683	static
1684	bool
1685	fts_proximity_is_word_in_range(
1686	/===========================/
1687	const fts_phrase_t*
1688	phrase, /!< in: phrase with the search info /
1689	byte* start, /!< in: text to search /
1690	ulint total_len) /!< in: length of text /
1691	{
1692	fts_proximity_t* proximity_pos = phrase->proximity_pos;
1693
1694	ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
1695	ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
1696
1697	/ Search each matched position pair (with min and max positions)*
1698	and count the number of words in the range /*
1699	for (ulint i = `0`; i < proximity_pos->n_pos; i++) {
1700	ulint cur_pos = proximity_pos->min_pos [i];
1701	ulint n_word = `0`;
1702
1703	ut_ad(proximity_pos->max_pos[i] <= total_len);
1704
1705	/ Walk through words in the range and count them /
1706	while (cur_pos <= proximity_pos->max_pos [i]) {
1707	ulint len;
1708	fts_string_t str;
1709
1710	len = innobase_mysql_fts_get_token(
1711	phrase->charset,
1712	start + cur_pos,
1713	start + total_len, &str);
1714
1715	if (len == `0`) {
1716	break;
1717	}
1718
1719	/ Advances position with "len" bytes /
1720	cur_pos += len;
1721
1722	/ Record the number of words /
1723	if (str.f_n_char > `0`) {
1724	n_word++;
1725	}
1726
1727	if (n_word > phrase->distance) {
1728	break;
1729	}
1730	}
1731
1732	/ Check if the number of words is less than specified*
1733	"distance" /*
1734	if (n_word && n_word <= phrase->distance) {
1735	return(true);
1736	}
1737	}
1738
1739	return(false);
1740	}
1741
1742	/***************************************************************//**
1743	FTS plugin parser 'myql_add_word' callback function for phrase match
1744	Refer to 'st_mysql_ftparser_param' for more detail.
1745	@return 0 if match, or return non-zero /*
1746	static
1747	int
1748	fts_query_match_phrase_add_word_for_parser(
1749	/=======================================/
1750	MYSQL_FTPARSER_PARAM* param, /!< in: parser param /
1751	const char* word, /!< in: token /
1752	int word_len, /!< in: token length /
1753	MYSQL_FTPARSER_BOOLEAN_INFO*)
1754	{
1755	fts_phrase_param_t* phrase_param;
1756	fts_phrase_t* phrase;
1757	const ib_vector_t* tokens;
1758	fts_string_t match;
1759	fts_string_t cmp_str;
1760	const fts_string_t* token;
1761	int result;
1762	mem_heap_t* heap;
1763
1764	phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
1765	heap = phrase_param->heap;
1766	phrase = phrase_param->phrase;
1767	tokens = phrase->tokens;
1768
1769	/ In case plugin parser doesn't check return value /
1770	if (phrase_param->token_index == ib_vector_size(tokens)) {
1771	return(`1`);
1772	}
1773
1774	match.f_str = (uchar *)(word);
1775	match.f_len = ulint(word_len);
1776	match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len);
1777
1778	if (match.f_len > `0`) {
1779	/ Get next token to match. /
1780	ut_a(phrase_param->token_index < ib_vector_size(tokens));
1781	token = static_cast<const fts_string_t*>(
1782	ib_vector_get_const(tokens, phrase_param->token_index));
1783
1784	fts_string_dup(&cmp_str, &match, heap);
1785
1786	result = innobase_fts_text_case_cmp(
1787	phrase->charset, token, &cmp_str);
1788
1789	if (result == `0`) {
1790	phrase_param->token_index++;
1791	} else {
1792	return(`1`);
1793	}
1794	}
1795
1796	/ Can't be greater than the number of elements. /
1797	ut_a(phrase_param->token_index <= ib_vector_size(tokens));
1798
1799	/ This is the case for multiple words. /
1800	if (phrase_param->token_index == ib_vector_size(tokens)) {
1801	phrase->found = TRUE;
1802	}
1803
1804	return(static_cast<int>(phrase->found));
1805	}
1806
1807	/***************************************************************//**
1808	Check whether the terms in the phrase match the text.
1809	@return TRUE if matched else FALSE /*
1810	static
1811	ibool
1812	fts_query_match_phrase_terms_by_parser(
1813	/===================================/
1814	fts_phrase_param_t* phrase_param, / in/out: phrase param /
1815	st_mysql_ftparser* parser, / in: plugin fts parser /
1816	byte* text, / in: text to check /
1817	ulint len) / in: text length /
1818	{
1819	MYSQL_FTPARSER_PARAM param;
1820
1821	ut_a(parser);
1822
1823	/ Set paramters for param /
1824	param.mysql_parse = fts_tokenize_document_internal;
1825	param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
1826	param.mysql_ftparam = phrase_param;
1827	param.cs = phrase_param->phrase->charset;
1828	param.doc = reinterpret_cast<char*>(text);
1829	param.length = static_cast<int>(len);
1830	param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
1831
1832	PARSER_INIT(parser, &param);
1833	parser->parse(&param);
1834	PARSER_DEINIT(parser, &param);
1835
1836	return(phrase_param->phrase->found);
1837	}
1838
1839	/***************************************************************//**
1840	Callback function to fetch and search the document.
1841	@return TRUE if matched else FALSE /*
1842	static
1843	ibool
1844	fts_query_match_phrase(
1845	/===================/
1846	fts_phrase_t* phrase, /!< in: phrase to match /
1847	byte* start, /!< in: text to search, we can't make*
1848	this const becase we need to first
1849	convert the string to lowercase /*
1850	ulint cur_len, /!< in: length of text /
1851	ulint prev_len, /!< in: total length for searched*
1852	doc fields/*
1853	mem_heap_t* heap) / heap /
1854	{
1855	ulint i;
1856	const fts_string_t* first;
1857	const byte* end = start + cur_len;
1858	const ib_vector_t* tokens = phrase->tokens;
1859	const ib_vector_t* positions = phrase->match->positions;
1860
1861	ut_a(!phrase->found);
1862	ut_a(phrase->match->doc_id > `0`);
1863	ut_a(ib_vector_size(tokens) > `0`);
1864	ut_a(ib_vector_size(positions) > `0`);
1865
1866	first = static_cast<const fts_string_t*>(
1867	ib_vector_get_const(tokens, `0`));
1868
1869	ut_a(phrase->match->start < ib_vector_size(positions));
1870
1871	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
1872	ulint pos;
1873	byte* ptr = start;
1874
1875	pos = (ulint) ib_vector_get_const(positions, i);
1876
1877	if (pos == ULINT_UNDEFINED) {
1878	break;
1879	}
1880
1881	if (pos < prev_len) {
1882	continue;
1883	}
1884
1885	/ Document positions are calculated from the beginning*
1886	of the first field, need to save the length for each
1887	searched field to adjust the doc position when search
1888	phrases. /*
1889	pos -= prev_len;
1890	ptr = start + pos;
1891
1892	/ Within limits ? /
1893	if (ptr >= end) {
1894	break;
1895	}
1896
1897	if (phrase->parser) {
1898	fts_phrase_param_t phrase_param;
1899
1900	phrase_param.phrase = phrase;
1901	phrase_param.token_index = `0`;
1902	phrase_param.heap = heap;
1903
1904	if (fts_query_match_phrase_terms_by_parser(
1905	&phrase_param,
1906	phrase->parser,
1907	ptr,
1908	ulint(end - ptr))) {
1909	break;
1910	}
1911	} else {
1912	fts_string_t match;
1913	fts_string_t cmp_str;
1914	ulint ret;
1915
1916	match.f_str = ptr;
1917	ret = innobase_mysql_fts_get_token(
1918	phrase->charset, start + pos,
1919	const_cast<byte*>(end), &match);
1920
1921	if (match.f_len == `0`) {
1922	break;
1923	}
1924
1925	fts_string_dup(&cmp_str, &match, heap);
1926
1927	if (innobase_fts_text_case_cmp(
1928	phrase->charset, first, &cmp_str) == `0`) {
1929
1930	/ This is the case for the single word*
1931	in the phrase. /*
1932	if (ib_vector_size(phrase->tokens) == `1`) {
1933	phrase->found = TRUE;
1934	break;
1935	}
1936
1937	ptr += ret;
1938
1939	/ Match the remaining terms in the phrase. /
1940	if (fts_query_match_phrase_terms(phrase, &ptr,
1941	end, heap)) {
1942	break;
1943	}
1944	}
1945	}
1946	}
1947
1948	return(phrase->found);
1949	}
1950
1951	/***************************************************************//**
1952	Callback function to fetch and search the document.
1953	@return whether the phrase is found /*
1954	static
1955	ibool
1956	fts_query_fetch_document(
1957	/=====================/
1958	void* row, /!< in: sel_node_t* /
1959	void* user_arg) /!< in: fts_doc_t* /
1960	{
1961
1962	que_node_t* exp;
1963	sel_node_t* node = static_cast<sel_node_t*>(row);
1964	fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg);
1965	ulint prev_len = `0`;
1966	ulint total_len = `0`;
1967	byte* document_text = NULL;
1968
1969	exp = node->select_list;
1970
1971	phrase->found = FALSE;
1972
1973	/ For proximity search, we will need to get the whole document*
1974	from all fields, so first count the total length of the document
1975	from all the fields /*
1976	if (phrase->proximity_pos) {
1977	while (exp) {
1978	ulint field_len;
1979	dfield_t* dfield = que_node_get_val(exp);
1980	byte* data = static_cast<byte*>(
1981	dfield_get_data(dfield));
1982
1983	if (dfield_is_ext(dfield)) {
1984	ulint local_len = dfield_get_len(dfield);
1985
1986	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
1987
1988	field_len = mach_read_from_4(
1989	data + local_len + BTR_EXTERN_LEN + `4`);
1990	} else {
1991	field_len = dfield_get_len(dfield);
1992	}
1993
1994	if (field_len != UNIV_SQL_NULL) {
1995	total_len += field_len + `1`;
1996	}
1997
1998	exp = que_node_get_next(exp);
1999	}
2000
2001	document_text = static_cast<byte*>(mem_heap_zalloc(
2002	phrase->heap, total_len));
2003
2004	if (!document_text) {
2005	return(FALSE);
2006	}
2007	}
2008
2009	exp = node->select_list;
2010
2011	while (exp) {
2012	dfield_t* dfield = que_node_get_val(exp);
2013	byte* data = static_cast<byte*>(
2014	dfield_get_data(dfield));
2015	ulint cur_len;
2016
2017	if (dfield_is_ext(dfield)) {
2018	data = btr_copy_externally_stored_field(
2019	&cur_len, data, phrase->page_size,
2020	dfield_get_len(dfield), phrase->heap);
2021	} else {
2022	cur_len = dfield_get_len(dfield);
2023	}
2024
2025	if (cur_len != UNIV_SQL_NULL && cur_len != `0`) {
2026	if (phrase->proximity_pos) {
2027	ut_ad(prev_len + cur_len <= total_len);
2028	memcpy(document_text + prev_len, data, cur_len);
2029	} else {
2030	/ For phrase search /
2031	phrase->found =
2032	fts_query_match_phrase(
2033	phrase,
2034	static_cast<byte*>(data),
2035	cur_len, prev_len,
2036	phrase->heap);
2037	}
2038
2039	/ Document positions are calculated from the beginning*
2040	of the first field, need to save the length for each
2041	searched field to adjust the doc position when search
2042	phrases. /*
2043	prev_len += cur_len + `1`;
2044	}
2045
2046	if (phrase->found) {
2047	break;
2048	}
2049
2050	exp = que_node_get_next(exp);
2051	}
2052
2053	if (phrase->proximity_pos) {
2054	ut_ad(prev_len <= total_len);
2055
2056	phrase->found = fts_proximity_is_word_in_range(
2057	phrase, document_text, total_len);
2058	}
2059
2060	return(phrase->found);
2061	}
2062
2063	#if 0
2064	/********************************************************************
2065	Callback function to check whether a record was found or not. /*
2066	static
2067	ibool
2068	fts_query_select(
2069	/=============/
2070	void* row, /!< in: sel_node_t* /
2071	void* user_arg) /!< in: fts_doc_t* /
2072	{
2073	int i;
2074	que_node_t* exp;
2075	sel_node_t* node = row;
2076	fts_select_t* select = user_arg;
2077
2078	ut_a(select->word_freq);
2079	ut_a(select->word_freq->doc_freqs);
2080
2081	exp = node->select_list;
2082
2083	for (i = `0`; exp && !select->found; ++i) {
2084	dfield_t* dfield = que_node_get_val(exp);
2085	void* data = dfield_get_data(dfield);
2086	ulint len = dfield_get_len(dfield);
2087
2088	switch (i) {
2089	case `0`: / DOC_COUNT /
2090	if (len != UNIV_SQL_NULL && len != `0`) {
2091
2092	select->word_freq->doc_count +=
2093	mach_read_from_4(data);
2094	}
2095	break;
2096
2097	case `1`: / ILIST /
2098	if (len != UNIV_SQL_NULL && len != `0`) {
2099
2100	fts_query_find_doc_id(select, data, len);
2101	}
2102	break;
2103
2104	default:
2105	ut_error;
2106	}
2107
2108	exp = que_node_get_next(exp);
2109	}
2110
2111	return(FALSE);
2112	}
2113
2114	/********************************************************************
2115	Read the rows from the FTS index, that match word and where the
2116	doc id is between first and last doc id.
2117	@return DB_SUCCESS if all go well else error code /*
2118	static MY_ATTRIBUTE((nonnull, warn_unused_result))
2119	dberr_t
2120	fts_query_find_term(
2121	/================/
2122	fts_query_t* query, /!< in: FTS query state /
2123	que_t** graph, /!< in: prepared statement /
2124	const fts_string_t* word, /!< in: the word to fetch /
2125	doc_id_t doc_id, /!< in: doc id to match /
2126	ulint* min_pos,/!< in/out: pos found must be*
2127	greater than this minimum value. /*
2128	ibool* found) /!< out: TRUE if found else FALSE /
2129	{
2130	pars_info_t* info;
2131	dberr_t error;
2132	fts_select_t select;
2133	doc_id_t match_doc_id;
2134	trx_t* trx = query->trx;
2135	char table_name[MAX_FULL_NAME_LEN];
2136
2137	trx->op_info = "fetching FTS index matching nodes";
2138
2139	if (*graph) {
2140	info = (*graph)->info;
2141	} else {
2142	ulint selected;
2143
2144	info = pars_info_create();
2145
2146	selected = fts_select_index(*word->f_str);
2147	query->fts_index_table.suffix = fts_get_suffix(selected);
2148
2149	fts_get_table_name(&query->fts_index_table, table_name);
2150	pars_info_bind_id(info, true, "index_table_name", table_name);
2151	}
2152
2153	select.found = FALSE;
2154	select.doc_id = doc_id;
2155	select.min_pos = *min_pos;
2156	select.word_freq = fts_query_add_word_freq(query, word->f_str);
2157
2158	pars_info_bind_function(info, "my_func", fts_query_select, &select);
2159	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2160
2161	/ Convert to "storage" byte order. /
2162	fts_write_doc_id((byte*) &match_doc_id, doc_id);
2163
2164	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
2165
2166	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
2167
2168	if (!*graph) {
2169
2170	*graph = fts_parse_sql(
2171	&query->fts_index_table,
2172	info,
2173	"DECLARE FUNCTION my_func;\n"
2174	"DECLARE CURSOR c IS"
2175	" SELECT doc_count, ilist\n"
2176	" FROM $index_table_name\n"
2177	" WHERE word LIKE :word AND"
2178	" first_doc_id <= :min_doc_id AND"
2179	" last_doc_id >= :max_doc_id\n"
2180	" ORDER BY first_doc_id;\n"
2181	"BEGIN\n"
2182	"\n"
2183	"OPEN c;\n"
2184	"WHILE 1 = 1 LOOP\n"
2185	" FETCH c INTO my_func();\n"
2186	" IF c % NOTFOUND THEN\n"
2187	" EXIT;\n"
2188	" END IF;\n"
2189	"END LOOP;\n"
2190	"CLOSE c;");
2191	}
2192
2193	for (;;) {
2194	error = fts_eval_sql(trx, *graph);
2195
2196	if (error == DB_SUCCESS) {
2197
2198	break; / Exit the loop. /
2199	} else {
2200
2201	if (error == DB_LOCK_WAIT_TIMEOUT) {
2202	ib::warn() << "lock wait timeout reading FTS"
2203	" index. Retrying!";
2204
2205	trx->error_state = DB_SUCCESS;
2206	} else {
2207	ib::error() << error
2208	<< " while reading FTS index.";
2209
2210	break; / Exit the loop. /
2211	}
2212	}
2213	}
2214
2215	/ Value to return /
2216	*found = select.found;
2217
2218	if (*found) {
2219	*min_pos = select.min_pos;
2220	}
2221
2222	return(error);
2223	}
2224
2225	/********************************************************************
2226	Callback aggregator for int columns. /*
2227	static
2228	ibool
2229	fts_query_sum(
2230	/==========/
2231	/!< out: always returns TRUE /
2232	void* row, /!< in: sel_node_t* /
2233	void* user_arg) /!< in: ulint* /
2234	{
2235
2236	que_node_t* exp;
2237	sel_node_t* node = row;
2238	ulint* total = user_arg;
2239
2240	exp = node->select_list;
2241
2242	while (exp) {
2243	dfield_t* dfield = que_node_get_val(exp);
2244	void* data = dfield_get_data(dfield);
2245	ulint len = dfield_get_len(dfield);
2246
2247	if (len != UNIV_SQL_NULL && len != `0`) {
2248	*total += mach_read_from_4(data);
2249	}
2250
2251	exp = que_node_get_next(exp);
2252	}
2253
2254	return(TRUE);
2255	}
2256
2257	/********************************************************************
2258	Calculate the total documents that contain a particular word (term).
2259	@return DB_SUCCESS if all go well else error code /*
2260	static MY_ATTRIBUTE((nonnull, warn_unused_result))
2261	dberr_t
2262	fts_query_total_docs_containing_term(
2263	/=================================/
2264	fts_query_t* query, /!< in: FTS query state /
2265	const fts_string_t* word, /!< in: the word to check /
2266	ulint* total) /!< out: documents containing word /
2267	{
2268	pars_info_t* info;
2269	dberr_t error;
2270	que_t* graph;
2271	ulint selected;
2272	trx_t* trx = query->trx;
2273	char table_name[MAX_FULL_NAME_LEN]
2274
2275	trx->op_info = "fetching FTS index document count";
2276
2277	*total = `0`;
2278
2279	info = pars_info_create();
2280
2281	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2282	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
2283
2284	selected = fts_select_index(*word->f_str);
2285
2286	query->fts_index_table.suffix = fts_get_suffix(selected);
2287
2288	fts_get_table_name(&query->fts_index_table, table_name);
2289
2290	pars_info_bind_id(info, true, "index_table_name", table_name);
2291
2292	graph = fts_parse_sql(
2293	&query->fts_index_table,
2294	info,
2295	"DECLARE FUNCTION my_func;\n"
2296	"DECLARE CURSOR c IS"
2297	" SELECT doc_count\n"
2298	" FROM $index_table_name\n"
2299	" WHERE word = :word"
2300	" ORDER BY first_doc_id;\n"
2301	"BEGIN\n"
2302	"\n"
2303	"OPEN c;\n"
2304	"WHILE 1 = 1 LOOP\n"
2305	" FETCH c INTO my_func();\n"
2306	" IF c % NOTFOUND THEN\n"
2307	" EXIT;\n"
2308	" END IF;\n"
2309	"END LOOP;\n"
2310	"CLOSE c;");
2311
2312	for (;;) {
2313	error = fts_eval_sql(trx, graph);
2314
2315	if (error == DB_SUCCESS) {
2316
2317	break; / Exit the loop. /
2318	} else {
2319
2320	if (error == DB_LOCK_WAIT_TIMEOUT) {
2321	ib::warn() << "lock wait timeout reading FTS"
2322	" index. Retrying!";
2323
2324	trx->error_state = DB_SUCCESS;
2325	} else {
2326	ib::error() << error
2327	<< " while reading FTS index.";
2328
2329	break; / Exit the loop. /
2330	}
2331	}
2332	}
2333
2334	fts_que_graph_free(graph);
2335
2336	return(error);
2337	}
2338
2339	/********************************************************************
2340	Get the total number of words in a documents.
2341	@return DB_SUCCESS if all go well else error code /*
2342	static MY_ATTRIBUTE((nonnull, warn_unused_result))
2343	dberr_t
2344	fts_query_terms_in_document(
2345	/========================/
2346	fts_query_t* query, /!< in: FTS query state /
2347	doc_id_t doc_id, /!< in: the word to check /
2348	ulint* total) /!< out: total words in document /
2349	{
2350	pars_info_t* info;
2351	dberr_t error;
2352	que_t* graph;
2353	doc_id_t read_doc_id;
2354	trx_t* trx = query->trx;
2355	char table_name[MAX_FULL_NAME_LEN];
2356
2357	trx->op_info = "fetching FTS document term count";
2358
2359	*total = `0`;
2360
2361	info = pars_info_create();
2362
2363	pars_info_bind_function(info, "my_func", fts_query_sum, total);
2364
2365	/ Convert to "storage" byte order. /
2366	fts_write_doc_id((byte*) &read_doc_id, doc_id);
2367	fts_bind_doc_id(info, "doc_id", &read_doc_id);
2368
2369	query->fts_index_table.suffix = "DOC_ID";
2370
2371	fts_get_table_name(&query->fts_index_table, table_name);
2372
2373	pars_info_bind_id(info, true, "index_table_name", table_name);
2374
2375	graph = fts_parse_sql(
2376	&query->fts_index_table,
2377	info,
2378	"DECLARE FUNCTION my_func;\n"
2379	"DECLARE CURSOR c IS"
2380	" SELECT count\n"
2381	" FROM $index_table_name\n"
2382	" WHERE doc_id = :doc_id"
2383	" BEGIN\n"
2384	"\n"
2385	"OPEN c;\n"
2386	"WHILE 1 = 1 LOOP\n"
2387	" FETCH c INTO my_func();\n"
2388	" IF c % NOTFOUND THEN\n"
2389	" EXIT;\n"
2390	" END IF;\n"
2391	"END LOOP;\n"
2392	"CLOSE c;");
2393
2394	for (;;) {
2395	error = fts_eval_sql(trx, graph);
2396
2397	if (error == DB_SUCCESS) {
2398
2399	break; / Exit the loop. /
2400	} else {
2401
2402	if (error == DB_LOCK_WAIT_TIMEOUT) {
2403	ib::warn() << "lock wait timeout reading FTS"
2404	" doc id table. Retrying!";
2405
2406	trx->error_state = DB_SUCCESS;
2407	} else {
2408	ib::error() << error << " while reading FTS"
2409	" doc id table.";
2410
2411	break; / Exit the loop. /
2412	}
2413	}
2414	}
2415
2416	fts_que_graph_free(graph);
2417
2418	return(error);
2419	}
2420	#endif
2421
2422	/***************************************************************//**
2423	Retrieve the document and match the phrase tokens.
2424	@return DB_SUCCESS or error code /*
2425	MY_ATTRIBUTE((nonnull(`1`,`2`,`3`,`6`), warn_unused_result))
2426	static
2427	dberr_t
2428	fts_query_match_document(
2429	/=====================/
2430	ib_vector_t* tokens, /!< in: phrase tokens /
2431	fts_get_doc_t* get_doc, /!< in: table and prepared statements /
2432	fts_match_t* match, /!< in: doc id and positions /
2433	ulint distance, /!< in: proximity distance /
2434	st_mysql_ftparser* parser, /!< in: fts plugin parser /
2435	ibool* found) /!< out: TRUE if phrase found /
2436	{
2437	dberr_t error;
2438	fts_phrase_t phrase(get_doc->index_cache->index->table);
2439
2440	phrase.match = match; / Positions to match /
2441	phrase.tokens = tokens; / Tokens to match /
2442	phrase.distance = distance;
2443	phrase.charset = get_doc->index_cache->charset;
2444	phrase.heap = mem_heap_create(`512`);
2445	phrase.parser = parser;
2446
2447	*found = phrase.found = FALSE;
2448
2449	error = fts_doc_fetch_by_doc_id(
2450	get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2451	fts_query_fetch_document, &phrase);
2452
2453	if (error != DB_SUCCESS) {
2454	ib::error () << "(" << ut_strerr(error)
2455	<< ") matching document.";
2456	} else {
2457	*found = phrase.found;
2458	}
2459
2460	mem_heap_free(phrase.heap);
2461
2462	return(error);
2463	}
2464
2465	/***************************************************************//**
2466	This function fetches the original documents and count the
2467	words in between matching words to see that is in specified distance
2468	@return DB_SUCCESS if all OK /*
2469	static MY_ATTRIBUTE((nonnull, warn_unused_result))
2470	bool
2471	fts_query_is_in_proximity_range(
2472	/============================/
2473	const fts_query_t* query, /!< in: query instance /
2474	fts_match_t** match, /!< in: query instance /
2475	fts_proximity_t* qualified_pos) /!< in: position info for*
2476	qualified ranges /*
2477	{
2478	fts_get_doc_t get_doc;
2479	fts_cache_t* cache = query->index->table->fts->cache;
2480	dberr_t err;
2481
2482	memset(&get_doc, `0x0`, sizeof(get_doc));
2483
2484	rw_lock_x_lock(&cache->lock);
2485	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2486	rw_lock_x_unlock(&cache->lock);
2487	ut_a(get_doc.index_cache != NULL);
2488
2489	fts_phrase_t phrase(get_doc.index_cache->index->table);
2490
2491	phrase.distance = query->distance;
2492	phrase.charset = get_doc.index_cache->charset;
2493	phrase.heap = mem_heap_create(`512`);
2494	phrase.proximity_pos = qualified_pos;
2495	phrase.found = FALSE;
2496
2497	err = fts_doc_fetch_by_doc_id(
2498	&get_doc, match[`0`]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
2499	fts_query_fetch_document, &phrase);
2500
2501	if (err != DB_SUCCESS) {
2502	ib::error () << "(" << ut_strerr(err) << ") in verification"
2503	" phase of proximity search";
2504	}
2505
2506	/ Free the prepared statement. /
2507	if (get_doc.get_document_graph) {
2508	fts_que_graph_free(get_doc.get_document_graph);
2509	get_doc.get_document_graph = NULL;
2510	}
2511
2512	mem_heap_free(phrase.heap);
2513
2514	return(err == DB_SUCCESS && phrase.found);
2515	}
2516
2517	/***************************************************************//**
2518	Iterate over the matched document ids and search the for the
2519	actual phrase in the text.
2520	@return DB_SUCCESS if all OK /*
2521	static MY_ATTRIBUTE((nonnull, warn_unused_result))
2522	dberr_t
2523	fts_query_search_phrase(
2524	/====================/
2525	fts_query_t* query, /!< in: query instance /
2526	ib_vector_t* orig_tokens, /!< in: tokens to search,*
2527	with any stopwords in the
2528	original phrase /*
2529	ib_vector_t* tokens) /!< in: tokens that does*
2530	not include stopwords and
2531	can be used to calculate
2532	ranking /*
2533	{
2534	ulint i;
2535	fts_get_doc_t get_doc;
2536	ulint n_matched;
2537	fts_cache_t* cache = query->index->table->fts->cache;
2538
2539	n_matched = ib_vector_size(query->matched);
2540
2541	/ Setup the doc retrieval infrastructure. /
2542	memset(&get_doc, `0x0`, sizeof(get_doc));
2543
2544	rw_lock_x_lock(&cache->lock);
2545
2546	get_doc.index_cache = fts_find_index_cache(cache, query->index);
2547
2548	/ Must find the index cache /
2549	ut_a(get_doc.index_cache != NULL);
2550
2551	rw_lock_x_unlock(&cache->lock);
2552
2553	#ifdef FTS_INTERNAL_DIAG_PRINT
2554	ib::info() << "Start phrase search";
2555	#endif
2556
2557	/ Read the document from disk and do the actual*
2558	match, matching documents will be added to the current
2559	doc id set. /*
2560	for (i = `0`; i < n_matched && query->error == DB_SUCCESS; ++i) {
2561	fts_match_t* match;
2562	ibool found = FALSE;
2563
2564	match = static_cast<fts_match_t*>(
2565	ib_vector_get(query->matched, i));
2566
2567	/ Skip the document ids that were filtered out by*
2568	an earlier pass. /*
2569	if (match->doc_id != `0`) {
2570
2571	query->error = fts_query_match_document(
2572	orig_tokens, &get_doc, match,
2573	query->distance, query->parser, &found);
2574
2575	if (query->error == DB_SUCCESS && found) {
2576	ulint z;
2577
2578	query->error = fts_query_process_doc_id(query,
2579	match->doc_id, `0`);
2580	if (query->error != DB_SUCCESS) {
2581	goto func_exit;
2582	}
2583
2584	for (z = `0`; z < ib_vector_size(tokens); z++) {
2585	fts_string_t* token;
2586	token = static_cast<fts_string_t*>(
2587	ib_vector_get(tokens, z));
2588	fts_query_add_word_to_document(
2589	query, match->doc_id, token);
2590	}
2591	}
2592	}
2593	}
2594
2595	func_exit:
2596	/ Free the prepared statement. /
2597	if (get_doc.get_document_graph) {
2598	fts_que_graph_free(get_doc.get_document_graph);
2599	get_doc.get_document_graph = NULL;
2600	}
2601
2602	return(query->error);
2603	}
2604
2605	/* Split the phrase into tokens*
2606	@param[in,out] query query instance
2607	@param[in] node query node to search
2608	@param[in,out] tokens token vector
2609	@param[in,out] orig_tokens original node tokens include stopword
2610	@param[in,out] heap mem heap /*
2611	static
2612	void
2613	fts_query_phrase_split(
2614	fts_query_t* query,
2615	const fts_ast_node_t* node,
2616	ib_vector_t* tokens,
2617	ib_vector_t* orig_tokens,
2618	mem_heap_t* heap)
2619	{
2620	fts_string_t phrase;
2621	ulint len = `0`;
2622	ulint cur_pos = `0`;
2623	fts_ast_node_t* term_node = NULL;
2624
2625	if (node->type == FTS_AST_TEXT) {
2626	phrase.f_str = node->text.ptr->str;
2627	phrase.f_len = node->text.ptr->len;
2628	len = phrase.f_len;
2629	} else {
2630	ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2631	phrase.f_str = NULL;
2632	phrase.f_len = `0`;
2633	term_node = node->list.head;
2634	}
2635
2636	while (true) {
2637	fts_cache_t* cache = query->index->table->fts->cache;
2638	ulint cur_len;
2639	fts_string_t result_str;
2640
2641	if (node->type == FTS_AST_TEXT) {
2642	if (cur_pos >= len) {
2643	break;
2644	}
2645
2646	cur_len = innobase_mysql_fts_get_token(
2647	query->fts_index_table.charset,
2648	reinterpret_cast<const byte*>(phrase.f_str)
2649	+ cur_pos,
2650	reinterpret_cast<const byte*>(phrase.f_str)
2651	+ len,
2652	&result_str);
2653
2654	if (cur_len == `0`) {
2655	break;
2656	}
2657
2658	cur_pos += cur_len;
2659	} else {
2660	ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
2661	/ Term node in parser phrase list /
2662	if (term_node == NULL) {
2663	break;
2664	}
2665
2666	ut_a(term_node->type == FTS_AST_TERM);
2667	result_str.f_str = term_node->term.ptr->str;
2668	result_str.f_len = term_node->term.ptr->len;
2669	result_str.f_n_char = fts_get_token_size(
2670	query->fts_index_table.charset,
2671	reinterpret_cast<char*>(result_str.f_str),
2672	result_str.f_len);
2673
2674	term_node = term_node->next;
2675	}
2676
2677	if (result_str.f_n_char == `0`) {
2678	continue;
2679	}
2680
2681	fts_string_t* token = static_cast<fts_string_t*>(
2682	ib_vector_push(tokens, NULL));
2683	fts_string_dup(token, &result_str, heap);
2684
2685	if (fts_check_token(
2686	&result_str,
2687	cache->stopword_info.cached_stopword,
2688	query->fts_index_table.charset)) {
2689	/ Add the word to the RB tree so that we can*
2690	calculate it's frequencey within a document. /*
2691	fts_query_add_word_freq(query, token);
2692	} else {
2693	ib_vector_pop(tokens);
2694	}
2695
2696	/ we will start to store all words including stopwords*
2697	in the "orig_tokens" vector, but skip any leading words
2698	that are stopwords /*
2699	if (!ib_vector_is_empty(tokens)) {
2700	fts_string_t* orig_token = static_cast<fts_string_t*>(
2701	ib_vector_push(orig_tokens, NULL));
2702
2703	orig_token->f_str = token->f_str;
2704	orig_token->f_len = token->f_len;
2705	}
2706	}
2707	}
2708
2709	/***************************************************************//**
2710	Text/Phrase search.
2711	@return DB_SUCCESS or error code /*
2712	static MY_ATTRIBUTE((warn_unused_result))
2713	dberr_t
2714	fts_query_phrase_search(
2715	/====================/
2716	fts_query_t* query, /!< in: query instance /
2717	const fts_ast_node_t* node) /!< in: node to search /
2718	{
2719	ib_vector_t* tokens;
2720	ib_vector_t* orig_tokens;
2721	mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t));
2722	ib_alloc_t* heap_alloc;
2723	ulint num_token;
2724
2725	heap_alloc = ib_heap_allocator_create(heap);
2726
2727	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), `4`);
2728	orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), `4`);
2729
2730	if (query->distance != ULINT_UNDEFINED && query->distance > `0`) {
2731	query->flags = FTS_PROXIMITY;
2732	} else {
2733	query->flags = FTS_PHRASE;
2734	}
2735
2736	/ Split the phrase into tokens. /
2737	fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
2738
2739	num_token = ib_vector_size(tokens);
2740	if (num_token > MAX_PROXIMITY_ITEM) {
2741	query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
2742	goto func_exit;
2743	}
2744
2745	ut_ad(ib_vector_size(orig_tokens) >= num_token);
2746
2747	/ Ignore empty strings. /
2748	if (num_token > `0`) {
2749	fts_string_t* token;
2750	fts_fetch_t fetch;
2751	trx_t* trx = query->trx;
2752	fts_ast_oper_t oper = query->oper;
2753	que_t* graph = NULL;
2754	ulint i;
2755	dberr_t error;
2756
2757	/ Create the vector for storing matching document ids*
2758	and the positions of the first token of the phrase. /*
2759	if (!query->matched) {
2760	ib_alloc_t* heap_alloc;
2761
2762	heap_alloc = ib_heap_allocator_create(heap);
2763
2764	if (!(query->flags & FTS_PROXIMITY)
2765	&& !(query->flags & FTS_PHRASE)) {
2766	query->matched = ib_vector_create(
2767	heap_alloc, sizeof(fts_match_t),
2768	`64`);
2769	} else {
2770	ut_a(num_token <= MAX_PROXIMITY_ITEM);
2771	query->match_array =
2772	(ib_vector_t**) mem_heap_alloc(
2773	heap,
2774	num_token *
2775	sizeof(query->matched));
2776
2777	for (i = `0`; i < num_token; i++) {
2778	query->match_array[i] =
2779	ib_vector_create(
2780	heap_alloc, sizeof(fts_match_t),
2781	`64`);
2782	}
2783
2784	query->matched = query->match_array[`0`];
2785	}
2786	}
2787
2788	/ Setup the callback args for filtering and consolidating*
2789	the ilist. /*
2790	fetch.read_arg = query;
2791	fetch.read_record = fts_query_index_fetch_nodes;
2792
2793	for (i = `0`; i < num_token; i++) {
2794	/ Search for the first word from the phrase. /
2795	token = static_cast<fts_string_t*>(
2796	ib_vector_get(tokens, i));
2797
2798	if (query->flags & FTS_PROXIMITY
2799	\|\| query->flags & FTS_PHRASE) {
2800	query->matched = query->match_array[i];
2801	}
2802
2803	error = fts_index_fetch_nodes(
2804	trx, &graph, &query->fts_index_table,
2805	token, &fetch);
2806
2807	/ DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' /
2808	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
2809	if (error != DB_SUCCESS) {
2810	query->error = error;
2811	}
2812
2813	fts_que_graph_free(graph);
2814	graph = NULL;
2815
2816	fts_query_cache(query, token);
2817
2818	if (!(query->flags & FTS_PHRASE)
2819	&& !(query->flags & FTS_PROXIMITY)) {
2820	break;
2821	}
2822
2823	/ If any of the token can't be found,*
2824	no need to continue match /*
2825	if (ib_vector_is_empty(query->match_array[i])
2826	\|\| query->error != DB_SUCCESS) {
2827	goto func_exit;
2828	}
2829	}
2830
2831	/ Just a single word, no need to fetch the original*
2832	documents to do phrase matching /*
2833	if (ib_vector_size(orig_tokens) == `1`
2834	&& !ib_vector_is_empty(query->match_array[`0`])) {
2835	fts_match_t* match;
2836	ulint n_matched;
2837
2838	n_matched = ib_vector_size(query->match_array[`0`]);
2839
2840	for (i = `0`; i < n_matched; i++) {
2841	match = static_cast<fts_match_t*>(
2842	ib_vector_get(
2843	query->match_array[`0`], i));
2844
2845	query->error = fts_query_process_doc_id(
2846	query, match->doc_id, `0`);
2847	if (query->error != DB_SUCCESS) {
2848	goto func_exit;
2849	}
2850
2851	fts_query_add_word_to_document(
2852	query, match->doc_id, token);
2853	}
2854	query->oper = oper;
2855	goto func_exit;
2856	}
2857
2858	/ If we are doing proximity search, verify the distance*
2859	between all words, and check they are in specified distance. /*
2860	if (query->flags & FTS_PROXIMITY) {
2861	fts_phrase_or_proximity_search(query, tokens);
2862	} else {
2863	ibool matched;
2864
2865	/ Phrase Search case:*
2866	We filter out the doc ids that don't contain
2867	all the tokens in the phrase. It's cheaper to
2868	search the ilist than bringing the documents in
2869	and then doing a search through the text. Isolated
2870	testing shows this also helps in mitigating disruption
2871	of the buffer cache. /*
2872	matched = fts_phrase_or_proximity_search(query, tokens);
2873	query->matched = query->match_array[`0`];
2874
2875	/ Read the actual text in and search for the phrase. /
2876	if (matched) {
2877	ut_ad(query->error == DB_SUCCESS);
2878	query->error = fts_query_search_phrase(
2879	query, orig_tokens, tokens);
2880	}
2881	}
2882
2883	/ Restore original operation. /
2884	query->oper = oper;
2885
2886	if (query->error != DB_SUCCESS) {
2887	goto func_exit;
2888	}
2889	}
2890
2891	func_exit:
2892	mem_heap_free(heap);
2893
2894	/ Don't need it anymore. /
2895	query->matched = NULL;
2896
2897	return(query->error);
2898	}
2899
2900	/***************************************************************//**
2901	Find the word and evaluate.
2902	@return DB_SUCCESS if all go well /*
2903	static MY_ATTRIBUTE((nonnull, warn_unused_result))
2904	dberr_t
2905	fts_query_execute(
2906	/==============/
2907	fts_query_t* query, /!< in: query instance /
2908	fts_string_t* token) /!< in: token to search /
2909	{
2910	switch (query->oper) {
2911	case FTS_NONE:
2912	case FTS_NEGATE:
2913	case FTS_INCR_RATING:
2914	case FTS_DECR_RATING:
2915	query->error = fts_query_union(query, token);
2916	break;
2917
2918	case FTS_EXIST:
2919	query->error = fts_query_intersect(query, token);
2920	break;
2921
2922	case FTS_IGNORE:
2923	query->error = fts_query_difference(query, token);
2924	break;
2925
2926	default:
2927	ut_error;
2928	}
2929
2930	return(query->error);
2931	}
2932
2933	/***************************************************************//**
2934	Create a wildcard string. It's the responsibility of the caller to
2935	free the byte pointer. It's allocated using ut_malloc_nokey().*
2936	@return ptr to allocated memory /*
2937	static
2938	byte*
2939	fts_query_get_token(
2940	/================/
2941	fts_ast_node_t* node, /!< in: the current sub tree /
2942	fts_string_t* token) /!< in: token to create /
2943	{
2944	ulint str_len;
2945	byte* new_ptr = NULL;
2946
2947	str_len = node->term.ptr->len;
2948
2949	ut_a(node->type == FTS_AST_TERM);
2950
2951	token->f_len = str_len;
2952	token->f_str = node->term.ptr->str;
2953
2954	if (node->term.wildcard) {
2955
2956	token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + `2`));
2957	token->f_len = str_len + `1`;
2958
2959	memcpy(token->f_str, node->term.ptr->str, str_len);
2960
2961	token->f_str[str_len] = `'%'`;
2962	token->f_str[token->f_len] = `0`;
2963
2964	new_ptr = token->f_str;
2965	}
2966
2967	return(new_ptr);
2968	}
2969
2970	/***************************************************************//**
2971	Visit every node of the AST. /*
2972	static
2973	dberr_t
2974	fts_query_visitor(
2975	/==============/
2976	fts_ast_oper_t oper, /!< in: current operator /
2977	fts_ast_node_t* node, /!< in: The root of the current subtree/
2978	void* arg) /!< in: callback arg/
2979	{
2980	byte* ptr;
2981	fts_string_t token;
2982	fts_query_t* query = static_cast<fts_query_t*>(arg);
2983
2984	ut_a(node);
2985	DBUG_ENTER("fts_query_visitor");
2986	DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
2987
2988	token.f_n_char = `0`;
2989	query->oper = oper;
2990	query->cur_node = node;
2991
2992	switch (node->type) {
2993	case FTS_AST_TEXT:
2994	case FTS_AST_PARSER_PHRASE_LIST:
2995
2996	if (query->oper == FTS_EXIST) {
2997	ut_ad(query->intersection == NULL);
2998	query->intersection = rbt_create(
2999	sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3000
3001	query->total_size += SIZEOF_RBT_CREATE;
3002	}
3003
3004	/ Set the current proximity distance. /
3005	query->distance = node->text.distance;
3006
3007	/ Force collection of doc ids and the positions. /
3008	query->collect_positions = TRUE;
3009
3010	query->error = fts_query_phrase_search(query, node);
3011
3012	query->collect_positions = FALSE;
3013
3014	if (query->oper == FTS_EXIST) {
3015	fts_query_free_doc_ids(query, query->doc_ids);
3016	query->doc_ids = query->intersection;
3017	query->intersection = NULL;
3018	}
3019
3020	break;
3021
3022	case FTS_AST_TERM:
3023	token.f_str = node->term.ptr->str;
3024	token.f_len = node->term.ptr->len;
3025
3026	/ Collect wildcard words for QUERY EXPANSION. /
3027	if (node->term.wildcard && query->wildcard_words != NULL) {
3028	ib_rbt_bound_t parent;
3029
3030	if (rbt_search(query->wildcard_words, &parent, &token)
3031	!= `0`) {
3032	fts_string_t word;
3033
3034	fts_string_dup(&word, &token, query->heap);
3035	rbt_add_node(query->wildcard_words, &parent,
3036	&word);
3037	}
3038	}
3039
3040	/ Add the word to our RB tree that will be used to*
3041	calculate this terms per document frequency. /*
3042	fts_query_add_word_freq(query, &token);
3043
3044	ptr = fts_query_get_token(node, &token);
3045	query->error = fts_query_execute(query, &token);
3046
3047	if (ptr) {
3048	ut_free(ptr);
3049	}
3050
3051	break;
3052
3053	case FTS_AST_SUBEXP_LIST:
3054	query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
3055	break;
3056
3057	default:
3058	ut_error;
3059	}
3060
3061	if (query->oper == FTS_EXIST) {
3062	query->multi_exist = true;
3063	}
3064
3065	DBUG_RETURN(query->error);
3066	}
3067
3068	/* Process (nested) sub-expression, create a new result set to store the*
3069	sub-expression result by processing nodes under current sub-expression
3070	list. Merge the sub-expression result with that of parent expression list.
3071	@param[in,out] node current root node
3072	@param[in,out] visitor callback function
3073	@param[in,out] arg argument for callback
3074	@return DB_SUCCESS if all go well /*
3075	static
3076	dberr_t
3077	fts_ast_visit_sub_exp(
3078	fts_ast_node_t* node,
3079	fts_ast_callback visitor,
3080	void* arg)
3081	{
3082	fts_ast_oper_t cur_oper;
3083	fts_query_t* query = static_cast<fts_query_t*>(arg);
3084	ib_rbt_t* parent_doc_ids;
3085	ib_rbt_t* subexpr_doc_ids;
3086	dberr_t error = DB_SUCCESS;
3087	bool will_be_ignored = false;
3088	bool multi_exist;
3089
3090	DBUG_ENTER("fts_ast_visit_sub_exp");
3091
3092	ut_a(node->type == FTS_AST_SUBEXP_LIST);
3093
3094	cur_oper = query->oper;
3095
3096	/ Save current result set /
3097	parent_doc_ids = query->doc_ids;
3098
3099	/ Create new result set to store the sub-expression result. We*
3100	will merge this result set with the parent after processing. /*
3101	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
3102	fts_ranking_doc_id_cmp);
3103
3104	query->total_size += SIZEOF_RBT_CREATE;
3105
3106	multi_exist = query->multi_exist;
3107	query->multi_exist = false;
3108	/ Process nodes in current sub-expression and store its*
3109	result set in query->doc_ids we created above. /*
3110	error = fts_ast_visit(FTS_NONE, node, visitor,
3111	arg, &will_be_ignored);
3112
3113	/ Reinstate parent node state /
3114	query->multi_exist = multi_exist;
3115	query->oper = cur_oper;
3116
3117	/ Merge the sub-expression result with the parent result set. /
3118	subexpr_doc_ids = query->doc_ids;
3119	query->doc_ids = parent_doc_ids;
3120	if (error == DB_SUCCESS) {
3121	error = fts_merge_doc_ids(query, subexpr_doc_ids);
3122	}
3123
3124	/ Free current result set. Result already merged into parent. /
3125	fts_query_free_doc_ids(query, subexpr_doc_ids);
3126
3127	DBUG_RETURN(error);
3128	}
3129
3130	#if 0
3131	/***************************************************************//***
3132	Check if the doc id exists in the ilist.
3133	@return TRUE if doc id found /*
3134	static
3135	ulint
3136	fts_query_find_doc_id(
3137	/==================/
3138	fts_select_t* select, /!< in/out: contains the doc id to*
3139	find, we update the word freq if
3140	document found /*
3141	void* data, /!< in: doc id ilist /
3142	ulint len) /!< in: doc id ilist size /
3143	{
3144	byte* ptr = data;
3145	doc_id_t doc_id = `0`;
3146	ulint decoded = `0`;
3147
3148	/ Decode the ilist and search for selected doc_id. We also*
3149	calculate the frequency of the word in the document if found. /*
3150	while (decoded < len && !select->found) {
3151	ulint freq = `0`;
3152	ulint min_pos = `0`;
3153	ulint last_pos = `0`;
3154	ulint pos = fts_decode_vlc(&ptr);
3155
3156	/ Add the delta. /
3157	doc_id += pos;
3158
3159	while (*ptr) {
3160	++freq;
3161	last_pos += fts_decode_vlc(&ptr);
3162
3163	/ Only if min_pos is not set and the current*
3164	term exists in a position greater than the
3165	min_pos of the previous term. /*
3166	if (min_pos == `0` && last_pos > select->min_pos) {
3167	min_pos = last_pos;
3168	}
3169	}
3170
3171	/ Skip the end of word position marker. /
3172	++ptr;
3173
3174	/ Bytes decoded so far. /
3175	decoded = ptr - (byte*) data;
3176
3177	/ A word may exist in the document but we only consider a*
3178	match if it exists in a position that is greater than the
3179	position of the previous term. /*
3180	if (doc_id == select->doc_id && min_pos > `0`) {
3181	fts_doc_freq_t* doc_freq;
3182
3183	/ Add the doc id to the doc freq rb tree, if*
3184	the doc id doesn't exist it will be created. /*
3185	doc_freq = fts_query_add_doc_freq(
3186	select->word_freq->doc_freqs, doc_id);
3187
3188	/ Avoid duplicating the frequency tally /
3189	if (doc_freq->freq == `0`) {
3190	doc_freq->freq = freq;
3191	}
3192
3193	select->found = TRUE;
3194	select->min_pos = min_pos;
3195	}
3196	}
3197
3198	return(select->found);
3199	}
3200	#endif
3201
3202	/***************************************************************//**
3203	Read and filter nodes.
3204	@return DB_SUCCESS if all go well,
3205	or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT /*
3206	static
3207	dberr_t
3208	fts_query_filter_doc_ids(
3209	/=====================/
3210	fts_query_t* query, /!< in: query instance /
3211	const fts_string_t* word, /!< in: the current word /
3212	fts_word_freq_t* word_freq, /!< in/out: word frequency /
3213	const fts_node_t* node, /!< in: current FTS node /
3214	void* data, /!< in: doc id ilist /
3215	ulint len, /!< in: doc id ilist size /
3216	ibool calc_doc_count) /!< in: whether to remember doc count /
3217	{
3218	byte* ptr = static_cast<byte*>(data);
3219	doc_id_t doc_id = `0`;
3220	ulint decoded = `0`;
3221	ib_rbt_t* doc_freqs = word_freq->doc_freqs;
3222
3223	/ Decode the ilist and add the doc ids to the query doc_id set. /
3224	while (decoded < len) {
3225	ulint freq = `0`;
3226	fts_doc_freq_t* doc_freq;
3227	fts_match_t* match = NULL;
3228	ulint last_pos = `0`;
3229	ulint pos = fts_decode_vlc(&ptr);
3230
3231	/ Some sanity checks. /
3232	if (doc_id == `0`) {
3233	ut_a(pos == node->first_doc_id);
3234	}
3235
3236	/ Add the delta. /
3237	doc_id += pos;
3238
3239	if (calc_doc_count) {
3240	word_freq->doc_count++;
3241	}
3242
3243	/ We simply collect the matching instances here. /
3244	if (query->collect_positions) {
3245	ib_alloc_t* heap_alloc;
3246
3247	/ Create a new fts_match_t instance. /
3248	match = static_cast<fts_match_t*>(
3249	ib_vector_push(query->matched, NULL));
3250
3251	match->start = `0`;
3252	match->doc_id = doc_id;
3253	heap_alloc = ib_vector_allocator(query->matched);
3254
3255	/ Allocate from the same heap as the*
3256	parent container. /*
3257	match->positions = ib_vector_create(
3258	heap_alloc, sizeof(ulint), `64`);
3259
3260	query->total_size += sizeof(fts_match_t)
3261	+ sizeof(ib_vector_t)
3262	+ sizeof(ulint) * `64`;
3263	}
3264
3265	/ Unpack the positions within the document. /
3266	while (*ptr) {
3267	last_pos += fts_decode_vlc(&ptr);
3268
3269	/ Collect the matching word positions, for phrase*
3270	matching later. /*
3271	if (query->collect_positions) {
3272	ib_vector_push(match->positions, &last_pos);
3273	}
3274
3275	++freq;
3276	}
3277
3278	/ End of list marker. /
3279	last_pos = (ulint) -`1`;
3280
3281	if (query->collect_positions) {
3282	ut_a(match != NULL);
3283	ib_vector_push(match->positions, &last_pos);
3284	}
3285
3286	/ Add the doc id to the doc freq rb tree, if the doc id*
3287	doesn't exist it will be created. /*
3288	doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
3289
3290	/ Avoid duplicating frequency tally. /
3291	if (doc_freq->freq == `0`) {
3292	doc_freq->freq = freq;
3293	}
3294
3295	/ Skip the end of word position marker. /
3296	++ptr;
3297
3298	/ Bytes decoded so far /
3299	decoded = ulint(ptr - (byte*) data);
3300
3301	/ We simply collect the matching documents and the*
3302	positions here and match later. /*
3303	if (!query->collect_positions) {
3304	/ We ignore error here and will check it later /
3305	fts_query_process_doc_id(query, doc_id, `0`);
3306
3307	/ Add the word to the document's matched RB tree. /
3308	fts_query_add_word_to_document(query, doc_id, word);
3309	}
3310	}
3311
3312	/ Some sanity checks. /
3313	ut_a(doc_id == node->last_doc_id);
3314
3315	if (query->total_size > fts_result_cache_limit) {
3316	return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3317	} else {
3318	return(DB_SUCCESS);
3319	}
3320	}
3321
3322	/***************************************************************//**
3323	Read the FTS INDEX row.
3324	@return DB_SUCCESS if all go well. /*
3325	static
3326	dberr_t
3327	fts_query_read_node(
3328	/================/
3329	fts_query_t* query, /!< in: query instance /
3330	const fts_string_t* word, /!< in: current word /
3331	que_node_t* exp) /!< in: query graph node /
3332	{
3333	int i;
3334	int ret;
3335	fts_node_t node;
3336	ib_rbt_bound_t parent;
3337	fts_word_freq_t* word_freq;
3338	ibool skip = FALSE;
3339	fts_string_t term;
3340	byte buf[FTS_MAX_WORD_LEN + `1`];
3341	dberr_t error = DB_SUCCESS;
3342
3343	ut_a(query->cur_node->type == FTS_AST_TERM
3344	\|\| query->cur_node->type == FTS_AST_TEXT
3345	\|\| query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
3346
3347	memset(&node, `0`, sizeof(node));
3348	term.f_str = buf;
3349
3350	/ Need to consider the wildcard search case, the word frequency*
3351	is created on the search string not the actual word. So we need
3352	to assign the frequency on search string behalf. /*
3353	if (query->cur_node->type == FTS_AST_TERM
3354	&& query->cur_node->term.wildcard) {
3355
3356	term.f_len = query->cur_node->term.ptr->len;
3357	ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
3358	memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
3359	} else {
3360	term.f_len = word->f_len;
3361	ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
3362	memcpy(term.f_str, word->f_str, word->f_len);
3363	}
3364
3365	/ Lookup the word in our rb tree, it must exist. /
3366	ret = rbt_search(query->word_freqs, &parent, &term);
3367
3368	ut_a(ret == `0`);
3369
3370	word_freq = rbt_value(fts_word_freq_t, parent.last);
3371
3372	/ Start from 1 since the first column has been read by the caller.*
3373	Also, we rely on the order of the columns projected, to filter
3374	out ilists that are out of range and we always want to read
3375	the doc_count irrespective of the suitablility of the row. /*
3376
3377	for (i = `1`; exp && !skip; exp = que_node_get_next(exp), ++i) {
3378
3379	dfield_t* dfield = que_node_get_val(exp);
3380	byte* data = static_cast<byte*>(
3381	dfield_get_data(dfield));
3382	ulint len = dfield_get_len(dfield);
3383
3384	ut_a(len != UNIV_SQL_NULL);
3385
3386	/ Note: The column numbers below must match the SELECT. /
3387
3388	switch (i) {
3389	case `1`: / DOC_COUNT /
3390	word_freq->doc_count += mach_read_from_4(data);
3391	break;
3392
3393	case `2`: / FIRST_DOC_ID /
3394	node.first_doc_id = fts_read_doc_id(data);
3395
3396	/ Skip nodes whose doc ids are out range. /
3397	if (query->oper == FTS_EXIST
3398	&& query->upper_doc_id > `0`
3399	&& node.first_doc_id > query->upper_doc_id) {
3400	skip = TRUE;
3401	}
3402	break;
3403
3404	case `3`: / LAST_DOC_ID /
3405	node.last_doc_id = fts_read_doc_id(data);
3406
3407	/ Skip nodes whose doc ids are out range. /
3408	if (query->oper == FTS_EXIST
3409	&& query->lower_doc_id > `0`
3410	&& node.last_doc_id < query->lower_doc_id) {
3411	skip = TRUE;
3412	}
3413	break;
3414
3415	case `4`: / ILIST /
3416
3417	error = fts_query_filter_doc_ids(
3418	query, &word_freq->word, word_freq,
3419	&node, data, len, FALSE);
3420
3421	break;
3422
3423	default:
3424	ut_error;
3425	}
3426	}
3427
3428	if (!skip) {
3429	/ Make sure all columns were read. /
3430
3431	ut_a(i == `5`);
3432	}
3433
3434	return error;
3435	}
3436
3437	/***************************************************************//**
3438	Callback function to fetch the rows in an FTS INDEX record.
3439	@return always returns TRUE /*
3440	static
3441	ibool
3442	fts_query_index_fetch_nodes(
3443	/========================/
3444	void* row, /!< in: sel_node_t* /
3445	void* user_arg) /!< in: pointer to fts_fetch_t /
3446	{
3447	fts_string_t key;
3448	sel_node_t* sel_node = static_cast<sel_node_t*>(row);
3449	fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg);
3450	fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg);
3451	que_node_t* exp = sel_node->select_list;
3452	dfield_t* dfield = que_node_get_val(exp);
3453	void* data = dfield_get_data(dfield);
3454	ulint dfield_len = dfield_get_len(dfield);
3455
3456	key.f_str = static_cast<byte*>(data);
3457	key.f_len = dfield_len;
3458
3459	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
3460
3461	/ Note: we pass error out by 'query->error' /
3462	query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
3463
3464	if (query->error != DB_SUCCESS) {
3465	ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
3466	return(FALSE);
3467	} else {
3468	return(TRUE);
3469	}
3470	}
3471
3472	/***************************************************************//**
3473	Calculate the inverse document frequency (IDF) for all the terms. /*
3474	static
3475	void
3476	fts_query_calculate_idf(
3477	/====================/
3478	fts_query_t* query) /!< in: Query state /
3479	{
3480	const ib_rbt_node_t* node;
3481	ib_uint64_t total_docs = query->total_docs;
3482
3483	/ We need to free any instances of fts_doc_freq_t that we*
3484	may have allocated. /*
3485	for (node = rbt_first(query->word_freqs);
3486	node;
3487	node = rbt_next(query->word_freqs, node)) {
3488
3489	fts_word_freq_t* word_freq;
3490
3491	word_freq = rbt_value(fts_word_freq_t, node);
3492
3493	if (word_freq->doc_count > `0`) {
3494	if (total_docs == word_freq->doc_count) {
3495	/ QP assume ranking > 0 if we find*
3496	a match. Since Log10(1) = 0, we cannot
3497	make IDF a zero value if do find a
3498	word in all documents. So let's make
3499	it an arbitrary very small number /*
3500	word_freq->idf = log10(`1.0001`);
3501	} else {
3502	word_freq->idf = log10(
3503	total_docs
3504	/ (double) word_freq->doc_count);
3505	}
3506	}
3507
3508	if (fts_enable_diag_print) {
3509	ib::info () << "'" << word_freq->word.f_str << "' -> "
3510	<< query->total_docs << "/"
3511	<< word_freq->doc_count << " "
3512	<< std::setw(`6`) << std::setprecision(`5`)
3513	<< word_freq->idf;
3514	}
3515	}
3516	}
3517
3518	/***************************************************************//**
3519	Calculate the ranking of the document. /*
3520	static
3521	void
3522	fts_query_calculate_ranking(
3523	/========================/
3524	const fts_query_t* query, /!< in: query state /
3525	fts_ranking_t* ranking) /!< in: Document to rank /
3526	{
3527	ulint pos = `0`;
3528	fts_string_t word;
3529
3530	/ At this stage, ranking->rank should not exceed the 1.0*
3531	bound /*
3532	ut_ad(ranking->rank <= `1.0` && ranking->rank >= -`1.0`);
3533	ut_ad(rbt_size(query->word_map) == query->word_vector->size());
3534
3535	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
3536	int ret;
3537	ib_rbt_bound_t parent;
3538	double weight;
3539	fts_doc_freq_t* doc_freq;
3540	fts_word_freq_t* word_freq;
3541
3542	ret = rbt_search(query->word_freqs, &parent, &word);
3543
3544	/ It must exist. /
3545	ut_a(ret == `0`);
3546
3547	word_freq = rbt_value(fts_word_freq_t, parent.last);
3548
3549	ret = rbt_search(
3550	word_freq->doc_freqs, &parent, &ranking->doc_id);
3551
3552	/ It must exist. /
3553	ut_a(ret == `0`);
3554
3555	doc_freq = rbt_value(fts_doc_freq_t, parent.last);
3556
3557	weight = (double) doc_freq->freq * word_freq->idf;
3558
3559	ranking->rank += (fts_rank_t) (weight * word_freq->idf);
3560	}
3561	}
3562
3563	/***************************************************************//**
3564	Add ranking to the result set. /*
3565	static
3566	void
3567	fts_query_add_ranking(
3568	/==================/
3569	fts_query_t* query, /!< in: query state /
3570	ib_rbt_t* ranking_tree, /!< in: ranking tree /
3571	const fts_ranking_t* new_ranking) /!< in: ranking of a document /
3572	{
3573	ib_rbt_bound_t parent;
3574
3575	/ Lookup the ranking in our rb tree and add if it doesn't exist. /
3576	if (rbt_search(ranking_tree, &parent, new_ranking) == `0`) {
3577	fts_ranking_t* ranking;
3578
3579	ranking = rbt_value(fts_ranking_t, parent.last);
3580
3581	ranking->rank += new_ranking->rank;
3582
3583	ut_a(ranking->words == NULL);
3584	} else {
3585	rbt_add_node(ranking_tree, &parent, new_ranking);
3586
3587	query->total_size += SIZEOF_RBT_NODE_ADD
3588	+ sizeof(fts_ranking_t);
3589	}
3590	}
3591
3592	/***************************************************************//**
3593	Retrieve the FTS Relevance Ranking result for doc with doc_id
3594	@return the relevance ranking value, 0 if no ranking value
3595	present. /*
3596	float
3597	fts_retrieve_ranking(
3598	/=================/
3599	fts_result_t* result, /!< in: FTS result structure /
3600	doc_id_t doc_id) /!< in: doc_id of the item to retrieve /
3601	{
3602	ib_rbt_bound_t parent;
3603	fts_ranking_t new_ranking;
3604
3605	DBUG_ENTER("fts_retrieve_ranking");
3606
3607	if (!result \|\| !result->rankings_by_id) {
3608	DBUG_RETURN(`0`);
3609	}
3610
3611	new_ranking.doc_id = doc_id;
3612
3613	/ Lookup the ranking in our rb tree /
3614	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == `0`) {
3615	fts_ranking_t* ranking;
3616
3617	ranking = rbt_value(fts_ranking_t, parent.last);
3618
3619	DBUG_RETURN(ranking->rank);
3620	}
3621
3622	DBUG_RETURN(`0`);
3623	}
3624
3625	/***************************************************************//**
3626	Create the result and copy the data to it. /*
3627	static
3628	fts_result_t*
3629	fts_query_prepare_result(
3630	/=====================/
3631	fts_query_t* query, /!< in: Query state /
3632	fts_result_t* result) /!< in: result this can contain*
3633	data from a previous search on
3634	another FTS index /*
3635	{
3636	const ib_rbt_node_t* node;
3637	bool result_is_null = false;
3638
3639	DBUG_ENTER("fts_query_prepare_result");
3640
3641	if (result == NULL) {
3642	result = static_cast<fts_result_t*>(
3643	ut_zalloc_nokey(sizeof(*result)));
3644
3645	result->rankings_by_id = rbt_create(
3646	sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
3647
3648	query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
3649	result_is_null = true;
3650	}
3651
3652	if (query->flags == FTS_OPT_RANKING) {
3653	fts_word_freq_t* word_freq;
3654	ulint size = ib_vector_size(query->deleted->doc_ids);
3655	fts_update_t* array =
3656	(fts_update_t*) query->deleted->doc_ids->data;
3657
3658	node = rbt_first(query->word_freqs);
3659	ut_ad(node);
3660	word_freq = rbt_value(fts_word_freq_t, node);
3661
3662	for (node = rbt_first(word_freq->doc_freqs);
3663	node;
3664	node = rbt_next(word_freq->doc_freqs, node)) {
3665	fts_doc_freq_t* doc_freq;
3666	fts_ranking_t ranking;
3667
3668	doc_freq = rbt_value(fts_doc_freq_t, node);
3669
3670	/ Don't put deleted docs into result /
3671	if (fts_bsearch(array, `0`, static_cast<int>(size),
3672	doc_freq->doc_id) >= `0`) {
3673	/ one less matching doc count /
3674	--word_freq->doc_count;
3675	continue;
3676	}
3677
3678	ranking.doc_id = doc_freq->doc_id;
3679	ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
3680	ranking.words = NULL;
3681
3682	fts_query_add_ranking(query, result->rankings_by_id,
3683	&ranking);
3684
3685	if (query->total_size > fts_result_cache_limit) {
3686	query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3687	fts_query_free_result(result);
3688	DBUG_RETURN(NULL);
3689	}
3690	}
3691
3692	/ Calculate IDF only after we exclude the deleted items /
3693	fts_query_calculate_idf(query);
3694
3695	node = rbt_first(query->word_freqs);
3696	word_freq = rbt_value(fts_word_freq_t, node);
3697
3698	/ Calculate the ranking for each doc /
3699	for (node = rbt_first(result->rankings_by_id);
3700	node != NULL;
3701	node = rbt_next(result->rankings_by_id, node)) {
3702
3703	fts_ranking_t* ranking;
3704
3705	ranking = rbt_value(fts_ranking_t, node);
3706
3707	ranking->rank = static_cast<fts_rank_t>(
3708	ranking->rank * word_freq->idf * word_freq->idf);
3709	}
3710
3711	DBUG_RETURN(result);
3712	}
3713
3714	ut_a(rbt_size(query->doc_ids) > `0`);
3715
3716	for (node = rbt_first(query->doc_ids);
3717	node;
3718	node = rbt_next(query->doc_ids, node)) {
3719
3720	fts_ranking_t* ranking;
3721
3722	ranking = rbt_value(fts_ranking_t, node);
3723	fts_query_calculate_ranking(query, ranking);
3724
3725	// FIXME: I think we may requre this information to improve the
3726	// ranking of doc ids which have more word matches from
3727	// different FTS indexes.
3728
3729	/ We don't need these anymore free the resources. /
3730	ranking->words = NULL;
3731
3732	if (!result_is_null) {
3733	fts_query_add_ranking(query, result->rankings_by_id, ranking);
3734
3735	if (query->total_size > fts_result_cache_limit) {
3736	query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
3737	fts_query_free_result(result);
3738	DBUG_RETURN(NULL);
3739	}
3740	}
3741	}
3742
3743	if (result_is_null) {
3744	/ Use doc_ids directly /
3745	rbt_free(result->rankings_by_id);
3746	result->rankings_by_id = query->doc_ids;
3747	query->doc_ids = NULL;
3748	}
3749
3750	DBUG_RETURN(result);
3751	}
3752
3753	/***************************************************************//**
3754	Get the result of the query. Calculate the similarity coefficient. /*
3755	static
3756	fts_result_t*
3757	fts_query_get_result(
3758	/=================/
3759	fts_query_t* query, /!< in: query instance /
3760	fts_result_t* result) /!< in: result /
3761	{
3762	DBUG_ENTER("fts_query_get_result");
3763
3764	if (rbt_size(query->doc_ids) > `0` \|\| query->flags == FTS_OPT_RANKING) {
3765	/ Copy the doc ids to the result. /
3766	result = fts_query_prepare_result(query, result);
3767	} else {
3768	/ Create an empty result instance. /
3769	result = static_cast<fts_result_t*>(
3770	ut_zalloc_nokey(sizeof(*result)));
3771	}
3772
3773	DBUG_RETURN(result);
3774	}
3775
3776	/***************************************************************//**
3777	FTS Query free resources and reset. /*
3778	static
3779	void
3780	fts_query_free(
3781	/===========/
3782	fts_query_t* query) /!< in: query instance to free/
3783	{
3784
3785	if (query->read_nodes_graph) {
3786	fts_que_graph_free(query->read_nodes_graph);
3787	}
3788
3789	if (query->root) {
3790	fts_ast_free_node(query->root);
3791	}
3792
3793	if (query->deleted) {
3794	fts_doc_ids_free(query->deleted);
3795	}
3796
3797	if (query->intersection) {
3798	fts_query_free_doc_ids(query, query->intersection);
3799	}
3800
3801	if (query->doc_ids) {
3802	fts_query_free_doc_ids(query, query->doc_ids);
3803	}
3804
3805	if (query->word_freqs) {
3806	const ib_rbt_node_t* node;
3807
3808	/ We need to free any instances of fts_doc_freq_t that we*
3809	may have allocated. /*
3810	for (node = rbt_first(query->word_freqs);
3811	node;
3812	node = rbt_next(query->word_freqs, node)) {
3813
3814	fts_word_freq_t* word_freq;
3815
3816	word_freq = rbt_value(fts_word_freq_t, node);
3817
3818	/ We need to cast away the const. /
3819	rbt_free(word_freq->doc_freqs);
3820	}
3821
3822	rbt_free(query->word_freqs);
3823	}
3824
3825	if (query->wildcard_words != NULL) {
3826	rbt_free(query->wildcard_words);
3827	}
3828
3829	ut_a(!query->intersection);
3830
3831	if (query->word_map) {
3832	rbt_free(query->word_map);
3833	}
3834
3835	if (query->word_vector != NULL) {
3836	UT_DELETE(query->word_vector);
3837	}
3838
3839	if (query->heap) {
3840	mem_heap_free(query->heap);
3841	}
3842
3843	memset(query, `0`, sizeof(*query));
3844	}
3845
3846	/***************************************************************//**
3847	Parse the query using flex/bison or plugin parser.
3848	@return parse tree node. /*
3849	static
3850	fts_ast_node_t*
3851	fts_query_parse(
3852	/============/
3853	fts_query_t* query, /!< in: query instance /
3854	byte* query_str, /!< in: query string /
3855	ulint query_len) /!< in: query string length /
3856	{
3857	int error;
3858	fts_ast_state_t state;
3859	bool mode = query->boolean_mode;
3860	DBUG_ENTER("fts_query_parse");
3861
3862	memset(&state, `0x0`, sizeof(state));
3863
3864	state.charset = query->fts_index_table.charset;
3865
3866	DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
3867	query->parser = NULL;);
3868
3869	if (query->parser) {
3870	state.root = state.cur_node =
3871	fts_ast_create_node_list(&state, NULL);
3872	error = fts_parse_by_parser(mode, query_str, query_len,
3873	query->parser, &state);
3874	} else {
3875	/ Setup the scanner to use, this depends on the mode flag. /
3876	state.lexer = fts_lexer_create(mode, query_str, query_len);
3877	state.charset = query->fts_index_table.charset;
3878	error = fts_parse(&state);
3879	fts_lexer_free(state.lexer);
3880	state.lexer = NULL;
3881	}
3882
3883	/ Error during parsing ? /
3884	if (error) {
3885	/ Free the nodes that were allocated during parsing. /
3886	fts_ast_state_free(&state);
3887	} else {
3888	query->root = state.root;
3889
3890	if (fts_enable_diag_print && query->root != NULL) {
3891	fts_ast_node_print(query->root);
3892	}
3893	}
3894
3895	DBUG_RETURN(state.root);
3896	}
3897
3898	/*****************************************************************//**
3899	FTS Query optimization
3900	Set FTS_OPT_RANKING if it is a simple term query /*
3901	static
3902	void
3903	fts_query_can_optimize(
3904	/===================/
3905	fts_query_t* query, /!< in/out: query instance /
3906	uint flags) /!< In: FTS search mode /
3907	{
3908	fts_ast_node_t* node = query->root;
3909
3910	if (flags & FTS_EXPAND) {
3911	return;
3912	}
3913
3914	/ Check if it has only a term without oper /
3915	ut_ad(node->type == FTS_AST_LIST);
3916	node = node->list.head;
3917	if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
3918	query->flags = FTS_OPT_RANKING;
3919	}
3920	}
3921
3922	/* FTS Query entry point.*
3923	@param[in] index fts index to search
3924	@param[in] flags FTS search mode
3925	@param[in] query_str FTS query
3926	@param[in] query_len FTS query string len in bytes
3927	@param[in,out] result result doc ids
3928	@return DB_SUCCESS if successful otherwise error code /*
3929	dberr_t
3930	fts_query(
3931	dict_index_t* index,
3932	uint flags,
3933	const byte* query_str,
3934	ulint query_len,
3935	fts_result_t** result)
3936	{
3937	fts_query_t query;
3938	dberr_t error = DB_SUCCESS;
3939	byte* lc_query_str;
3940	ulint lc_query_str_len;
3941	ulint result_len;
3942	bool boolean_mode;
3943	trx_t* query_trx;
3944	CHARSET_INFO* charset;
3945	ulint start_time_ms;
3946	bool will_be_ignored = false;
3947
3948	boolean_mode = flags & FTS_BOOL;
3949
3950	*result = NULL;
3951	memset(&query, `0x0`, sizeof(query));
3952	query_trx = trx_create();
3953	query_trx->op_info = "FTS query";
3954
3955	start_time_ms = ut_time_ms();
3956
3957	query.trx = query_trx;
3958	query.index = index;
3959	query.boolean_mode = boolean_mode;
3960	query.deleted = fts_doc_ids_create();
3961	query.cur_node = NULL;
3962
3963	query.fts_common_table.type = FTS_COMMON_TABLE;
3964	query.fts_common_table.table_id = index->table->id;
3965	query.fts_common_table.parent = index->table->name.m_name;
3966	query.fts_common_table.table = index->table;
3967
3968	charset = fts_index_get_charset(index);
3969
3970	query.fts_index_table.type = FTS_INDEX_TABLE;
3971	query.fts_index_table.index_id = index->id;
3972	query.fts_index_table.table_id = index->table->id;
3973	query.fts_index_table.parent = index->table->name.m_name;
3974	query.fts_index_table.charset = charset;
3975	query.fts_index_table.table = index->table;
3976
3977	query.word_map = rbt_create_arg_cmp(
3978	sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
3979	query.word_vector = UT_NEW_NOKEY(word_vector_t ());
3980	query.error = DB_SUCCESS;
3981
3982	/ Setup the RB tree that will be used to collect per term*
3983	statistics. /*
3984	query.word_freqs = rbt_create_arg_cmp(
3985	sizeof(fts_word_freq_t), innobase_fts_text_cmp,
3986	(void*) charset);
3987
3988	if (flags & FTS_EXPAND) {
3989	query.wildcard_words = rbt_create_arg_cmp(
3990	sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
3991	}
3992
3993	query.total_size += SIZEOF_RBT_CREATE;
3994
3995	query.total_docs = dict_table_get_n_rows(index->table);
3996
3997	query.fts_common_table.suffix = "DELETED";
3998
3999	/ Read the deleted doc_ids, we need these for filtering. /
4000	error = fts_table_fetch_doc_ids(
4001	NULL, &query.fts_common_table, query.deleted);
4002
4003	if (error != DB_SUCCESS) {
4004	goto func_exit;
4005	}
4006
4007	query.fts_common_table.suffix = "DELETED_CACHE";
4008
4009	error = fts_table_fetch_doc_ids(
4010	NULL, &query.fts_common_table, query.deleted);
4011
4012	if (error != DB_SUCCESS) {
4013	goto func_exit;
4014	}
4015
4016	/ Get the deleted doc ids that are in the cache. /
4017	fts_cache_append_deleted_doc_ids(
4018	index->table->fts->cache, query.deleted->doc_ids);
4019	DEBUG_SYNC_C("fts_deleted_doc_ids_append");
4020
4021	/ Sort the vector so that we can do a binary search over the ids. /
4022	ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
4023
4024	/ Convert the query string to lower case before parsing. We own*
4025	the ut_malloc'ed result and so remember to free it before return. /*
4026
4027	lc_query_str_len = query_len * charset->casedn_multiply + `1`;
4028	lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
4029
4030	/ For binary collations, a case sensitive search is*
4031	performed. Hence don't convert to lower case. /*
4032	if (my_binary_compare(charset)) {
4033	memcpy(lc_query_str, query_str, query_len);
4034	lc_query_str[query_len]= `0`;
4035	result_len= query_len;
4036	} else {
4037	result_len = innobase_fts_casedn_str(
4038	charset, (char*)( query_str), query_len,
4039	(char*)(lc_query_str), lc_query_str_len);
4040	}
4041
4042	ut_ad(result_len < lc_query_str_len);
4043
4044	lc_query_str[result_len] = `0`;
4045
4046	query.heap = mem_heap_create(`128`);
4047
4048	/ Create the rb tree for the doc id (current) set. /
4049	query.doc_ids = rbt_create(
4050	sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
4051	query.parser = index->parser;
4052
4053	query.total_size += SIZEOF_RBT_CREATE;
4054
4055	/ Parse the input query string. /
4056	if (fts_query_parse(&query, lc_query_str, result_len)) {
4057	fts_ast_node_t* ast = query.root;
4058
4059	/ Optimize query to check if it's a single term /
4060	fts_query_can_optimize(&query, flags);
4061
4062	DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
4063	fts_result_cache_limit = `2048`;
4064	);
4065
4066	/ Traverse the Abstract Syntax Tree (AST) and execute*
4067	the query. /*
4068	query.error = fts_ast_visit(
4069	FTS_NONE, ast, fts_query_visitor,
4070	&query, &will_be_ignored);
4071
4072	/ If query expansion is requested, extend the search*
4073	with first search pass result /*
4074	if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
4075	query.error = fts_expand_query(index, &query);
4076	}
4077
4078	/ Calculate the inverse document frequency of the terms. /
4079	if (query.error == DB_SUCCESS
4080	&& query.flags != FTS_OPT_RANKING) {
4081	fts_query_calculate_idf(&query);
4082	}
4083
4084	/ Copy the result from the query state, so that we can*
4085	return it to the caller. /*
4086	if (query.error == DB_SUCCESS) {
4087	result = fts_query_get_result(&query, result);
4088	}
4089
4090	error = query.error;
4091	} else {
4092	/ still return an empty result set /
4093	result = static_cast<fts_result_t>(
4094	ut_zalloc_nokey(sizeof(**result)));
4095	}
4096
4097	ut_free(lc_query_str);
4098
4099	if (fts_enable_diag_print && (*result)) {
4100	ulint diff_time = ut_time_ms() - start_time_ms;
4101
4102	ib::info () << "FTS Search Processing time: "
4103	<< diff_time / `1000` << " secs: " << diff_time % `1000`
4104	<< " millisec: row(s) "
4105	<< ((*result)->rankings_by_id
4106	? lint(rbt_size((*result)->rankings_by_id))
4107	: -`1`);
4108
4109	/ Log memory consumption & result size /
4110	ib::info () << "Full Search Memory: " << query.total_size
4111	<< " (bytes), Row: "
4112	<< ((*result)->rankings_by_id
4113	? rbt_size((*result)->rankings_by_id)
4114	: `0`)
4115	<< ".";
4116	}
4117
4118	func_exit:
4119	fts_query_free(&query);
4120
4121	trx_free(query_trx);
4122
4123	return(error);
4124	}
4125
4126	/***************************************************************//**
4127	FTS Query free result, returned by fts_query(). /*
4128	void
4129	fts_query_free_result(
4130	/==================/
4131	fts_result_t* result) /!< in: result instance to free./
4132	{
4133	if (result) {
4134	if (result->rankings_by_id != NULL) {
4135	rbt_free(result->rankings_by_id);
4136	result->rankings_by_id = NULL;
4137	}
4138	if (result->rankings_by_rank != NULL) {
4139	rbt_free(result->rankings_by_rank);
4140	result->rankings_by_rank = NULL;
4141	}
4142
4143	ut_free(result);
4144	result = NULL;
4145	}
4146	}
4147
4148	/***************************************************************//**
4149	FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. /*
4150	void
4151	fts_query_sort_result_on_rank(
4152	/==========================/
4153	fts_result_t* result) /!< out: result instance to sort./
4154	{
4155	const ib_rbt_node_t* node;
4156	ib_rbt_t* ranked;
4157
4158	ut_a(result->rankings_by_id != NULL);
4159	if (result->rankings_by_rank) {
4160	rbt_free(result->rankings_by_rank);
4161	}
4162
4163	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
4164
4165	/ We need to free any instances of fts_doc_freq_t that we*
4166	may have allocated. /*
4167	for (node = rbt_first(result->rankings_by_id);
4168	node;
4169	node = rbt_next(result->rankings_by_id, node)) {
4170
4171	fts_ranking_t* ranking;
4172
4173	ranking = rbt_value(fts_ranking_t, node);
4174
4175	ut_a(ranking->words == NULL);
4176
4177	rbt_insert(ranked, ranking, ranking);
4178	}
4179
4180	/ Reset the current node too. /
4181	result->current = NULL;
4182	result->rankings_by_rank = ranked;
4183	}
4184
4185	/*****************************************************************//**
4186	A debug function to print result doc_id set. /*
4187	static
4188	void
4189	fts_print_doc_id(
4190	/=============/
4191	fts_query_t* query) /!< in : tree that stores doc_ids./
4192	{
4193	const ib_rbt_node_t* node;
4194
4195	/ Iterate each member of the doc_id set /
4196	for (node = rbt_first(query->doc_ids);
4197	node;
4198	node = rbt_next(query->doc_ids, node)) {
4199	fts_ranking_t* ranking;
4200	ranking = rbt_value(fts_ranking_t, node);
4201
4202	ib::info () << "doc_ids info, doc_id: " << ranking->doc_id;
4203
4204	ulint pos = `0`;
4205	fts_string_t word;
4206
4207	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
4208	ib::info () << "doc_ids info, value: " << word.f_str;
4209	}
4210	}
4211	}
4212
4213	/***********************************************************//**
4214	This function implements a simple "blind" query expansion search:
4215	words in documents found in the first search pass will be used as
4216	search arguments to search the document again, thus "expand"
4217	the search result set.
4218	@return DB_SUCCESS if success, otherwise the error code /*
4219	static MY_ATTRIBUTE((nonnull, warn_unused_result))
4220	dberr_t
4221	fts_expand_query(
4222	/=============/
4223	dict_index_t* index, /!< in: FTS index to search /
4224	fts_query_t* query) /!< in: FTS query instance /
4225	{
4226	const ib_rbt_node_t* node;
4227	const ib_rbt_node_t* token_node;
4228	fts_doc_t result_doc;
4229	dberr_t error = DB_SUCCESS;
4230	const fts_index_cache_t*index_cache;
4231
4232	/ If no doc is found in first search pass, return /
4233	if (!rbt_size(query->doc_ids)) {
4234	return(error);
4235	}
4236
4237	/ Init "result_doc", to hold words from the first search pass /
4238	fts_doc_init(&result_doc);
4239
4240	rw_lock_x_lock(&index->table->fts->cache->lock);
4241	index_cache = fts_find_index_cache(index->table->fts->cache, index);
4242	rw_lock_x_unlock(&index->table->fts->cache->lock);
4243
4244	ut_a(index_cache);
4245
4246	result_doc.tokens = rbt_create_arg_cmp(
4247	sizeof(fts_token_t), innobase_fts_text_cmp,
4248	(void*) index_cache->charset);
4249
4250	result_doc.charset = index_cache->charset;
4251	result_doc.parser = index_cache->index->parser;
4252
4253	query->total_size += SIZEOF_RBT_CREATE;
4254
4255	if (fts_enable_diag_print) {
4256	fts_print_doc_id(query);
4257	}
4258
4259	for (node = rbt_first(query->doc_ids);
4260	node;
4261	node = rbt_next(query->doc_ids, node)) {
4262
4263	fts_ranking_t* ranking;
4264	ulint prev_token_size;
4265	ulint estimate_size;
4266
4267	prev_token_size = rbt_size(result_doc.tokens);
4268
4269	ranking = rbt_value(fts_ranking_t, node);
4270
4271	/ Fetch the documents with the doc_id from the*
4272	result of first seach pass. Since we do not
4273	store document-to-word mapping, we need to
4274	fetch the original document and parse them.
4275	Future optimization could be done here if we
4276	support some forms of document-to-word mapping /*
4277	fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
4278	FTS_FETCH_DOC_BY_ID_EQUAL,
4279	fts_query_expansion_fetch_doc,
4280	&result_doc);
4281
4282	/ Estimate memory used, see fts_process_token and fts_token_t.*
4283	We ignore token size here. /*
4284	estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
4285	* (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
4286	+ sizeof(ib_vector_t) + sizeof(ulint) * `32`);
4287	query->total_size += estimate_size;
4288
4289	if (query->total_size > fts_result_cache_limit) {
4290	error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
4291	goto func_exit;
4292	}
4293	}
4294
4295	/ Remove words that have already been searched in the first pass /
4296	for (ulint i = `0`; i < query->word_vector->size(); i++) {
4297	fts_string_t word = query->word_vector->at(i);
4298	ib_rbt_bound_t parent;
4299
4300	if (query->wildcard_words
4301	&& rbt_search(query->wildcard_words, &parent, &word) == `0`) {
4302	/ If it's a wildcard word, remove words having*
4303	it as prefix. /*
4304	while (rbt_search_cmp(result_doc.tokens,
4305	&parent, &word, NULL,
4306	innobase_fts_text_cmp_prefix)
4307	== `0`) {
4308	ut_free(rbt_remove_node(result_doc.tokens,
4309	parent.last));
4310	}
4311	} else {
4312	/ We don't check return value, because the word may*
4313	have been deleted by a previous wildcard word as its
4314	prefix, e.g. ('g good'). /
4315	rbt_delete(result_doc.tokens, &word);
4316	}
4317	}
4318
4319	/ Search the table the second time with expanded search list /
4320	for (token_node = rbt_first(result_doc.tokens);
4321	token_node;
4322	token_node = rbt_next(result_doc.tokens, token_node)) {
4323	fts_token_t* mytoken;
4324	mytoken = rbt_value(fts_token_t, token_node);
4325
4326	/ '%' in the end is treated as prefix search,*
4327	it can cause assert failure, so we skip it. /*
4328	if (mytoken->text.f_str[mytoken->text.f_len - `1`] == `'%'`) {
4329	continue;
4330	}
4331
4332	ut_ad(mytoken->text.f_str[mytoken->text.f_len] == `0`);
4333	fts_query_add_word_freq(query, &mytoken->text);
4334	error = fts_query_union(query, &mytoken->text);
4335
4336	if (error != DB_SUCCESS) {
4337	break;
4338	}
4339	}
4340
4341	func_exit:
4342	fts_doc_free(&result_doc);
4343
4344	return(error);
4345	}
4346	/***********************************************************//**
4347	This function finds documents that contain all words in a
4348	phrase or proximity search. And if proximity search, verify
4349	the words are close enough to each other, as in specified distance.
4350	This function is called for phrase and proximity search.
4351	@return TRUE if documents are found, FALSE if otherwise /*
4352	static
4353	ibool
4354	fts_phrase_or_proximity_search(
4355	/===========================/
4356	fts_query_t* query, /!< in/out: query instance.*
4357	query->doc_ids might be instantiated
4358	with qualified doc IDs /*
4359	ib_vector_t* tokens) /!< in: Tokens contain words /
4360	{
4361	ulint n_matched;
4362	ulint i;
4363	ibool matched = FALSE;
4364	ulint num_token = ib_vector_size(tokens);
4365	fts_match_t* match[MAX_PROXIMITY_ITEM];
4366	ibool end_list = FALSE;
4367
4368	/ Number of matched documents for the first token /
4369	n_matched = ib_vector_size(query->match_array[`0`]);
4370
4371	/ We have a set of match list for each word, we shall*
4372	walk through the list and find common documents that
4373	contain all the matching words. /*
4374	for (i = `0`; i < n_matched; i++) {
4375	ulint j;
4376	ulint k = `0`;
4377	fts_proximity_t qualified_pos;
4378
4379	match[`0`] = static_cast<fts_match_t*>(
4380	ib_vector_get(query->match_array[`0`], i));
4381
4382	/ For remaining match list for the token(word), we*
4383	try to see if there is a document with the same
4384	doc id /*
4385	for (j = `1`; j < num_token; j++) {
4386	match[j] = static_cast<fts_match_t*>(
4387	ib_vector_get(query->match_array[j], k));
4388
4389	while (match[j]->doc_id < match[`0`]->doc_id
4390	&& k < ib_vector_size(query->match_array[j])) {
4391	match[j] = static_cast<fts_match_t*>(
4392	ib_vector_get(
4393	query->match_array[j], k));
4394	k++;
4395	}
4396
4397	if (match[j]->doc_id > match[`0`]->doc_id) {
4398	/ no match /
4399	if (query->flags & FTS_PHRASE) {
4400	match[`0`]->doc_id = `0`;
4401	}
4402	break;
4403	}
4404
4405	if (k == ib_vector_size(query->match_array[j])) {
4406	end_list = TRUE;
4407
4408	if (match[j]->doc_id != match[`0`]->doc_id) {
4409	/ no match /
4410	if (query->flags & FTS_PHRASE) {
4411	ulint s;
4412
4413	match[`0`]->doc_id = `0`;
4414
4415	for (s = i + `1`; s < n_matched;
4416	s++) {
4417	match[`0`] = static_cast<
4418	fts_match_t*>(
4419	ib_vector_get(
4420	query->match_array[`0`],
4421	s));
4422	match[`0`]->doc_id = `0`;
4423	}
4424	}
4425
4426	goto func_exit;
4427	}
4428	}
4429
4430	/ FIXME: A better solution will be a counter array*
4431	remember each run's last position. So we don't
4432	reset it here very time /*
4433	k = `0`;
4434	}
4435
4436	if (j != num_token) {
4437	continue;
4438	}
4439
4440	/ For this matching doc, we need to further*
4441	verify whether the words in the doc are close
4442	to each other, and within the distance specified
4443	in the proximity search /*
4444	if (query->flags & FTS_PHRASE) {
4445	matched = TRUE;
4446	} else if (fts_proximity_get_positions(
4447	match, num_token, ULINT_MAX, &qualified_pos)) {
4448
4449	/ Fetch the original documents and count the*
4450	words in between matching words to see that is in
4451	specified distance /*
4452	if (fts_query_is_in_proximity_range(
4453	query, match, &qualified_pos)) {
4454	/ If so, mark we find a matching doc /
4455	query->error = fts_query_process_doc_id(
4456	query, match[`0`]->doc_id, `0`);
4457	if (query->error != DB_SUCCESS) {
4458	matched = FALSE;
4459	goto func_exit;
4460	}
4461
4462	matched = TRUE;
4463	for (ulint z = `0`; z < num_token; z++) {
4464	fts_string_t* token;
4465	token = static_cast<fts_string_t*>(
4466	ib_vector_get(tokens, z));
4467	fts_query_add_word_to_document(
4468	query, match[`0`]->doc_id, token);
4469	}
4470	}
4471	}
4472
4473	if (end_list) {
4474	break;
4475	}
4476	}
4477
4478	func_exit:
4479	return(matched);
4480	}
4481
4482	/***********************************************************//**
4483	This function checks whether words in result documents are close to
4484	each other (within proximity range as specified by "distance").
4485	If "distance" is MAX_ULINT, then it will find all combinations of
4486	positions of matching words and store min and max positions
4487	in the "qualified_pos" for later verification.
4488	@return true if words are close to each other, false if otherwise /*
4489	static
4490	bool
4491	fts_proximity_get_positions(
4492	/========================/
4493	fts_match_t** match, /!< in: query instance /
4494	ulint num_match, /!< in: number of matching*
4495	items /*
4496	ulint distance, /!< in: distance value*
4497	for proximity search /*
4498	fts_proximity_t* qualified_pos) /!< out: the position info*
4499	records ranges containing
4500	all matching words. /*
4501	{
4502	ulint i;
4503	ulint idx[MAX_PROXIMITY_ITEM];
4504	ulint num_pos[MAX_PROXIMITY_ITEM];
4505	ulint min_idx;
4506
4507	qualified_pos->n_pos = `0`;
4508
4509	ut_a(num_match <= MAX_PROXIMITY_ITEM);
4510
4511	/ Each word could appear multiple times in a doc. So*
4512	we need to walk through each word's position list, and find
4513	closest distance between different words to see if
4514	they are in the proximity distance. /*
4515
4516	/ Assume each word's position list is sorted, we*
4517	will just do a walk through to all words' lists
4518	similar to a the merge phase of a merge sort /*
4519	for (i = `0`; i < num_match; i++) {
4520	/ idx is the current position we are checking*
4521	for a particular word /*
4522	idx[i] = `0`;
4523
4524	/ Number of positions for this word /
4525	num_pos[i] = ib_vector_size(match[i]->positions);
4526	}
4527
4528	/ Start with the first word /
4529	min_idx = `0`;
4530
4531	while (idx[min_idx] < num_pos[min_idx]) {
4532	ulint position[MAX_PROXIMITY_ITEM];
4533	ulint min_pos = ULINT_MAX;
4534	ulint max_pos = `0`;
4535
4536	/ Check positions in each word position list, and*
4537	record the max/min position /*
4538	for (i = `0`; i < num_match; i++) {
4539	position[i] = (ulint) ib_vector_get_const(
4540	match[i]->positions, idx[i]);
4541
4542	if (position[i] == ULINT_UNDEFINED) {
4543	break;
4544	}
4545
4546	if (position[i] < min_pos) {
4547	min_pos = position[i];
4548	min_idx = i;
4549	}
4550
4551	if (position[i] > max_pos) {
4552	max_pos = position[i];
4553	}
4554	}
4555
4556	/ If max and min position are within range, we*
4557	find a good match /*
4558	if (max_pos - min_pos <= distance
4559	&& (i >= num_match \|\| position[i] != ULINT_UNDEFINED)) {
4560	/ The charset has variable character*
4561	length encoding, record the min_pos and
4562	max_pos, we will need to verify the actual
4563	number of characters /*
4564	qualified_pos->min_pos.push_back(min_pos);
4565	qualified_pos->max_pos.push_back(max_pos);
4566	qualified_pos->n_pos++;
4567	}
4568
4569	/ Otherwise, move to the next position is the*
4570	list for the word with the smallest position /*
4571	idx[min_idx]++;
4572	}
4573
4574	return(qualified_pos->n_pos != `0`);
4575	}
4576

Browse the source code of MariaDB/storage/innobase/fts/fts0que.cc