fts0types.h source code [MariaDB/storage/innobase/include/fts0types.h]

1	/*****************************************************************************
2
3	Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
4	Copyright (c) 2017, MariaDB Corporation.
5
6	This program is free software; you can redistribute it and/or modify it under
7	the terms of the GNU General Public License as published by the Free Software
8	Foundation; version 2 of the License.
9
10	This program is distributed in the hope that it will be useful, but WITHOUT
11	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License along with
15	this program; if not, write to the Free Software Foundation, Inc.,
16	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18	*****************************************************************************/
19
20	/****************************************************************//**
21	@file include/fts0types.h
22	Full text search types file
23
24	Created 2007-03-27 Sunny Bains
25	*******************************************************/
26
27	#ifndef INNOBASE_FTS0TYPES_H
28	#define INNOBASE_FTS0TYPES_H
29
30	#include "univ.i"
31	#include "fts0fts.h"
32	#include "fut0fut.h"
33	#include "pars0pars.h"
34	#include "que0types.h"
35	#include "ut0byte.h"
36	#include "ut0rbt.h"
37
38	/* Types used within FTS. /
39	struct fts_que_t;
40	struct fts_node_t;
41
42	/* Callbacks used within FTS. /
43	typedef pars_user_func_cb_t fts_sql_callback;
44	typedef void (fts_filter)(void*, fts_node_t, void*, ulint len);
45
46	/* Statistics relevant to a particular document, used during retrieval. /
47	struct fts_doc_stats_t {
48	doc_id_t doc_id; /!< Document id /
49	ulint word_count; /!< Total words in the document /
50	};
51
52	/* It's main purpose is to store the SQL prepared statements that*
53	are required to retrieve a document from the database. /*
54	struct fts_get_doc_t {
55	fts_index_cache_t*
56	index_cache; /!< The index cache instance /
57
58	/!< Parsed sql statement /
59	que_t* get_document_graph;
60	fts_cache_t* cache; /!< The parent cache /
61	};
62
63	/* Since we can have multiple FTS indexes on a table, we keep a*
64	per index cache of words etc. /*
65	struct fts_index_cache_t {
66	dict_index_t* index; /!< The FTS index instance /
67
68	ib_rbt_t* words; /!< Nodes; indexed by fts_string_t,
69	cells are fts_tokenizer_word_t./
70
71	ib_vector_t* doc_stats; /!< Array of the fts_doc_stats_t*
72	contained in the memory buffer.
73	Must be in sorted order (ascending).
74	The ideal choice is an rb tree but
75	the rb tree imposes a space overhead
76	that we can do without /*
77
78	que_t** ins_graph; /!< Insert query graphs /
79
80	que_t** sel_graph; /!< Select query graphs /
81	CHARSET_INFO* charset; /!< charset /
82	};
83
84	/* For supporting the tracking of updates on multiple FTS indexes we need*
85	to track which FTS indexes need to be updated. For INSERT and DELETE we
86	update all fts indexes. /*
87	struct fts_update_t {
88	doc_id_t doc_id; /!< The doc id affected /
89
90	ib_vector_t* fts_indexes; /!< The FTS indexes that need to be*
91	updated. A NULL value means all
92	indexes need to be updated. This
93	vector is not allocated on the heap
94	and so must be freed explicitly,
95	when we are done with it /*
96	};
97
98	/* Stop word control infotmation. /
99	struct fts_stopword_t {
100	ulint status; /!< Status of the stopword tree /
101	ib_alloc_t* heap; /!< The memory allocator to use /
102	ib_rbt_t* cached_stopword;/!< This stores all active stopwords /
103	CHARSET_INFO* charset; /!< charset for stopword /
104	};
105
106	/* The SYNC state of the cache. There is one instance of this struct*
107	associated with each ADD thread. /*
108	struct fts_sync_t {
109	trx_t* trx; /!< The transaction used for SYNCing*
110	the cache to disk /*
111	dict_table_t* table; /!< Table with FTS index(es) /
112	ulint max_cache_size; /!< Max size in bytes of the cache /
113	ibool cache_full; /!< flag, when true it indicates that*
114	we need to sync the cache to disk /*
115	ulint lower_index; /!< the start index of the doc id*
116	vector from where to start adding
117	documents to the FTS cache /*
118	ulint upper_index; /!< max index of the doc id vector to*
119	add to the FTS cache /*
120	ibool interrupted; /!< TRUE if SYNC was interrupted /
121	doc_id_t min_doc_id; /!< The smallest doc id added to the*
122	cache. It should equal to
123	doc_ids[lower_index] /*
124	doc_id_t max_doc_id; /!< The doc id at which the cache was*
125	noted as being full, we use this to
126	set the upper_limit field /*
127	ib_time_t start_time; /!< SYNC start time /
128	bool in_progress; /!< flag whether sync is in progress./
129	bool unlock_cache; /!< flag whether unlock cache when*
130	write fts node /*
131	os_event_t event; /!< sync finish event;*
132	only os_event_set() and os_event_wait()
133	are used /*
134	};
135
136	/* The cache for the FTS system. It is a memory-based inverted index*
137	that new entries are added to, until it grows over the configured maximum
138	size, at which time its contents are written to the INDEX table. /*
139	struct fts_cache_t {
140	rw_lock_t lock; /!< lock protecting all access to the*
141	memory buffer. FIXME: this needs to
142	be our new upgrade-capable rw-lock /*
143
144	rw_lock_t init_lock; /!< lock used for the cache*
145	intialization, it has different
146	SYNC level as above cache lock /*
147
148	ib_mutex_t optimize_lock; /!< Lock for OPTIMIZE /
149
150	ib_mutex_t deleted_lock; /!< Lock covering deleted_doc_ids /
151
152	ib_mutex_t doc_id_lock; /!< Lock covering Doc ID /
153
154	ib_vector_t* deleted_doc_ids;/!< Array of deleted doc ids, each*
155	element is of type fts_update_t /*
156
157	ib_vector_t* indexes; /!< We store the stats and inverted*
158	index for the individual FTS indexes
159	in this vector. Each element is
160	an instance of fts_index_cache_t /*
161
162	ib_vector_t* get_docs; /!< information required to read*
163	the document from the table. Each
164	element is of type fts_doc_t /*
165
166	ulint total_size; /!< total size consumed by the ilist*
167	field of all nodes. SYNC is run
168	whenever this gets too big /*
169	fts_sync_t* sync; /!< sync structure to sync data to*
170	disk /*
171	ib_alloc_t* sync_heap; /!< The heap allocator, for indexes*
172	and deleted_doc_ids, ie. transient
173	objects, they are recreated after
174	a SYNC is completed /*
175
176	ib_alloc_t* self_heap; /!< This heap is the heap out of*
177	which an instance of the cache itself
178	was created. Objects created using
179	this heap will last for the lifetime
180	of the cache /*
181
182	doc_id_t next_doc_id; /!< Next doc id /
183
184	doc_id_t synced_doc_id; /!< Doc ID sync-ed to CONFIG table /
185
186	doc_id_t first_doc_id; /!< first doc id since this table*
187	was opened /*
188
189	ulint deleted; /!< Number of doc ids deleted since*
190	last optimized. This variable is
191	covered by deleted_lock /*
192
193	ulint added; /!< Number of doc ids added since last*
194	optimized. This variable is covered by
195	the deleted lock /*
196
197	fts_stopword_t stopword_info; /!< Cached stopwords for the FTS /
198	mem_heap_t* cache_heap; /!< Cache Heap /
199	};
200
201	/* Columns of the FTS auxiliary INDEX table /
202	struct fts_node_t {
203	doc_id_t first_doc_id; /!< First document id in ilist. /
204
205	doc_id_t last_doc_id; /!< Last document id in ilist. /
206
207	byte* ilist; /!< Binary list of documents & word*
208	positions the token appears in.
209	TODO: For now, these are simply
210	ut_malloc'd, but if testing shows
211	that they waste memory unacceptably, a
212	special memory allocator will have
213	to be written /*
214
215	ulint doc_count; /!< Number of doc ids in ilist /
216
217	ulint ilist_size; /!< Used size of ilist in bytes. /
218
219	ulint ilist_size_alloc;
220	/!< Allocated size of ilist in*
221	bytes /*
222	bool synced; /!< flag whether the node is synced /
223	};
224
225	/* A tokenizer word. Contains information about one word. /
226	struct fts_tokenizer_word_t {
227	fts_string_t text; /!< Token text. /
228
229	ib_vector_t* nodes; /!< Word node ilists, each element is*
230	of type fts_node_t /*
231	};
232
233	/* Word text plus it's array of nodes as on disk in FTS index /
234	struct fts_word_t {
235	fts_string_t text; /!< Word value in UTF-8 /
236	ib_vector_t* nodes; /!< Nodes read from disk /
237
238	ib_alloc_t* heap_alloc; /!< For handling all allocations /
239	};
240
241	/* Callback for reading and filtering nodes that are read from FTS index /
242	struct fts_fetch_t {
243	void* read_arg; /!< Arg for the sql_callback /
244
245	fts_sql_callback
246	read_record; /!< Callback for reading index*
247	record /*
248	ulint total_memory; /!< Total memory used /
249	};
250
251	/* For horizontally splitting an FTS auxiliary index /
252	struct fts_index_selector_t {
253	ulint value; /!< Character value at which*
254	to split /*
255
256	const char* suffix; /!< FTS aux index suffix /
257	};
258
259	/* This type represents a single document. /
260	struct fts_doc_t {
261	fts_string_t text; /!< document text /
262
263	ibool found; /!< TRUE if the document was found*
264	successfully in the database /*
265
266	ib_rbt_t* tokens; /!< This is filled when the document*
267	is tokenized. Tokens; indexed by
268	fts_string_t, cells are of type*
269	fts_token_t /
270
271	ib_alloc_t* self_heap; /!< An instance of this type is*
272	allocated from this heap along
273	with any objects that have the
274	same lifespan, most notably
275	the vector of token positions /*
276	CHARSET_INFO* charset; /!< Document's charset info /
277
278	st_mysql_ftparser* parser; /!< fts plugin parser /
279
280	ib_rbt_t* stopwords; /!< Stopwords /
281	};
282
283	/* A token and its positions within a document. /
284	struct fts_token_t {
285	fts_string_t text; /!< token text /
286
287	ib_vector_t* positions; /!< an array of the positions the*
288	token is found in; each item is
289	actually an ulint. /*
290	};
291
292	/* It's defined in fts/fts0fts.c /
293	extern const fts_index_selector_t fts_index_selector[];
294
295	/****************************************************************//**
296	Compare two fts_trx_row_t instances doc_ids. /*
297	UNIV_INLINE
298	int
299	fts_trx_row_doc_id_cmp(
300	/===================/
301	/!< out:*
302	< 0 if n1 < n2,
303	0 if n1 == n2,
304	> 0 if n1 > n2 /*
305	const void* p1, /!< in: id1 /
306	const void* p2); /!< in: id2 /
307
308	/****************************************************************//**
309	Compare two fts_ranking_t instances doc_ids. /*
310	UNIV_INLINE
311	int
312	fts_ranking_doc_id_cmp(
313	/===================/
314	/!< out:*
315	< 0 if n1 < n2,
316	0 if n1 == n2,
317	> 0 if n1 > n2 /*
318	const void* p1, /!< in: id1 /
319	const void* p2); /!< in: id2 /
320
321	/****************************************************************//**
322	Compare two fts_update_t instances doc_ids. /*
323	UNIV_INLINE
324	int
325	fts_update_doc_id_cmp(
326	/==================/
327	/!< out:*
328	< 0 if n1 < n2,
329	0 if n1 == n2,
330	> 0 if n1 > n2 /*
331	const void* p1, /!< in: id1 /
332	const void* p2); /!< in: id2 /
333
334	/****************************************************************//**
335	Decode and return the integer that was encoded using our VLC scheme./*
336	UNIV_INLINE
337	ulint
338	fts_decode_vlc(
339	/===========/
340	/!< out: value decoded /
341	byte** ptr); /!< in: ptr to decode from, this ptr is*
342	incremented by the number of bytes decoded /*
343
344	/****************************************************************//**
345	Duplicate a string. /*
346	UNIV_INLINE
347	void
348	fts_string_dup(
349	/===========/
350	/!< out:*
351	< 0 if n1 < n2,
352	0 if n1 == n2,
353	> 0 if n1 > n2 /*
354	fts_string_t* dst, /!< in: dup to here /
355	const fts_string_t* src, /!< in: src string /
356	mem_heap_t* heap); /!< in: heap to use /
357
358	/****************************************************************//**
359	Return length of val if it were encoded using our VLC scheme. /*
360	UNIV_INLINE
361	ulint
362	fts_get_encoded_len(
363	/================/
364	/!< out: length of value*
365	encoded, in bytes /*
366	ulint val); /!< in: value to encode /
367
368	/****************************************************************//**
369	Encode an integer using our VLC scheme and return the length in bytes. /*
370	UNIV_INLINE
371	ulint
372	fts_encode_int(
373	/===========/
374	/!< out: length of value*
375	encoded, in bytes /*
376	ulint val, /!< in: value to encode /
377	byte* buf); /!< in: buffer, must have*
378	enough space /*
379
380	/****************************************************************//**
381	Get the selected FTS aux INDEX suffix. /*
382	UNIV_INLINE
383	const char*
384	fts_get_suffix(
385	/===========/
386	ulint selected); /!< in: selected index /
387
388	/* Select the FTS auxiliary index for the given character.*
389	@param[in] cs charset
390	@param[in] str string
391	@param[in] len string length in bytes
392	@return the index to use for the string /*
393	UNIV_INLINE
394	ulint
395	fts_select_index(
396	const CHARSET_INFO* cs,
397	const byte* str,
398	ulint len);
399
400	#include "fts0types.ic"
401	#include "fts0vlc.ic"
402
403	#endif /* INNOBASE_FTS0TYPES_H */
404

Browse the source code of MariaDB/storage/innobase/include/fts0types.h