1/*****************************************************************************
2
3Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2017, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/******************************************************************//**
21@file include/fts0types.h
22Full text search types file
23
24Created 2007-03-27 Sunny Bains
25*******************************************************/
26
27#ifndef INNOBASE_FTS0TYPES_H
28#define INNOBASE_FTS0TYPES_H
29
30#include "univ.i"
31#include "fts0fts.h"
32#include "fut0fut.h"
33#include "pars0pars.h"
34#include "que0types.h"
35#include "ut0byte.h"
36#include "ut0rbt.h"
37
38/** Types used within FTS. */
39struct fts_que_t;
40struct fts_node_t;
41
42/** Callbacks used within FTS. */
43typedef pars_user_func_cb_t fts_sql_callback;
44typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
45
46/** Statistics relevant to a particular document, used during retrieval. */
47struct fts_doc_stats_t {
48 doc_id_t doc_id; /*!< Document id */
49 ulint word_count; /*!< Total words in the document */
50};
51
52/** It's main purpose is to store the SQL prepared statements that
53are required to retrieve a document from the database. */
54struct fts_get_doc_t {
55 fts_index_cache_t*
56 index_cache; /*!< The index cache instance */
57
58 /*!< Parsed sql statement */
59 que_t* get_document_graph;
60 fts_cache_t* cache; /*!< The parent cache */
61};
62
63/** Since we can have multiple FTS indexes on a table, we keep a
64per index cache of words etc. */
65struct fts_index_cache_t {
66 dict_index_t* index; /*!< The FTS index instance */
67
68 ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*,
69 cells are fts_tokenizer_word_t*.*/
70
71 ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t
72 contained in the memory buffer.
73 Must be in sorted order (ascending).
74 The ideal choice is an rb tree but
75 the rb tree imposes a space overhead
76 that we can do without */
77
78 que_t** ins_graph; /*!< Insert query graphs */
79
80 que_t** sel_graph; /*!< Select query graphs */
81 CHARSET_INFO* charset; /*!< charset */
82};
83
84/** For supporting the tracking of updates on multiple FTS indexes we need
85to track which FTS indexes need to be updated. For INSERT and DELETE we
86update all fts indexes. */
87struct fts_update_t {
88 doc_id_t doc_id; /*!< The doc id affected */
89
90 ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be
91 updated. A NULL value means all
92 indexes need to be updated. This
93 vector is not allocated on the heap
94 and so must be freed explicitly,
95 when we are done with it */
96};
97
98/** Stop word control infotmation. */
99struct fts_stopword_t {
100 ulint status; /*!< Status of the stopword tree */
101 ib_alloc_t* heap; /*!< The memory allocator to use */
102 ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */
103 CHARSET_INFO* charset; /*!< charset for stopword */
104};
105
106/** The SYNC state of the cache. There is one instance of this struct
107associated with each ADD thread. */
108struct fts_sync_t {
109 trx_t* trx; /*!< The transaction used for SYNCing
110 the cache to disk */
111 dict_table_t* table; /*!< Table with FTS index(es) */
112 ulint max_cache_size; /*!< Max size in bytes of the cache */
113 ibool cache_full; /*!< flag, when true it indicates that
114 we need to sync the cache to disk */
115 ulint lower_index; /*!< the start index of the doc id
116 vector from where to start adding
117 documents to the FTS cache */
118 ulint upper_index; /*!< max index of the doc id vector to
119 add to the FTS cache */
120 ibool interrupted; /*!< TRUE if SYNC was interrupted */
121 doc_id_t min_doc_id; /*!< The smallest doc id added to the
122 cache. It should equal to
123 doc_ids[lower_index] */
124 doc_id_t max_doc_id; /*!< The doc id at which the cache was
125 noted as being full, we use this to
126 set the upper_limit field */
127 ib_time_t start_time; /*!< SYNC start time */
128 bool in_progress; /*!< flag whether sync is in progress.*/
129 bool unlock_cache; /*!< flag whether unlock cache when
130 write fts node */
131 os_event_t event; /*!< sync finish event;
132 only os_event_set() and os_event_wait()
133 are used */
134};
135
136/** The cache for the FTS system. It is a memory-based inverted index
137that new entries are added to, until it grows over the configured maximum
138size, at which time its contents are written to the INDEX table. */
139struct fts_cache_t {
140 rw_lock_t lock; /*!< lock protecting all access to the
141 memory buffer. FIXME: this needs to
142 be our new upgrade-capable rw-lock */
143
144 rw_lock_t init_lock; /*!< lock used for the cache
145 intialization, it has different
146 SYNC level as above cache lock */
147
148 ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */
149
150 ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */
151
152 ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */
153
154 ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each
155 element is of type fts_update_t */
156
157 ib_vector_t* indexes; /*!< We store the stats and inverted
158 index for the individual FTS indexes
159 in this vector. Each element is
160 an instance of fts_index_cache_t */
161
162 ib_vector_t* get_docs; /*!< information required to read
163 the document from the table. Each
164 element is of type fts_doc_t */
165
166 ulint total_size; /*!< total size consumed by the ilist
167 field of all nodes. SYNC is run
168 whenever this gets too big */
169 fts_sync_t* sync; /*!< sync structure to sync data to
170 disk */
171 ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes
172 and deleted_doc_ids, ie. transient
173 objects, they are recreated after
174 a SYNC is completed */
175
176 ib_alloc_t* self_heap; /*!< This heap is the heap out of
177 which an instance of the cache itself
178 was created. Objects created using
179 this heap will last for the lifetime
180 of the cache */
181
182 doc_id_t next_doc_id; /*!< Next doc id */
183
184 doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
185
186 doc_id_t first_doc_id; /*!< first doc id since this table
187 was opened */
188
189 ulint deleted; /*!< Number of doc ids deleted since
190 last optimized. This variable is
191 covered by deleted_lock */
192
193 ulint added; /*!< Number of doc ids added since last
194 optimized. This variable is covered by
195 the deleted lock */
196
197 fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
198 mem_heap_t* cache_heap; /*!< Cache Heap */
199};
200
201/** Columns of the FTS auxiliary INDEX table */
202struct fts_node_t {
203 doc_id_t first_doc_id; /*!< First document id in ilist. */
204
205 doc_id_t last_doc_id; /*!< Last document id in ilist. */
206
207 byte* ilist; /*!< Binary list of documents & word
208 positions the token appears in.
209 TODO: For now, these are simply
210 ut_malloc'd, but if testing shows
211 that they waste memory unacceptably, a
212 special memory allocator will have
213 to be written */
214
215 ulint doc_count; /*!< Number of doc ids in ilist */
216
217 ulint ilist_size; /*!< Used size of ilist in bytes. */
218
219 ulint ilist_size_alloc;
220 /*!< Allocated size of ilist in
221 bytes */
222 bool synced; /*!< flag whether the node is synced */
223};
224
225/** A tokenizer word. Contains information about one word. */
226struct fts_tokenizer_word_t {
227 fts_string_t text; /*!< Token text. */
228
229 ib_vector_t* nodes; /*!< Word node ilists, each element is
230 of type fts_node_t */
231};
232
233/** Word text plus it's array of nodes as on disk in FTS index */
234struct fts_word_t {
235 fts_string_t text; /*!< Word value in UTF-8 */
236 ib_vector_t* nodes; /*!< Nodes read from disk */
237
238 ib_alloc_t* heap_alloc; /*!< For handling all allocations */
239};
240
241/** Callback for reading and filtering nodes that are read from FTS index */
242struct fts_fetch_t {
243 void* read_arg; /*!< Arg for the sql_callback */
244
245 fts_sql_callback
246 read_record; /*!< Callback for reading index
247 record */
248 ulint total_memory; /*!< Total memory used */
249};
250
251/** For horizontally splitting an FTS auxiliary index */
252struct fts_index_selector_t {
253 ulint value; /*!< Character value at which
254 to split */
255
256 const char* suffix; /*!< FTS aux index suffix */
257};
258
259/** This type represents a single document. */
260struct fts_doc_t {
261 fts_string_t text; /*!< document text */
262
263 ibool found; /*!< TRUE if the document was found
264 successfully in the database */
265
266 ib_rbt_t* tokens; /*!< This is filled when the document
267 is tokenized. Tokens; indexed by
268 fts_string_t*, cells are of type
269 fts_token_t* */
270
271 ib_alloc_t* self_heap; /*!< An instance of this type is
272 allocated from this heap along
273 with any objects that have the
274 same lifespan, most notably
275 the vector of token positions */
276 CHARSET_INFO* charset; /*!< Document's charset info */
277
278 st_mysql_ftparser* parser; /*!< fts plugin parser */
279
280 ib_rbt_t* stopwords; /*!< Stopwords */
281};
282
283/** A token and its positions within a document. */
284struct fts_token_t {
285 fts_string_t text; /*!< token text */
286
287 ib_vector_t* positions; /*!< an array of the positions the
288 token is found in; each item is
289 actually an ulint. */
290};
291
292/** It's defined in fts/fts0fts.c */
293extern const fts_index_selector_t fts_index_selector[];
294
295/******************************************************************//**
296Compare two fts_trx_row_t instances doc_ids. */
297UNIV_INLINE
298int
299fts_trx_row_doc_id_cmp(
300/*===================*/
301 /*!< out:
302 < 0 if n1 < n2,
303 0 if n1 == n2,
304 > 0 if n1 > n2 */
305 const void* p1, /*!< in: id1 */
306 const void* p2); /*!< in: id2 */
307
308/******************************************************************//**
309Compare two fts_ranking_t instances doc_ids. */
310UNIV_INLINE
311int
312fts_ranking_doc_id_cmp(
313/*===================*/
314 /*!< out:
315 < 0 if n1 < n2,
316 0 if n1 == n2,
317 > 0 if n1 > n2 */
318 const void* p1, /*!< in: id1 */
319 const void* p2); /*!< in: id2 */
320
321/******************************************************************//**
322Compare two fts_update_t instances doc_ids. */
323UNIV_INLINE
324int
325fts_update_doc_id_cmp(
326/*==================*/
327 /*!< out:
328 < 0 if n1 < n2,
329 0 if n1 == n2,
330 > 0 if n1 > n2 */
331 const void* p1, /*!< in: id1 */
332 const void* p2); /*!< in: id2 */
333
334/******************************************************************//**
335Decode and return the integer that was encoded using our VLC scheme.*/
336UNIV_INLINE
337ulint
338fts_decode_vlc(
339/*===========*/
340 /*!< out: value decoded */
341 byte** ptr); /*!< in: ptr to decode from, this ptr is
342 incremented by the number of bytes decoded */
343
344/******************************************************************//**
345Duplicate a string. */
346UNIV_INLINE
347void
348fts_string_dup(
349/*===========*/
350 /*!< out:
351 < 0 if n1 < n2,
352 0 if n1 == n2,
353 > 0 if n1 > n2 */
354 fts_string_t* dst, /*!< in: dup to here */
355 const fts_string_t* src, /*!< in: src string */
356 mem_heap_t* heap); /*!< in: heap to use */
357
358/******************************************************************//**
359Return length of val if it were encoded using our VLC scheme. */
360UNIV_INLINE
361ulint
362fts_get_encoded_len(
363/*================*/
364 /*!< out: length of value
365 encoded, in bytes */
366 ulint val); /*!< in: value to encode */
367
368/******************************************************************//**
369Encode an integer using our VLC scheme and return the length in bytes. */
370UNIV_INLINE
371ulint
372fts_encode_int(
373/*===========*/
374 /*!< out: length of value
375 encoded, in bytes */
376 ulint val, /*!< in: value to encode */
377 byte* buf); /*!< in: buffer, must have
378 enough space */
379
380/******************************************************************//**
381Get the selected FTS aux INDEX suffix. */
382UNIV_INLINE
383const char*
384fts_get_suffix(
385/*===========*/
386 ulint selected); /*!< in: selected index */
387
388/** Select the FTS auxiliary index for the given character.
389@param[in] cs charset
390@param[in] str string
391@param[in] len string length in bytes
392@return the index to use for the string */
393UNIV_INLINE
394ulint
395fts_select_index(
396 const CHARSET_INFO* cs,
397 const byte* str,
398 ulint len);
399
400#include "fts0types.ic"
401#include "fts0vlc.ic"
402
403#endif /* INNOBASE_FTS0TYPES_H */
404