1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2017, MariaDB Corporation. |
5 | |
6 | This program is free software; you can redistribute it and/or modify it under |
7 | the terms of the GNU General Public License as published by the Free Software |
8 | Foundation; version 2 of the License. |
9 | |
10 | This program is distributed in the hope that it will be useful, but WITHOUT |
11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License along with |
15 | this program; if not, write to the Free Software Foundation, Inc., |
16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
17 | |
18 | *****************************************************************************/ |
19 | |
20 | /******************************************************************//** |
21 | @file include/fts0types.h |
22 | Full text search types file |
23 | |
24 | Created 2007-03-27 Sunny Bains |
25 | *******************************************************/ |
26 | |
27 | #ifndef INNOBASE_FTS0TYPES_H |
28 | #define INNOBASE_FTS0TYPES_H |
29 | |
30 | #include "univ.i" |
31 | #include "fts0fts.h" |
32 | #include "fut0fut.h" |
33 | #include "pars0pars.h" |
34 | #include "que0types.h" |
35 | #include "ut0byte.h" |
36 | #include "ut0rbt.h" |
37 | |
38 | /** Types used within FTS. */ |
39 | struct fts_que_t; |
40 | struct fts_node_t; |
41 | |
42 | /** Callbacks used within FTS. */ |
43 | typedef pars_user_func_cb_t fts_sql_callback; |
44 | typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); |
45 | |
46 | /** Statistics relevant to a particular document, used during retrieval. */ |
47 | struct fts_doc_stats_t { |
48 | doc_id_t doc_id; /*!< Document id */ |
49 | ulint word_count; /*!< Total words in the document */ |
50 | }; |
51 | |
52 | /** It's main purpose is to store the SQL prepared statements that |
53 | are required to retrieve a document from the database. */ |
54 | struct fts_get_doc_t { |
55 | fts_index_cache_t* |
56 | index_cache; /*!< The index cache instance */ |
57 | |
58 | /*!< Parsed sql statement */ |
59 | que_t* get_document_graph; |
60 | fts_cache_t* cache; /*!< The parent cache */ |
61 | }; |
62 | |
63 | /** Since we can have multiple FTS indexes on a table, we keep a |
64 | per index cache of words etc. */ |
65 | struct fts_index_cache_t { |
66 | dict_index_t* index; /*!< The FTS index instance */ |
67 | |
68 | ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, |
69 | cells are fts_tokenizer_word_t*.*/ |
70 | |
71 | ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t |
72 | contained in the memory buffer. |
73 | Must be in sorted order (ascending). |
74 | The ideal choice is an rb tree but |
75 | the rb tree imposes a space overhead |
76 | that we can do without */ |
77 | |
78 | que_t** ins_graph; /*!< Insert query graphs */ |
79 | |
80 | que_t** sel_graph; /*!< Select query graphs */ |
81 | CHARSET_INFO* charset; /*!< charset */ |
82 | }; |
83 | |
84 | /** For supporting the tracking of updates on multiple FTS indexes we need |
85 | to track which FTS indexes need to be updated. For INSERT and DELETE we |
86 | update all fts indexes. */ |
87 | struct fts_update_t { |
88 | doc_id_t doc_id; /*!< The doc id affected */ |
89 | |
90 | ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be |
91 | updated. A NULL value means all |
92 | indexes need to be updated. This |
93 | vector is not allocated on the heap |
94 | and so must be freed explicitly, |
95 | when we are done with it */ |
96 | }; |
97 | |
98 | /** Stop word control infotmation. */ |
99 | struct fts_stopword_t { |
100 | ulint status; /*!< Status of the stopword tree */ |
101 | ib_alloc_t* heap; /*!< The memory allocator to use */ |
102 | ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ |
103 | CHARSET_INFO* charset; /*!< charset for stopword */ |
104 | }; |
105 | |
106 | /** The SYNC state of the cache. There is one instance of this struct |
107 | associated with each ADD thread. */ |
108 | struct fts_sync_t { |
109 | trx_t* trx; /*!< The transaction used for SYNCing |
110 | the cache to disk */ |
111 | dict_table_t* table; /*!< Table with FTS index(es) */ |
112 | ulint max_cache_size; /*!< Max size in bytes of the cache */ |
113 | ibool cache_full; /*!< flag, when true it indicates that |
114 | we need to sync the cache to disk */ |
115 | ulint lower_index; /*!< the start index of the doc id |
116 | vector from where to start adding |
117 | documents to the FTS cache */ |
118 | ulint upper_index; /*!< max index of the doc id vector to |
119 | add to the FTS cache */ |
120 | ibool interrupted; /*!< TRUE if SYNC was interrupted */ |
121 | doc_id_t min_doc_id; /*!< The smallest doc id added to the |
122 | cache. It should equal to |
123 | doc_ids[lower_index] */ |
124 | doc_id_t max_doc_id; /*!< The doc id at which the cache was |
125 | noted as being full, we use this to |
126 | set the upper_limit field */ |
127 | ib_time_t start_time; /*!< SYNC start time */ |
128 | bool in_progress; /*!< flag whether sync is in progress.*/ |
129 | bool unlock_cache; /*!< flag whether unlock cache when |
130 | write fts node */ |
131 | os_event_t event; /*!< sync finish event; |
132 | only os_event_set() and os_event_wait() |
133 | are used */ |
134 | }; |
135 | |
136 | /** The cache for the FTS system. It is a memory-based inverted index |
137 | that new entries are added to, until it grows over the configured maximum |
138 | size, at which time its contents are written to the INDEX table. */ |
139 | struct fts_cache_t { |
140 | rw_lock_t lock; /*!< lock protecting all access to the |
141 | memory buffer. FIXME: this needs to |
142 | be our new upgrade-capable rw-lock */ |
143 | |
144 | rw_lock_t init_lock; /*!< lock used for the cache |
145 | intialization, it has different |
146 | SYNC level as above cache lock */ |
147 | |
148 | ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ |
149 | |
150 | ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ |
151 | |
152 | ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ |
153 | |
154 | ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each |
155 | element is of type fts_update_t */ |
156 | |
157 | ib_vector_t* indexes; /*!< We store the stats and inverted |
158 | index for the individual FTS indexes |
159 | in this vector. Each element is |
160 | an instance of fts_index_cache_t */ |
161 | |
162 | ib_vector_t* get_docs; /*!< information required to read |
163 | the document from the table. Each |
164 | element is of type fts_doc_t */ |
165 | |
166 | ulint total_size; /*!< total size consumed by the ilist |
167 | field of all nodes. SYNC is run |
168 | whenever this gets too big */ |
169 | fts_sync_t* sync; /*!< sync structure to sync data to |
170 | disk */ |
171 | ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes |
172 | and deleted_doc_ids, ie. transient |
173 | objects, they are recreated after |
174 | a SYNC is completed */ |
175 | |
176 | ib_alloc_t* self_heap; /*!< This heap is the heap out of |
177 | which an instance of the cache itself |
178 | was created. Objects created using |
179 | this heap will last for the lifetime |
180 | of the cache */ |
181 | |
182 | doc_id_t next_doc_id; /*!< Next doc id */ |
183 | |
184 | doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ |
185 | |
186 | doc_id_t first_doc_id; /*!< first doc id since this table |
187 | was opened */ |
188 | |
189 | ulint deleted; /*!< Number of doc ids deleted since |
190 | last optimized. This variable is |
191 | covered by deleted_lock */ |
192 | |
193 | ulint added; /*!< Number of doc ids added since last |
194 | optimized. This variable is covered by |
195 | the deleted lock */ |
196 | |
197 | fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ |
198 | mem_heap_t* cache_heap; /*!< Cache Heap */ |
199 | }; |
200 | |
201 | /** Columns of the FTS auxiliary INDEX table */ |
202 | struct fts_node_t { |
203 | doc_id_t first_doc_id; /*!< First document id in ilist. */ |
204 | |
205 | doc_id_t last_doc_id; /*!< Last document id in ilist. */ |
206 | |
207 | byte* ilist; /*!< Binary list of documents & word |
208 | positions the token appears in. |
209 | TODO: For now, these are simply |
210 | ut_malloc'd, but if testing shows |
211 | that they waste memory unacceptably, a |
212 | special memory allocator will have |
213 | to be written */ |
214 | |
215 | ulint doc_count; /*!< Number of doc ids in ilist */ |
216 | |
217 | ulint ilist_size; /*!< Used size of ilist in bytes. */ |
218 | |
219 | ulint ilist_size_alloc; |
220 | /*!< Allocated size of ilist in |
221 | bytes */ |
222 | bool synced; /*!< flag whether the node is synced */ |
223 | }; |
224 | |
225 | /** A tokenizer word. Contains information about one word. */ |
226 | struct fts_tokenizer_word_t { |
227 | fts_string_t text; /*!< Token text. */ |
228 | |
229 | ib_vector_t* nodes; /*!< Word node ilists, each element is |
230 | of type fts_node_t */ |
231 | }; |
232 | |
233 | /** Word text plus it's array of nodes as on disk in FTS index */ |
234 | struct fts_word_t { |
235 | fts_string_t text; /*!< Word value in UTF-8 */ |
236 | ib_vector_t* nodes; /*!< Nodes read from disk */ |
237 | |
238 | ib_alloc_t* heap_alloc; /*!< For handling all allocations */ |
239 | }; |
240 | |
241 | /** Callback for reading and filtering nodes that are read from FTS index */ |
242 | struct fts_fetch_t { |
243 | void* read_arg; /*!< Arg for the sql_callback */ |
244 | |
245 | fts_sql_callback |
246 | read_record; /*!< Callback for reading index |
247 | record */ |
248 | ulint total_memory; /*!< Total memory used */ |
249 | }; |
250 | |
251 | /** For horizontally splitting an FTS auxiliary index */ |
252 | struct fts_index_selector_t { |
253 | ulint value; /*!< Character value at which |
254 | to split */ |
255 | |
256 | const char* suffix; /*!< FTS aux index suffix */ |
257 | }; |
258 | |
259 | /** This type represents a single document. */ |
260 | struct fts_doc_t { |
261 | fts_string_t text; /*!< document text */ |
262 | |
263 | ibool found; /*!< TRUE if the document was found |
264 | successfully in the database */ |
265 | |
266 | ib_rbt_t* tokens; /*!< This is filled when the document |
267 | is tokenized. Tokens; indexed by |
268 | fts_string_t*, cells are of type |
269 | fts_token_t* */ |
270 | |
271 | ib_alloc_t* self_heap; /*!< An instance of this type is |
272 | allocated from this heap along |
273 | with any objects that have the |
274 | same lifespan, most notably |
275 | the vector of token positions */ |
276 | CHARSET_INFO* charset; /*!< Document's charset info */ |
277 | |
278 | st_mysql_ftparser* parser; /*!< fts plugin parser */ |
279 | |
280 | ib_rbt_t* stopwords; /*!< Stopwords */ |
281 | }; |
282 | |
283 | /** A token and its positions within a document. */ |
284 | struct fts_token_t { |
285 | fts_string_t text; /*!< token text */ |
286 | |
287 | ib_vector_t* positions; /*!< an array of the positions the |
288 | token is found in; each item is |
289 | actually an ulint. */ |
290 | }; |
291 | |
292 | /** It's defined in fts/fts0fts.c */ |
293 | extern const fts_index_selector_t fts_index_selector[]; |
294 | |
295 | /******************************************************************//** |
296 | Compare two fts_trx_row_t instances doc_ids. */ |
297 | UNIV_INLINE |
298 | int |
299 | fts_trx_row_doc_id_cmp( |
300 | /*===================*/ |
301 | /*!< out: |
302 | < 0 if n1 < n2, |
303 | 0 if n1 == n2, |
304 | > 0 if n1 > n2 */ |
305 | const void* p1, /*!< in: id1 */ |
306 | const void* p2); /*!< in: id2 */ |
307 | |
308 | /******************************************************************//** |
309 | Compare two fts_ranking_t instances doc_ids. */ |
310 | UNIV_INLINE |
311 | int |
312 | fts_ranking_doc_id_cmp( |
313 | /*===================*/ |
314 | /*!< out: |
315 | < 0 if n1 < n2, |
316 | 0 if n1 == n2, |
317 | > 0 if n1 > n2 */ |
318 | const void* p1, /*!< in: id1 */ |
319 | const void* p2); /*!< in: id2 */ |
320 | |
321 | /******************************************************************//** |
322 | Compare two fts_update_t instances doc_ids. */ |
323 | UNIV_INLINE |
324 | int |
325 | fts_update_doc_id_cmp( |
326 | /*==================*/ |
327 | /*!< out: |
328 | < 0 if n1 < n2, |
329 | 0 if n1 == n2, |
330 | > 0 if n1 > n2 */ |
331 | const void* p1, /*!< in: id1 */ |
332 | const void* p2); /*!< in: id2 */ |
333 | |
334 | /******************************************************************//** |
335 | Decode and return the integer that was encoded using our VLC scheme.*/ |
336 | UNIV_INLINE |
337 | ulint |
338 | fts_decode_vlc( |
339 | /*===========*/ |
340 | /*!< out: value decoded */ |
341 | byte** ptr); /*!< in: ptr to decode from, this ptr is |
342 | incremented by the number of bytes decoded */ |
343 | |
344 | /******************************************************************//** |
345 | Duplicate a string. */ |
346 | UNIV_INLINE |
347 | void |
348 | fts_string_dup( |
349 | /*===========*/ |
350 | /*!< out: |
351 | < 0 if n1 < n2, |
352 | 0 if n1 == n2, |
353 | > 0 if n1 > n2 */ |
354 | fts_string_t* dst, /*!< in: dup to here */ |
355 | const fts_string_t* src, /*!< in: src string */ |
356 | mem_heap_t* heap); /*!< in: heap to use */ |
357 | |
358 | /******************************************************************//** |
359 | Return length of val if it were encoded using our VLC scheme. */ |
360 | UNIV_INLINE |
361 | ulint |
362 | fts_get_encoded_len( |
363 | /*================*/ |
364 | /*!< out: length of value |
365 | encoded, in bytes */ |
366 | ulint val); /*!< in: value to encode */ |
367 | |
368 | /******************************************************************//** |
369 | Encode an integer using our VLC scheme and return the length in bytes. */ |
370 | UNIV_INLINE |
371 | ulint |
372 | fts_encode_int( |
373 | /*===========*/ |
374 | /*!< out: length of value |
375 | encoded, in bytes */ |
376 | ulint val, /*!< in: value to encode */ |
377 | byte* buf); /*!< in: buffer, must have |
378 | enough space */ |
379 | |
380 | /******************************************************************//** |
381 | Get the selected FTS aux INDEX suffix. */ |
382 | UNIV_INLINE |
383 | const char* |
384 | fts_get_suffix( |
385 | /*===========*/ |
386 | ulint selected); /*!< in: selected index */ |
387 | |
388 | /** Select the FTS auxiliary index for the given character. |
389 | @param[in] cs charset |
390 | @param[in] str string |
391 | @param[in] len string length in bytes |
392 | @return the index to use for the string */ |
393 | UNIV_INLINE |
394 | ulint |
395 | fts_select_index( |
396 | const CHARSET_INFO* cs, |
397 | const byte* str, |
398 | ulint len); |
399 | |
400 | #include "fts0types.ic" |
401 | #include "fts0vlc.ic" |
402 | |
403 | #endif /* INNOBASE_FTS0TYPES_H */ |
404 | |