| 1 | /***************************************************************************** |
| 2 | |
| 3 | Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. |
| 4 | Copyright (c) 2017, MariaDB Corporation. |
| 5 | |
| 6 | This program is free software; you can redistribute it and/or modify it under |
| 7 | the terms of the GNU General Public License as published by the Free Software |
| 8 | Foundation; version 2 of the License. |
| 9 | |
| 10 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU General Public License along with |
| 15 | this program; if not, write to the Free Software Foundation, Inc., |
| 16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
| 17 | |
| 18 | *****************************************************************************/ |
| 19 | |
| 20 | /******************************************************************//** |
| 21 | @file include/fts0types.h |
| 22 | Full text search types file |
| 23 | |
| 24 | Created 2007-03-27 Sunny Bains |
| 25 | *******************************************************/ |
| 26 | |
| 27 | #ifndef INNOBASE_FTS0TYPES_H |
| 28 | #define INNOBASE_FTS0TYPES_H |
| 29 | |
| 30 | #include "univ.i" |
| 31 | #include "fts0fts.h" |
| 32 | #include "fut0fut.h" |
| 33 | #include "pars0pars.h" |
| 34 | #include "que0types.h" |
| 35 | #include "ut0byte.h" |
| 36 | #include "ut0rbt.h" |
| 37 | |
| 38 | /** Types used within FTS. */ |
| 39 | struct fts_que_t; |
| 40 | struct fts_node_t; |
| 41 | |
| 42 | /** Callbacks used within FTS. */ |
| 43 | typedef pars_user_func_cb_t fts_sql_callback; |
| 44 | typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); |
| 45 | |
| 46 | /** Statistics relevant to a particular document, used during retrieval. */ |
| 47 | struct fts_doc_stats_t { |
| 48 | doc_id_t doc_id; /*!< Document id */ |
| 49 | ulint word_count; /*!< Total words in the document */ |
| 50 | }; |
| 51 | |
| 52 | /** It's main purpose is to store the SQL prepared statements that |
| 53 | are required to retrieve a document from the database. */ |
| 54 | struct fts_get_doc_t { |
| 55 | fts_index_cache_t* |
| 56 | index_cache; /*!< The index cache instance */ |
| 57 | |
| 58 | /*!< Parsed sql statement */ |
| 59 | que_t* get_document_graph; |
| 60 | fts_cache_t* cache; /*!< The parent cache */ |
| 61 | }; |
| 62 | |
| 63 | /** Since we can have multiple FTS indexes on a table, we keep a |
| 64 | per index cache of words etc. */ |
| 65 | struct fts_index_cache_t { |
| 66 | dict_index_t* index; /*!< The FTS index instance */ |
| 67 | |
| 68 | ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, |
| 69 | cells are fts_tokenizer_word_t*.*/ |
| 70 | |
| 71 | ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t |
| 72 | contained in the memory buffer. |
| 73 | Must be in sorted order (ascending). |
| 74 | The ideal choice is an rb tree but |
| 75 | the rb tree imposes a space overhead |
| 76 | that we can do without */ |
| 77 | |
| 78 | que_t** ins_graph; /*!< Insert query graphs */ |
| 79 | |
| 80 | que_t** sel_graph; /*!< Select query graphs */ |
| 81 | CHARSET_INFO* charset; /*!< charset */ |
| 82 | }; |
| 83 | |
| 84 | /** For supporting the tracking of updates on multiple FTS indexes we need |
| 85 | to track which FTS indexes need to be updated. For INSERT and DELETE we |
| 86 | update all fts indexes. */ |
| 87 | struct fts_update_t { |
| 88 | doc_id_t doc_id; /*!< The doc id affected */ |
| 89 | |
| 90 | ib_vector_t* fts_indexes; /*!< The FTS indexes that need to be |
| 91 | updated. A NULL value means all |
| 92 | indexes need to be updated. This |
| 93 | vector is not allocated on the heap |
| 94 | and so must be freed explicitly, |
| 95 | when we are done with it */ |
| 96 | }; |
| 97 | |
| 98 | /** Stop word control infotmation. */ |
| 99 | struct fts_stopword_t { |
| 100 | ulint status; /*!< Status of the stopword tree */ |
| 101 | ib_alloc_t* heap; /*!< The memory allocator to use */ |
| 102 | ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ |
| 103 | CHARSET_INFO* charset; /*!< charset for stopword */ |
| 104 | }; |
| 105 | |
| 106 | /** The SYNC state of the cache. There is one instance of this struct |
| 107 | associated with each ADD thread. */ |
| 108 | struct fts_sync_t { |
| 109 | trx_t* trx; /*!< The transaction used for SYNCing |
| 110 | the cache to disk */ |
| 111 | dict_table_t* table; /*!< Table with FTS index(es) */ |
| 112 | ulint max_cache_size; /*!< Max size in bytes of the cache */ |
| 113 | ibool cache_full; /*!< flag, when true it indicates that |
| 114 | we need to sync the cache to disk */ |
| 115 | ulint lower_index; /*!< the start index of the doc id |
| 116 | vector from where to start adding |
| 117 | documents to the FTS cache */ |
| 118 | ulint upper_index; /*!< max index of the doc id vector to |
| 119 | add to the FTS cache */ |
| 120 | ibool interrupted; /*!< TRUE if SYNC was interrupted */ |
| 121 | doc_id_t min_doc_id; /*!< The smallest doc id added to the |
| 122 | cache. It should equal to |
| 123 | doc_ids[lower_index] */ |
| 124 | doc_id_t max_doc_id; /*!< The doc id at which the cache was |
| 125 | noted as being full, we use this to |
| 126 | set the upper_limit field */ |
| 127 | ib_time_t start_time; /*!< SYNC start time */ |
| 128 | bool in_progress; /*!< flag whether sync is in progress.*/ |
| 129 | bool unlock_cache; /*!< flag whether unlock cache when |
| 130 | write fts node */ |
| 131 | os_event_t event; /*!< sync finish event; |
| 132 | only os_event_set() and os_event_wait() |
| 133 | are used */ |
| 134 | }; |
| 135 | |
| 136 | /** The cache for the FTS system. It is a memory-based inverted index |
| 137 | that new entries are added to, until it grows over the configured maximum |
| 138 | size, at which time its contents are written to the INDEX table. */ |
| 139 | struct fts_cache_t { |
| 140 | rw_lock_t lock; /*!< lock protecting all access to the |
| 141 | memory buffer. FIXME: this needs to |
| 142 | be our new upgrade-capable rw-lock */ |
| 143 | |
| 144 | rw_lock_t init_lock; /*!< lock used for the cache |
| 145 | intialization, it has different |
| 146 | SYNC level as above cache lock */ |
| 147 | |
| 148 | ib_mutex_t optimize_lock; /*!< Lock for OPTIMIZE */ |
| 149 | |
| 150 | ib_mutex_t deleted_lock; /*!< Lock covering deleted_doc_ids */ |
| 151 | |
| 152 | ib_mutex_t doc_id_lock; /*!< Lock covering Doc ID */ |
| 153 | |
| 154 | ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each |
| 155 | element is of type fts_update_t */ |
| 156 | |
| 157 | ib_vector_t* indexes; /*!< We store the stats and inverted |
| 158 | index for the individual FTS indexes |
| 159 | in this vector. Each element is |
| 160 | an instance of fts_index_cache_t */ |
| 161 | |
| 162 | ib_vector_t* get_docs; /*!< information required to read |
| 163 | the document from the table. Each |
| 164 | element is of type fts_doc_t */ |
| 165 | |
| 166 | ulint total_size; /*!< total size consumed by the ilist |
| 167 | field of all nodes. SYNC is run |
| 168 | whenever this gets too big */ |
| 169 | fts_sync_t* sync; /*!< sync structure to sync data to |
| 170 | disk */ |
| 171 | ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes |
| 172 | and deleted_doc_ids, ie. transient |
| 173 | objects, they are recreated after |
| 174 | a SYNC is completed */ |
| 175 | |
| 176 | ib_alloc_t* self_heap; /*!< This heap is the heap out of |
| 177 | which an instance of the cache itself |
| 178 | was created. Objects created using |
| 179 | this heap will last for the lifetime |
| 180 | of the cache */ |
| 181 | |
| 182 | doc_id_t next_doc_id; /*!< Next doc id */ |
| 183 | |
| 184 | doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ |
| 185 | |
| 186 | doc_id_t first_doc_id; /*!< first doc id since this table |
| 187 | was opened */ |
| 188 | |
| 189 | ulint deleted; /*!< Number of doc ids deleted since |
| 190 | last optimized. This variable is |
| 191 | covered by deleted_lock */ |
| 192 | |
| 193 | ulint added; /*!< Number of doc ids added since last |
| 194 | optimized. This variable is covered by |
| 195 | the deleted lock */ |
| 196 | |
| 197 | fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ |
| 198 | mem_heap_t* cache_heap; /*!< Cache Heap */ |
| 199 | }; |
| 200 | |
| 201 | /** Columns of the FTS auxiliary INDEX table */ |
| 202 | struct fts_node_t { |
| 203 | doc_id_t first_doc_id; /*!< First document id in ilist. */ |
| 204 | |
| 205 | doc_id_t last_doc_id; /*!< Last document id in ilist. */ |
| 206 | |
| 207 | byte* ilist; /*!< Binary list of documents & word |
| 208 | positions the token appears in. |
| 209 | TODO: For now, these are simply |
| 210 | ut_malloc'd, but if testing shows |
| 211 | that they waste memory unacceptably, a |
| 212 | special memory allocator will have |
| 213 | to be written */ |
| 214 | |
| 215 | ulint doc_count; /*!< Number of doc ids in ilist */ |
| 216 | |
| 217 | ulint ilist_size; /*!< Used size of ilist in bytes. */ |
| 218 | |
| 219 | ulint ilist_size_alloc; |
| 220 | /*!< Allocated size of ilist in |
| 221 | bytes */ |
| 222 | bool synced; /*!< flag whether the node is synced */ |
| 223 | }; |
| 224 | |
| 225 | /** A tokenizer word. Contains information about one word. */ |
| 226 | struct fts_tokenizer_word_t { |
| 227 | fts_string_t text; /*!< Token text. */ |
| 228 | |
| 229 | ib_vector_t* nodes; /*!< Word node ilists, each element is |
| 230 | of type fts_node_t */ |
| 231 | }; |
| 232 | |
| 233 | /** Word text plus it's array of nodes as on disk in FTS index */ |
| 234 | struct fts_word_t { |
| 235 | fts_string_t text; /*!< Word value in UTF-8 */ |
| 236 | ib_vector_t* nodes; /*!< Nodes read from disk */ |
| 237 | |
| 238 | ib_alloc_t* heap_alloc; /*!< For handling all allocations */ |
| 239 | }; |
| 240 | |
| 241 | /** Callback for reading and filtering nodes that are read from FTS index */ |
| 242 | struct fts_fetch_t { |
| 243 | void* read_arg; /*!< Arg for the sql_callback */ |
| 244 | |
| 245 | fts_sql_callback |
| 246 | read_record; /*!< Callback for reading index |
| 247 | record */ |
| 248 | ulint total_memory; /*!< Total memory used */ |
| 249 | }; |
| 250 | |
| 251 | /** For horizontally splitting an FTS auxiliary index */ |
| 252 | struct fts_index_selector_t { |
| 253 | ulint value; /*!< Character value at which |
| 254 | to split */ |
| 255 | |
| 256 | const char* suffix; /*!< FTS aux index suffix */ |
| 257 | }; |
| 258 | |
| 259 | /** This type represents a single document. */ |
| 260 | struct fts_doc_t { |
| 261 | fts_string_t text; /*!< document text */ |
| 262 | |
| 263 | ibool found; /*!< TRUE if the document was found |
| 264 | successfully in the database */ |
| 265 | |
| 266 | ib_rbt_t* tokens; /*!< This is filled when the document |
| 267 | is tokenized. Tokens; indexed by |
| 268 | fts_string_t*, cells are of type |
| 269 | fts_token_t* */ |
| 270 | |
| 271 | ib_alloc_t* self_heap; /*!< An instance of this type is |
| 272 | allocated from this heap along |
| 273 | with any objects that have the |
| 274 | same lifespan, most notably |
| 275 | the vector of token positions */ |
| 276 | CHARSET_INFO* charset; /*!< Document's charset info */ |
| 277 | |
| 278 | st_mysql_ftparser* parser; /*!< fts plugin parser */ |
| 279 | |
| 280 | ib_rbt_t* stopwords; /*!< Stopwords */ |
| 281 | }; |
| 282 | |
| 283 | /** A token and its positions within a document. */ |
| 284 | struct fts_token_t { |
| 285 | fts_string_t text; /*!< token text */ |
| 286 | |
| 287 | ib_vector_t* positions; /*!< an array of the positions the |
| 288 | token is found in; each item is |
| 289 | actually an ulint. */ |
| 290 | }; |
| 291 | |
| 292 | /** It's defined in fts/fts0fts.c */ |
| 293 | extern const fts_index_selector_t fts_index_selector[]; |
| 294 | |
| 295 | /******************************************************************//** |
| 296 | Compare two fts_trx_row_t instances doc_ids. */ |
| 297 | UNIV_INLINE |
| 298 | int |
| 299 | fts_trx_row_doc_id_cmp( |
| 300 | /*===================*/ |
| 301 | /*!< out: |
| 302 | < 0 if n1 < n2, |
| 303 | 0 if n1 == n2, |
| 304 | > 0 if n1 > n2 */ |
| 305 | const void* p1, /*!< in: id1 */ |
| 306 | const void* p2); /*!< in: id2 */ |
| 307 | |
| 308 | /******************************************************************//** |
| 309 | Compare two fts_ranking_t instances doc_ids. */ |
| 310 | UNIV_INLINE |
| 311 | int |
| 312 | fts_ranking_doc_id_cmp( |
| 313 | /*===================*/ |
| 314 | /*!< out: |
| 315 | < 0 if n1 < n2, |
| 316 | 0 if n1 == n2, |
| 317 | > 0 if n1 > n2 */ |
| 318 | const void* p1, /*!< in: id1 */ |
| 319 | const void* p2); /*!< in: id2 */ |
| 320 | |
| 321 | /******************************************************************//** |
| 322 | Compare two fts_update_t instances doc_ids. */ |
| 323 | UNIV_INLINE |
| 324 | int |
| 325 | fts_update_doc_id_cmp( |
| 326 | /*==================*/ |
| 327 | /*!< out: |
| 328 | < 0 if n1 < n2, |
| 329 | 0 if n1 == n2, |
| 330 | > 0 if n1 > n2 */ |
| 331 | const void* p1, /*!< in: id1 */ |
| 332 | const void* p2); /*!< in: id2 */ |
| 333 | |
| 334 | /******************************************************************//** |
| 335 | Decode and return the integer that was encoded using our VLC scheme.*/ |
| 336 | UNIV_INLINE |
| 337 | ulint |
| 338 | fts_decode_vlc( |
| 339 | /*===========*/ |
| 340 | /*!< out: value decoded */ |
| 341 | byte** ptr); /*!< in: ptr to decode from, this ptr is |
| 342 | incremented by the number of bytes decoded */ |
| 343 | |
| 344 | /******************************************************************//** |
| 345 | Duplicate a string. */ |
| 346 | UNIV_INLINE |
| 347 | void |
| 348 | fts_string_dup( |
| 349 | /*===========*/ |
| 350 | /*!< out: |
| 351 | < 0 if n1 < n2, |
| 352 | 0 if n1 == n2, |
| 353 | > 0 if n1 > n2 */ |
| 354 | fts_string_t* dst, /*!< in: dup to here */ |
| 355 | const fts_string_t* src, /*!< in: src string */ |
| 356 | mem_heap_t* heap); /*!< in: heap to use */ |
| 357 | |
| 358 | /******************************************************************//** |
| 359 | Return length of val if it were encoded using our VLC scheme. */ |
| 360 | UNIV_INLINE |
| 361 | ulint |
| 362 | fts_get_encoded_len( |
| 363 | /*================*/ |
| 364 | /*!< out: length of value |
| 365 | encoded, in bytes */ |
| 366 | ulint val); /*!< in: value to encode */ |
| 367 | |
| 368 | /******************************************************************//** |
| 369 | Encode an integer using our VLC scheme and return the length in bytes. */ |
| 370 | UNIV_INLINE |
| 371 | ulint |
| 372 | fts_encode_int( |
| 373 | /*===========*/ |
| 374 | /*!< out: length of value |
| 375 | encoded, in bytes */ |
| 376 | ulint val, /*!< in: value to encode */ |
| 377 | byte* buf); /*!< in: buffer, must have |
| 378 | enough space */ |
| 379 | |
| 380 | /******************************************************************//** |
| 381 | Get the selected FTS aux INDEX suffix. */ |
| 382 | UNIV_INLINE |
| 383 | const char* |
| 384 | fts_get_suffix( |
| 385 | /*===========*/ |
| 386 | ulint selected); /*!< in: selected index */ |
| 387 | |
| 388 | /** Select the FTS auxiliary index for the given character. |
| 389 | @param[in] cs charset |
| 390 | @param[in] str string |
| 391 | @param[in] len string length in bytes |
| 392 | @return the index to use for the string */ |
| 393 | UNIV_INLINE |
| 394 | ulint |
| 395 | fts_select_index( |
| 396 | const CHARSET_INFO* cs, |
| 397 | const byte* str, |
| 398 | ulint len); |
| 399 | |
| 400 | #include "fts0types.ic" |
| 401 | #include "fts0vlc.ic" |
| 402 | |
| 403 | #endif /* INNOBASE_FTS0TYPES_H */ |
| 404 | |