1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of TokuDB
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 TokuDBis is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 TokuDB is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with TokuDB. If not, see <http://www.gnu.org/licenses/>.
21
22======= */
23
24#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25
26#ifndef _HA_TOKUDB_H
27#define _HA_TOKUDB_H
28
29#include "hatoku_hton.h"
30#include "hatoku_cmp.h"
31#include "tokudb_background.h"
32
33#define HA_TOKU_ORIG_VERSION 4
34#define HA_TOKU_VERSION 4
35//
36// no capabilities yet
37//
38#define HA_TOKU_CAP 0
39
40class ha_tokudb;
41
42typedef struct loader_context {
43 THD* thd;
44 char write_status_msg[200];
45 ha_tokudb* ha;
46} *LOADER_CONTEXT;
47
48//
49// This class stores table information that is to be shared
50// among all ha_tokudb objects.
51// There is one instance per table, shared among handlers.
52// Some of the variables here are the DB* pointers to indexes,
53// and auto increment information.
54//
55// When the last user releases it's reference on the share,
56// it closes all of its database handles and releases all info
57// The share instance stays around though so some data can be transiently
58// kept across open-close-open-close cycles. These data will be explicitly
59// noted below.
60//
61class TOKUDB_SHARE {
62public:
63 enum share_state_t {
64 CLOSED = 0,
65 OPENED = 1,
66 ERROR = 2
67 };
68
69 // one time, start up init
70 static void static_init();
71
72 // one time, shutdown destroy
73 static void static_destroy();
74
75 // retuns a locked, properly reference counted share
76 // callers must check to ensure share is in correct state for callers use
77 // and unlock the share.
78 // if create_new is set, a new "CLOSED" share will be created if one
79 // doesn't exist, otherwise will return NULL if an existing is not found.
80 static TOKUDB_SHARE* get_share(
81 const char* table_name,
82 TABLE_SHARE* table_share,
83 THR_LOCK_DATA* data,
84 bool create_new);
85
86 // removes a share entirely from the pool, call to rename/deleta a table
87 // caller must hold ddl_mutex on this share and the share MUST have
88 // exactly 0 _use_count
89 static void drop_share(TOKUDB_SHARE* share);
90
91 // returns state string for logging/reporting
92 static const char* get_state_string(share_state_t state);
93
94 void* operator new(size_t sz);
95 void operator delete(void* p);
96
97 TOKUDB_SHARE();
98
99 // increases the ref count and waits for any currently executing state
100 // transition to complete
101 // returns current state and leaves share locked
102 // callers must check to ensure share is in correct state for callers use
103 // and unlock the share.
104 share_state_t addref();
105
106 // decreases the ref count and potentially closes the share
107 // caller must not have ownership of mutex, will lock and release
108 int release();
109
110 // returns the current use count
111 // no locking requirements
112 inline int use_count() const;
113
114 // locks the share
115 inline void lock() const;
116
117 // unlocks the share
118 inline void unlock() const;
119
120 // returns the current state of the share
121 // no locking requirements
122 inline share_state_t state() const;
123
124 // sets the state of the share
125 // caller must hold mutex on this share
126 inline void set_state(share_state_t state);
127
128 // returns the full MySQL table name of the table ex:
129 // ./database/table
130 // no locking requirements
131 inline const char* full_table_name() const;
132
133 // returns the strlen of the full table name
134 // no locking requirements
135 inline uint full_table_name_length() const;
136
137 // returns the parsed database name this table resides in
138 // no locking requirements
139 inline const char* database_name() const;
140
141 // returns the strlen of the database name
142 // no locking requirements
143 inline uint database_name_length() const;
144
145 // returns the parsed table name of this table
146 // no locking requirements
147 inline const char* table_name() const;
148
149 // returns the strlen of the the table name
150 // no locking requirements
151 inline uint table_name_length() const;
152
153 // sets the estimated number of rows in the table
154 // should be called only during share initialization and info call
155 // caller must hold mutex on this share unless specified by 'locked'
156 inline void set_row_count(uint64_t rows, bool locked);
157
158 // updates tracked row count and ongoing table change delta tracking
159 // called from any ha_tokudb operation that inserts/modifies/deletes rows
160 // may spawn background analysis if enabled, allowed and threshold hit
161 // caller must not have ownership of mutex, will lock and release
162 void update_row_count(
163 THD* thd,
164 uint64_t added,
165 uint64_t deleted,
166 uint64_t updated);
167
168 // returns the current row count estimate
169 // no locking requirements
170 inline ha_rows row_count() const;
171
172 // initializes cardinality statistics, takes ownership of incoming buffer
173 // caller must hold mutex on this share
174 inline void init_cardinality_counts(
175 uint32_t rec_per_keys,
176 uint64_t* rec_per_key);
177
178 // update the cardinality statistics. number of records must match
179 // caller must hold mutex on this share
180 inline void update_cardinality_counts(
181 uint32_t rec_per_keys,
182 const uint64_t* rec_per_key);
183
184 // disallow any auto analysis from taking place
185 // caller must hold mutex on this share
186 inline void disallow_auto_analysis();
187
188 // allow any auto analysis to take place
189 // pass in true for 'reset_deltas' to reset delta counting to 0
190 // caller must hold mutex on this share
191 inline void allow_auto_analysis(bool reset_deltas);
192
193 // cancels all background jobs for this share
194 // no locking requirements
195 inline void cancel_background_jobs() const;
196
197 // copies cardinality statistics into TABLE counter set
198 // caller must not have ownership of mutex, will lock and release
199 void set_cardinality_counts_in_table(TABLE* table);
200
201 // performs table analysis on underlying indices and produces estimated
202 // cardinality statistics.
203 // on success updates cardinality counts in status database and this share
204 // MUST pass a valid THD to access session variables.
205 // MAY pass txn. If txn is passed, assumes an explicit user scheduled
206 // ANALYZE and not an auto ANALYZE resulting from delta threshold
207 // uses session variables:
208 // tokudb_analyze_in_background, tokudb_analyze_throttle,
209 // tokudb_analyze_time, and tokudb_analyze_delete_fraction
210 // caller must hold mutex on this share
211 int analyze_standard(THD* thd, DB_TXN* txn);
212
213 // performs table scan and updates the internal FT logical row count value
214 // on success also updates share row count estimate.
215 // MUST pass a valid THD to access session variables.
216 // MAY pass txn. If txn is passed, assumes an explicit user scheduled
217 // uses session variables:
218 // tokudb_analyze_in_background, and tokudb_analyze_throttle
219 // caller must not have ownership of mutex, will lock and release
220 int analyze_recount_rows(THD* thd, DB_TXN* txn);
221
222public:
223 //*********************************
224 // Destroyed and recreated on open-close-open
225 ulonglong auto_ident;
226 ulonglong last_auto_increment, auto_inc_create_value;
227
228 // estimate on number of rows added in the process of a locked tables
229 // this is so we can better estimate row count during a lock table
230 ha_rows rows_from_locked_table;
231 DB* status_block;
232
233 // DB that is indexed on the primary key
234 DB* file;
235
236 // array of all DB's that make up table, includes DB that
237 // is indexed on the primary key, add 1 in case primary
238 // key is hidden
239 DB* key_file[MAX_KEY + 1];
240 uint status, version, capabilities;
241 uint ref_length;
242
243 // whether table has an auto increment column
244 bool has_auto_inc;
245
246 // index of auto increment column in table->field, if auto_inc exists
247 uint ai_field_index;
248
249 // whether the primary key has a string
250 bool pk_has_string;
251
252 KEY_AND_COL_INFO kc_info;
253
254 // key info copied from TABLE_SHARE, used by background jobs that have no
255 // access to a handler instance
256 uint _keys;
257 uint _max_key_parts;
258 struct key_descriptor_t {
259 uint _parts;
260 bool _is_unique;
261 char* _name;
262 };
263 key_descriptor_t* _key_descriptors;
264
265 // we want the following optimization for bulk loads, if the table is empty,
266 // attempt to grab a table lock. emptiness check can be expensive,
267 // so we try it once for a table. After that, we keep this variable around
268 // to tell us to not try it again.
269 bool try_table_lock;
270
271 bool has_unique_keys;
272 bool replace_into_fast;
273 tokudb::thread::rwlock_t _num_DBs_lock;
274 uint32_t num_DBs;
275
276private:
277 static HASH _open_tables;
278 static tokudb::thread::mutex_t _open_tables_mutex;
279
280 static uchar* hash_get_key(
281 TOKUDB_SHARE* share,
282 size_t* length,
283 TOKUDB_UNUSED(my_bool not_used));
284
285 static void hash_free_element(TOKUDB_SHARE* share);
286
287 //*********************************
288 // Spans open-close-open
289 mutable tokudb::thread::mutex_t _mutex;
290 mutable tokudb::thread::mutex_t _ddl_mutex;
291 uint _use_count;
292
293 share_state_t _state;
294
295 ulonglong _row_delta_activity;
296 bool _allow_auto_analysis;
297
298 String _full_table_name;
299 String _database_name;
300 String _table_name;
301
302 //*********************************
303 // Destroyed and recreated on open-close-open
304 THR_LOCK _thr_lock;
305
306 // estimate on number of rows in table
307 ha_rows _rows;
308
309 // cardinality counts
310 uint32_t _rec_per_keys;
311 uint64_t* _rec_per_key;
312
313 void init(const char* table_name);
314 void destroy();
315};
316inline int TOKUDB_SHARE::use_count() const {
317 return _use_count;
318}
319inline void TOKUDB_SHARE::lock() const {
320 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
321 _full_table_name.ptr(),
322 get_state_string(_state),
323 _use_count);
324 mutex_t_lock(_mutex);
325 TOKUDB_SHARE_DBUG_VOID_RETURN();
326}
327inline void TOKUDB_SHARE::unlock() const {
328 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
329 _full_table_name.ptr(),
330 get_state_string(_state),
331 _use_count);
332 mutex_t_unlock(_mutex);
333 TOKUDB_SHARE_DBUG_VOID_RETURN();
334}
335inline TOKUDB_SHARE::share_state_t TOKUDB_SHARE::state() const {
336 return _state;
337}
338inline void TOKUDB_SHARE::set_state(TOKUDB_SHARE::share_state_t state) {
339 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]:new_state[%s]",
340 _full_table_name.ptr(),
341 get_state_string(_state),
342 _use_count,
343 get_state_string(state));
344
345 assert_debug(_mutex.is_owned_by_me());
346 _state = state;
347 TOKUDB_SHARE_DBUG_VOID_RETURN();
348}
349inline const char* TOKUDB_SHARE::full_table_name() const {
350 return _full_table_name.ptr();
351}
352inline uint TOKUDB_SHARE::full_table_name_length() const {
353 return _full_table_name.length();
354}
355inline const char* TOKUDB_SHARE::database_name() const {
356 return _database_name.ptr();
357}
358inline uint TOKUDB_SHARE::database_name_length() const {
359 return _database_name.length();
360}
361inline const char* TOKUDB_SHARE::table_name() const {
362 return _table_name.ptr();
363}
364inline uint TOKUDB_SHARE::table_name_length() const {
365 return _table_name.length();
366}
367inline void TOKUDB_SHARE::set_row_count(uint64_t rows, bool locked) {
368 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]:rows[%" PRIu64 "]:locked[%d]",
369 _full_table_name.ptr(),
370 get_state_string(_state),
371 _use_count,
372 rows,
373 locked);
374
375 if (!locked) {
376 lock();
377 } else {
378 assert_debug(_mutex.is_owned_by_me());
379 }
380 if (_rows && rows == 0)
381 _row_delta_activity = 0;
382
383 _rows = rows;
384 if (!locked) {
385 unlock();
386 }
387 TOKUDB_SHARE_DBUG_VOID_RETURN();
388}
389inline ha_rows TOKUDB_SHARE::row_count() const {
390 return _rows;
391}
392inline void TOKUDB_SHARE::init_cardinality_counts(
393 uint32_t rec_per_keys,
394 uint64_t* rec_per_key) {
395
396 assert_debug(_mutex.is_owned_by_me());
397 // can not change number of keys live
398 assert_always(_rec_per_key == NULL && _rec_per_keys == 0);
399 _rec_per_keys = rec_per_keys;
400 _rec_per_key = rec_per_key;
401}
402inline void TOKUDB_SHARE::update_cardinality_counts(
403 uint32_t rec_per_keys,
404 const uint64_t* rec_per_key) {
405
406 assert_debug(_mutex.is_owned_by_me());
407 // can not change number of keys live
408 assert_always(rec_per_keys == _rec_per_keys);
409 assert_always(rec_per_key != NULL);
410 memcpy(_rec_per_key, rec_per_key, _rec_per_keys * sizeof(uint64_t));
411}
412inline void TOKUDB_SHARE::disallow_auto_analysis() {
413 assert_debug(_mutex.is_owned_by_me());
414 _allow_auto_analysis = false;
415}
416inline void TOKUDB_SHARE::allow_auto_analysis(bool reset_deltas) {
417 assert_debug(_mutex.is_owned_by_me());
418 _allow_auto_analysis = true;
419 if (reset_deltas)
420 _row_delta_activity = 0;
421}
422inline void TOKUDB_SHARE::cancel_background_jobs() const {
423 tokudb::background::_job_manager->cancel_job(full_table_name());
424}
425
426
427
428typedef struct st_filter_key_part_info {
429 uint offset;
430 uint part_index;
431} FILTER_KEY_PART_INFO;
432
433typedef enum {
434 lock_read = 0,
435 lock_write
436} TABLE_LOCK_TYPE;
437
438// the number of rows bulk fetched in one callback grows exponentially
439// with the bulk fetch iteration, so the max iteration is the max number
440// of shifts we can perform on a 64 bit integer.
441#define HA_TOKU_BULK_FETCH_ITERATION_MAX 63
442
443class ha_tokudb : public handler {
444private:
445 THR_LOCK_DATA lock; ///< MySQL lock
446 TOKUDB_SHARE *share; ///< Shared lock info
447
448#ifdef MARIADB_BASE_VERSION
449 // MariaDB version of MRR
450 DsMrr_impl ds_mrr;
451#elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
452 // MySQL version of MRR
453 DsMrr_impl ds_mrr;
454#endif
455
456 // For ICP. Cache our own copies
457 Item* toku_pushed_idx_cond;
458 uint toku_pushed_idx_cond_keyno; /* The index which the above condition is for */
459 bool icp_went_out_of_range;
460
461 //
462 // last key returned by ha_tokudb's cursor
463 //
464 DBT last_key;
465 //
466 // pointer used for multi_alloc of key_buff, key_buff2, primary_key_buff
467 //
468 void *alloc_ptr;
469 //
470 // buffer used to temporarily store a "packed row"
471 // data pointer of a DBT will end up pointing to this
472 // see pack_row for usage
473 //
474 uchar *rec_buff;
475 //
476 // number of bytes allocated in rec_buff
477 //
478 ulong alloced_rec_buff_length;
479 //
480 // same as above two, but for updates
481 //
482 uchar *rec_update_buff;
483 ulong alloced_update_rec_buff_length;
484 uint32_t max_key_length;
485
486 uchar* range_query_buff; // range query buffer
487 uint32_t size_range_query_buff; // size of the allocated range query buffer
488 uint32_t bytes_used_in_range_query_buff; // number of bytes used in the range query buffer
489 uint32_t curr_range_query_buff_offset; // current offset into the range query buffer for queries to read
490 uint64_t bulk_fetch_iteration;
491 uint64_t rows_fetched_using_bulk_fetch;
492 bool doing_bulk_fetch;
493 bool maybe_index_scan;
494
495 //
496 // buffer used to temporarily store a "packed key"
497 // data pointer of a DBT will end up pointing to this
498 //
499 uchar *key_buff;
500 //
501 // buffer used to temporarily store a "packed key"
502 // data pointer of a DBT will end up pointing to this
503 // This is used in functions that require the packing
504 // of more than one key
505 //
506 uchar *key_buff2;
507 uchar *key_buff3;
508 uchar *key_buff4;
509 //
510 // buffer used to temporarily store a "packed key"
511 // data pointer of a DBT will end up pointing to this
512 // currently this is only used for a primary key in
513 // the function update_row, hence the name. It
514 // does not carry any state throughout the class.
515 //
516 uchar *primary_key_buff;
517
518 //
519 // ranges of prelocked area, used to know how much to bulk fetch
520 //
521 uchar *prelocked_left_range;
522 uint32_t prelocked_left_range_size;
523 uchar *prelocked_right_range;
524 uint32_t prelocked_right_range_size;
525
526
527 //
528 // individual DBTs for each index
529 //
530 DBT_ARRAY mult_key_dbt_array[2*(MAX_KEY + 1)];
531 DBT_ARRAY mult_rec_dbt_array[MAX_KEY + 1];
532 uint32_t mult_put_flags[MAX_KEY + 1];
533 uint32_t mult_del_flags[MAX_KEY + 1];
534 uint32_t mult_dbt_flags[MAX_KEY + 1];
535
536
537 //
538 // when unpacking blobs, we need to store it in a temporary
539 // buffer that will persist because MySQL just gets a pointer to the
540 // blob data, a pointer we need to ensure is valid until the next
541 // query
542 //
543 uchar* blob_buff;
544 uint32_t num_blob_bytes;
545
546 bool unpack_entire_row;
547
548 //
549 // buffers (and their sizes) that will hold the indexes
550 // of fields that need to be read for a query
551 //
552 uint32_t* fixed_cols_for_query;
553 uint32_t num_fixed_cols_for_query;
554 uint32_t* var_cols_for_query;
555 uint32_t num_var_cols_for_query;
556 bool read_blobs;
557 bool read_key;
558
559 //
560 // transaction used by ha_tokudb's cursor
561 //
562 DB_TXN *transaction;
563
564 // external_lock will set this true for read operations that will be closely followed by write operations.
565 bool use_write_locks; // use write locks for reads
566
567 //
568 // instance of cursor being used for init_xxx and rnd_xxx functions
569 //
570 DBC *cursor;
571 uint32_t cursor_flags; // flags for cursor
572 //
573 // flags that are returned in table_flags()
574 //
575 ulonglong int_table_flags;
576 //
577 // count on the number of rows that gets changed, such as when write_row occurs
578 // this is meant to help keep estimate on number of elements in DB
579 //
580 ulonglong added_rows;
581 ulonglong deleted_rows;
582 ulonglong updated_rows;
583
584
585 uint last_dup_key;
586 //
587 // if set to 0, then the primary key is not hidden
588 // if non-zero (not necessarily 1), primary key is hidden
589 //
590 uint hidden_primary_key;
591 bool key_read, using_ignore;
592 bool using_ignore_no_key;
593
594 //
595 // After a cursor encounters an error, the cursor will be unusable
596 // In case MySQL attempts to do a cursor operation (such as rnd_next
597 // or index_prev), we will gracefully return this error instead of crashing
598 //
599 int last_cursor_error;
600
601 //
602 // For instances where we successfully prelock a range or a table,
603 // we set this to true so that successive cursor calls can know
604 // know to limit the locking overhead in a call to the fractal tree
605 //
606 bool range_lock_grabbed;
607 bool range_lock_grabbed_null;
608
609 //
610 // For bulk inserts, we want option of not updating auto inc
611 // until all inserts are done. By default, is false
612 //
613 bool delay_updating_ai_metadata; // if true, don't update auto-increment metadata until bulk load completes
614 bool ai_metadata_update_required; // if true, autoincrement metadata must be updated
615
616 //
617 // buffer for updating the status of long insert, delete, and update
618 // statements. Right now, the the messages are
619 // "[inserted|updated|deleted] about %llu rows",
620 // so a buffer of 200 is good enough.
621 //
622 char write_status_msg[200]; //buffer of 200 should be a good upper bound.
623 struct loader_context lc;
624
625 DB_LOADER* loader;
626 bool abort_loader;
627 int loader_error;
628
629 bool num_DBs_locked_in_bulk;
630 uint32_t lock_count;
631
632 bool fix_rec_buff_for_blob(ulong length);
633 bool fix_rec_update_buff_for_blob(ulong length);
634 uchar current_ident[TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH];
635
636 ulong max_row_length(const uchar * buf);
637 int pack_row_in_buff(
638 DBT * row,
639 const uchar* record,
640 uint index,
641 uchar* row_buff
642 );
643 int pack_row(
644 DBT * row,
645 const uchar* record,
646 uint index
647 );
648 int pack_old_row_for_update(
649 DBT * row,
650 const uchar* record,
651 uint index
652 );
653 uint32_t place_key_into_mysql_buff(KEY* key_info, uchar * record, uchar* data);
654 void unpack_key(uchar * record, DBT const *key, uint index);
655 uint32_t place_key_into_dbt_buff(KEY* key_info, uchar * buff, const uchar * record, bool* has_null, int key_length);
656 DBT* create_dbt_key_from_key(DBT * key, KEY* key_info, uchar * buff, const uchar * record, bool* has_null, bool dont_pack_pk, int key_length, uint8_t inf_byte);
657 DBT *create_dbt_key_from_table(DBT * key, uint keynr, uchar * buff, const uchar * record, bool* has_null, int key_length = MAX_KEY_LENGTH);
658 DBT* create_dbt_key_for_lookup(DBT * key, KEY* key_info, uchar * buff, const uchar * record, bool* has_null, int key_length = MAX_KEY_LENGTH);
659 DBT *pack_key(DBT * key, uint keynr, uchar * buff, const uchar * key_ptr, uint key_length, int8_t inf_byte);
660#if TOKU_INCLUDE_EXTENDED_KEYS
661 DBT *pack_ext_key(DBT * key, uint keynr, uchar * buff, const uchar * key_ptr, uint key_length, int8_t inf_byte);
662#endif
663 bool key_changed(uint keynr, const uchar * old_row, const uchar * new_row);
664 int handle_cursor_error(int error, int err_to_return, uint keynr);
665 DBT *get_pos(DBT * to, uchar * pos);
666
667 int open_main_dictionary(const char* name, bool is_read_only, DB_TXN* txn);
668 int open_secondary_dictionary(DB** ptr, KEY* key_info, const char* name, bool is_read_only, DB_TXN* txn);
669 int acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt);
670 int estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn);
671 bool has_auto_increment_flag(uint* index);
672
673 int write_frm_data(DB* db, DB_TXN* txn, const char* frm_name);
674 int verify_frm_data(const char* frm_name, DB_TXN* trans);
675 int remove_frm_data(DB *db, DB_TXN *txn);
676
677 int write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn);
678 int remove_from_status(DB* db, HA_METADATA_KEY curr_key_data, DB_TXN* txn);
679
680 int write_metadata(DB* db, void* key, uint key_size, void* data, uint data_size, DB_TXN* txn);
681 int remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction);
682
683 int update_max_auto_inc(DB* db, ulonglong val);
684 int remove_key_name_from_status(DB* status_block, const char* key_name, DB_TXN* txn);
685 int write_key_name_to_status(DB* status_block, const char* key_name, DB_TXN* txn);
686 int write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn);
687 void init_auto_increment();
688 bool can_replace_into_be_fast(TABLE_SHARE* table_share, KEY_AND_COL_INFO* kc_info, uint pk);
689 int initialize_share(const char* name, int mode);
690
691 void set_query_columns(uint keynr);
692 int prelock_range (const key_range *start_key, const key_range *end_key);
693 int create_txn(THD* thd, tokudb_trx_data* trx);
694 bool may_table_be_empty(DB_TXN *txn);
695 int delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete);
696 int delete_or_rename_dictionary( const char* from_name, const char* to_name, const char* index_name, bool is_key, DB_TXN* txn, bool is_delete);
697 int truncate_dictionary( uint keynr, DB_TXN* txn );
698 int create_secondary_dictionary(
699 const char* name,
700 TABLE* form,
701 KEY* key_info,
702 DB_TXN* txn,
703 KEY_AND_COL_INFO* kc_info,
704 uint32_t keynr,
705 bool is_hot_index,
706 toku_compression_method compression_method
707 );
708 int create_main_dictionary(const char* name, TABLE* form, DB_TXN* txn, KEY_AND_COL_INFO* kc_info, toku_compression_method compression_method);
709 void trace_create_table_info(const char *name, TABLE * form);
710 int is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags);
711 int is_val_unique(bool* is_unique, const uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn);
712 int do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd);
713 void set_main_dict_put_flags(THD* thd, bool opt_eligible, uint32_t* put_flags);
714 int insert_row_to_main_dictionary(uchar* record, DBT* pk_key, DBT* pk_val, DB_TXN* txn);
715 int insert_rows_to_dictionaries_mult(DBT* pk_key, DBT* pk_val, DB_TXN* txn, THD* thd);
716 void test_row_packing(uchar* record, DBT* pk_key, DBT* pk_val);
717 uint32_t fill_row_mutator(
718 uchar* buf,
719 uint32_t* dropped_columns,
720 uint32_t num_dropped_columns,
721 TABLE* altered_table,
722 KEY_AND_COL_INFO* altered_kc_info,
723 uint32_t keynr,
724 bool is_add
725 );
726
727 // 0 <= active_index < table_share->keys || active_index == MAX_KEY
728 // tokudb_active_index = active_index if active_index < table_share->keys, else tokudb_active_index = primary_key = table_share->keys
729 uint tokudb_active_index;
730
731public:
732 ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg);
733 ~ha_tokudb();
734
735 const char *table_type() const;
736 const char *index_type(uint inx);
737 const char **bas_ext() const;
738
739 //
740 // Returns a bit mask of capabilities of storage engine. Capabilities
741 // defined in sql/handler.h
742 //
743 ulonglong table_flags() const;
744
745 ulong index_flags(uint inx, uint part, bool all_parts) const;
746
747 //
748 // Returns limit on the number of keys imposed by tokudb.
749 //
750 uint max_supported_keys() const {
751 return MAX_KEY;
752 }
753
754 uint extra_rec_buf_length() const {
755 return TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
756 }
757 ha_rows estimate_rows_upper_bound();
758
759 //
760 // Returns the limit on the key length imposed by tokudb.
761 //
762 uint max_supported_key_length() const {
763 return UINT_MAX32;
764 }
765
766 //
767 // Returns limit on key part length imposed by tokudb.
768 //
769 uint max_supported_key_part_length() const {
770 return UINT_MAX32;
771 }
772 const key_map *keys_to_use_for_scanning() {
773 return &key_map_full;
774 }
775
776 double scan_time();
777
778 double read_time(uint index, uint ranges, ha_rows rows);
779
780 // Defined in mariadb
781 double keyread_time(uint index, uint ranges, ha_rows rows);
782
783 // Defined in mysql 5.6
784 double index_only_read_time(uint keynr, double records);
785
786 int open(const char *name, int mode, uint test_if_locked);
787 int close();
788 void update_create_info(HA_CREATE_INFO* create_info);
789 int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info);
790 int delete_table(const char *name);
791 int rename_table(const char *from, const char *to);
792 int optimize(THD * thd, HA_CHECK_OPT * check_opt);
793 int analyze(THD * thd, HA_CHECK_OPT * check_opt);
794 int write_row(uchar * buf);
795 int update_row(const uchar * old_data, const uchar * new_data);
796 int delete_row(const uchar * buf);
797#if MYSQL_VERSION_ID >= 100000
798 void start_bulk_insert(ha_rows rows, uint flags);
799#else
800 void start_bulk_insert(ha_rows rows);
801#endif
802 static int bulk_insert_poll(void* extra, float progress);
803 static void loader_add_index_err(DB* db,
804 int i,
805 int err,
806 DBT* key,
807 DBT* val,
808 void* error_extra);
809 static void loader_dup(DB* db,
810 int i,
811 int err,
812 DBT* key,
813 DBT* val,
814 void* error_extra);
815 int end_bulk_insert();
816 int end_bulk_insert(bool abort);
817
818 int prepare_index_scan();
819 int prepare_index_key_scan( const uchar * key, uint key_len );
820 int prepare_range_scan( const key_range *start_key, const key_range *end_key);
821 void column_bitmaps_signal();
822 int index_init(uint index, bool sorted);
823 int index_end();
824 int index_next_same(uchar * buf, const uchar * key, uint keylen);
825 int index_read(uchar * buf, const uchar * key, uint key_len, enum ha_rkey_function find_flag);
826 int index_read_last(uchar * buf, const uchar * key, uint key_len);
827 int index_next(uchar * buf);
828 int index_prev(uchar * buf);
829 int index_first(uchar * buf);
830 int index_last(uchar * buf);
831
832 bool has_gap_locks() const { return true; }
833
834 int rnd_init(bool scan);
835 int rnd_end();
836 int rnd_next(uchar * buf);
837 int rnd_pos(uchar * buf, uchar * pos);
838
839 int read_range_first(const key_range *start_key,
840 const key_range *end_key,
841 bool eq_range, bool sorted);
842 int read_range_next();
843
844
845 void position(const uchar * record);
846 int info(uint);
847 int extra(enum ha_extra_function operation);
848 int reset();
849 int external_lock(THD * thd, int lock_type);
850 int start_stmt(THD * thd, thr_lock_type lock_type);
851
852 ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key);
853
854 uint32_t get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd);
855 THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to, enum thr_lock_type lock_type);
856
857 int get_status(DB_TXN* trans);
858 void init_hidden_prim_key_info(DB_TXN *txn);
859 inline void get_auto_primary_key(uchar * to) {
860 share->lock();
861 share->auto_ident++;
862 hpk_num_to_char(to, share->auto_ident);
863 share->unlock();
864 }
865 virtual void get_auto_increment(
866 ulonglong offset,
867 ulonglong increment,
868 ulonglong nb_desired_values,
869 ulonglong* first_value,
870 ulonglong* nb_reserved_values);
871 bool is_optimize_blocking();
872 bool is_auto_inc_singleton();
873 void print_error(int error, myf errflag);
874 uint8 table_cache_type() {
875 return HA_CACHE_TBL_TRANSACT;
876 }
877 bool primary_key_is_clustered() {
878 return true;
879 }
880 int cmp_ref(const uchar * ref1, const uchar * ref2);
881 bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes);
882
883#ifdef MARIADB_BASE_VERSION
884
885// MariaDB MRR introduced in 5.5, API changed in MariaDB 10.0
886#if MYSQL_VERSION_ID >= 100000
887#define COST_VECT Cost_estimate
888#endif
889
890 int multi_range_read_init(RANGE_SEQ_IF* seq,
891 void* seq_init_param,
892 uint n_ranges, uint mode,
893 HANDLER_BUFFER *buf);
894 int multi_range_read_next(range_id_t *range_info);
895 ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
896 void *seq_init_param,
897 uint n_ranges, uint *bufsz,
898 uint *flags, COST_VECT *cost);
899 ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
900 uint key_parts, uint *bufsz,
901 uint *flags, COST_VECT *cost);
902 int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size);
903
904#else
905
906// MySQL MRR introduced in 5.6
907#if 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
908 int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
909 uint n_ranges, uint mode, HANDLER_BUFFER *buf);
910 int multi_range_read_next(char **range_info);
911 ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
912 void *seq_init_param,
913 uint n_ranges, uint *bufsz,
914 uint *flags, Cost_estimate *cost);
915 ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
916 uint *bufsz, uint *flags, Cost_estimate *cost);
917#endif
918
919#endif
920
921 Item* idx_cond_push(uint keyno, class Item* idx_cond);
922 void cancel_pushed_idx_cond();
923
924#if TOKU_INCLUDE_ALTER_56
925 public:
926 enum_alter_inplace_result check_if_supported_inplace_alter(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
927 bool prepare_inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
928 bool inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
929 bool commit_inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha_alter_info, bool commit);
930 private:
931 int alter_table_add_index(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
932 int alter_table_drop_index(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
933 int alter_table_add_or_drop_column(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
934 int alter_table_expand_varchar_offsets(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
935 int alter_table_expand_columns(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
936 int alter_table_expand_one_column(TABLE *altered_table, Alter_inplace_info *ha_alter_info, int expand_field_num);
937 int alter_table_expand_blobs(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
938 void print_alter_info(TABLE *altered_table, Alter_inplace_info *ha_alter_info);
939 int setup_kc_info(TABLE *altered_table, KEY_AND_COL_INFO *kc_info);
940 int new_row_descriptor(TABLE *table, TABLE *altered_table, Alter_inplace_info *ha_alter_info, uint32_t idx, DBT *row_descriptor);
941
942 public:
943#endif
944#if defined(TOKU_INCLUDE_ALTER_55)
945public:
946 // Returns true of the 5.6 inplace alter table interface is used.
947 bool try_hot_alter_table();
948
949 // Used by the partition storage engine to provide new frm data for the table.
950 int new_alter_table_frm_data(const uchar *frm_data, size_t frm_len);
951#endif
952
953 private:
954 int tokudb_add_index(TABLE* table_arg,
955 KEY* key_info,
956 uint num_of_keys,
957 DB_TXN* txn,
958 bool* inc_num_DBs,
959 bool* modified_DB);
960 static int tokudb_add_index_poll(void *extra, float progress);
961 void restore_add_index(TABLE* table_arg,
962 uint num_of_keys,
963 bool incremented_numDBs,
964 bool modified_DBs);
965 int drop_indexes(TABLE* table_arg,
966 uint* key_num,
967 uint num_of_keys,
968 KEY* key_info,
969 DB_TXN* txn);
970 void restore_drop_indexes(TABLE* table_arg, uint* key_num, uint num_of_keys);
971
972 public:
973 // delete all rows from the table
974 // effect: all dictionaries, including the main and indexes, should be empty
975 int discard_or_import_tablespace(my_bool discard);
976 int truncate();
977 int delete_all_rows();
978 void extract_hidden_primary_key(uint keynr, DBT const *found_key);
979 void read_key_only(uchar * buf, uint keynr, DBT const *found_key);
980 int read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key);
981 int read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key);
982 int unpack_blobs(
983 uchar* record,
984 const uchar* from_tokudb_blob,
985 uint32_t num_blob_bytes,
986 bool check_bitmap
987 );
988 int unpack_row(
989 uchar* record,
990 DBT const *row,
991 DBT const *key,
992 uint index
993 );
994
995 int prefix_cmp_dbts( uint keynr, const DBT* first_key, const DBT* second_key) {
996 return tokudb_prefix_cmp_dbt_key(share->key_file[keynr], first_key, second_key);
997 }
998
999 void track_progress(THD* thd);
1000 void set_loader_error(int err);
1001 void set_dup_value_for_pk(DBT* key);
1002
1003
1004 //
1005 // index into key_file that holds DB* that is indexed on
1006 // the primary_key. this->key_file[primary_index] == this->file
1007 //
1008 uint primary_key;
1009
1010 int check(THD *thd, HA_CHECK_OPT *check_opt);
1011
1012 int fill_range_query_buf(
1013 bool need_val,
1014 DBT const* key,
1015 DBT const* row,
1016 int direction,
1017 THD* thd,
1018 uchar* buf,
1019 DBT* key_to_compare);
1020
1021#if TOKU_INCLUDE_ROW_TYPE_COMPRESSION
1022 enum row_type get_row_type() const;
1023#endif
1024private:
1025 int read_full_row(uchar * buf);
1026 int __close();
1027 int get_next(uchar* buf, int direction, DBT* key_to_compare, bool do_key_read);
1028 int read_data_from_range_query_buff(uchar* buf, bool need_val, bool do_key_read);
1029 // for ICP, only in MariaDB and MySQL 5.6
1030 enum icp_result toku_handler_index_cond_check(Item* pushed_idx_cond);
1031 void invalidate_bulk_fetch();
1032 void invalidate_icp();
1033 int delete_all_rows_internal();
1034 void close_dsmrr();
1035 void reset_dsmrr();
1036
1037#if TOKU_INCLUDE_WRITE_FRM_DATA
1038 int write_frm_data(const uchar *frm_data, size_t frm_len);
1039#endif
1040#if defined(TOKU_INCLUDE_UPSERT)
1041private:
1042 int fast_update(THD *thd, List<Item> &update_fields, List<Item> &update_values, Item *conds);
1043 bool check_fast_update(THD *thd, List<Item> &update_fields, List<Item> &update_values, Item *conds);
1044 int send_update_message(List<Item> &update_fields, List<Item> &update_values, Item *conds, DB_TXN *txn);
1045 int upsert(THD *thd, List<Item> &update_fields, List<Item> &update_values);
1046 bool check_upsert(THD *thd, List<Item> &update_fields, List<Item> &update_values);
1047 int send_upsert_message(THD *thd, List<Item> &update_fields, List<Item> &update_values, DB_TXN *txn);
1048#endif
1049public:
1050 // mysql sometimes retires a txn before a cursor that references the txn is closed.
1051 // for example, commit is sometimes called before index_end. the following methods
1052 // put the handler on a list of handlers that get cleaned up when the txn is retired.
1053 void cleanup_txn(DB_TXN *txn);
1054private:
1055 LIST trx_handler_list;
1056 void add_to_trx_handler_list();
1057 void remove_from_trx_handler_list();
1058
1059private:
1060 int do_optimize(THD *thd);
1061 int map_to_handler_error(int error);
1062
1063public:
1064 void rpl_before_write_rows();
1065 void rpl_after_write_rows();
1066 void rpl_before_delete_rows();
1067 void rpl_after_delete_rows();
1068 void rpl_before_update_rows();
1069 void rpl_after_update_rows();
1070 bool rpl_lookup_rows();
1071private:
1072 bool in_rpl_write_rows;
1073 bool in_rpl_delete_rows;
1074 bool in_rpl_update_rows;
1075};
1076
1077#endif // _HA_TOKUDB_H
1078
1079