1/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3#ident "$Id$"
4/*======
5This file is part of TokuDB
6
7
8Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 TokuDBis is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 TokuDB is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with TokuDB. If not, see <http://www.gnu.org/licenses/>.
21
22======= */
23
24#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25
26#include "hatoku_hton.h"
27#include "hatoku_cmp.h"
28#include "tokudb_buffer.h"
29#include "tokudb_status.h"
30#include "tokudb_card.h"
31#include "ha_tokudb.h"
32#include "sql_db.h"
33
34pfs_key_t ha_tokudb_mutex_key;
35pfs_key_t num_DBs_lock_key;
36
37#if TOKU_INCLUDE_EXTENDED_KEYS
38static inline uint get_ext_key_parts(const KEY *key) {
39#if (50609 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
40 (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799)
41 return key->actual_key_parts;
42#elif defined(MARIADB_BASE_VERSION)
43 return key->ext_key_parts;
44#else
45#error
46#endif
47}
48#endif
49
50HASH TOKUDB_SHARE::_open_tables;
51tokudb::thread::mutex_t TOKUDB_SHARE::_open_tables_mutex;
52
53static const char* ha_tokudb_exts[] = {
54 ha_tokudb_ext,
55 NullS
56};
57
58//
59// This offset is calculated starting from AFTER the NULL bytes
60//
61static inline uint32_t get_fixed_field_size(
62 KEY_AND_COL_INFO* kc_info,
63 TABLE_SHARE* table_share,
64 uint keynr) {
65
66 uint offset = 0;
67 for (uint i = 0; i < table_share->fields; i++) {
68 if (is_fixed_field(kc_info, i) &&
69 !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
70 offset += kc_info->field_lengths[i];
71 }
72 }
73 return offset;
74}
75
76
77static inline uint32_t get_len_of_offsets(
78 KEY_AND_COL_INFO* kc_info,
79 TABLE_SHARE* table_share,
80 uint keynr) {
81
82 uint len = 0;
83 for (uint i = 0; i < table_share->fields; i++) {
84 if (is_variable_field(kc_info, i) &&
85 !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
86 len += kc_info->num_offset_bytes;
87 }
88 }
89 return len;
90}
91
92
93static int allocate_key_and_col_info(
94 TABLE_SHARE* table_share,
95 KEY_AND_COL_INFO* kc_info) {
96
97 int error;
98 //
99 // initialize all of the bitmaps
100 //
101 for (uint i = 0; i < MAX_KEY + 1; i++) {
102 error =
103 bitmap_init(
104 &kc_info->key_filters[i],
105 NULL,
106 table_share->fields,
107 false);
108 if (error) {
109 goto exit;
110 }
111 }
112
113 //
114 // create the field lengths
115 //
116 kc_info->multi_ptr = tokudb::memory::multi_malloc(
117 MYF(MY_WME+MY_ZEROFILL),
118 &kc_info->field_types, (uint)(table_share->fields * sizeof (uint8_t)),
119 &kc_info->field_lengths, (uint)(table_share->fields * sizeof (uint16_t)),
120 &kc_info->length_bytes, (uint)(table_share->fields * sizeof (uint8_t)),
121 &kc_info->blob_fields, (uint)(table_share->fields * sizeof (uint32_t)),
122 NullS);
123 if (kc_info->multi_ptr == NULL) {
124 error = ENOMEM;
125 goto exit;
126 }
127exit:
128 if (error) {
129 for (uint i = 0; MAX_KEY + 1; i++) {
130 bitmap_free(&kc_info->key_filters[i]);
131 }
132 tokudb::memory::free(kc_info->multi_ptr);
133 }
134 return error;
135}
136
137static void free_key_and_col_info (KEY_AND_COL_INFO* kc_info) {
138 for (uint i = 0; i < MAX_KEY+1; i++) {
139 bitmap_free(&kc_info->key_filters[i]);
140 }
141
142 for (uint i = 0; i < MAX_KEY+1; i++) {
143 tokudb::memory::free(kc_info->cp_info[i]);
144 kc_info->cp_info[i] = NULL; // 3144
145 }
146
147 tokudb::memory::free(kc_info->multi_ptr);
148 kc_info->field_types = NULL;
149 kc_info->field_lengths = NULL;
150 kc_info->length_bytes = NULL;
151 kc_info->blob_fields = NULL;
152}
153
154
155uchar* TOKUDB_SHARE::hash_get_key(
156 TOKUDB_SHARE* share,
157 size_t* length,
158 TOKUDB_UNUSED(my_bool not_used)) {
159
160 *length = share->_full_table_name.length();
161 return (uchar *) share->_full_table_name.c_ptr();
162}
163void TOKUDB_SHARE::hash_free_element(TOKUDB_SHARE* share) {
164 share->destroy();
165 delete share;
166}
167void TOKUDB_SHARE::static_init() {
168 my_hash_init(
169 &_open_tables,
170 table_alias_charset,
171 32,
172 0,
173 0,
174 (my_hash_get_key)hash_get_key,
175 (my_hash_free_key)hash_free_element, 0);
176}
177void TOKUDB_SHARE::static_destroy() {
178 my_hash_free(&_open_tables);
179}
180const char* TOKUDB_SHARE::get_state_string(share_state_t state) {
181 static const char* state_string[] = {
182 "CLOSED",
183 "OPENED",
184 "ERROR"
185 };
186 assert_always(state == CLOSED || state == OPENED || state == ERROR);
187 return state_string[state];
188}
189void* TOKUDB_SHARE::operator new(size_t sz) {
190 return tokudb::memory::malloc(sz, MYF(MY_WME|MY_ZEROFILL|MY_FAE));
191}
192void TOKUDB_SHARE::operator delete(void* p) { tokudb::memory::free(p); }
193TOKUDB_SHARE::TOKUDB_SHARE()
194 : _num_DBs_lock(num_DBs_lock_key), _mutex(ha_tokudb_mutex_key) {}
195void TOKUDB_SHARE::init(const char* table_name) {
196 _use_count = 0;
197 thr_lock_init(&_thr_lock);
198 _state = CLOSED;
199 _row_delta_activity = 0;
200 _allow_auto_analysis = true;
201
202 _full_table_name.append(table_name);
203
204 String tmp_dictionary_name;
205 tokudb_split_dname(
206 table_name,
207 _database_name,
208 _table_name,
209 tmp_dictionary_name);
210
211 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
212 _full_table_name.ptr(),
213 get_state_string(_state),
214 _use_count);
215 TOKUDB_SHARE_DBUG_VOID_RETURN();
216}
217void TOKUDB_SHARE::destroy() {
218 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
219 _full_table_name.ptr(),
220 get_state_string(_state),
221 _use_count);
222
223 assert_always(_use_count == 0);
224 assert_always(
225 _state == TOKUDB_SHARE::CLOSED || _state == TOKUDB_SHARE::ERROR);
226 thr_lock_delete(&_thr_lock);
227 TOKUDB_SHARE_DBUG_VOID_RETURN();
228}
229TOKUDB_SHARE* TOKUDB_SHARE::get_share(const char* table_name,
230 TABLE_SHARE* table_share,
231 THR_LOCK_DATA* data,
232 bool create_new) {
233 mutex_t_lock(_open_tables_mutex);
234 int error = 0;
235 uint length = (uint)strlen(table_name);
236 TOKUDB_SHARE* share = (TOKUDB_SHARE*)my_hash_search(
237 &_open_tables, (uchar*)table_name, length);
238
239 TOKUDB_TRACE_FOR_FLAGS(
240 TOKUDB_DEBUG_SHARE,
241 "existing share[%s] %s:share[%p]",
242 table_name,
243 share == NULL ? "not found" : "found",
244 share);
245
246 if (!share) {
247 if (create_new == false)
248 goto exit;
249 // create share and fill it with all zeroes
250 // hence, all pointers are initialized to NULL
251 share = new TOKUDB_SHARE;
252 assert_always(share);
253
254 share->init(table_name);
255
256 error = my_hash_insert(&_open_tables, (uchar*)share);
257 if (error) {
258 free_key_and_col_info(&share->kc_info);
259 share->destroy();
260 tokudb::memory::free((uchar*)share);
261 share = NULL;
262 goto exit;
263 }
264 }
265
266 share->addref();
267
268 if (data)
269 thr_lock_data_init(&(share->_thr_lock), data, NULL);
270
271exit:
272 mutex_t_unlock(_open_tables_mutex);
273 return share;
274}
275void TOKUDB_SHARE::drop_share(TOKUDB_SHARE* share) {
276 TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_SHARE,
277 "share[%p]:file[%s]:state[%s]:use_count[%d]",
278 share,
279 share->_full_table_name.ptr(),
280 get_state_string(share->_state),
281 share->_use_count);
282
283 mutex_t_lock(_open_tables_mutex);
284 my_hash_delete(&_open_tables, (uchar*)share);
285 mutex_t_unlock(_open_tables_mutex);
286}
287TOKUDB_SHARE::share_state_t TOKUDB_SHARE::addref() {
288 TOKUDB_SHARE_TRACE_FOR_FLAGS((TOKUDB_DEBUG_ENTER & TOKUDB_DEBUG_SHARE),
289 "file[%s]:state[%s]:use_count[%d]",
290 _full_table_name.ptr(),
291 get_state_string(_state),
292 _use_count);
293
294 lock();
295 _use_count++;
296
297 return _state;
298}
299int TOKUDB_SHARE::release() {
300 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
301 _full_table_name.ptr(),
302 get_state_string(_state),
303 _use_count);
304
305 int error, result = 0;
306
307 mutex_t_lock(_mutex);
308 assert_always(_use_count != 0);
309 _use_count--;
310 if (_use_count == 0 && _state == TOKUDB_SHARE::OPENED) {
311 // number of open DB's may not be equal to number of keys we have
312 // because add_index may have added some. So, we loop through entire
313 // array and close any non-NULL value. It is imperative that we reset
314 // a DB to NULL once we are done with it.
315 for (uint i = 0; i < sizeof(key_file)/sizeof(key_file[0]); i++) {
316 if (key_file[i]) {
317 TOKUDB_TRACE_FOR_FLAGS(
318 TOKUDB_DEBUG_OPEN,
319 "dbclose:%p",
320 key_file[i]);
321 error = key_file[i]->close(key_file[i], 0);
322 assert_always(error == 0);
323 if (error) {
324 result = error;
325 }
326 if (key_file[i] == file)
327 file = NULL;
328 key_file[i] = NULL;
329 }
330 }
331
332 error = tokudb::metadata::close(&status_block);
333 assert_always(error == 0);
334
335 free_key_and_col_info(&kc_info);
336
337 if (_rec_per_key) {
338 tokudb::memory::free(_rec_per_key);
339 _rec_per_key = NULL;
340 _rec_per_keys = 0;
341 }
342
343 for (uint i = 0; i < _keys; i++) {
344 tokudb::memory::free(_key_descriptors[i]._name);
345 }
346 tokudb::memory::free(_key_descriptors);
347 _keys = _max_key_parts = 0; _key_descriptors = NULL;
348
349 _state = TOKUDB_SHARE::CLOSED;
350 }
351 mutex_t_unlock(_mutex);
352
353 TOKUDB_SHARE_DBUG_RETURN(result);
354}
355void TOKUDB_SHARE::update_row_count(
356 THD* thd,
357 uint64_t added,
358 uint64_t deleted,
359 uint64_t updated) {
360
361 uint64_t delta = added + deleted + updated;
362 lock();
363 if (deleted > added && _rows < (deleted - added)) {
364 _rows = 0;
365 } else {
366 _rows += added - deleted;
367 }
368 _row_delta_activity += delta;
369 if (_row_delta_activity == (uint64_t)~0)
370 _row_delta_activity = 1;
371
372 ulonglong auto_threshold = tokudb::sysvars::auto_analyze(thd);
373 if (delta && auto_threshold > 0 && _allow_auto_analysis) {
374 ulonglong pct_of_rows_changed_to_trigger;
375 pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100);
376 if (_row_delta_activity >= pct_of_rows_changed_to_trigger) {
377 char msg[200];
378 snprintf(msg,
379 sizeof(msg),
380 "TokuDB: Auto %s analysis for %s, delta_activity %llu is "
381 "greater than %llu percent of %llu rows.",
382 tokudb::sysvars::analyze_in_background(thd) > 0
383 ? "scheduling background"
384 : "running foreground",
385 full_table_name(),
386 _row_delta_activity,
387 auto_threshold,
388 (ulonglong)(_rows));
389
390 // analyze_standard will unlock _mutex regardless of success/failure
391 int ret = analyze_standard(thd, NULL);
392 if (ret == 0) {
393 sql_print_information("%s - succeeded.", msg);
394 } else {
395 sql_print_information(
396 "%s - failed, likely a job already running.",
397 msg);
398 }
399 }
400 }
401 unlock();
402}
403void TOKUDB_SHARE::set_cardinality_counts_in_table(TABLE* table) {
404 lock();
405 uint32_t next_key_part = 0;
406 for (uint32_t i = 0; i < table->s->keys; i++) {
407 KEY* key = &table->key_info[i];
408 bool is_unique_key =
409 (i == table->s->primary_key) || (key->flags & HA_NOSAME);
410
411 for (uint32_t j = 0; j < get_ext_key_parts(key); j++) {
412 if (j >= key->user_defined_key_parts) {
413 // MySQL 'hidden' keys, really needs deeper investigation
414 // into MySQL hidden keys vs TokuDB hidden keys
415 key->rec_per_key[j] = 1;
416 continue;
417 }
418
419 assert_always(next_key_part < _rec_per_keys);
420 ulong val = _rec_per_key[next_key_part++];
421 val = (val * tokudb::sysvars::cardinality_scale_percent) / 100;
422 if (val == 0 || _rows == 0 ||
423 (is_unique_key && j == get_ext_key_parts(key) - 1)) {
424 val = 1;
425 }
426 key->rec_per_key[j] = val;
427 }
428 }
429 unlock();
430}
431
432#define HANDLE_INVALID_CURSOR() \
433 if (cursor == NULL) { \
434 error = last_cursor_error; \
435 goto cleanup; \
436 }
437
438const char *ha_tokudb::table_type() const {
439 return tokudb_hton_name;
440}
441
442const char *ha_tokudb::index_type(uint inx) {
443 return "BTREE";
444}
445
446/*
447 * returns NULL terminated file extension string
448 */
449const char **ha_tokudb::bas_ext() const {
450 TOKUDB_HANDLER_DBUG_ENTER("");
451 DBUG_RETURN(ha_tokudb_exts);
452}
453
454static inline bool is_insert_ignore (THD* thd) {
455 //
456 // from http://lists.mysql.com/internals/37735
457 //
458 return thd->lex->ignore && thd->lex->duplicates == DUP_ERROR;
459}
460
461static inline bool is_replace_into(THD* thd) {
462 return thd->lex->duplicates == DUP_REPLACE;
463}
464
465static inline bool do_ignore_flag_optimization(
466 THD* thd,
467 TABLE* table,
468 bool opt_eligible) {
469
470 bool do_opt = false;
471 if (opt_eligible &&
472 (is_replace_into(thd) || is_insert_ignore(thd)) &&
473 tokudb::sysvars::pk_insert_mode(thd) == 1 &&
474 !table->triggers &&
475 !(mysql_bin_log.is_open() &&
476 thd->variables.binlog_format != BINLOG_FORMAT_STMT)) {
477 do_opt = true;
478 }
479 return do_opt;
480}
481
482ulonglong ha_tokudb::table_flags() const {
483 return int_table_flags | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
484}
485
486//
487// Returns a bit mask of capabilities of the key or its part specified by
488// the arguments. The capabilities are defined in sql/handler.h.
489//
490ulong ha_tokudb::index_flags(uint idx, uint part, bool all_parts) const {
491 TOKUDB_HANDLER_DBUG_ENTER("");
492 assert_always(table_share);
493 ulong flags = (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
494 HA_KEYREAD_ONLY | HA_READ_RANGE | HA_DO_INDEX_COND_PUSHDOWN);
495 if (key_is_clustering(&table_share->key_info[idx])) {
496 flags |= HA_CLUSTERED_INDEX;
497 }
498 DBUG_RETURN(flags);
499}
500
501
502//
503// struct that will be used as a context for smart DBT callbacks
504// contains parameters needed to complete the smart DBT cursor call
505//
506typedef struct smart_dbt_info {
507 ha_tokudb* ha; //instance to ha_tokudb needed for reading the row
508 uchar* buf; // output buffer where row will be written
509 uint keynr; // index into share->key_file that represents DB we are currently operating on
510} *SMART_DBT_INFO;
511
512typedef struct smart_dbt_bf_info {
513 ha_tokudb* ha;
514 bool need_val;
515 int direction;
516 THD* thd;
517 uchar* buf;
518 DBT* key_to_compare;
519} *SMART_DBT_BF_INFO;
520
521typedef struct index_read_info {
522 struct smart_dbt_info smart_dbt_info;
523 int cmp;
524 DBT* orig_key;
525} *INDEX_READ_INFO;
526
527//
528// smart DBT callback function for optimize
529// in optimize, we want to flatten DB by doing
530// a full table scan. Therefore, we don't
531// want to actually do anything with the data, hence
532// callback does nothing
533//
534static int smart_dbt_do_nothing (DBT const *key, DBT const *row, void *context) {
535 return 0;
536}
537
538static int
539smart_dbt_callback_rowread_ptquery (DBT const *key, DBT const *row, void *context) {
540 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
541 info->ha->extract_hidden_primary_key(info->keynr, key);
542 return info->ha->read_row_callback(info->buf,info->keynr,row,key);
543}
544
545//
546// Smart DBT callback function in case where we have a covering index
547//
548static int
549smart_dbt_callback_keyread(DBT const *key, DBT const *row, void *context) {
550 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
551 info->ha->extract_hidden_primary_key(info->keynr, key);
552 info->ha->read_key_only(info->buf,info->keynr,key);
553 return 0;
554}
555
556//
557// Smart DBT callback function in case where we do NOT have a covering index
558//
559static int
560smart_dbt_callback_rowread(DBT const *key, DBT const *row, void *context) {
561 int error = 0;
562 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
563 info->ha->extract_hidden_primary_key(info->keynr, key);
564 error = info->ha->read_primary_key(info->buf,info->keynr,row,key);
565 return error;
566}
567
568//
569// Smart DBT callback function in case where we have a covering index
570//
571static int
572smart_dbt_callback_ir_keyread(DBT const *key, DBT const *row, void *context) {
573 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
574 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
575 if (ir_info->cmp) {
576 return 0;
577 }
578 return smart_dbt_callback_keyread(key, row, &ir_info->smart_dbt_info);
579}
580
581static int
582smart_dbt_callback_lookup(DBT const *key, DBT const *row, void *context) {
583 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
584 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
585 return 0;
586}
587
588
589//
590// Smart DBT callback function in case where we do NOT have a covering index
591//
592static int
593smart_dbt_callback_ir_rowread(DBT const *key, DBT const *row, void *context) {
594 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
595 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
596 if (ir_info->cmp) {
597 return 0;
598 }
599 return smart_dbt_callback_rowread(key, row, &ir_info->smart_dbt_info);
600}
601
602//
603// macro for Smart DBT callback function,
604// so we do not need to put this long line of code in multiple places
605//
606#define SMART_DBT_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_keyread : smart_dbt_callback_rowread )
607#define SMART_DBT_IR_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_ir_keyread : smart_dbt_callback_ir_rowread )
608
609//
610// macro that modifies read flag for cursor operations depending on whether
611// we have preacquired lock or not
612//
613#define SET_PRELOCK_FLAG(flg) ((flg) | (range_lock_grabbed ? (use_write_locks ? DB_PRELOCKED_WRITE : DB_PRELOCKED) : 0))
614
615//
616// This method retrieves the value of the auto increment column of a record in MySQL format
617// This was basically taken from MyISAM
618// Parameters:
619// type - the type of the auto increment column (e.g. int, float, double...)
620// offset - offset into the record where the auto increment column is stored
621// [in] record - MySQL row whose auto increment value we want to extract
622// Returns:
623// The value of the auto increment column in record
624//
625static ulonglong retrieve_auto_increment(uint16 type, uint32 offset,const uchar *record)
626{
627 const uchar *key; /* Key */
628 ulonglong unsigned_autoinc = 0; /* Unsigned auto-increment */
629 longlong signed_autoinc = 0; /* Signed auto-increment */
630 enum { unsigned_type, signed_type } autoinc_type;
631 float float_tmp; /* Temporary variable */
632 double double_tmp; /* Temporary variable */
633
634 key = ((uchar *) record) + offset;
635
636 /* Set default autoincrement type */
637 autoinc_type = unsigned_type;
638
639 switch (type) {
640 case HA_KEYTYPE_INT8:
641 signed_autoinc = (longlong) *(char*)key;
642 autoinc_type = signed_type;
643 break;
644
645 case HA_KEYTYPE_BINARY:
646 unsigned_autoinc = (ulonglong) *(uchar*) key;
647 break;
648
649 case HA_KEYTYPE_SHORT_INT:
650 signed_autoinc = (longlong) sint2korr(key);
651 autoinc_type = signed_type;
652 break;
653
654 case HA_KEYTYPE_USHORT_INT:
655 unsigned_autoinc = (ulonglong) uint2korr(key);
656 break;
657
658 case HA_KEYTYPE_LONG_INT:
659 signed_autoinc = (longlong) sint4korr(key);
660 autoinc_type = signed_type;
661 break;
662
663 case HA_KEYTYPE_ULONG_INT:
664 unsigned_autoinc = (ulonglong) uint4korr(key);
665 break;
666
667 case HA_KEYTYPE_INT24:
668 signed_autoinc = (longlong) sint3korr(key);
669 autoinc_type = signed_type;
670 break;
671
672 case HA_KEYTYPE_UINT24:
673 unsigned_autoinc = (ulonglong) tokudb_uint3korr(key);
674 break;
675
676 case HA_KEYTYPE_LONGLONG:
677 signed_autoinc = sint8korr(key);
678 autoinc_type = signed_type;
679 break;
680
681 case HA_KEYTYPE_ULONGLONG:
682 unsigned_autoinc = uint8korr(key);
683 break;
684
685 /* The remaining two cases should not be used but are included for
686 compatibility */
687 case HA_KEYTYPE_FLOAT:
688 float4get(float_tmp, key); /* Note: float4get is a macro */
689 signed_autoinc = (longlong) float_tmp;
690 autoinc_type = signed_type;
691 break;
692
693 case HA_KEYTYPE_DOUBLE:
694 float8get(double_tmp, key); /* Note: float8get is a macro */
695 signed_autoinc = (longlong) double_tmp;
696 autoinc_type = signed_type;
697 break;
698
699 default:
700 assert_unreachable();
701 }
702
703 if (signed_autoinc < 0) {
704 signed_autoinc = 0;
705 }
706
707 return autoinc_type == unsigned_type ?
708 unsigned_autoinc : (ulonglong) signed_autoinc;
709}
710
711static inline ulong field_offset(Field* field, TABLE* table) {
712 return((ulong) (field->ptr - table->record[0]));
713}
714
715static inline HA_TOKU_ISO_LEVEL tx_to_toku_iso(ulong tx_isolation) {
716 if (tx_isolation == ISO_READ_UNCOMMITTED) {
717 return hatoku_iso_read_uncommitted;
718 }
719 else if (tx_isolation == ISO_READ_COMMITTED) {
720 return hatoku_iso_read_committed;
721 }
722 else if (tx_isolation == ISO_REPEATABLE_READ) {
723 return hatoku_iso_repeatable_read;
724 }
725 else {
726 return hatoku_iso_serializable;
727 }
728}
729
730static inline uint32_t toku_iso_to_txn_flag (HA_TOKU_ISO_LEVEL lvl) {
731 if (lvl == hatoku_iso_read_uncommitted) {
732 return DB_READ_UNCOMMITTED;
733 }
734 else if (lvl == hatoku_iso_read_committed) {
735 return DB_READ_COMMITTED;
736 }
737 else if (lvl == hatoku_iso_repeatable_read) {
738 return DB_TXN_SNAPSHOT;
739 }
740 else {
741 return 0;
742 }
743}
744
745static int filter_key_part_compare (const void* left, const void* right) {
746 FILTER_KEY_PART_INFO* left_part= (FILTER_KEY_PART_INFO *)left;
747 FILTER_KEY_PART_INFO* right_part = (FILTER_KEY_PART_INFO *)right;
748 return left_part->offset - right_part->offset;
749}
750
751//
752// Be very careful with parameters passed to this function. Who knows
753// if key, table have proper info set. I had to verify by checking
754// in the debugger.
755//
756void set_key_filter(
757 MY_BITMAP* key_filter,
758 KEY* key,
759 TABLE* table,
760 bool get_offset_from_keypart) {
761
762 FILTER_KEY_PART_INFO parts[MAX_REF_PARTS];
763 uint curr_skip_index = 0;
764
765 for (uint i = 0; i < key->user_defined_key_parts; i++) {
766 //
767 // horrendous hack due to bugs in mysql, basically
768 // we cannot always reliably get the offset from the same source
769 //
770 parts[i].offset =
771 get_offset_from_keypart ?
772 key->key_part[i].offset :
773 field_offset(key->key_part[i].field, table);
774 parts[i].part_index = i;
775 }
776 qsort(
777 parts, // start of array
778 key->user_defined_key_parts, //num elements
779 sizeof(*parts), //size of each element
780 filter_key_part_compare);
781
782 for (uint i = 0; i < table->s->fields; i++) {
783 Field* field = table->field[i];
784 uint curr_field_offset = field_offset(field, table);
785 if (curr_skip_index < key->user_defined_key_parts) {
786 uint curr_skip_offset = 0;
787 curr_skip_offset = parts[curr_skip_index].offset;
788 if (curr_skip_offset == curr_field_offset) {
789 //
790 // we have hit a field that is a portion of the primary key
791 //
792 uint curr_key_index = parts[curr_skip_index].part_index;
793 curr_skip_index++;
794 //
795 // only choose to continue over the key if the key's length matches the field's length
796 // otherwise, we may have a situation where the column is a varchar(10), the
797 // key is only the first 3 characters, and we end up losing the last 7 bytes of the
798 // column
799 //
800 TOKU_TYPE toku_type = mysql_to_toku_type(field);
801 switch (toku_type) {
802 case toku_type_blob:
803 break;
804 case toku_type_varbinary:
805 case toku_type_varstring:
806 case toku_type_fixbinary:
807 case toku_type_fixstring:
808 if (key->key_part[curr_key_index].length == field->field_length) {
809 bitmap_set_bit(key_filter,i);
810 }
811 break;
812 default:
813 bitmap_set_bit(key_filter,i);
814 break;
815 }
816 }
817 }
818 }
819}
820
821static inline uchar* pack_fixed_field(
822 uchar* to_tokudb,
823 const uchar* from_mysql,
824 uint32_t num_bytes
825 )
826{
827 switch (num_bytes) {
828 case (1):
829 memcpy(to_tokudb, from_mysql, 1);
830 break;
831 case (2):
832 memcpy(to_tokudb, from_mysql, 2);
833 break;
834 case (3):
835 memcpy(to_tokudb, from_mysql, 3);
836 break;
837 case (4):
838 memcpy(to_tokudb, from_mysql, 4);
839 break;
840 case (8):
841 memcpy(to_tokudb, from_mysql, 8);
842 break;
843 default:
844 memcpy(to_tokudb, from_mysql, num_bytes);
845 break;
846 }
847 return to_tokudb+num_bytes;
848}
849
850static inline const uchar* unpack_fixed_field(
851 uchar* to_mysql,
852 const uchar* from_tokudb,
853 uint32_t num_bytes
854 )
855{
856 switch (num_bytes) {
857 case (1):
858 memcpy(to_mysql, from_tokudb, 1);
859 break;
860 case (2):
861 memcpy(to_mysql, from_tokudb, 2);
862 break;
863 case (3):
864 memcpy(to_mysql, from_tokudb, 3);
865 break;
866 case (4):
867 memcpy(to_mysql, from_tokudb, 4);
868 break;
869 case (8):
870 memcpy(to_mysql, from_tokudb, 8);
871 break;
872 default:
873 memcpy(to_mysql, from_tokudb, num_bytes);
874 break;
875 }
876 return from_tokudb+num_bytes;
877}
878
879static inline uchar* write_var_field(
880 uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
881 uchar* to_tokudb_data, // location where data is going to be written
882 uchar* to_tokudb_offset_start, //location where offset starts, IS THIS A BAD NAME????
883 const uchar * data, // the data to write
884 uint32_t data_length, // length of data to write
885 uint32_t offset_bytes // number of offset bytes
886 )
887{
888 memcpy(to_tokudb_data, data, data_length);
889 //
890 // for offset, we pack the offset where the data ENDS!
891 //
892 uint32_t offset = to_tokudb_data + data_length - to_tokudb_offset_start;
893 switch(offset_bytes) {
894 case (1):
895 to_tokudb_offset_ptr[0] = (uchar)offset;
896 break;
897 case (2):
898 int2store(to_tokudb_offset_ptr,offset);
899 break;
900 default:
901 assert_unreachable();
902 break;
903 }
904 return to_tokudb_data + data_length;
905}
906
907static inline uint32_t get_var_data_length(
908 const uchar * from_mysql,
909 uint32_t mysql_length_bytes
910 )
911{
912 uint32_t data_length;
913 switch(mysql_length_bytes) {
914 case(1):
915 data_length = from_mysql[0];
916 break;
917 case(2):
918 data_length = uint2korr(from_mysql);
919 break;
920 default:
921 assert_unreachable();
922 }
923 return data_length;
924}
925
926static inline uchar* pack_var_field(
927 uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
928 uchar* to_tokudb_data, // pointer to where tokudb data should be written
929 uchar* to_tokudb_offset_start, //location where data starts, IS THIS A BAD NAME????
930 const uchar * from_mysql, // mysql data
931 uint32_t mysql_length_bytes, //number of bytes used to store length in from_mysql
932 uint32_t offset_bytes //number of offset_bytes used in tokudb row
933 )
934{
935 uint data_length = get_var_data_length(from_mysql, mysql_length_bytes);
936 return write_var_field(
937 to_tokudb_offset_ptr,
938 to_tokudb_data,
939 to_tokudb_offset_start,
940 from_mysql + mysql_length_bytes,
941 data_length,
942 offset_bytes
943 );
944}
945
946static inline void unpack_var_field(
947 uchar* to_mysql,
948 const uchar* from_tokudb_data,
949 uint32_t from_tokudb_data_len,
950 uint32_t mysql_length_bytes
951 )
952{
953 //
954 // store the length
955 //
956 switch (mysql_length_bytes) {
957 case(1):
958 to_mysql[0] = (uchar)from_tokudb_data_len;
959 break;
960 case(2):
961 int2store(to_mysql, from_tokudb_data_len);
962 break;
963 default:
964 assert_unreachable();
965 }
966 //
967 // store the data
968 //
969 memcpy(to_mysql+mysql_length_bytes, from_tokudb_data, from_tokudb_data_len);
970}
971
972static uchar* pack_toku_field_blob(
973 uchar* to_tokudb,
974 const uchar* from_mysql,
975 Field* field
976 )
977{
978 uint32_t len_bytes = field->row_pack_length();
979 uint32_t length = 0;
980 uchar* data_ptr = NULL;
981 memcpy(to_tokudb, from_mysql, len_bytes);
982
983 switch (len_bytes) {
984 case (1):
985 length = (uint32_t)(*from_mysql);
986 break;
987 case (2):
988 length = uint2korr(from_mysql);
989 break;
990 case (3):
991 length = tokudb_uint3korr(from_mysql);
992 break;
993 case (4):
994 length = uint4korr(from_mysql);
995 break;
996 default:
997 assert_unreachable();
998 }
999
1000 if (length > 0) {
1001 memcpy((uchar *)(&data_ptr), from_mysql + len_bytes, sizeof(uchar*));
1002 memcpy(to_tokudb + len_bytes, data_ptr, length);
1003 }
1004 return (to_tokudb + len_bytes + length);
1005}
1006
1007static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) {
1008 int error;
1009 tokudb_trx_data* trx = (tokudb_trx_data *) tokudb::memory::malloc(
1010 sizeof(*trx),
1011 MYF(MY_ZEROFILL));
1012 if (!trx) {
1013 error = ENOMEM;
1014 goto cleanup;
1015 }
1016
1017 *out_trx = trx;
1018 error = 0;
1019cleanup:
1020 return error;
1021}
1022
1023
1024static inline int tokudb_generate_row(
1025 DB *dest_db,
1026 DB *src_db,
1027 DBT *dest_key,
1028 DBT *dest_val,
1029 const DBT *src_key,
1030 const DBT *src_val
1031 )
1032{
1033 int error;
1034
1035 DB* curr_db = dest_db;
1036 uchar* row_desc = NULL;
1037 uint32_t desc_size;
1038 uchar* buff = NULL;
1039 uint32_t max_key_len = 0;
1040
1041 row_desc = (uchar *)curr_db->descriptor->dbt.data;
1042 row_desc += (*(uint32_t *)row_desc);
1043 desc_size = (*(uint32_t *)row_desc) - 4;
1044 row_desc += 4;
1045
1046 if (is_key_pk(row_desc, desc_size)) {
1047 if (dest_key->flags == DB_DBT_REALLOC && dest_key->data != NULL) {
1048 free(dest_key->data);
1049 }
1050 if (dest_val != NULL) {
1051 if (dest_val->flags == DB_DBT_REALLOC && dest_val->data != NULL) {
1052 free(dest_val->data);
1053 }
1054 }
1055 dest_key->data = src_key->data;
1056 dest_key->size = src_key->size;
1057 dest_key->flags = 0;
1058 if (dest_val != NULL) {
1059 dest_val->data = src_val->data;
1060 dest_val->size = src_val->size;
1061 dest_val->flags = 0;
1062 }
1063 error = 0;
1064 goto cleanup;
1065 }
1066 // at this point, we need to create the key/val and set it
1067 // in the DBTs
1068 if (dest_key->flags == 0) {
1069 dest_key->ulen = 0;
1070 dest_key->size = 0;
1071 dest_key->data = NULL;
1072 dest_key->flags = DB_DBT_REALLOC;
1073 }
1074 if (dest_key->flags == DB_DBT_REALLOC) {
1075 max_key_len = max_key_size_from_desc(row_desc, desc_size);
1076 max_key_len += src_key->size;
1077
1078 if (max_key_len > dest_key->ulen) {
1079 void* old_ptr = dest_key->data;
1080 void* new_ptr = NULL;
1081 new_ptr = realloc(old_ptr, max_key_len);
1082 assert_always(new_ptr);
1083 dest_key->data = new_ptr;
1084 dest_key->ulen = max_key_len;
1085 }
1086
1087 buff = (uchar *)dest_key->data;
1088 assert_always(buff != NULL && max_key_len > 0);
1089 } else {
1090 assert_unreachable();
1091 }
1092
1093 dest_key->size = pack_key_from_desc(buff, row_desc, desc_size, src_key,
1094 src_val);
1095 assert_always(dest_key->ulen >= dest_key->size);
1096 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY)) &&
1097 !max_key_len) {
1098 max_key_len = max_key_size_from_desc(row_desc, desc_size);
1099 max_key_len += src_key->size;
1100 }
1101 if (max_key_len) {
1102 assert_always(max_key_len >= dest_key->size);
1103 }
1104
1105 row_desc += desc_size;
1106 desc_size = (*(uint32_t *)row_desc) - 4;
1107 row_desc += 4;
1108 if (dest_val != NULL) {
1109 if (!is_key_clustering(row_desc, desc_size) || src_val->size == 0) {
1110 dest_val->size = 0;
1111 } else {
1112 uchar* buff = NULL;
1113 if (dest_val->flags == 0) {
1114 dest_val->ulen = 0;
1115 dest_val->size = 0;
1116 dest_val->data = NULL;
1117 dest_val->flags = DB_DBT_REALLOC;
1118 }
1119 if (dest_val->flags == DB_DBT_REALLOC){
1120 if (dest_val->ulen < src_val->size) {
1121 void* old_ptr = dest_val->data;
1122 void* new_ptr = NULL;
1123 new_ptr = realloc(old_ptr, src_val->size);
1124 assert_always(new_ptr);
1125 dest_val->data = new_ptr;
1126 dest_val->ulen = src_val->size;
1127 }
1128 buff = (uchar *)dest_val->data;
1129 assert_always(buff != NULL);
1130 } else {
1131 assert_unreachable();
1132 }
1133 dest_val->size = pack_clustering_val_from_desc(
1134 buff,
1135 row_desc,
1136 desc_size,
1137 src_val);
1138 assert_always(dest_val->ulen >= dest_val->size);
1139 }
1140 }
1141 error = 0;
1142cleanup:
1143 return error;
1144}
1145
1146static int generate_row_for_del(
1147 DB *dest_db,
1148 DB *src_db,
1149 DBT_ARRAY *dest_key_arrays,
1150 const DBT *src_key,
1151 const DBT *src_val
1152 )
1153{
1154 DBT* dest_key = &dest_key_arrays->dbts[0];
1155 return tokudb_generate_row(
1156 dest_db,
1157 src_db,
1158 dest_key,
1159 NULL,
1160 src_key,
1161 src_val
1162 );
1163}
1164
1165
1166static int generate_row_for_put(
1167 DB *dest_db,
1168 DB *src_db,
1169 DBT_ARRAY *dest_key_arrays,
1170 DBT_ARRAY *dest_val_arrays,
1171 const DBT *src_key,
1172 const DBT *src_val
1173 )
1174{
1175 DBT* dest_key = &dest_key_arrays->dbts[0];
1176 DBT *dest_val = (dest_val_arrays == NULL) ? NULL : &dest_val_arrays->dbts[0];
1177 return tokudb_generate_row(
1178 dest_db,
1179 src_db,
1180 dest_key,
1181 dest_val,
1182 src_key,
1183 src_val
1184 );
1185}
1186
1187ha_tokudb::ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg):handler(hton, table_arg) {
1188 TOKUDB_HANDLER_DBUG_ENTER("");
1189 share = NULL;
1190 int_table_flags = HA_REC_NOT_IN_SEQ | HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS
1191 | HA_PRIMARY_KEY_IN_READ_INDEX | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
1192 | HA_FILE_BASED | HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX
1193 | HA_CAN_WRITE_DURING_OPTIMIZE | HA_ONLINE_ANALYZE;
1194 alloc_ptr = NULL;
1195 rec_buff = NULL;
1196 rec_update_buff = NULL;
1197 transaction = NULL;
1198 cursor = NULL;
1199 fixed_cols_for_query = NULL;
1200 var_cols_for_query = NULL;
1201 num_fixed_cols_for_query = 0;
1202 num_var_cols_for_query = 0;
1203 unpack_entire_row = true;
1204 read_blobs = false;
1205 read_key = false;
1206 added_rows = 0;
1207 deleted_rows = 0;
1208 updated_rows = 0;
1209 last_dup_key = UINT_MAX;
1210 using_ignore = false;
1211 using_ignore_no_key = false;
1212 last_cursor_error = 0;
1213 range_lock_grabbed = false;
1214 blob_buff = NULL;
1215 num_blob_bytes = 0;
1216 delay_updating_ai_metadata = false;
1217 ai_metadata_update_required = false;
1218 memset(mult_key_dbt_array, 0, sizeof(mult_key_dbt_array));
1219 memset(mult_rec_dbt_array, 0, sizeof(mult_rec_dbt_array));
1220 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1221 toku_dbt_array_init(&mult_key_dbt_array[i], 1);
1222 }
1223 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1224 toku_dbt_array_init(&mult_rec_dbt_array[i], 1);
1225 }
1226 loader = NULL;
1227 abort_loader = false;
1228 memset(&lc, 0, sizeof(lc));
1229 lock.type = TL_IGNORE;
1230 for (uint32_t i = 0; i < MAX_KEY+1; i++) {
1231 mult_put_flags[i] = 0;
1232 mult_del_flags[i] = DB_DELETE_ANY;
1233 mult_dbt_flags[i] = DB_DBT_REALLOC;
1234 }
1235 num_DBs_locked_in_bulk = false;
1236 lock_count = 0;
1237 use_write_locks = false;
1238 range_query_buff = NULL;
1239 size_range_query_buff = 0;
1240 bytes_used_in_range_query_buff = 0;
1241 curr_range_query_buff_offset = 0;
1242 doing_bulk_fetch = false;
1243 prelocked_left_range_size = 0;
1244 prelocked_right_range_size = 0;
1245 tokudb_active_index = MAX_KEY;
1246 invalidate_icp();
1247 trx_handler_list.data = this;
1248 in_rpl_write_rows = in_rpl_delete_rows = in_rpl_update_rows = false;
1249 TOKUDB_HANDLER_DBUG_VOID_RETURN;
1250}
1251
1252ha_tokudb::~ha_tokudb() {
1253 TOKUDB_HANDLER_DBUG_ENTER("");
1254 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1255 toku_dbt_array_destroy(&mult_key_dbt_array[i]);
1256 }
1257 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1258 toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
1259 }
1260 TOKUDB_HANDLER_DBUG_VOID_RETURN;
1261}
1262
1263//
1264// states if table has an auto increment column, if so, sets index where auto inc column is to index
1265// Parameters:
1266// [out] index - if auto inc exists, then this param is set to where it exists in table, if not, then unchanged
1267// Returns:
1268// true if auto inc column exists, false otherwise
1269//
1270bool ha_tokudb::has_auto_increment_flag(uint* index) {
1271 //
1272 // check to see if we have auto increment field
1273 //
1274 bool ai_found = false;
1275 uint ai_index = 0;
1276 for (uint i = 0; i < table_share->fields; i++, ai_index++) {
1277 Field* field = table->field[i];
1278 if (field->flags & AUTO_INCREMENT_FLAG) {
1279 ai_found = true;
1280 *index = ai_index;
1281 break;
1282 }
1283 }
1284 return ai_found;
1285}
1286
1287static int open_status_dictionary(DB** ptr, const char* name, DB_TXN* txn) {
1288 int error;
1289 char* newname = NULL;
1290 size_t newname_len = get_max_dict_name_path_length(name);
1291 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
1292 if (newname == NULL) {
1293 error = ENOMEM;
1294 goto cleanup;
1295 }
1296 make_name(newname, newname_len, name, "status");
1297 TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "open:%s", newname);
1298
1299 error = tokudb::metadata::open(db_env, ptr, newname, txn);
1300cleanup:
1301 tokudb::memory::free(newname);
1302 return error;
1303}
1304
1305int ha_tokudb::open_main_dictionary(
1306 const char* name,
1307 bool is_read_only,
1308 DB_TXN* txn) {
1309
1310 int error;
1311 char* newname = NULL;
1312 size_t newname_len = 0;
1313 uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1314
1315 assert_always(share->file == NULL);
1316 assert_always(share->key_file[primary_key] == NULL);
1317 newname_len = get_max_dict_name_path_length(name);
1318 newname = (char*)tokudb::memory::malloc(
1319 newname_len,
1320 MYF(MY_WME|MY_ZEROFILL));
1321 if (newname == NULL) {
1322 error = ENOMEM;
1323 goto exit;
1324 }
1325 make_name(newname, newname_len, name, "main");
1326
1327 error = db_create(&share->file, db_env, 0);
1328 if (error) {
1329 goto exit;
1330 }
1331 share->key_file[primary_key] = share->file;
1332
1333 error =
1334 share->file->open(
1335 share->file,
1336 txn,
1337 newname,
1338 NULL,
1339 DB_BTREE,
1340 open_flags,
1341 0);
1342 if (error) {
1343 goto exit;
1344 }
1345
1346 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1347 TOKUDB_DEBUG_OPEN,
1348 "open:%s:file=%p",
1349 newname,
1350 share->file);
1351
1352 error = 0;
1353exit:
1354 if (error) {
1355 if (share->file) {
1356 int r = share->file->close(
1357 share->file,
1358 0
1359 );
1360 assert_always(r==0);
1361 share->file = NULL;
1362 share->key_file[primary_key] = NULL;
1363 }
1364 }
1365 tokudb::memory::free(newname);
1366 return error;
1367}
1368
1369//
1370// Open a secondary table, the key will be a secondary index, the data will
1371// be a primary key
1372//
1373int ha_tokudb::open_secondary_dictionary(
1374 DB** ptr,
1375 KEY* key_info,
1376 const char* name,
1377 bool is_read_only,
1378 DB_TXN* txn) {
1379
1380 int error = ENOSYS;
1381 char dict_name[MAX_DICT_NAME_LEN];
1382 uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1383 char* newname = NULL;
1384 size_t newname_len = 0;
1385
1386 sprintf(dict_name, "key-%s", key_info->name.str);
1387
1388 newname_len = get_max_dict_name_path_length(name);
1389 newname =
1390 (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME|MY_ZEROFILL));
1391 if (newname == NULL) {
1392 error = ENOMEM;
1393 goto cleanup;
1394 }
1395 make_name(newname, newname_len, name, dict_name);
1396
1397
1398 if ((error = db_create(ptr, db_env, 0))) {
1399 my_errno = error;
1400 goto cleanup;
1401 }
1402
1403
1404 error = (*ptr)->open(*ptr, txn, newname, NULL, DB_BTREE, open_flags, 0);
1405 if (error) {
1406 my_errno = error;
1407 goto cleanup;
1408 }
1409 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1410 TOKUDB_DEBUG_OPEN,
1411 "open:%s:file=%p",
1412 newname,
1413 *ptr);
1414cleanup:
1415 if (error) {
1416 if (*ptr) {
1417 int r = (*ptr)->close(*ptr, 0);
1418 assert_always(r==0);
1419 *ptr = NULL;
1420 }
1421 }
1422 tokudb::memory::free(newname);
1423 return error;
1424}
1425
1426static int initialize_col_pack_info(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
1427 int error = ENOSYS;
1428 //
1429 // set up the cp_info
1430 //
1431 assert_always(kc_info->cp_info[keynr] == NULL);
1432 kc_info->cp_info[keynr] = (COL_PACK_INFO*)tokudb::memory::malloc(
1433 table_share->fields * sizeof(COL_PACK_INFO),
1434 MYF(MY_WME | MY_ZEROFILL));
1435 if (kc_info->cp_info[keynr] == NULL) {
1436 error = ENOMEM;
1437 goto exit;
1438 }
1439 {
1440 uint32_t curr_fixed_offset = 0;
1441 uint32_t curr_var_index = 0;
1442 for (uint j = 0; j < table_share->fields; j++) {
1443 COL_PACK_INFO* curr = &kc_info->cp_info[keynr][j];
1444 //
1445 // need to set the offsets / indexes
1446 // offsets are calculated AFTER the NULL bytes
1447 //
1448 if (!bitmap_is_set(&kc_info->key_filters[keynr],j)) {
1449 if (is_fixed_field(kc_info, j)) {
1450 curr->col_pack_val = curr_fixed_offset;
1451 curr_fixed_offset += kc_info->field_lengths[j];
1452 }
1453 else if (is_variable_field(kc_info, j)) {
1454 curr->col_pack_val = curr_var_index;
1455 curr_var_index++;
1456 }
1457 }
1458 }
1459
1460 //
1461 // set up the mcp_info
1462 //
1463 kc_info->mcp_info[keynr].fixed_field_size = get_fixed_field_size(
1464 kc_info,
1465 table_share,
1466 keynr
1467 );
1468 kc_info->mcp_info[keynr].len_of_offsets = get_len_of_offsets(
1469 kc_info,
1470 table_share,
1471 keynr
1472 );
1473
1474 error = 0;
1475 }
1476exit:
1477 return error;
1478}
1479
1480// reset the kc_info state at keynr
1481static void reset_key_and_col_info(KEY_AND_COL_INFO *kc_info, uint keynr) {
1482 bitmap_clear_all(&kc_info->key_filters[keynr]);
1483 tokudb::memory::free(kc_info->cp_info[keynr]);
1484 kc_info->cp_info[keynr] = NULL;
1485 kc_info->mcp_info[keynr] = (MULTI_COL_PACK_INFO) { 0, 0 };
1486}
1487
1488static int initialize_key_and_col_info(
1489 TABLE_SHARE* table_share,
1490 TABLE* table,
1491 KEY_AND_COL_INFO* kc_info,
1492 uint hidden_primary_key,
1493 uint primary_key) {
1494
1495 int error = 0;
1496 uint32_t curr_blob_field_index = 0;
1497 uint32_t max_var_bytes = 0;
1498 //
1499 // fill in the field lengths. 0 means it is a variable sized field length
1500 // fill in length_bytes, 0 means it is fixed or blob
1501 //
1502 for (uint i = 0; i < table_share->fields; i++) {
1503 Field* field = table_share->field[i];
1504 TOKU_TYPE toku_type = mysql_to_toku_type(field);
1505 uint32 pack_length = 0;
1506 switch (toku_type) {
1507 case toku_type_int:
1508 case toku_type_double:
1509 case toku_type_float:
1510 case toku_type_fixbinary:
1511 case toku_type_fixstring:
1512 pack_length = field->pack_length();
1513 assert_always(pack_length < 1<<16);
1514 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD;
1515 kc_info->field_lengths[i] = (uint16_t)pack_length;
1516 kc_info->length_bytes[i] = 0;
1517 break;
1518 case toku_type_blob:
1519 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD;
1520 kc_info->field_lengths[i] = 0;
1521 kc_info->length_bytes[i] = 0;
1522 kc_info->blob_fields[curr_blob_field_index] = i;
1523 curr_blob_field_index++;
1524 break;
1525 case toku_type_varstring:
1526 case toku_type_varbinary:
1527 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD;
1528 kc_info->field_lengths[i] = 0;
1529 kc_info->length_bytes[i] =
1530 (uchar)((Field_varstring*)field)->length_bytes;
1531 max_var_bytes += field->field_length;
1532 break;
1533 default:
1534 assert_unreachable();
1535 }
1536 }
1537 kc_info->num_blobs = curr_blob_field_index;
1538
1539 //
1540 // initialize share->num_offset_bytes
1541 // because MAX_REF_LENGTH is 65536, we
1542 // can safely set num_offset_bytes to 1 or 2
1543 //
1544 if (max_var_bytes < 256) {
1545 kc_info->num_offset_bytes = 1;
1546 } else {
1547 kc_info->num_offset_bytes = 2;
1548 }
1549
1550 for (uint i = 0;
1551 i < table_share->keys + tokudb_test(hidden_primary_key);
1552 i++) {
1553 //
1554 // do the cluster/primary key filtering calculations
1555 //
1556 if (!(i==primary_key && hidden_primary_key)) {
1557 if (i == primary_key) {
1558 set_key_filter(
1559 &kc_info->key_filters[primary_key],
1560 &table_share->key_info[primary_key],
1561 table,
1562 true);
1563 } else {
1564 set_key_filter(
1565 &kc_info->key_filters[i],
1566 &table_share->key_info[i],
1567 table,
1568 true);
1569 if (!hidden_primary_key) {
1570 set_key_filter(
1571 &kc_info->key_filters[i],
1572 &table_share->key_info[primary_key],
1573 table,
1574 true);
1575 }
1576 }
1577 }
1578 if (i == primary_key || key_is_clustering(&table_share->key_info[i])) {
1579 error = initialize_col_pack_info(kc_info, table_share, i);
1580 if (error) {
1581 goto exit;
1582 }
1583 }
1584 }
1585exit:
1586 return error;
1587}
1588
1589bool ha_tokudb::can_replace_into_be_fast(
1590 TABLE_SHARE* table_share,
1591 KEY_AND_COL_INFO* kc_info,
1592 uint pk) {
1593
1594 uint curr_num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1595 bool ret_val;
1596 if (curr_num_DBs == 1) {
1597 ret_val = true;
1598 goto exit;
1599 }
1600 ret_val = true;
1601 for (uint curr_index = 0; curr_index < table_share->keys; curr_index++) {
1602 if (curr_index == pk) continue;
1603 KEY* curr_key_info = &table_share->key_info[curr_index];
1604 for (uint i = 0; i < curr_key_info->user_defined_key_parts; i++) {
1605 uint16 curr_field_index = curr_key_info->key_part[i].field->field_index;
1606 if (!bitmap_is_set(&kc_info->key_filters[curr_index],curr_field_index)) {
1607 ret_val = false;
1608 goto exit;
1609 }
1610 if (bitmap_is_set(&kc_info->key_filters[curr_index], curr_field_index) &&
1611 !bitmap_is_set(&kc_info->key_filters[pk], curr_field_index)) {
1612 ret_val = false;
1613 goto exit;
1614 }
1615
1616 }
1617 }
1618exit:
1619 return ret_val;
1620}
1621
1622int ha_tokudb::initialize_share(const char* name, int mode) {
1623 int error = 0;
1624 uint64_t num_rows = 0;
1625 DB_TXN* txn = NULL;
1626 bool do_commit = false;
1627 THD* thd = ha_thd();
1628 tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
1629 if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
1630 txn = trx->sub_sp_level;
1631 }
1632 else {
1633 do_commit = true;
1634 error = txn_begin(db_env, 0, &txn, 0, thd);
1635 if (error) { goto exit; }
1636 }
1637
1638
1639 error = get_status(txn);
1640 if (error) {
1641 goto exit;
1642 }
1643 if (share->version != HA_TOKU_VERSION) {
1644 error = ENOSYS;
1645 goto exit;
1646 }
1647
1648#if WITH_PARTITION_STORAGE_ENGINE
1649 // verify frm data for non-partitioned tables
1650 if (TOKU_PARTITION_WRITE_FRM_DATA || table->part_info == NULL) {
1651 error = verify_frm_data(table->s->path.str, txn);
1652 if (error)
1653 goto exit;
1654 } else {
1655 // remove the frm data for partitions since we are not maintaining it
1656 error = remove_frm_data(share->status_block, txn);
1657 if (error)
1658 goto exit;
1659 }
1660#else
1661 error = verify_frm_data(table->s->path.str, txn);
1662 if (error)
1663 goto exit;
1664#endif
1665
1666 error =
1667 initialize_key_and_col_info(
1668 table_share,
1669 table,
1670 &share->kc_info,
1671 hidden_primary_key,
1672 primary_key);
1673 if (error) { goto exit; }
1674
1675 error = open_main_dictionary(name, mode == O_RDONLY, txn);
1676 if (error) {
1677 goto exit;
1678 }
1679
1680 share->has_unique_keys = false;
1681 share->_keys = table_share->keys;
1682 share->_max_key_parts = table_share->key_parts;
1683 share->_key_descriptors =
1684 (TOKUDB_SHARE::key_descriptor_t*)tokudb::memory::malloc(
1685 sizeof(TOKUDB_SHARE::key_descriptor_t) * share->_keys,
1686 MYF(MY_ZEROFILL));
1687
1688 /* Open other keys; These are part of the share structure */
1689 for (uint i = 0; i < table_share->keys; i++) {
1690 share->_key_descriptors[i]._parts =
1691 table_share->key_info[i].user_defined_key_parts;
1692 if (i == primary_key) {
1693 share->_key_descriptors[i]._is_unique = true;
1694 share->_key_descriptors[i]._name = tokudb::memory::strdup("primary", 0);
1695 } else {
1696 share->_key_descriptors[i]._is_unique = false;
1697 share->_key_descriptors[i]._name =
1698 tokudb::memory::strdup(table_share->key_info[i].name.str, 0);
1699 }
1700
1701 if (table_share->key_info[i].flags & HA_NOSAME) {
1702 share->_key_descriptors[i]._is_unique = true;
1703 share->has_unique_keys = true;
1704 }
1705 if (i != primary_key) {
1706 error =
1707 open_secondary_dictionary(
1708 &share->key_file[i],
1709 &table_share->key_info[i],
1710 name,
1711 mode == O_RDONLY,
1712 txn);
1713 if (error) {
1714 goto exit;
1715 }
1716 }
1717 }
1718 share->replace_into_fast =
1719 can_replace_into_be_fast(
1720 table_share,
1721 &share->kc_info,
1722 primary_key);
1723
1724 share->pk_has_string = false;
1725 if (!hidden_primary_key) {
1726 //
1727 // We need to set the ref_length to start at 5, to account for
1728 // the "infinity byte" in keys, and for placing the DBT size in the first four bytes
1729 //
1730 ref_length = sizeof(uint32_t) + sizeof(uchar);
1731 KEY_PART_INFO* key_part = table->key_info[primary_key].key_part;
1732 KEY_PART_INFO* end =
1733 key_part + table->key_info[primary_key].user_defined_key_parts;
1734 for (; key_part != end; key_part++) {
1735 ref_length += key_part->field->max_packed_col_length(key_part->length);
1736 TOKU_TYPE toku_type = mysql_to_toku_type(key_part->field);
1737 if (toku_type == toku_type_fixstring ||
1738 toku_type == toku_type_varstring ||
1739 toku_type == toku_type_blob
1740 )
1741 {
1742 share->pk_has_string = true;
1743 }
1744 }
1745 share->status |= STATUS_PRIMARY_KEY_INIT;
1746 }
1747 share->ref_length = ref_length;
1748
1749 error = estimate_num_rows(share->file, &num_rows, txn);
1750 //
1751 // estimate_num_rows should not fail under normal conditions
1752 //
1753 if (error == 0) {
1754 share->set_row_count(num_rows, true);
1755 } else {
1756 goto exit;
1757 }
1758 //
1759 // initialize auto increment data
1760 //
1761 share->has_auto_inc = has_auto_increment_flag(&share->ai_field_index);
1762 if (share->has_auto_inc) {
1763 init_auto_increment();
1764 }
1765
1766 if (may_table_be_empty(txn)) {
1767 share->try_table_lock = true;
1768 } else {
1769 share->try_table_lock = false;
1770 }
1771
1772 share->num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1773
1774 init_hidden_prim_key_info(txn);
1775
1776 // initialize cardinality info from the status dictionary
1777 {
1778 uint32_t rec_per_keys = tokudb::compute_total_key_parts(table_share);
1779 uint64_t* rec_per_key =
1780 (uint64_t*)tokudb::memory::malloc(
1781 rec_per_keys * sizeof(uint64_t),
1782 MYF(MY_FAE));
1783 error =
1784 tokudb::get_card_from_status(
1785 share->status_block,
1786 txn,
1787 rec_per_keys,
1788 rec_per_key);
1789 if (error) {
1790 memset(rec_per_key, 0, sizeof(ulonglong) * rec_per_keys);
1791 }
1792 share->init_cardinality_counts(rec_per_keys, rec_per_key);
1793 }
1794
1795 error = 0;
1796exit:
1797 if (do_commit && txn) {
1798 commit_txn(txn,0);
1799 }
1800 return error;
1801}
1802
1803//
1804// Creates and opens a handle to a table which already exists in a tokudb
1805// database.
1806// Parameters:
1807// [in] name - table name
1808// mode - seems to specify if table is read only
1809// test_if_locked - unused
1810// Returns:
1811// 0 on success
1812// 1 on error
1813//
1814int ha_tokudb::open(const char *name, int mode, uint test_if_locked) {
1815 TOKUDB_HANDLER_DBUG_ENTER("%s %o %u", name, mode, test_if_locked);
1816 THD* thd = ha_thd();
1817
1818 int error = 0;
1819 int ret_val = 0;
1820
1821 transaction = NULL;
1822 cursor = NULL;
1823
1824
1825 /* Open primary key */
1826 hidden_primary_key = 0;
1827 if ((primary_key = table_share->primary_key) >= MAX_KEY) {
1828 // No primary key
1829 primary_key = table_share->keys;
1830 key_used_on_scan = MAX_KEY;
1831 hidden_primary_key = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
1832 ref_length = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t);
1833 }
1834 else {
1835 key_used_on_scan = primary_key;
1836 }
1837
1838 /* Need some extra memory in case of packed keys */
1839 // the "+ 1" is for the first byte that states +/- infinity
1840 // multiply everything by 2 to account for clustered keys having a key and primary key together
1841 max_key_length = 2*(table_share->max_key_length + MAX_REF_PARTS * 3 + sizeof(uchar));
1842 alloc_ptr = tokudb::memory::multi_malloc(
1843 MYF(MY_WME),
1844 &key_buff, max_key_length,
1845 &key_buff2, max_key_length,
1846 &key_buff3, max_key_length,
1847 &key_buff4, max_key_length,
1848 &prelocked_left_range, max_key_length,
1849 &prelocked_right_range, max_key_length,
1850 &primary_key_buff, (hidden_primary_key ? 0 : max_key_length),
1851 &fixed_cols_for_query, table_share->fields*sizeof(uint32_t),
1852 &var_cols_for_query, table_share->fields*sizeof(uint32_t),
1853 NullS);
1854 if (alloc_ptr == NULL) {
1855 ret_val = 1;
1856 goto exit;
1857 }
1858
1859 size_range_query_buff = tokudb::sysvars::read_buf_size(thd);
1860 range_query_buff =
1861 (uchar*)tokudb::memory::malloc(size_range_query_buff, MYF(MY_WME));
1862 if (range_query_buff == NULL) {
1863 ret_val = 1;
1864 goto exit;
1865 }
1866
1867 alloced_rec_buff_length = table_share->rec_buff_length +
1868 table_share->fields;
1869 rec_buff = (uchar *) tokudb::memory::malloc(
1870 alloced_rec_buff_length,
1871 MYF(MY_WME));
1872 if (rec_buff == NULL) {
1873 ret_val = 1;
1874 goto exit;
1875 }
1876
1877 alloced_update_rec_buff_length = alloced_rec_buff_length;
1878 rec_update_buff = (uchar*)tokudb::memory::malloc(
1879 alloced_update_rec_buff_length,
1880 MYF(MY_WME));
1881 if (rec_update_buff == NULL) {
1882 ret_val = 1;
1883 goto exit;
1884 }
1885
1886 // lookup or create share
1887 share = TOKUDB_SHARE::get_share(name, table_share, &lock, true);
1888 assert_always(share);
1889
1890 if (share->state() != TOKUDB_SHARE::OPENED) {
1891 // means we're responsible for the transition to OPENED, ERROR or CLOSED
1892
1893 ret_val = allocate_key_and_col_info(table_share, &share->kc_info);
1894 if (ret_val == 0) {
1895 ret_val = initialize_share(name, mode);
1896 }
1897
1898 if (ret_val == 0) {
1899 share->set_state(TOKUDB_SHARE::OPENED);
1900 } else {
1901 free_key_and_col_info(&share->kc_info);
1902 share->set_state(TOKUDB_SHARE::ERROR);
1903 }
1904 share->unlock();
1905 } else {
1906 // got an already OPENED instance
1907 share->unlock();
1908 }
1909
1910 if (share->state() == TOKUDB_SHARE::ERROR) {
1911 share->release();
1912 goto exit;
1913 }
1914
1915 assert_always(share->state() == TOKUDB_SHARE::OPENED);
1916
1917 ref_length = share->ref_length; // If second open
1918
1919 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1920 TOKUDB_DEBUG_OPEN,
1921 "tokudbopen:%p:share=%p:file=%p:table=%p:table->s=%p:%d",
1922 this,
1923 share,
1924 share->file,
1925 table,
1926 table->s,
1927 share->use_count());
1928
1929 key_read = false;
1930 stats.block_size = 1<<20; // QQQ Tokudb DB block size
1931
1932 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1933
1934exit:
1935 if (ret_val) {
1936 tokudb::memory::free(range_query_buff);
1937 range_query_buff = NULL;
1938 tokudb::memory::free(alloc_ptr);
1939 alloc_ptr = NULL;
1940 tokudb::memory::free(rec_buff);
1941 rec_buff = NULL;
1942 tokudb::memory::free(rec_update_buff);
1943 rec_update_buff = NULL;
1944
1945 if (error) {
1946 my_errno = error;
1947 }
1948 }
1949 TOKUDB_HANDLER_DBUG_RETURN(ret_val);
1950}
1951
1952//
1953// estimate the number of rows in a DB
1954// Parameters:
1955// [in] db - DB whose number of rows will be estimated
1956// [out] num_rows - number of estimated rows in db
1957// Returns:
1958// 0 on success
1959// error otherwise
1960//
1961int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) {
1962 int error = ENOSYS;
1963 bool do_commit = false;
1964 DB_BTREE_STAT64 dict_stats;
1965 DB_TXN* txn_to_use = NULL;
1966
1967 if (txn == NULL) {
1968 error = txn_begin(db_env, 0, &txn_to_use, DB_READ_UNCOMMITTED, ha_thd());
1969 if (error) goto cleanup;
1970 do_commit = true;
1971 }
1972 else {
1973 txn_to_use = txn;
1974 }
1975
1976 error = db->stat64(db, txn_to_use, &dict_stats);
1977 if (error) { goto cleanup; }
1978
1979 *num_rows = dict_stats.bt_ndata;
1980 error = 0;
1981cleanup:
1982 if (do_commit) {
1983 commit_txn(txn_to_use, 0);
1984 txn_to_use = NULL;
1985 }
1986 return error;
1987}
1988
1989
1990int ha_tokudb::write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn ){
1991 return write_metadata(db, &curr_key_data, sizeof curr_key_data, data, size, txn);
1992}
1993
1994int ha_tokudb::remove_from_status(DB *db, HA_METADATA_KEY curr_key_data, DB_TXN *txn) {
1995 return remove_metadata(db, &curr_key_data, sizeof curr_key_data, txn);
1996}
1997
1998int ha_tokudb::remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction){
1999 int error;
2000 DBT key;
2001 DB_TXN* txn = NULL;
2002 bool do_commit = false;
2003 //
2004 // transaction to be used for putting metadata into status.tokudb
2005 //
2006 if (transaction == NULL) {
2007 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2008 if (error) {
2009 goto cleanup;
2010 }
2011 do_commit = true;
2012 }
2013 else {
2014 txn = transaction;
2015 }
2016
2017 memset(&key, 0, sizeof(key));
2018 key.data = key_data;
2019 key.size = key_size;
2020 error = db->del(db, txn, &key, DB_DELETE_ANY);
2021 if (error) {
2022 goto cleanup;
2023 }
2024
2025 error = 0;
2026cleanup:
2027 if (do_commit && txn) {
2028 if (!error) {
2029 commit_txn(txn, DB_TXN_NOSYNC);
2030 }
2031 else {
2032 abort_txn(txn);
2033 }
2034 }
2035 return error;
2036}
2037
2038//
2039// helper function to write a piece of metadata in to status.tokudb
2040//
2041int ha_tokudb::write_metadata(DB* db, void* key_data, uint key_size, void* val_data, uint val_size, DB_TXN* transaction ){
2042 int error;
2043 DBT key;
2044 DBT value;
2045 DB_TXN* txn = NULL;
2046 bool do_commit = false;
2047 //
2048 // transaction to be used for putting metadata into status.tokudb
2049 //
2050 if (transaction == NULL) {
2051 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2052 if (error) {
2053 goto cleanup;
2054 }
2055 do_commit = true;
2056 }
2057 else {
2058 txn = transaction;
2059 }
2060
2061 memset(&key, 0, sizeof(key));
2062 memset(&value, 0, sizeof(value));
2063 key.data = key_data;
2064 key.size = key_size;
2065 value.data = val_data;
2066 value.size = val_size;
2067 error = db->put(db, txn, &key, &value, 0);
2068 if (error) {
2069 goto cleanup;
2070 }
2071
2072 error = 0;
2073cleanup:
2074 if (do_commit && txn) {
2075 if (!error) {
2076 commit_txn(txn, DB_TXN_NOSYNC);
2077 }
2078 else {
2079 abort_txn(txn);
2080 }
2081 }
2082 return error;
2083}
2084
2085int ha_tokudb::write_frm_data(DB* db, DB_TXN* txn, const char* frm_name) {
2086 TOKUDB_HANDLER_DBUG_ENTER("%p %p %s", db, txn, frm_name);
2087
2088 uchar* frm_data = NULL;
2089 size_t frm_len = 0;
2090 int error = 0;
2091
2092#if 100000 <= MYSQL_VERSION_ID
2093 error = table_share->read_frm_image((const uchar**)&frm_data,&frm_len);
2094 if (error) { goto cleanup; }
2095#else
2096 error = readfrm(frm_name,&frm_data,&frm_len);
2097 if (error) { goto cleanup; }
2098#endif
2099
2100 error = write_to_status(db,hatoku_frm_data,frm_data,(uint)frm_len, txn);
2101 if (error) { goto cleanup; }
2102
2103 error = 0;
2104cleanup:
2105 tokudb::memory::free(frm_data);
2106 TOKUDB_HANDLER_DBUG_RETURN(error);
2107}
2108
2109int ha_tokudb::remove_frm_data(DB *db, DB_TXN *txn) {
2110 return remove_from_status(db, hatoku_frm_data, txn);
2111}
2112
2113static int smart_dbt_callback_verify_frm (DBT const *key, DBT const *row, void *context) {
2114 DBT* stored_frm = (DBT *)context;
2115 stored_frm->size = row->size;
2116 stored_frm->data = (uchar *)tokudb::memory::malloc(row->size, MYF(MY_WME));
2117 assert_always(stored_frm->data);
2118 memcpy(stored_frm->data, row->data, row->size);
2119 return 0;
2120}
2121
2122int ha_tokudb::verify_frm_data(const char* frm_name, DB_TXN* txn) {
2123 TOKUDB_HANDLER_DBUG_ENTER("%s", frm_name);
2124 uchar* mysql_frm_data = NULL;
2125 size_t mysql_frm_len = 0;
2126 DBT key = {};
2127 DBT stored_frm = {};
2128 int error = 0;
2129 HA_METADATA_KEY curr_key = hatoku_frm_data;
2130
2131 // get the frm data from MySQL
2132#if 100000 <= MYSQL_VERSION_ID
2133 error = table_share->read_frm_image((const uchar**)&mysql_frm_data,&mysql_frm_len);
2134 if (error) {
2135 goto cleanup;
2136 }
2137#else
2138 error = readfrm(frm_name,&mysql_frm_data,&mysql_frm_len);
2139 if (error) {
2140 goto cleanup;
2141 }
2142#endif
2143
2144 key.data = &curr_key;
2145 key.size = sizeof(curr_key);
2146 error = share->status_block->getf_set(
2147 share->status_block,
2148 txn,
2149 0,
2150 &key,
2151 smart_dbt_callback_verify_frm,
2152 &stored_frm
2153 );
2154 if (error == DB_NOTFOUND) {
2155 // if not found, write it
2156 error = write_frm_data(share->status_block, txn, frm_name);
2157 goto cleanup;
2158 } else if (error) {
2159 goto cleanup;
2160 }
2161
2162 if (stored_frm.size != mysql_frm_len || memcmp(stored_frm.data, mysql_frm_data, stored_frm.size)) {
2163 error = HA_ERR_TABLE_DEF_CHANGED;
2164 goto cleanup;
2165 }
2166
2167 error = 0;
2168cleanup:
2169 tokudb::memory::free(mysql_frm_data);
2170 tokudb::memory::free(stored_frm.data);
2171 TOKUDB_HANDLER_DBUG_RETURN(error);
2172}
2173
2174//
2175// Updates status.tokudb with a new max value used for the auto increment column
2176// Parameters:
2177// [in] db - this will always be status.tokudb
2178// val - value to store
2179// Returns:
2180// 0 on success, error otherwise
2181//
2182//
2183int ha_tokudb::update_max_auto_inc(DB* db, ulonglong val){
2184 return write_to_status(db,hatoku_max_ai,&val,sizeof(val), NULL);
2185}
2186
2187//
2188// Writes the initial auto increment value, as specified by create table
2189// so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
2190// then the value 100 will be stored here in val
2191// Parameters:
2192// [in] db - this will always be status.tokudb
2193// val - value to store
2194// Returns:
2195// 0 on success, error otherwise
2196//
2197//
2198int ha_tokudb::write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn){
2199 return write_to_status(db,hatoku_ai_create_value,&val,sizeof(val), txn);
2200}
2201
2202
2203//
2204// Closes a handle to a table.
2205//
2206int ha_tokudb::close() {
2207 TOKUDB_HANDLER_DBUG_ENTER("");
2208 int r = __close();
2209 TOKUDB_HANDLER_DBUG_RETURN(r);
2210}
2211
2212int ha_tokudb::__close() {
2213 TOKUDB_HANDLER_DBUG_ENTER("");
2214 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "close:%p", this);
2215 tokudb::memory::free(rec_buff);
2216 tokudb::memory::free(rec_update_buff);
2217 tokudb::memory::free(blob_buff);
2218 tokudb::memory::free(alloc_ptr);
2219 tokudb::memory::free(range_query_buff);
2220 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
2221 toku_dbt_array_destroy(&mult_key_dbt_array[i]);
2222 }
2223 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
2224 toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
2225 }
2226 rec_buff = NULL;
2227 rec_update_buff = NULL;
2228 alloc_ptr = NULL;
2229 ha_tokudb::reset();
2230 int retval = share->release();
2231 TOKUDB_HANDLER_DBUG_RETURN(retval);
2232}
2233
2234//
2235// Reallocate record buffer (rec_buff) if needed
2236// If not needed, does nothing
2237// Parameters:
2238// length - size of buffer required for rec_buff
2239//
2240bool ha_tokudb::fix_rec_buff_for_blob(ulong length) {
2241 if (!rec_buff || (length > alloced_rec_buff_length)) {
2242 uchar* newptr = (uchar*)tokudb::memory::realloc(
2243 (void*)rec_buff,
2244 length,
2245 MYF(MY_ALLOW_ZERO_PTR));
2246 if (!newptr)
2247 return 1;
2248 rec_buff = newptr;
2249 alloced_rec_buff_length = length;
2250 }
2251 return 0;
2252}
2253
2254//
2255// Reallocate record buffer (rec_buff) if needed
2256// If not needed, does nothing
2257// Parameters:
2258// length - size of buffer required for rec_buff
2259//
2260bool ha_tokudb::fix_rec_update_buff_for_blob(ulong length) {
2261 if (!rec_update_buff || (length > alloced_update_rec_buff_length)) {
2262 uchar* newptr = (uchar*)tokudb::memory::realloc(
2263 (void*)rec_update_buff,
2264 length,
2265 MYF(MY_ALLOW_ZERO_PTR));
2266 if (!newptr)
2267 return 1;
2268 rec_update_buff= newptr;
2269 alloced_update_rec_buff_length = length;
2270 }
2271 return 0;
2272}
2273
2274/* Calculate max length needed for row */
2275ulong ha_tokudb::max_row_length(const uchar * buf) {
2276 ulong length = table_share->reclength + table_share->fields * 2;
2277 uint *ptr, *end;
2278 for (ptr = table_share->blob_field, end = ptr + table_share->blob_fields; ptr != end; ptr++) {
2279 Field_blob *blob = ((Field_blob *) table->field[*ptr]);
2280 length += blob->get_length((uchar *) (buf + field_offset(blob, table))) + 2;
2281 }
2282 return length;
2283}
2284
2285/*
2286*/
2287//
2288// take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2289// Pack a row for storage.
2290// If the row is of fixed length, just store the row 'as is'.
2291// If not, we will generate a packed row suitable for storage.
2292// This will only fail if we don't have enough memory to pack the row,
2293// which may only happen in rows with blobs, as the default row length is
2294// pre-allocated.
2295// Parameters:
2296// [out] row - row stored in DBT to be converted
2297// [out] buf - buffer where row is packed
2298// [in] record - row in MySQL format
2299//
2300
2301int ha_tokudb::pack_row_in_buff(
2302 DBT * row,
2303 const uchar* record,
2304 uint index,
2305 uchar* row_buff
2306 )
2307{
2308 uchar* fixed_field_ptr = NULL;
2309 uchar* var_field_offset_ptr = NULL;
2310 uchar* start_field_data_ptr = NULL;
2311 uchar* var_field_data_ptr = NULL;
2312 int r = ENOSYS;
2313 memset((void *) row, 0, sizeof(*row));
2314
2315 my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2316
2317 // Copy null bytes
2318 memcpy(row_buff, record, table_share->null_bytes);
2319 fixed_field_ptr = row_buff + table_share->null_bytes;
2320 var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2321 start_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2322 var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2323
2324 // assert that when the hidden primary key exists, primary_key_offsets is NULL
2325 for (uint i = 0; i < table_share->fields; i++) {
2326 Field* field = table->field[i];
2327 uint curr_field_offset = field_offset(field, table);
2328 if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2329 continue;
2330 }
2331 if (is_fixed_field(&share->kc_info, i)) {
2332 fixed_field_ptr = pack_fixed_field(
2333 fixed_field_ptr,
2334 record + curr_field_offset,
2335 share->kc_info.field_lengths[i]
2336 );
2337 }
2338 else if (is_variable_field(&share->kc_info, i)) {
2339 var_field_data_ptr = pack_var_field(
2340 var_field_offset_ptr,
2341 var_field_data_ptr,
2342 start_field_data_ptr,
2343 record + curr_field_offset,
2344 share->kc_info.length_bytes[i],
2345 share->kc_info.num_offset_bytes
2346 );
2347 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2348 }
2349 }
2350
2351 for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2352 Field* field = table->field[share->kc_info.blob_fields[i]];
2353 var_field_data_ptr = pack_toku_field_blob(
2354 var_field_data_ptr,
2355 record + field_offset(field, table),
2356 field
2357 );
2358 }
2359
2360 row->data = row_buff;
2361 row->size = (size_t) (var_field_data_ptr - row_buff);
2362 r = 0;
2363
2364 dbug_tmp_restore_column_map(table->write_set, old_map);
2365 return r;
2366}
2367
2368
2369int ha_tokudb::pack_row(
2370 DBT * row,
2371 const uchar* record,
2372 uint index
2373 )
2374{
2375 return pack_row_in_buff(row,record,index,rec_buff);
2376}
2377
2378int ha_tokudb::pack_old_row_for_update(
2379 DBT * row,
2380 const uchar* record,
2381 uint index
2382 )
2383{
2384 return pack_row_in_buff(row,record,index,rec_update_buff);
2385}
2386
2387
2388int ha_tokudb::unpack_blobs(
2389 uchar* record,
2390 const uchar* from_tokudb_blob,
2391 uint32_t num_bytes,
2392 bool check_bitmap
2393 )
2394{
2395 uint error = 0;
2396 uchar* ptr = NULL;
2397 const uchar* buff = NULL;
2398 //
2399 // assert that num_bytes > 0 iff share->num_blobs > 0
2400 //
2401 assert_always( !((share->kc_info.num_blobs == 0) && (num_bytes > 0)) );
2402 if (num_bytes > num_blob_bytes) {
2403 ptr = (uchar*)tokudb::memory::realloc(
2404 (void*)blob_buff, num_bytes,
2405 MYF(MY_ALLOW_ZERO_PTR));
2406 if (ptr == NULL) {
2407 error = ENOMEM;
2408 goto exit;
2409 }
2410 blob_buff = ptr;
2411 num_blob_bytes = num_bytes;
2412 }
2413
2414 memcpy(blob_buff, from_tokudb_blob, num_bytes);
2415 buff= blob_buff;
2416 for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2417 uint32_t curr_field_index = share->kc_info.blob_fields[i];
2418 bool skip = check_bitmap ?
2419 !(bitmap_is_set(table->read_set,curr_field_index) ||
2420 bitmap_is_set(table->write_set,curr_field_index)) :
2421 false;
2422 Field* field = table->field[curr_field_index];
2423 uint32_t len_bytes = field->row_pack_length();
2424 const uchar* end_buff = unpack_toku_field_blob(
2425 record + field_offset(field, table),
2426 buff,
2427 len_bytes,
2428 skip
2429 );
2430 // verify that the pointers to the blobs are all contained within the blob_buff
2431 if (!(blob_buff <= buff && end_buff <= blob_buff + num_bytes)) {
2432 error = -3000000;
2433 goto exit;
2434 }
2435 buff = end_buff;
2436 }
2437 // verify that the entire blob buffer was parsed
2438 if (share->kc_info.num_blobs > 0 && !(num_bytes > 0 && buff == blob_buff + num_bytes)) {
2439 error = -4000000;
2440 goto exit;
2441 }
2442
2443 error = 0;
2444exit:
2445 return error;
2446}
2447
2448//
2449// take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2450// Parameters:
2451// [out] record - row in MySQL format
2452// [in] row - row stored in DBT to be converted
2453//
2454int ha_tokudb::unpack_row(
2455 uchar* record,
2456 DBT const *row,
2457 DBT const *key,
2458 uint index
2459 )
2460{
2461 //
2462 // two cases, fixed length row, and variable length row
2463 // fixed length row is first below
2464 //
2465 /* Copy null bits */
2466 int error = 0;
2467 const uchar* fixed_field_ptr = (const uchar *) row->data;
2468 const uchar* var_field_offset_ptr = NULL;
2469 const uchar* var_field_data_ptr = NULL;
2470 uint32_t data_end_offset = 0;
2471 memcpy(record, fixed_field_ptr, table_share->null_bytes);
2472 fixed_field_ptr += table_share->null_bytes;
2473
2474 var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2475 var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2476
2477 //
2478 // unpack the key, if necessary
2479 //
2480 if (!(hidden_primary_key && index == primary_key)) {
2481 unpack_key(record,key,index);
2482 }
2483
2484 uint32_t last_offset = 0;
2485 //
2486 // we have two methods of unpacking, one if we need to unpack the entire row
2487 // the second if we unpack a subset of the entire row
2488 // first method here is if we unpack the entire row
2489 //
2490 if (unpack_entire_row) {
2491 //
2492 // fill in parts of record that are not part of the key
2493 //
2494 for (uint i = 0; i < table_share->fields; i++) {
2495 Field* field = table->field[i];
2496 if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2497 continue;
2498 }
2499
2500 if (is_fixed_field(&share->kc_info, i)) {
2501 fixed_field_ptr = unpack_fixed_field(
2502 record + field_offset(field, table),
2503 fixed_field_ptr,
2504 share->kc_info.field_lengths[i]
2505 );
2506 }
2507 //
2508 // here, we DO modify var_field_data_ptr or var_field_offset_ptr
2509 // as we unpack variable sized fields
2510 //
2511 else if (is_variable_field(&share->kc_info, i)) {
2512 switch (share->kc_info.num_offset_bytes) {
2513 case (1):
2514 data_end_offset = var_field_offset_ptr[0];
2515 break;
2516 case (2):
2517 data_end_offset = uint2korr(var_field_offset_ptr);
2518 break;
2519 default:
2520 assert_unreachable();
2521 }
2522 unpack_var_field(
2523 record + field_offset(field, table),
2524 var_field_data_ptr,
2525 data_end_offset - last_offset,
2526 share->kc_info.length_bytes[i]
2527 );
2528 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2529 var_field_data_ptr += data_end_offset - last_offset;
2530 last_offset = data_end_offset;
2531 }
2532 }
2533 error = unpack_blobs(
2534 record,
2535 var_field_data_ptr,
2536 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2537 false
2538 );
2539 if (error) {
2540 goto exit;
2541 }
2542 }
2543 //
2544 // in this case, we unpack only what is specified
2545 // in fixed_cols_for_query and var_cols_for_query
2546 //
2547 else {
2548 //
2549 // first the fixed fields
2550 //
2551 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
2552 uint field_index = fixed_cols_for_query[i];
2553 Field* field = table->field[field_index];
2554 unpack_fixed_field(
2555 record + field_offset(field, table),
2556 fixed_field_ptr + share->kc_info.cp_info[index][field_index].col_pack_val,
2557 share->kc_info.field_lengths[field_index]
2558 );
2559 }
2560
2561 //
2562 // now the var fields
2563 // here, we do NOT modify var_field_data_ptr or var_field_offset_ptr
2564 //
2565 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
2566 uint field_index = var_cols_for_query[i];
2567 Field* field = table->field[field_index];
2568 uint32_t var_field_index = share->kc_info.cp_info[index][field_index].col_pack_val;
2569 uint32_t data_start_offset;
2570 uint32_t field_len;
2571
2572 get_var_field_info(
2573 &field_len,
2574 &data_start_offset,
2575 var_field_index,
2576 var_field_offset_ptr,
2577 share->kc_info.num_offset_bytes
2578 );
2579
2580 unpack_var_field(
2581 record + field_offset(field, table),
2582 var_field_data_ptr + data_start_offset,
2583 field_len,
2584 share->kc_info.length_bytes[field_index]
2585 );
2586 }
2587
2588 if (read_blobs) {
2589 //
2590 // now the blobs
2591 //
2592 get_blob_field_info(
2593 &data_end_offset,
2594 share->kc_info.mcp_info[index].len_of_offsets,
2595 var_field_data_ptr,
2596 share->kc_info.num_offset_bytes
2597 );
2598
2599 var_field_data_ptr += data_end_offset;
2600 error = unpack_blobs(
2601 record,
2602 var_field_data_ptr,
2603 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2604 true
2605 );
2606 if (error) {
2607 goto exit;
2608 }
2609 }
2610 }
2611 error = 0;
2612exit:
2613 return error;
2614}
2615
2616uint32_t ha_tokudb::place_key_into_mysql_buff(
2617 KEY* key_info,
2618 uchar* record,
2619 uchar* data) {
2620
2621 KEY_PART_INFO* key_part = key_info->key_part;
2622 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2623 uchar* pos = data;
2624
2625 for (; key_part != end; key_part++) {
2626 if (key_part->field->null_bit) {
2627 uint null_offset = get_null_offset(table, key_part->field);
2628 if (*pos++ == NULL_COL_VAL) { // Null value
2629 //
2630 // We don't need to reset the record data as we will not access it
2631 // if the null data is set
2632 //
2633 record[null_offset] |= key_part->field->null_bit;
2634 continue;
2635 }
2636 record[null_offset] &= ~key_part->field->null_bit;
2637 }
2638#if !defined(MARIADB_BASE_VERSION)
2639 //
2640 // HOPEFULLY TEMPORARY
2641 //
2642 assert_always(table->s->db_low_byte_first);
2643#endif
2644 pos = unpack_toku_key_field(
2645 record + field_offset(key_part->field, table),
2646 pos,
2647 key_part->field,
2648 key_part->length
2649 );
2650 }
2651 return pos-data;
2652}
2653
2654//
2655// Store the key and the primary key into the row
2656// Parameters:
2657// [out] record - key stored in MySQL format
2658// [in] key - key stored in DBT to be converted
2659// index -index into key_file that represents the DB
2660// unpacking a key of
2661//
2662void ha_tokudb::unpack_key(uchar * record, DBT const *key, uint index) {
2663 uint32_t bytes_read;
2664 uchar *pos = (uchar *) key->data + 1;
2665 bytes_read = place_key_into_mysql_buff(
2666 &table->key_info[index],
2667 record,
2668 pos
2669 );
2670 if( (index != primary_key) && !hidden_primary_key) {
2671 //
2672 // also unpack primary key
2673 //
2674 place_key_into_mysql_buff(
2675 &table->key_info[primary_key],
2676 record,
2677 pos+bytes_read
2678 );
2679 }
2680}
2681
2682uint32_t ha_tokudb::place_key_into_dbt_buff(
2683 KEY* key_info,
2684 uchar* buff,
2685 const uchar* record,
2686 bool* has_null,
2687 int key_length) {
2688
2689 KEY_PART_INFO* key_part = key_info->key_part;
2690 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2691 uchar* curr_buff = buff;
2692 *has_null = false;
2693 for (; key_part != end && key_length > 0; key_part++) {
2694 //
2695 // accessing key_part->field->null_bit instead off key_part->null_bit
2696 // because key_part->null_bit is not set in add_index
2697 // filed ticket 862 to look into this
2698 //
2699 if (key_part->field->null_bit) {
2700 /* Store 0 if the key part is a NULL part */
2701 uint null_offset = get_null_offset(table, key_part->field);
2702 if (record[null_offset] & key_part->field->null_bit) {
2703 *curr_buff++ = NULL_COL_VAL;
2704 *has_null = true;
2705 continue;
2706 }
2707 *curr_buff++ = NONNULL_COL_VAL; // Store NOT NULL marker
2708 }
2709#if !defined(MARIADB_BASE_VERSION)
2710 //
2711 // HOPEFULLY TEMPORARY
2712 //
2713 assert_always(table->s->db_low_byte_first);
2714#endif
2715 //
2716 // accessing field_offset(key_part->field) instead off key_part->offset
2717 // because key_part->offset is SET INCORRECTLY in add_index
2718 // filed ticket 862 to look into this
2719 //
2720 curr_buff = pack_toku_key_field(
2721 curr_buff,
2722 (uchar *) (record + field_offset(key_part->field, table)),
2723 key_part->field,
2724 key_part->length
2725 );
2726 key_length -= key_part->length;
2727 }
2728 return curr_buff - buff;
2729}
2730
2731
2732
2733//
2734// Create a packed key from a row. This key will be written as such
2735// to the index tree. This will never fail as the key buffer is pre-allocated.
2736// Parameters:
2737// [out] key - DBT that holds the key
2738// [in] key_info - holds data about the key, such as it's length and offset into record
2739// [out] buff - buffer that will hold the data for key (unless
2740// we have a hidden primary key)
2741// [in] record - row from which to create the key
2742// key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2743// Returns:
2744// the parameter key
2745//
2746
2747DBT* ha_tokudb::create_dbt_key_from_key(
2748 DBT * key,
2749 KEY* key_info,
2750 uchar * buff,
2751 const uchar * record,
2752 bool* has_null,
2753 bool dont_pack_pk,
2754 int key_length,
2755 uint8_t inf_byte
2756 )
2757{
2758 uint32_t size = 0;
2759 uchar* tmp_buff = buff;
2760 my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2761
2762 key->data = buff;
2763
2764 //
2765 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2766 // positive infinity or negative infinity or zero. For this, because we are creating key
2767 // from a row, there is no way that columns can be missing, so in practice,
2768 // this will be meaningless. Might as well put in a value
2769 //
2770 *tmp_buff++ = inf_byte;
2771 size++;
2772 size += place_key_into_dbt_buff(
2773 key_info,
2774 tmp_buff,
2775 record,
2776 has_null,
2777 key_length
2778 );
2779 if (!dont_pack_pk) {
2780 tmp_buff = buff + size;
2781 if (hidden_primary_key) {
2782 memcpy(tmp_buff, current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2783 size += TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2784 }
2785 else {
2786 bool tmp_bool = false;
2787 size += place_key_into_dbt_buff(
2788 &table->key_info[primary_key],
2789 tmp_buff,
2790 record,
2791 &tmp_bool,
2792 MAX_KEY_LENGTH //this parameter does not matter
2793 );
2794 }
2795 }
2796
2797 key->size = size;
2798 DBUG_DUMP("key", (uchar *) key->data, key->size);
2799 dbug_tmp_restore_column_map(table->write_set, old_map);
2800 return key;
2801}
2802
2803
2804//
2805// Create a packed key from a row. This key will be written as such
2806// to the index tree. This will never fail as the key buffer is pre-allocated.
2807// Parameters:
2808// [out] key - DBT that holds the key
2809// keynr - index for which to create the key
2810// [out] buff - buffer that will hold the data for key (unless
2811// we have a hidden primary key)
2812// [in] record - row from which to create the key
2813// [out] has_null - says if the key has a NULL value for one of its columns
2814// key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2815// Returns:
2816// the parameter key
2817//
2818DBT *ha_tokudb::create_dbt_key_from_table(
2819 DBT * key,
2820 uint keynr,
2821 uchar * buff,
2822 const uchar * record,
2823 bool* has_null,
2824 int key_length
2825 )
2826{
2827 TOKUDB_HANDLER_DBUG_ENTER("");
2828 memset((void *) key, 0, sizeof(*key));
2829 if (hidden_primary_key && keynr == primary_key) {
2830 key->data = buff;
2831 memcpy(buff, &current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2832 key->size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2833 *has_null = false;
2834 DBUG_RETURN(key);
2835 }
2836 DBUG_RETURN(create_dbt_key_from_key(key, &table->key_info[keynr],buff,record, has_null, (keynr == primary_key), key_length, COL_ZERO));
2837}
2838
2839DBT* ha_tokudb::create_dbt_key_for_lookup(
2840 DBT * key,
2841 KEY* key_info,
2842 uchar * buff,
2843 const uchar * record,
2844 bool* has_null,
2845 int key_length
2846 )
2847{
2848 TOKUDB_HANDLER_DBUG_ENTER("");
2849 // override the infinity byte, needed in case the pk is a string
2850 // to make sure that the cursor that uses this key properly positions
2851 // it at the right location. If the table stores "D", but we look up for "d",
2852 // and the infinity byte is 0, then we will skip the "D", because
2853 // in bytes, "d" > "D".
2854 DBT* ret = create_dbt_key_from_key(key, key_info, buff, record, has_null, true, key_length, COL_NEG_INF);
2855 DBUG_RETURN(ret);
2856}
2857
2858//
2859// Create a packed key from from a MySQL unpacked key (like the one that is
2860// sent from the index_read() This key is to be used to read a row
2861// Parameters:
2862// [out] key - DBT that holds the key
2863// keynr - index for which to pack the key
2864// [out] buff - buffer that will hold the data for key
2865// [in] key_ptr - MySQL unpacked key
2866// key_length - length of key_ptr
2867// Returns:
2868// the parameter key
2869//
2870DBT* ha_tokudb::pack_key(
2871 DBT* key,
2872 uint keynr,
2873 uchar* buff,
2874 const uchar* key_ptr,
2875 uint key_length,
2876 int8_t inf_byte) {
2877
2878 TOKUDB_HANDLER_DBUG_ENTER(
2879 "key %p %u:%2.2x inf=%d",
2880 key_ptr,
2881 key_length,
2882 key_length > 0 ? key_ptr[0] : 0,
2883 inf_byte);
2884#if TOKU_INCLUDE_EXTENDED_KEYS
2885 if (keynr != primary_key && !tokudb_test(hidden_primary_key)) {
2886 DBUG_RETURN(pack_ext_key(key, keynr, buff, key_ptr, key_length, inf_byte));
2887 }
2888#endif
2889 KEY* key_info = &table->key_info[keynr];
2890 KEY_PART_INFO* key_part = key_info->key_part;
2891 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2892 my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2893
2894 memset((void *) key, 0, sizeof(*key));
2895 key->data = buff;
2896
2897 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2898 // positive infinity or negative infinity
2899 *buff++ = (uchar)inf_byte;
2900
2901 for (; key_part != end && (int) key_length > 0; key_part++) {
2902 uint offset = 0;
2903 if (key_part->null_bit) {
2904 if (!(*key_ptr == 0)) {
2905 *buff++ = NULL_COL_VAL;
2906 key_length -= key_part->store_length;
2907 key_ptr += key_part->store_length;
2908 continue;
2909 }
2910 *buff++ = NONNULL_COL_VAL;
2911 offset = 1; // Data is at key_ptr+1
2912 }
2913#if !defined(MARIADB_BASE_VERSION)
2914 assert_always(table->s->db_low_byte_first);
2915#endif
2916 buff = pack_key_toku_key_field(
2917 buff,
2918 (uchar *) key_ptr + offset,
2919 key_part->field,
2920 key_part->length
2921 );
2922
2923 key_ptr += key_part->store_length;
2924 key_length -= key_part->store_length;
2925 }
2926
2927 key->size = (buff - (uchar *) key->data);
2928 DBUG_DUMP("key", (uchar *) key->data, key->size);
2929 dbug_tmp_restore_column_map(table->write_set, old_map);
2930 DBUG_RETURN(key);
2931}
2932
2933#if TOKU_INCLUDE_EXTENDED_KEYS
2934DBT* ha_tokudb::pack_ext_key(
2935 DBT* key,
2936 uint keynr,
2937 uchar* buff,
2938 const uchar* key_ptr,
2939 uint key_length,
2940 int8_t inf_byte) {
2941
2942 TOKUDB_HANDLER_DBUG_ENTER("");
2943
2944 // build a list of PK parts that are in the SK. we will use this list to build the
2945 // extended key if necessary.
2946 KEY* pk_key_info = &table->key_info[primary_key];
2947 uint pk_parts = pk_key_info->user_defined_key_parts;
2948 uint pk_next = 0;
2949 struct {
2950 const uchar *key_ptr;
2951 KEY_PART_INFO *key_part;
2952 } pk_info[pk_parts];
2953
2954 KEY* key_info = &table->key_info[keynr];
2955 KEY_PART_INFO* key_part = key_info->key_part;
2956 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2957 my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2958
2959 memset((void *) key, 0, sizeof(*key));
2960 key->data = buff;
2961
2962 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2963 // positive infinity or negative infinity
2964 *buff++ = (uchar)inf_byte;
2965
2966 for (; key_part != end && (int) key_length > 0; key_part++) {
2967 // if the SK part is part of the PK, then append it to the list.
2968 if (key_part->field->part_of_key.is_set(primary_key)) {
2969 assert_always(pk_next < pk_parts);
2970 pk_info[pk_next].key_ptr = key_ptr;
2971 pk_info[pk_next].key_part = key_part;
2972 pk_next++;
2973 }
2974 uint offset = 0;
2975 if (key_part->null_bit) {
2976 if (!(*key_ptr == 0)) {
2977 *buff++ = NULL_COL_VAL;
2978 key_length -= key_part->store_length;
2979 key_ptr += key_part->store_length;
2980 continue;
2981 }
2982 *buff++ = NONNULL_COL_VAL;
2983 offset = 1; // Data is at key_ptr+1
2984 }
2985#if !defined(MARIADB_BASE_VERSION)
2986 assert_always(table->s->db_low_byte_first);
2987#endif
2988 buff = pack_key_toku_key_field(
2989 buff,
2990 (uchar *) key_ptr + offset,
2991 key_part->field,
2992 key_part->length
2993 );
2994
2995 key_ptr += key_part->store_length;
2996 key_length -= key_part->store_length;
2997 }
2998
2999 if (key_length > 0) {
3000 assert_always(key_part == end);
3001 end = key_info->key_part + get_ext_key_parts(key_info);
3002
3003 // pack PK in order of PK key parts
3004 for (uint pk_index = 0;
3005 key_part != end && (int) key_length > 0 && pk_index < pk_parts;
3006 pk_index++) {
3007 uint i;
3008 for (i = 0; i < pk_next; i++) {
3009 if (pk_info[i].key_part->fieldnr ==
3010 pk_key_info->key_part[pk_index].fieldnr)
3011 break;
3012 }
3013 if (i < pk_next) {
3014 const uchar *this_key_ptr = pk_info[i].key_ptr;
3015 KEY_PART_INFO *this_key_part = pk_info[i].key_part;
3016 buff = pack_key_toku_key_field(
3017 buff,
3018 (uchar*)this_key_ptr,
3019 this_key_part->field,
3020 this_key_part->length);
3021 } else {
3022 buff = pack_key_toku_key_field(
3023 buff,
3024 (uchar*)key_ptr,
3025 key_part->field,
3026 key_part->length);
3027 key_ptr += key_part->store_length;
3028 key_length -= key_part->store_length;
3029 key_part++;
3030 }
3031 }
3032 }
3033
3034 key->size = (buff - (uchar *) key->data);
3035 DBUG_DUMP("key", (uchar *) key->data, key->size);
3036 dbug_tmp_restore_column_map(table->write_set, old_map);
3037 DBUG_RETURN(key);
3038}
3039#endif
3040
3041//
3042// get max used hidden primary key value
3043//
3044void ha_tokudb::init_hidden_prim_key_info(DB_TXN *txn) {
3045 TOKUDB_HANDLER_DBUG_ENTER("");
3046 if (!(share->status & STATUS_PRIMARY_KEY_INIT)) {
3047 int error = 0;
3048 DBC* c = NULL;
3049 error = share->key_file[primary_key]->cursor(
3050 share->key_file[primary_key],
3051 txn,
3052 &c,
3053 0);
3054 assert_always(error == 0);
3055 DBT key,val;
3056 memset(&key, 0, sizeof(key));
3057 memset(&val, 0, sizeof(val));
3058 error = c->c_get(c, &key, &val, DB_LAST);
3059 if (error == 0) {
3060 assert_always(key.size == TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
3061 share->auto_ident = hpk_char_to_num((uchar *)key.data);
3062 }
3063 error = c->c_close(c);
3064 assert_always(error == 0);
3065 share->status |= STATUS_PRIMARY_KEY_INIT;
3066 }
3067 TOKUDB_HANDLER_DBUG_VOID_RETURN;
3068}
3069
3070
3071
3072/** @brief
3073 Get metadata info stored in status.tokudb
3074 */
3075int ha_tokudb::get_status(DB_TXN* txn) {
3076 TOKUDB_HANDLER_DBUG_ENTER("");
3077 DBT key, value;
3078 HA_METADATA_KEY curr_key;
3079 int error;
3080
3081 //
3082 // open status.tokudb
3083 //
3084 if (!share->status_block) {
3085 error =
3086 open_status_dictionary(
3087 &share->status_block,
3088 share->full_table_name(),
3089 txn);
3090 if (error) {
3091 goto cleanup;
3092 }
3093 }
3094
3095 //
3096 // transaction to be used for putting metadata into status.tokudb
3097 //
3098 memset(&key, 0, sizeof(key));
3099 memset(&value, 0, sizeof(value));
3100 key.data = &curr_key;
3101 key.size = sizeof(curr_key);
3102 value.flags = DB_DBT_USERMEM;
3103
3104 assert_always(share->status_block);
3105 //
3106 // get version
3107 //
3108 value.ulen = sizeof(share->version);
3109 value.data = &share->version;
3110 curr_key = hatoku_new_version;
3111 error = share->status_block->get(
3112 share->status_block,
3113 txn,
3114 &key,
3115 &value,
3116 0
3117 );
3118 if (error == DB_NOTFOUND) {
3119 //
3120 // hack to keep handle the issues of going back and forth
3121 // between 5.0.3 to 5.0.4
3122 // the problem with going back and forth
3123 // is with storing the frm file, 5.0.4 stores it, 5.0.3 does not
3124 // so, if a user goes back and forth and alters the schema
3125 // the frm stored can get out of sync with the schema of the table
3126 // This can cause issues.
3127 // To take care of this, we are doing this versioning work here.
3128 // We change the key that stores the version.
3129 // In 5.0.3, it is hatoku_old_version, in 5.0.4 it is hatoku_new_version
3130 // When we encounter a table that does not have hatoku_new_version
3131 // set, we give it the right one, and overwrite the old one with zero.
3132 // This ensures that 5.0.3 cannot open the table. Once it has been opened by 5.0.4
3133 //
3134 uint dummy_version = 0;
3135 share->version = HA_TOKU_ORIG_VERSION;
3136 error = write_to_status(
3137 share->status_block,
3138 hatoku_new_version,
3139 &share->version,
3140 sizeof(share->version),
3141 txn
3142 );
3143 if (error) { goto cleanup; }
3144 error = write_to_status(
3145 share->status_block,
3146 hatoku_old_version,
3147 &dummy_version,
3148 sizeof(dummy_version),
3149 txn
3150 );
3151 if (error) { goto cleanup; }
3152 }
3153 else if (error || value.size != sizeof(share->version)) {
3154 if (error == 0) {
3155 error = HA_ERR_INTERNAL_ERROR;
3156 }
3157 goto cleanup;
3158 }
3159 //
3160 // get capabilities
3161 //
3162 curr_key = hatoku_capabilities;
3163 value.ulen = sizeof(share->capabilities);
3164 value.data = &share->capabilities;
3165 error = share->status_block->get(
3166 share->status_block,
3167 txn,
3168 &key,
3169 &value,
3170 0
3171 );
3172 if (error == DB_NOTFOUND) {
3173 share->capabilities= 0;
3174 }
3175 else if (error || value.size != sizeof(share->version)) {
3176 if (error == 0) {
3177 error = HA_ERR_INTERNAL_ERROR;
3178 }
3179 goto cleanup;
3180 }
3181
3182 error = 0;
3183cleanup:
3184 TOKUDB_HANDLER_DBUG_RETURN(error);
3185}
3186
3187/** @brief
3188 Return an estimated of the number of rows in the table.
3189 Used when sorting to allocate buffers and by the optimizer.
3190 This is used in filesort.cc.
3191*/
3192ha_rows ha_tokudb::estimate_rows_upper_bound() {
3193 TOKUDB_HANDLER_DBUG_ENTER("");
3194 DBUG_RETURN(share->row_count() + HA_TOKUDB_EXTRA_ROWS);
3195}
3196
3197//
3198// Function that compares two primary keys that were saved as part of rnd_pos
3199// and ::position
3200//
3201int ha_tokudb::cmp_ref(const uchar * ref1, const uchar * ref2) {
3202 int ret_val = 0;
3203 bool read_string = false;
3204 ret_val = tokudb_compare_two_keys(
3205 ref1 + sizeof(uint32_t),
3206 *(uint32_t *)ref1,
3207 ref2 + sizeof(uint32_t),
3208 *(uint32_t *)ref2,
3209 (uchar *)share->file->descriptor->dbt.data + 4,
3210 *(uint32_t *)share->file->descriptor->dbt.data - 4,
3211 false,
3212 &read_string
3213 );
3214 return ret_val;
3215}
3216
3217bool ha_tokudb::check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) {
3218 //
3219 // This is a horrendous hack for now, as copied by InnoDB.
3220 // This states that if the auto increment create field has changed,
3221 // via a "alter table foo auto_increment=new_val", that this
3222 // change is incompatible, and to rebuild the entire table
3223 // This will need to be fixed
3224 //
3225 if ((info->used_fields & HA_CREATE_USED_AUTO) &&
3226 info->auto_increment_value != 0) {
3227
3228 return COMPATIBLE_DATA_NO;
3229 }
3230 if (table_changes != IS_EQUAL_YES)
3231 return COMPATIBLE_DATA_NO;
3232 return COMPATIBLE_DATA_YES;
3233}
3234
3235//
3236// Method that is called before the beginning of many calls
3237// to insert rows (ha_tokudb::write_row). There is no guarantee
3238// that start_bulk_insert is called, however there is a guarantee
3239// that if start_bulk_insert is called, then end_bulk_insert may be
3240// called as well.
3241// Parameters:
3242// [in] rows - an estimate of the number of rows that will be inserted
3243// if number of rows is unknown (such as if doing
3244// "insert into foo select * from bar), then rows
3245// will be 0
3246//
3247//
3248// This function returns true if the table MAY be empty.
3249// It is NOT meant to be a 100% check for emptiness.
3250// This is used for a bulk load optimization.
3251//
3252bool ha_tokudb::may_table_be_empty(DB_TXN *txn) {
3253 int error;
3254 bool ret_val = false;
3255 DBC* tmp_cursor = NULL;
3256 DB_TXN* tmp_txn = NULL;
3257
3258 const int empty_scan = tokudb::sysvars::empty_scan(ha_thd());
3259 if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_DISABLED)
3260 goto cleanup;
3261
3262 if (txn == NULL) {
3263 error = txn_begin(db_env, 0, &tmp_txn, 0, ha_thd());
3264 if (error) {
3265 goto cleanup;
3266 }
3267 txn = tmp_txn;
3268 }
3269
3270 error = share->file->cursor(share->file, txn, &tmp_cursor, 0);
3271 if (error)
3272 goto cleanup;
3273 tmp_cursor->c_set_check_interrupt_callback(tmp_cursor, tokudb_killed_thd_callback, ha_thd());
3274 if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_LR)
3275 error = tmp_cursor->c_getf_next(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3276 else
3277 error = tmp_cursor->c_getf_prev(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3278 error = map_to_handler_error(error);
3279 if (error == DB_NOTFOUND)
3280 ret_val = true;
3281 else
3282 ret_val = false;
3283 error = 0;
3284
3285cleanup:
3286 if (tmp_cursor) {
3287 int r = tmp_cursor->c_close(tmp_cursor);
3288 assert_always(r == 0);
3289 tmp_cursor = NULL;
3290 }
3291 if (tmp_txn) {
3292 commit_txn(tmp_txn, 0);
3293 tmp_txn = NULL;
3294 }
3295 return ret_val;
3296}
3297
3298#if MYSQL_VERSION_ID >= 100000
3299void ha_tokudb::start_bulk_insert(ha_rows rows, uint flags) {
3300 TOKUDB_HANDLER_DBUG_ENTER("%llu %u txn %p", (unsigned long long) rows, flags, transaction);
3301#else
3302void ha_tokudb::start_bulk_insert(ha_rows rows) {
3303 TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction);
3304#endif
3305 THD* thd = ha_thd();
3306 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3307 delay_updating_ai_metadata = true;
3308 ai_metadata_update_required = false;
3309 abort_loader = false;
3310
3311 rwlock_t_lock_read(share->_num_DBs_lock);
3312 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3313 num_DBs_locked_in_bulk = true;
3314 lock_count = 0;
3315
3316 if ((rows == 0 || rows > 1) && share->try_table_lock) {
3317 if (tokudb::sysvars::prelock_empty(thd) &&
3318 may_table_be_empty(transaction) &&
3319 transaction != NULL) {
3320 if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR
3321 || table->s->next_number_key_offset) {
3322 acquire_table_lock(transaction, lock_write);
3323 } else {
3324 mult_dbt_flags[primary_key] = 0;
3325 if (!thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) && !hidden_primary_key) {
3326 mult_put_flags[primary_key] = DB_NOOVERWRITE;
3327 }
3328 uint32_t loader_flags = (tokudb::sysvars::load_save_space(thd)) ?
3329 LOADER_COMPRESS_INTERMEDIATES : 0;
3330
3331 int error = db_env->create_loader(
3332 db_env,
3333 transaction,
3334 &loader,
3335 NULL, // no src_db needed
3336 curr_num_DBs,
3337 share->key_file,
3338 mult_put_flags,
3339 mult_dbt_flags,
3340 loader_flags
3341 );
3342 if (error) {
3343 assert_always(loader == NULL);
3344 goto exit_try_table_lock;
3345 }
3346
3347 lc.thd = thd;
3348 lc.ha = this;
3349
3350 error = loader->set_poll_function(
3351 loader, ha_tokudb::bulk_insert_poll, &lc);
3352 assert_always(!error);
3353
3354 error = loader->set_error_callback(
3355 loader, ha_tokudb::loader_dup, &lc);
3356 assert_always(!error);
3357
3358 trx->stmt_progress.using_loader = true;
3359 }
3360 }
3361 exit_try_table_lock:
3362 share->lock();
3363 share->try_table_lock = false;
3364 share->unlock();
3365 }
3366 TOKUDB_HANDLER_DBUG_VOID_RETURN;
3367}
3368int ha_tokudb::bulk_insert_poll(void* extra, float progress) {
3369 LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
3370 if (thd_killed(context->thd)) {
3371 sprintf(context->write_status_msg,
3372 "The process has been killed, aborting bulk load.");
3373 return ER_ABORTING_CONNECTION;
3374 }
3375 float percentage = progress * 100;
3376 sprintf(context->write_status_msg,
3377 "Loading of data t %s about %.1f%% done",
3378 context->ha->share->full_table_name(),
3379 percentage);
3380 thd_proc_info(context->thd, context->write_status_msg);
3381#ifdef HA_TOKUDB_HAS_THD_PROGRESS
3382 thd_progress_report(context->thd, (unsigned long long)percentage, 100);
3383#endif
3384 return 0;
3385}
3386void ha_tokudb::loader_add_index_err(DB* db,
3387 int i,
3388 int err,
3389 DBT* key,
3390 DBT* val,
3391 void* error_extra) {
3392 LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3393 assert_always(context->ha);
3394 context->ha->set_loader_error(err);
3395}
3396void ha_tokudb::loader_dup(DB* db,
3397 int i,
3398 int err,
3399 DBT* key,
3400 DBT* val,
3401 void* error_extra) {
3402 LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3403 assert_always(context->ha);
3404 context->ha->set_loader_error(err);
3405 if (err == DB_KEYEXIST) {
3406 context->ha->set_dup_value_for_pk(key);
3407 }
3408}
3409
3410//
3411// Method that is called at the end of many calls to insert rows
3412// (ha_tokudb::write_row). If start_bulk_insert is called, then
3413// this is guaranteed to be called.
3414//
3415int ha_tokudb::end_bulk_insert(bool abort) {
3416 TOKUDB_HANDLER_DBUG_ENTER("");
3417 int error = 0;
3418 THD* thd = ha_thd();
3419 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3420 bool using_loader = (loader != NULL);
3421 if (ai_metadata_update_required) {
3422 share->lock();
3423 error = update_max_auto_inc(share->status_block, share->last_auto_increment);
3424 share->unlock();
3425 if (error) { goto cleanup; }
3426 }
3427 delay_updating_ai_metadata = false;
3428 ai_metadata_update_required = false;
3429 loader_error = 0;
3430 if (loader) {
3431 if (!abort_loader && !thd_kill_level(thd)) {
3432 DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", {
3433 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3434 thd_proc_info(thd, "DBUG sleep");
3435 my_sleep(20000000);
3436 thd_proc_info(thd, orig_proc_info);
3437 });
3438 error = loader->close(loader);
3439 loader = NULL;
3440 if (error) {
3441 if (thd_kill_level(thd)) {
3442 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3443 }
3444 goto cleanup;
3445 }
3446
3447 for (uint i = 0; i < table_share->keys; i++) {
3448 if (table_share->key_info[i].flags & HA_NOSAME) {
3449 bool is_unique;
3450 if (i == primary_key && !share->pk_has_string) {
3451 continue;
3452 }
3453 error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i],
3454 DB_PRELOCKED_WRITE);
3455 if (error) goto cleanup;
3456 if (!is_unique) {
3457 error = HA_ERR_FOUND_DUPP_KEY;
3458 last_dup_key = i;
3459 goto cleanup;
3460 }
3461 }
3462 }
3463 }
3464 else {
3465 error = sprintf(write_status_msg, "aborting bulk load");
3466 thd_proc_info(thd, write_status_msg);
3467 loader->abort(loader);
3468 loader = NULL;
3469 share->try_table_lock = true;
3470 }
3471 }
3472
3473cleanup:
3474 if (num_DBs_locked_in_bulk) {
3475 share->_num_DBs_lock.unlock();
3476 }
3477 num_DBs_locked_in_bulk = false;
3478 lock_count = 0;
3479 if (loader) {
3480 error = sprintf(write_status_msg, "aborting bulk load");
3481 thd_proc_info(thd, write_status_msg);
3482 loader->abort(loader);
3483 loader = NULL;
3484 }
3485 abort_loader = false;
3486 memset(&lc, 0, sizeof(lc));
3487 if (error || loader_error) {
3488 my_errno = error ? error : loader_error;
3489 if (using_loader) {
3490 share->try_table_lock = true;
3491 }
3492 }
3493 trx->stmt_progress.using_loader = false;
3494 thd_proc_info(thd, 0);
3495 TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
3496}
3497
3498int ha_tokudb::end_bulk_insert() {
3499 return end_bulk_insert( false );
3500}
3501
3502int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) {
3503 int error;
3504 DBC* tmp_cursor1 = NULL;
3505 DBC* tmp_cursor2 = NULL;
3506 DBT key1, key2, val, packed_key1, packed_key2;
3507 uint64_t cnt = 0;
3508 char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound.
3509 THD* thd = ha_thd();
3510 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3511 memset(&key1, 0, sizeof(key1));
3512 memset(&key2, 0, sizeof(key2));
3513 memset(&val, 0, sizeof(val));
3514 memset(&packed_key1, 0, sizeof(packed_key1));
3515 memset(&packed_key2, 0, sizeof(packed_key2));
3516 *is_unique = true;
3517
3518 error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE);
3519 if (error) { goto cleanup; }
3520
3521 error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE);
3522 if (error) { goto cleanup; }
3523
3524 error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3525 if (error == DB_NOTFOUND) {
3526 *is_unique = true;
3527 error = 0;
3528 goto cleanup;
3529 }
3530 else if (error) { goto cleanup; }
3531 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3532 if (error) { goto cleanup; }
3533
3534 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3535 if (error == DB_NOTFOUND) {
3536 *is_unique = true;
3537 error = 0;
3538 goto cleanup;
3539 }
3540 else if (error) { goto cleanup; }
3541
3542 while (error != DB_NOTFOUND) {
3543 bool has_null1;
3544 bool has_null2;
3545 int cmp;
3546 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1);
3547 place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1);
3548
3549 create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1);
3550 create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2);
3551
3552 if (!has_null1 && !has_null2) {
3553 cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2);
3554 if (cmp == 0) {
3555 memcpy(key_buff, key1.data, key1.size);
3556 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1);
3557 *is_unique = false;
3558 break;
3559 }
3560 }
3561
3562 error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3563 if (error) { goto cleanup; }
3564 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3565 if (error && (error != DB_NOTFOUND)) { goto cleanup; }
3566
3567 cnt++;
3568 if ((cnt % 10000) == 0) {
3569 sprintf(
3570 status_msg,
3571 "Verifying index uniqueness: Checked %llu of %llu rows in key-%s.",
3572 (long long unsigned) cnt,
3573 share->row_count(),
3574 key_info->name.str);
3575 thd_proc_info(thd, status_msg);
3576 if (thd_kill_level(thd)) {
3577 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3578 error = ER_QUERY_INTERRUPTED;
3579 goto cleanup;
3580 }
3581 }
3582 }
3583
3584 error = 0;
3585
3586cleanup:
3587 thd_proc_info(thd, orig_proc_info);
3588 if (tmp_cursor1) {
3589 tmp_cursor1->c_close(tmp_cursor1);
3590 tmp_cursor1 = NULL;
3591 }
3592 if (tmp_cursor2) {
3593 tmp_cursor2->c_close(tmp_cursor2);
3594 tmp_cursor2 = NULL;
3595 }
3596 return error;
3597}
3598
3599int ha_tokudb::is_val_unique(bool* is_unique, const uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn) {
3600 int error = 0;
3601 bool has_null;
3602 DBC* tmp_cursor = NULL;
3603
3604 DBT key; memset((void *)&key, 0, sizeof(key));
3605 create_dbt_key_from_key(&key, key_info, key_buff2, record, &has_null, true, MAX_KEY_LENGTH, COL_NEG_INF);
3606 if (has_null) {
3607 error = 0;
3608 *is_unique = true;
3609 goto cleanup;
3610 }
3611
3612 error = share->key_file[dict_index]->cursor(share->key_file[dict_index], txn, &tmp_cursor, DB_SERIALIZABLE | DB_RMW);
3613 if (error) {
3614 goto cleanup;
3615 } else {
3616 // prelock (key,-inf),(key,+inf) so that the subsequent key lookup does not overlock
3617 uint flags = 0;
3618 DBT key_right; memset(&key_right, 0, sizeof key_right);
3619 create_dbt_key_from_key(&key_right, key_info, key_buff3, record, &has_null, true, MAX_KEY_LENGTH, COL_POS_INF);
3620 error = tmp_cursor->c_set_bounds(tmp_cursor, &key, &key_right, true, DB_NOTFOUND);
3621 if (error == 0) {
3622 flags = DB_PRELOCKED | DB_PRELOCKED_WRITE;
3623 }
3624
3625 // lookup key and check unique prefix
3626 struct smart_dbt_info info;
3627 info.ha = this;
3628 info.buf = NULL;
3629 info.keynr = dict_index;
3630
3631 struct index_read_info ir_info;
3632 ir_info.orig_key = &key;
3633 ir_info.smart_dbt_info = info;
3634
3635 error = tmp_cursor->c_getf_set_range(tmp_cursor, flags, &key, smart_dbt_callback_lookup, &ir_info);
3636 if (error == DB_NOTFOUND) {
3637 *is_unique = true;
3638 error = 0;
3639 goto cleanup;
3640 }
3641 else if (error) {
3642 error = map_to_handler_error(error);
3643 goto cleanup;
3644 }
3645 if (ir_info.cmp) {
3646 *is_unique = true;
3647 }
3648 else {
3649 *is_unique = false;
3650 }
3651 }
3652 error = 0;
3653
3654cleanup:
3655 if (tmp_cursor) {
3656 int r = tmp_cursor->c_close(tmp_cursor);
3657 assert_always(r==0);
3658 tmp_cursor = NULL;
3659 }
3660 return error;
3661}
3662
3663static void maybe_do_unique_checks_delay(THD *thd) {
3664 if (thd->slave_thread) {
3665 uint64_t delay_ms = tokudb::sysvars::rpl_unique_checks_delay(thd);
3666 if (delay_ms)
3667 usleep(delay_ms * 1000);
3668 }
3669}
3670
3671static bool need_read_only(THD *thd) {
3672 return opt_readonly || !tokudb::sysvars::rpl_check_readonly(thd);
3673}
3674
3675static bool do_unique_checks(THD *thd, bool do_rpl_event) {
3676 if (do_rpl_event &&
3677 thd->slave_thread &&
3678 need_read_only(thd) &&
3679 !tokudb::sysvars::rpl_unique_checks(thd)) {
3680 return false;
3681 } else {
3682 return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3683 }
3684}
3685
3686int ha_tokudb::do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd) {
3687 int error = 0;
3688 //
3689 // first do uniqueness checks
3690 //
3691 if (share->has_unique_keys && do_unique_checks(thd, in_rpl_write_rows)) {
3692 DBUG_EXECUTE_IF("tokudb_crash_if_rpl_does_uniqueness_check",
3693 DBUG_ASSERT(0););
3694 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3695 bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
3696 bool is_unique = false;
3697 //
3698 // don't need to do check for primary key that don't have strings
3699 //
3700 if (keynr == primary_key && !share->pk_has_string) {
3701 continue;
3702 }
3703 if (!is_unique_key) {
3704 continue;
3705 }
3706
3707 maybe_do_unique_checks_delay(thd);
3708
3709 //
3710 // if unique key, check uniqueness constraint
3711 // but, we do not need to check it if the key has a null
3712 // and we do not need to check it if unique_checks is off
3713 //
3714 error = is_val_unique(&is_unique, record, &table->key_info[keynr], keynr, txn);
3715 if (error) {
3716 goto cleanup;
3717 }
3718 if (!is_unique) {
3719 error = DB_KEYEXIST;
3720 last_dup_key = keynr;
3721 goto cleanup;
3722 }
3723 }
3724 }
3725cleanup:
3726 return error;
3727}
3728
3729void ha_tokudb::test_row_packing(uchar* record, DBT* pk_key, DBT* pk_val) {
3730 int error;
3731 DBT row, key;
3732 //
3733 // variables for testing key packing, only used in some debug modes
3734 //
3735 uchar* tmp_pk_key_data = NULL;
3736 uchar* tmp_pk_val_data = NULL;
3737 DBT tmp_pk_key;
3738 DBT tmp_pk_val;
3739 bool has_null;
3740 int cmp;
3741
3742 memset(&tmp_pk_key, 0, sizeof(DBT));
3743 memset(&tmp_pk_val, 0, sizeof(DBT));
3744
3745 //
3746 //use for testing the packing of keys
3747 //
3748 tmp_pk_key_data = (uchar*)tokudb::memory::malloc(pk_key->size, MYF(MY_WME));
3749 assert_always(tmp_pk_key_data);
3750 tmp_pk_val_data = (uchar*)tokudb::memory::malloc(pk_val->size, MYF(MY_WME));
3751 assert_always(tmp_pk_val_data);
3752 memcpy(tmp_pk_key_data, pk_key->data, pk_key->size);
3753 memcpy(tmp_pk_val_data, pk_val->data, pk_val->size);
3754 tmp_pk_key.data = tmp_pk_key_data;
3755 tmp_pk_key.size = pk_key->size;
3756 tmp_pk_val.data = tmp_pk_val_data;
3757 tmp_pk_val.size = pk_val->size;
3758
3759 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3760 uint32_t tmp_num_bytes = 0;
3761 uchar* row_desc = NULL;
3762 uint32_t desc_size = 0;
3763
3764 if (keynr == primary_key) {
3765 continue;
3766 }
3767
3768 create_dbt_key_from_table(&key, keynr, key_buff2, record, &has_null);
3769
3770 //
3771 // TEST
3772 //
3773 row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3774 row_desc += (*(uint32_t *)row_desc);
3775 desc_size = (*(uint32_t *)row_desc) - 4;
3776 row_desc += 4;
3777 tmp_num_bytes = pack_key_from_desc(
3778 key_buff3,
3779 row_desc,
3780 desc_size,
3781 &tmp_pk_key,
3782 &tmp_pk_val
3783 );
3784 assert_always(tmp_num_bytes == key.size);
3785 cmp = memcmp(key_buff3,key_buff2,tmp_num_bytes);
3786 assert_always(cmp == 0);
3787
3788 //
3789 // test key packing of clustering keys
3790 //
3791 if (key_is_clustering(&table->key_info[keynr])) {
3792 error = pack_row(&row, (const uchar *) record, keynr);
3793 assert_always(error == 0);
3794 uchar* tmp_buff = NULL;
3795 tmp_buff = (uchar*)tokudb::memory::malloc(
3796 alloced_rec_buff_length,
3797 MYF(MY_WME));
3798 assert_always(tmp_buff);
3799 row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3800 row_desc += (*(uint32_t *)row_desc);
3801 row_desc += (*(uint32_t *)row_desc);
3802 desc_size = (*(uint32_t *)row_desc) - 4;
3803 row_desc += 4;
3804 tmp_num_bytes = pack_clustering_val_from_desc(
3805 tmp_buff,
3806 row_desc,
3807 desc_size,
3808 &tmp_pk_val
3809 );
3810 assert_always(tmp_num_bytes == row.size);
3811 cmp = memcmp(tmp_buff,rec_buff,tmp_num_bytes);
3812 assert_always(cmp == 0);
3813 tokudb::memory::free(tmp_buff);
3814 }
3815 }
3816
3817 //
3818 // copy stuff back out
3819 //
3820 error = pack_row(pk_val, (const uchar *) record, primary_key);
3821 assert_always(pk_val->size == tmp_pk_val.size);
3822 cmp = memcmp(pk_val->data, tmp_pk_val_data, pk_val->size);
3823 assert_always( cmp == 0);
3824
3825 tokudb::memory::free(tmp_pk_key_data);
3826 tokudb::memory::free(tmp_pk_val_data);
3827}
3828
3829// set the put flags for the main dictionary
3830void ha_tokudb::set_main_dict_put_flags(THD* thd, bool opt_eligible, uint32_t* put_flags) {
3831 uint32_t old_prelock_flags = 0;
3832 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3833 bool in_hot_index = share->num_DBs > curr_num_DBs;
3834 bool using_ignore_flag_opt = do_ignore_flag_optimization(thd, table, share->replace_into_fast && !using_ignore_no_key);
3835 //
3836 // optimization for "REPLACE INTO..." (and "INSERT IGNORE") command
3837 // if the command is "REPLACE INTO" and the only table
3838 // is the main table (or all indexes are a subset of the pk),
3839 // then we can simply insert the element
3840 // with DB_YESOVERWRITE. If the element does not exist,
3841 // it will act as a normal insert, and if it does exist, it
3842 // will act as a replace, which is exactly what REPLACE INTO is supposed
3843 // to do. We cannot do this if otherwise, because then we lose
3844 // consistency between indexes
3845 //
3846 if (hidden_primary_key)
3847 {
3848 *put_flags = old_prelock_flags;
3849 }
3850 else if (!do_unique_checks(thd, in_rpl_write_rows | in_rpl_update_rows) && !is_replace_into(thd) && !is_insert_ignore(thd))
3851 {
3852 *put_flags = old_prelock_flags;
3853 }
3854 else if (using_ignore_flag_opt && is_replace_into(thd)
3855 && !in_hot_index)
3856 {
3857 *put_flags = old_prelock_flags;
3858 }
3859 else if (opt_eligible && using_ignore_flag_opt && is_insert_ignore(thd)
3860 && !in_hot_index)
3861 {
3862 *put_flags = DB_NOOVERWRITE_NO_ERROR | old_prelock_flags;
3863 }
3864 else
3865 {
3866 *put_flags = DB_NOOVERWRITE | old_prelock_flags;
3867 }
3868}
3869
3870int ha_tokudb::insert_row_to_main_dictionary(uchar* record, DBT* pk_key, DBT* pk_val, DB_TXN* txn) {
3871 int error = 0;
3872 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3873 assert_always(curr_num_DBs == 1);
3874
3875 uint32_t put_flags = mult_put_flags[primary_key];
3876 THD *thd = ha_thd();
3877 set_main_dict_put_flags(thd, true, &put_flags);
3878
3879 // for test, make unique checks have a very long duration
3880 if ((put_flags & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
3881 maybe_do_unique_checks_delay(thd);
3882
3883 error = share->file->put(share->file, txn, pk_key, pk_val, put_flags);
3884 if (error) {
3885 last_dup_key = primary_key;
3886 goto cleanup;
3887 }
3888
3889cleanup:
3890 return error;
3891}
3892
3893int ha_tokudb::insert_rows_to_dictionaries_mult(DBT* pk_key, DBT* pk_val, DB_TXN* txn, THD* thd) {
3894 int error = 0;
3895 uint curr_num_DBs = share->num_DBs;
3896 set_main_dict_put_flags(thd, true, &mult_put_flags[primary_key]);
3897 uint32_t flags = mult_put_flags[primary_key];
3898
3899 // for test, make unique checks have a very long duration
3900 if ((flags & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
3901 maybe_do_unique_checks_delay(thd);
3902
3903 // the insert ignore optimization uses DB_NOOVERWRITE_NO_ERROR,
3904 // which is not allowed with env->put_multiple.
3905 // we have to insert the rows one by one in this case.
3906 if (flags & DB_NOOVERWRITE_NO_ERROR) {
3907 DB * src_db = share->key_file[primary_key];
3908 for (uint32_t i = 0; i < curr_num_DBs; i++) {
3909 DB * db = share->key_file[i];
3910 if (i == primary_key) {
3911 // if it's the primary key, insert the rows
3912 // as they are.
3913 error = db->put(db, txn, pk_key, pk_val, flags);
3914 } else {
3915 // generate a row for secondary keys.
3916 // use our multi put key/rec buffers
3917 // just as the ydb layer would have in
3918 // env->put_multiple(), except that
3919 // we will just do a put() right away.
3920 error = tokudb_generate_row(db, src_db,
3921 &mult_key_dbt_array[i].dbts[0], &mult_rec_dbt_array[i].dbts[0],
3922 pk_key, pk_val);
3923 if (error != 0) {
3924 goto out;
3925 }
3926 error = db->put(db, txn, &mult_key_dbt_array[i].dbts[0],
3927 &mult_rec_dbt_array[i].dbts[0], flags);
3928 }
3929 if (error != 0) {
3930 goto out;
3931 }
3932 }
3933 } else {
3934 // not insert ignore, so we can use put multiple
3935 error = db_env->put_multiple(
3936 db_env,
3937 share->key_file[primary_key],
3938 txn,
3939 pk_key,
3940 pk_val,
3941 curr_num_DBs,
3942 share->key_file,
3943 mult_key_dbt_array,
3944 mult_rec_dbt_array,
3945 mult_put_flags
3946 );
3947 }
3948
3949out:
3950 //
3951 // We break if we hit an error, unless it is a dup key error
3952 // and MySQL told us to ignore duplicate key errors
3953 //
3954 if (error) {
3955 last_dup_key = primary_key;
3956 }
3957 return error;
3958}
3959
3960//
3961// Stores a row in the table, called when handling an INSERT query
3962// Parameters:
3963// [in] record - a row in MySQL format
3964// Returns:
3965// 0 on success
3966// error otherwise
3967//
3968int ha_tokudb::write_row(uchar * record) {
3969 TOKUDB_HANDLER_DBUG_ENTER("%p", record);
3970
3971 DBT row, prim_key;
3972 int error;
3973 THD *thd = ha_thd();
3974 bool has_null;
3975 DB_TXN* sub_trans = NULL;
3976 DB_TXN* txn = NULL;
3977 tokudb_trx_data *trx = NULL;
3978 uint curr_num_DBs;
3979 bool create_sub_trans = false;
3980 bool num_DBs_locked = false;
3981
3982 //
3983 // some crap that needs to be done because MySQL does not properly abstract
3984 // this work away from us, namely filling in auto increment and setting auto timestamp
3985 //
3986#if MYSQL_VERSION_ID < 50600
3987 if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) {
3988 table->timestamp_field->set_time();
3989 }
3990#endif
3991 if (table->next_number_field && record == table->record[0]) {
3992 error = update_auto_increment();
3993 if (error)
3994 goto cleanup;
3995 }
3996
3997 //
3998 // check to see if some value for the auto increment column that is bigger
3999 // than anything else til now is being used. If so, update the metadata to reflect it
4000 // the goal here is we never want to have a dup key error due to a bad increment
4001 // of the auto inc field.
4002 //
4003 if (share->has_auto_inc && record == table->record[0]) {
4004 share->lock();
4005 ulonglong curr_auto_inc = retrieve_auto_increment(
4006 table->field[share->ai_field_index]->key_type(),
4007 field_offset(table->field[share->ai_field_index], table),
4008 record);
4009 if (curr_auto_inc > share->last_auto_increment) {
4010 share->last_auto_increment = curr_auto_inc;
4011 if (delay_updating_ai_metadata) {
4012 ai_metadata_update_required = true;
4013 } else {
4014 update_max_auto_inc(
4015 share->status_block,
4016 share->last_auto_increment);
4017 }
4018 }
4019 share->unlock();
4020 }
4021
4022 //
4023 // grab reader lock on numDBs_lock
4024 //
4025 if (!num_DBs_locked_in_bulk) {
4026 rwlock_t_lock_read(share->_num_DBs_lock);
4027 num_DBs_locked = true;
4028 } else {
4029 lock_count++;
4030 if (lock_count >= 2000) {
4031 share->_num_DBs_lock.unlock();
4032 rwlock_t_lock_read(share->_num_DBs_lock);
4033 lock_count = 0;
4034 }
4035 }
4036 curr_num_DBs = share->num_DBs;
4037
4038 if (hidden_primary_key) {
4039 get_auto_primary_key(current_ident);
4040 }
4041
4042 if (table_share->blob_fields) {
4043 if (fix_rec_buff_for_blob(max_row_length(record))) {
4044 error = HA_ERR_OUT_OF_MEM;
4045 goto cleanup;
4046 }
4047 }
4048
4049 create_dbt_key_from_table(&prim_key, primary_key, primary_key_buff, record, &has_null);
4050 if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4051 goto cleanup;
4052 }
4053
4054 create_sub_trans = (using_ignore && !(do_ignore_flag_optimization(thd,table,share->replace_into_fast && !using_ignore_no_key)));
4055 if (create_sub_trans) {
4056 error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4057 if (error) {
4058 goto cleanup;
4059 }
4060 }
4061 txn = create_sub_trans ? sub_trans : transaction;
4062 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_TXN, "txn %p", txn);
4063 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY))) {
4064 test_row_packing(record,&prim_key,&row);
4065 }
4066 if (loader) {
4067 error = loader->put(loader, &prim_key, &row);
4068 if (error) {
4069 abort_loader = true;
4070 goto cleanup;
4071 }
4072 } else {
4073 error = do_uniqueness_checks(record, txn, thd);
4074 if (error) {
4075 // for #4633
4076 // if we have a duplicate key error, let's check the primary key to see
4077 // if there is a duplicate there. If so, set last_dup_key to the pk
4078 if (error == DB_KEYEXIST && !tokudb_test(hidden_primary_key) && last_dup_key != primary_key) {
4079 int r = share->file->getf_set(share->file, txn, DB_SERIALIZABLE, &prim_key, smart_dbt_do_nothing, NULL);
4080 if (r == 0) {
4081 // if we get no error, that means the row
4082 // was found and this is a duplicate key,
4083 // so we set last_dup_key
4084 last_dup_key = primary_key;
4085 } else if (r != DB_NOTFOUND) {
4086 // if some other error is returned, return that to the user.
4087 error = r;
4088 }
4089 }
4090 goto cleanup;
4091 }
4092 if (curr_num_DBs == 1) {
4093 error = insert_row_to_main_dictionary(record, &prim_key, &row, txn);
4094 if (error) { goto cleanup; }
4095 } else {
4096 error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
4097 if (error) { goto cleanup; }
4098 }
4099 if (error == 0) {
4100 uint64_t full_row_size = prim_key.size + row.size;
4101 toku_hton_update_primary_key_bytes_inserted(full_row_size);
4102 }
4103 }
4104
4105 trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4106 if (!error) {
4107 added_rows++;
4108 trx->stmt_progress.inserted++;
4109 track_progress(thd);
4110 }
4111cleanup:
4112 if (num_DBs_locked) {
4113 share->_num_DBs_lock.unlock();
4114 }
4115 if (error == DB_KEYEXIST) {
4116 error = HA_ERR_FOUND_DUPP_KEY;
4117 }
4118 if (sub_trans) {
4119 // no point in recording error value of abort.
4120 // nothing we can do about it anyway and it is not what
4121 // we want to return.
4122 if (error) {
4123 abort_txn(sub_trans);
4124 }
4125 else {
4126 commit_txn(sub_trans, DB_TXN_NOSYNC);
4127 }
4128 }
4129 TOKUDB_HANDLER_DBUG_RETURN(error);
4130}
4131
4132/* Compare if a key in a row has changed */
4133bool ha_tokudb::key_changed(uint keynr, const uchar * old_row, const uchar * new_row) {
4134 DBT old_key;
4135 DBT new_key;
4136 memset((void *) &old_key, 0, sizeof(old_key));
4137 memset((void *) &new_key, 0, sizeof(new_key));
4138
4139 bool has_null;
4140 create_dbt_key_from_table(&new_key, keynr, key_buff2, new_row, &has_null);
4141 create_dbt_key_for_lookup(&old_key,&table->key_info[keynr], key_buff3, old_row, &has_null);
4142 return tokudb_prefix_cmp_dbt_key(share->key_file[keynr], &old_key, &new_key);
4143}
4144
4145//
4146// Updates a row in the table, called when handling an UPDATE query
4147// Parameters:
4148// [in] old_row - row to be updated, in MySQL format
4149// [in] new_row - new row, in MySQL format
4150// Returns:
4151// 0 on success
4152// error otherwise
4153//
4154int ha_tokudb::update_row(const uchar * old_row, const uchar * new_row) {
4155 TOKUDB_HANDLER_DBUG_ENTER("");
4156 DBT prim_key, old_prim_key, prim_row, old_prim_row;
4157 int UNINIT_VAR(error);
4158 bool has_null;
4159 THD* thd = ha_thd();
4160 DB_TXN* sub_trans = NULL;
4161 DB_TXN* txn = NULL;
4162 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4163 uint curr_num_DBs;
4164
4165 memset((void *) &prim_key, 0, sizeof(prim_key));
4166 memset((void *) &old_prim_key, 0, sizeof(old_prim_key));
4167 memset((void *) &prim_row, 0, sizeof(prim_row));
4168 memset((void *) &old_prim_row, 0, sizeof(old_prim_row));
4169
4170#if MYSQL_VERSION_ID < 50600
4171 if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) {
4172 table->timestamp_field->set_time();
4173 }
4174#endif
4175 //
4176 // check to see if some value for the auto increment column that is bigger
4177 // than anything else til now is being used. If so, update the metadata to reflect it
4178 // the goal here is we never want to have a dup key error due to a bad increment
4179 // of the auto inc field.
4180 //
4181 if (share->has_auto_inc && new_row == table->record[0]) {
4182 share->lock();
4183 ulonglong curr_auto_inc = retrieve_auto_increment(
4184 table->field[share->ai_field_index]->key_type(),
4185 field_offset(table->field[share->ai_field_index], table),
4186 new_row
4187 );
4188 if (curr_auto_inc > share->last_auto_increment) {
4189 error = update_max_auto_inc(share->status_block, curr_auto_inc);
4190 if (!error) {
4191 share->last_auto_increment = curr_auto_inc;
4192 }
4193 }
4194 share->unlock();
4195 }
4196
4197 //
4198 // grab reader lock on numDBs_lock
4199 //
4200 bool num_DBs_locked = false;
4201 if (!num_DBs_locked_in_bulk) {
4202 rwlock_t_lock_read(share->_num_DBs_lock);
4203 num_DBs_locked = true;
4204 }
4205 curr_num_DBs = share->num_DBs;
4206
4207 if (using_ignore) {
4208 error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4209 if (error) {
4210 goto cleanup;
4211 }
4212 }
4213 txn = using_ignore ? sub_trans : transaction;
4214
4215 if (hidden_primary_key) {
4216 memset((void *) &prim_key, 0, sizeof(prim_key));
4217 prim_key.data = (void *) current_ident;
4218 prim_key.size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
4219 old_prim_key = prim_key;
4220 }
4221 else {
4222 create_dbt_key_from_table(&prim_key, primary_key, key_buff, new_row, &has_null);
4223 create_dbt_key_from_table(&old_prim_key, primary_key, primary_key_buff, old_row, &has_null);
4224 }
4225
4226 // do uniqueness checks
4227 if (share->has_unique_keys && do_unique_checks(thd, in_rpl_update_rows)) {
4228 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
4229 bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
4230 if (keynr == primary_key && !share->pk_has_string) {
4231 continue;
4232 }
4233 if (is_unique_key) {
4234 bool key_ch = key_changed(keynr, old_row, new_row);
4235 if (key_ch) {
4236 bool is_unique;
4237 error = is_val_unique(&is_unique, new_row, &table->key_info[keynr], keynr, txn);
4238 if (error) goto cleanup;
4239 if (!is_unique) {
4240 error = DB_KEYEXIST;
4241 last_dup_key = keynr;
4242 goto cleanup;
4243 }
4244 }
4245 }
4246 }
4247 }
4248
4249 if (table_share->blob_fields) {
4250 if (fix_rec_buff_for_blob(max_row_length(new_row))) {
4251 error = HA_ERR_OUT_OF_MEM;
4252 goto cleanup;
4253 }
4254 if (fix_rec_update_buff_for_blob(max_row_length(old_row))) {
4255 error = HA_ERR_OUT_OF_MEM;
4256 goto cleanup;
4257 }
4258 }
4259
4260 error = pack_row(&prim_row, new_row, primary_key);
4261 if (error) { goto cleanup; }
4262
4263 error = pack_old_row_for_update(&old_prim_row, old_row, primary_key);
4264 if (error) { goto cleanup; }
4265
4266 set_main_dict_put_flags(thd, false, &mult_put_flags[primary_key]);
4267
4268 // for test, make unique checks have a very long duration
4269 if ((mult_put_flags[primary_key] & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
4270 maybe_do_unique_checks_delay(thd);
4271
4272 error = db_env->update_multiple(
4273 db_env,
4274 share->key_file[primary_key],
4275 txn,
4276 &old_prim_key,
4277 &old_prim_row,
4278 &prim_key,
4279 &prim_row,
4280 curr_num_DBs,
4281 share->key_file,
4282 mult_put_flags,
4283 2*curr_num_DBs,
4284 mult_key_dbt_array,
4285 curr_num_DBs,
4286 mult_rec_dbt_array
4287 );
4288
4289 if (error == DB_KEYEXIST) {
4290 last_dup_key = primary_key;
4291 }
4292 else if (!error) {
4293 updated_rows++;
4294 trx->stmt_progress.updated++;
4295 track_progress(thd);
4296 }
4297
4298
4299cleanup:
4300 if (num_DBs_locked) {
4301 share->_num_DBs_lock.unlock();
4302 }
4303 if (error == DB_KEYEXIST) {
4304 error = HA_ERR_FOUND_DUPP_KEY;
4305 }
4306 if (sub_trans) {
4307 // no point in recording error value of abort.
4308 // nothing we can do about it anyway and it is not what
4309 // we want to return.
4310 if (error) {
4311 abort_txn(sub_trans);
4312 }
4313 else {
4314 commit_txn(sub_trans, DB_TXN_NOSYNC);
4315 }
4316 }
4317 TOKUDB_HANDLER_DBUG_RETURN(error);
4318}
4319
4320//
4321// Deletes a row in the table, called when handling a DELETE query
4322// Parameters:
4323// [in] record - row to be deleted, in MySQL format
4324// Returns:
4325// 0 on success
4326// error otherwise
4327//
4328int ha_tokudb::delete_row(const uchar * record) {
4329 TOKUDB_HANDLER_DBUG_ENTER("");
4330 int error = ENOSYS;
4331 DBT row, prim_key;
4332 bool has_null;
4333 THD* thd = ha_thd();
4334 uint curr_num_DBs;
4335 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4336
4337 //
4338 // grab reader lock on numDBs_lock
4339 //
4340 bool num_DBs_locked = false;
4341 if (!num_DBs_locked_in_bulk) {
4342 rwlock_t_lock_read(share->_num_DBs_lock);
4343 num_DBs_locked = true;
4344 }
4345 curr_num_DBs = share->num_DBs;
4346
4347 create_dbt_key_from_table(&prim_key, primary_key, key_buff, record, &has_null);
4348 if (table_share->blob_fields) {
4349 if (fix_rec_buff_for_blob(max_row_length(record))) {
4350 error = HA_ERR_OUT_OF_MEM;
4351 goto cleanup;
4352 }
4353 }
4354 if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4355 goto cleanup;
4356 }
4357
4358 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
4359 TOKUDB_DEBUG_TXN,
4360 "all %p stmt %p sub_sp_level %p transaction %p",
4361 trx->all,
4362 trx->stmt,
4363 trx->sub_sp_level,
4364 transaction);
4365
4366 error =
4367 db_env->del_multiple(
4368 db_env,
4369 share->key_file[primary_key],
4370 transaction,
4371 &prim_key,
4372 &row,
4373 curr_num_DBs,
4374 share->key_file,
4375 mult_key_dbt_array,
4376 mult_del_flags);
4377
4378 if (error) {
4379 DBUG_PRINT("error", ("Got error %d", error));
4380 } else {
4381 deleted_rows++;
4382 trx->stmt_progress.deleted++;
4383 track_progress(thd);
4384 }
4385cleanup:
4386 if (num_DBs_locked) {
4387 share->_num_DBs_lock.unlock();
4388 }
4389 TOKUDB_HANDLER_DBUG_RETURN(error);
4390}
4391
4392//
4393// takes as input table->read_set and table->write_set
4394// and puts list of field indexes that need to be read in
4395// unpack_row in the member variables fixed_cols_for_query
4396// and var_cols_for_query
4397//
4398void ha_tokudb::set_query_columns(uint keynr) {
4399 uint32_t curr_fixed_col_index = 0;
4400 uint32_t curr_var_col_index = 0;
4401 read_key = false;
4402 read_blobs = false;
4403 //
4404 // i know this is probably confusing and will need to be explained better
4405 //
4406 uint key_index = 0;
4407
4408 if (keynr == primary_key || keynr == MAX_KEY) {
4409 key_index = primary_key;
4410 }
4411 else {
4412 key_index = (key_is_clustering(&table->key_info[keynr]) ? keynr : primary_key);
4413 }
4414 for (uint i = 0; i < table_share->fields; i++) {
4415 if (bitmap_is_set(table->read_set,i) ||
4416 bitmap_is_set(table->write_set,i)
4417 )
4418 {
4419 if (bitmap_is_set(&share->kc_info.key_filters[key_index],i)) {
4420 read_key = true;
4421 }
4422 else {
4423 //
4424 // if fixed field length
4425 //
4426 if (is_fixed_field(&share->kc_info, i)) {
4427 //
4428 // save the offset into the list
4429 //
4430 fixed_cols_for_query[curr_fixed_col_index] = i;
4431 curr_fixed_col_index++;
4432 }
4433 //
4434 // varchar or varbinary
4435 //
4436 else if (is_variable_field(&share->kc_info, i)) {
4437 var_cols_for_query[curr_var_col_index] = i;
4438 curr_var_col_index++;
4439 }
4440 //
4441 // it is a blob
4442 //
4443 else {
4444 read_blobs = true;
4445 }
4446 }
4447 }
4448 }
4449 num_fixed_cols_for_query = curr_fixed_col_index;
4450 num_var_cols_for_query = curr_var_col_index;
4451}
4452
4453void ha_tokudb::column_bitmaps_signal() {
4454 //
4455 // if we have max number of indexes, then MAX_KEY == primary_key
4456 //
4457 if (tokudb_active_index != MAX_KEY || tokudb_active_index == primary_key) {
4458 set_query_columns(tokudb_active_index);
4459 }
4460}
4461
4462//
4463// Notification that a scan of entire secondary table is about
4464// to take place. Will pre acquire table read lock
4465// Returns:
4466// 0 on success
4467// error otherwise
4468//
4469int ha_tokudb::prepare_index_scan() {
4470 TOKUDB_HANDLER_DBUG_ENTER("");
4471 int error = 0;
4472 HANDLE_INVALID_CURSOR();
4473 error = prelock_range(NULL, NULL);
4474 if (error) { last_cursor_error = error; goto cleanup; }
4475
4476 range_lock_grabbed = true;
4477 error = 0;
4478cleanup:
4479 TOKUDB_HANDLER_DBUG_RETURN(error);
4480}
4481
4482static bool index_key_is_null(
4483 TABLE* table,
4484 uint keynr,
4485 const uchar* key,
4486 uint key_len) {
4487
4488 bool key_can_be_null = false;
4489 KEY* key_info = &table->key_info[keynr];
4490 KEY_PART_INFO* key_part = key_info->key_part;
4491 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
4492 for (; key_part != end; key_part++) {
4493 if (key_part->null_bit) {
4494 key_can_be_null = true;
4495 break;
4496 }
4497 }
4498 return key_can_be_null && key_len > 0 && key[0] != 0;
4499}
4500
4501// Return true if bulk fetch can be used
4502static bool tokudb_do_bulk_fetch(THD *thd) {
4503 switch (thd_sql_command(thd)) {
4504 case SQLCOM_SELECT:
4505 case SQLCOM_CREATE_TABLE:
4506 case SQLCOM_INSERT_SELECT:
4507 case SQLCOM_REPLACE_SELECT:
4508 case SQLCOM_DELETE:
4509 return tokudb::sysvars::bulk_fetch(thd) != 0;
4510 default:
4511 return false;
4512 }
4513}
4514
4515//
4516// Notification that a range query getting all elements that equal a key
4517// to take place. Will pre acquire read lock
4518// Returns:
4519// 0 on success
4520// error otherwise
4521//
4522int ha_tokudb::prepare_index_key_scan(const uchar * key, uint key_len) {
4523 TOKUDB_HANDLER_DBUG_ENTER("%p %u", key, key_len);
4524 int error = 0;
4525 DBT start_key, end_key;
4526 THD* thd = ha_thd();
4527 HANDLE_INVALID_CURSOR();
4528 pack_key(&start_key, tokudb_active_index, prelocked_left_range, key, key_len, COL_NEG_INF);
4529 prelocked_left_range_size = start_key.size;
4530 pack_key(&end_key, tokudb_active_index, prelocked_right_range, key, key_len, COL_POS_INF);
4531 prelocked_right_range_size = end_key.size;
4532
4533 error = cursor->c_set_bounds(
4534 cursor,
4535 &start_key,
4536 &end_key,
4537 true,
4538 (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
4539 );
4540
4541 if (error){
4542 goto cleanup;
4543 }
4544
4545 range_lock_grabbed = true;
4546 range_lock_grabbed_null = index_key_is_null(table, tokudb_active_index, key, key_len);
4547 doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
4548 bulk_fetch_iteration = 0;
4549 rows_fetched_using_bulk_fetch = 0;
4550 error = 0;
4551cleanup:
4552 if (error) {
4553 error = map_to_handler_error(error);
4554 last_cursor_error = error;
4555 //
4556 // cursor should be initialized here, but in case it is not,
4557 // we still check
4558 //
4559 if (cursor) {
4560 int r = cursor->c_close(cursor);
4561 assert_always(r==0);
4562 cursor = NULL;
4563 remove_from_trx_handler_list();
4564 }
4565 }
4566 TOKUDB_HANDLER_DBUG_RETURN(error);
4567}
4568
4569void ha_tokudb::invalidate_bulk_fetch() {
4570 bytes_used_in_range_query_buff= 0;
4571 curr_range_query_buff_offset = 0;
4572 icp_went_out_of_range = false;
4573}
4574
4575void ha_tokudb::invalidate_icp() {
4576 toku_pushed_idx_cond = NULL;
4577 toku_pushed_idx_cond_keyno = MAX_KEY;
4578 icp_went_out_of_range = false;
4579}
4580
4581//
4582// Initializes local cursor on DB with index keynr
4583// Parameters:
4584// keynr - key (index) number
4585// sorted - 1 if result MUST be sorted according to index
4586// Returns:
4587// 0 on success
4588// error otherwise
4589//
4590int ha_tokudb::index_init(uint keynr, bool sorted) {
4591 TOKUDB_HANDLER_DBUG_ENTER("%d %u txn %p", keynr, sorted, transaction);
4592
4593 int error;
4594 THD* thd = ha_thd();
4595 DBUG_PRINT("enter", ("table: '%s' key: %d", table_share->table_name.str, keynr));
4596
4597 /*
4598 Under some very rare conditions (like full joins) we may already have
4599 an active cursor at this point
4600 */
4601 if (cursor) {
4602 DBUG_PRINT("note", ("Closing active cursor"));
4603 int r = cursor->c_close(cursor);
4604 assert_always(r==0);
4605 remove_from_trx_handler_list();
4606 }
4607 active_index = keynr;
4608
4609 if (active_index < MAX_KEY) {
4610 DBUG_ASSERT(keynr <= table->s->keys);
4611 } else {
4612 DBUG_ASSERT(active_index == MAX_KEY);
4613 keynr = primary_key;
4614 }
4615 tokudb_active_index = keynr;
4616
4617#if TOKU_CLUSTERING_IS_COVERING
4618 if (keynr < table->s->keys && table->key_info[keynr].option_struct->clustering)
4619 key_read = false;
4620#endif
4621
4622 last_cursor_error = 0;
4623 range_lock_grabbed = false;
4624 range_lock_grabbed_null = false;
4625 DBUG_ASSERT(share->key_file[keynr]);
4626 cursor_flags = get_cursor_isolation_flags(lock.type, thd);
4627 if (use_write_locks) {
4628 cursor_flags |= DB_RMW;
4629 }
4630 if (tokudb::sysvars::disable_prefetching(thd)) {
4631 cursor_flags |= DBC_DISABLE_PREFETCHING;
4632 }
4633 if (lock.type == TL_READ_WITH_SHARED_LOCKS) {
4634 cursor_flags |= DB_LOCKING_READ;
4635 }
4636 if ((error = share->key_file[keynr]->cursor(share->key_file[keynr],
4637 transaction, &cursor,
4638 cursor_flags))) {
4639 if (error == TOKUDB_MVCC_DICTIONARY_TOO_NEW) {
4640 error = HA_ERR_TABLE_DEF_CHANGED;
4641 my_error(ER_TABLE_DEF_CHANGED, MYF(0));
4642 }
4643 if (error == DB_LOCK_NOTGRANTED) {
4644 error = HA_ERR_LOCK_WAIT_TIMEOUT;
4645 my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
4646 }
4647 table->status = STATUS_NOT_FOUND;
4648 error = map_to_handler_error(error);
4649 last_cursor_error = error;
4650 cursor = NULL; // Safety
4651 goto exit;
4652 }
4653 cursor->c_set_check_interrupt_callback(cursor, tokudb_killed_thd_callback, thd);
4654 memset((void *) &last_key, 0, sizeof(last_key));
4655
4656 add_to_trx_handler_list();
4657
4658 if (thd_sql_command(thd) == SQLCOM_SELECT) {
4659 set_query_columns(keynr);
4660 unpack_entire_row = false;
4661 }
4662 else {
4663 unpack_entire_row = true;
4664 }
4665 invalidate_bulk_fetch();
4666 doing_bulk_fetch = false;
4667 maybe_index_scan = false;
4668 error = 0;
4669exit:
4670 TOKUDB_HANDLER_DBUG_RETURN(error);
4671}
4672
4673//
4674// closes the local cursor
4675//
4676int ha_tokudb::index_end() {
4677 TOKUDB_HANDLER_DBUG_ENTER("");
4678 range_lock_grabbed = false;
4679 range_lock_grabbed_null = false;
4680 if (cursor) {
4681 DBUG_PRINT("enter", ("table: '%s'", table_share->table_name.str));
4682 int r = cursor->c_close(cursor);
4683 assert_always(r==0);
4684 cursor = NULL;
4685 remove_from_trx_handler_list();
4686 last_cursor_error = 0;
4687 }
4688 active_index = tokudb_active_index = MAX_KEY;
4689
4690 //
4691 // reset query variables
4692 //
4693 unpack_entire_row = true;
4694 read_blobs = true;
4695 read_key = true;
4696 num_fixed_cols_for_query = 0;
4697 num_var_cols_for_query = 0;
4698
4699 invalidate_bulk_fetch();
4700 invalidate_icp();
4701 doing_bulk_fetch = false;
4702 close_dsmrr();
4703
4704 TOKUDB_HANDLER_DBUG_RETURN(0);
4705}
4706
4707
4708int ha_tokudb::handle_cursor_error(int error, int err_to_return, uint keynr) {
4709 TOKUDB_HANDLER_DBUG_ENTER("");
4710 if (error) {
4711 error = map_to_handler_error(error);
4712 last_cursor_error = error;
4713 table->status = STATUS_NOT_FOUND;
4714 if (error == DB_NOTFOUND) {
4715 error = err_to_return;
4716 }
4717 }
4718 TOKUDB_HANDLER_DBUG_RETURN(error);
4719}
4720
4721
4722//
4723// Helper function for read_row and smart_dbt_callback_xxx functions
4724// When using a hidden primary key, upon reading a row,
4725// we set the current_ident field to whatever the primary key we retrieved
4726// was
4727//
4728void ha_tokudb::extract_hidden_primary_key(uint keynr, DBT const *found_key) {
4729 //
4730 // extract hidden primary key to current_ident
4731 //
4732 if (hidden_primary_key) {
4733 if (keynr == primary_key) {
4734 memcpy(current_ident, (char *) found_key->data, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
4735 }
4736 //
4737 // if secondary key, hidden primary key is at end of found_key
4738 //
4739 else {
4740 memcpy(
4741 current_ident,
4742 (char *) found_key->data + found_key->size - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
4743 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
4744 );
4745 }
4746 }
4747}
4748
4749
4750int ha_tokudb::read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4751 assert_always(keynr == primary_key);
4752 return unpack_row(buf, row,found_key, keynr);
4753}
4754
4755//
4756// Reads the contents of row and found_key, DBT's retrieved from the DB associated to keynr, into buf
4757// This function assumes that we are using a covering index, as a result, if keynr is the primary key,
4758// we do not read row into buf
4759// Parameters:
4760// [out] buf - buffer for the row, in MySQL format
4761// keynr - index into key_file that represents DB we are currently operating on.
4762// [in] row - the row that has been read from the preceding DB call
4763// [in] found_key - key used to retrieve the row
4764//
4765void ha_tokudb::read_key_only(uchar * buf, uint keynr, DBT const *found_key) {
4766 TOKUDB_HANDLER_DBUG_ENTER("");
4767 table->status = 0;
4768 //
4769 // only case when we do not unpack the key is if we are dealing with the main dictionary
4770 // of a table with a hidden primary key
4771 //
4772 if (!(hidden_primary_key && keynr == primary_key)) {
4773 unpack_key(buf, found_key, keynr);
4774 }
4775 TOKUDB_HANDLER_DBUG_VOID_RETURN;
4776}
4777
4778//
4779// Helper function used to try to retrieve the entire row
4780// If keynr is associated with the main table, reads contents of found_key and row into buf, otherwise,
4781// makes copy of primary key and saves it to last_key. This can later be used to retrieve the entire row
4782// Parameters:
4783// [out] buf - buffer for the row, in MySQL format
4784// keynr - index into key_file that represents DB we are currently operating on.
4785// [in] row - the row that has been read from the preceding DB call
4786// [in] found_key - key used to retrieve the row
4787//
4788int ha_tokudb::read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4789 TOKUDB_HANDLER_DBUG_ENTER("");
4790 int error = 0;
4791 table->status = 0;
4792 //
4793 // case where we read from secondary table that is not clustered
4794 //
4795 if (keynr != primary_key && !key_is_clustering(&table->key_info[keynr])) {
4796 bool has_null;
4797 //
4798 // create a DBT that has the same data as row, this is inefficient
4799 // extract_hidden_primary_key MUST have been called before this
4800 //
4801 memset((void *) &last_key, 0, sizeof(last_key));
4802 if (!hidden_primary_key) {
4803 unpack_key(buf, found_key, keynr);
4804 }
4805 create_dbt_key_from_table(
4806 &last_key,
4807 primary_key,
4808 key_buff,
4809 buf,
4810 &has_null
4811 );
4812 }
4813 //
4814 // else read from clustered/primary key
4815 //
4816 else {
4817 error = unpack_row(buf, row, found_key, keynr);
4818 if (error) { goto exit; }
4819 }
4820 if (found_key) { DBUG_DUMP("read row key", (uchar *) found_key->data, found_key->size); }
4821 error = 0;
4822exit:
4823 TOKUDB_HANDLER_DBUG_RETURN(error);
4824}
4825
4826//
4827// This function reads an entire row into buf. This function also assumes that
4828// the key needed to retrieve the row is stored in the member variable last_key
4829// Parameters:
4830// [out] buf - buffer for the row, in MySQL format
4831// Returns:
4832// 0 on success, error otherwise
4833//
4834int ha_tokudb::read_full_row(uchar * buf) {
4835 TOKUDB_HANDLER_DBUG_ENTER("");
4836 int error = 0;
4837 struct smart_dbt_info info;
4838 info.ha = this;
4839 info.buf = buf;
4840 info.keynr = primary_key;
4841 //
4842 // assumes key is stored in this->last_key
4843 //
4844
4845 error = share->file->getf_set(
4846 share->file,
4847 transaction,
4848 cursor_flags,
4849 &last_key,
4850 smart_dbt_callback_rowread_ptquery,
4851 &info
4852 );
4853
4854 if (error) {
4855 if (error == DB_LOCK_NOTGRANTED) {
4856 error = HA_ERR_LOCK_WAIT_TIMEOUT;
4857 }
4858 table->status = STATUS_NOT_FOUND;
4859 TOKUDB_HANDLER_DBUG_RETURN(error == DB_NOTFOUND ? HA_ERR_CRASHED : error);
4860 }
4861
4862 TOKUDB_HANDLER_DBUG_RETURN(error);
4863}
4864
4865
4866//
4867// Reads the next row matching to the key, on success, advances cursor
4868// Parameters:
4869// [out] buf - buffer for the next row, in MySQL format
4870// [in] key - key value
4871// keylen - length of key
4872// Returns:
4873// 0 on success
4874// HA_ERR_END_OF_FILE if not found
4875// error otherwise
4876//
4877int ha_tokudb::index_next_same(uchar* buf, const uchar* key, uint keylen) {
4878 TOKUDB_HANDLER_DBUG_ENTER("");
4879
4880 DBT curr_key;
4881 DBT found_key;
4882 bool has_null;
4883 int cmp;
4884 // create the key that will be used to compare with what is found
4885 // in order to figure out if we should return an error
4886 pack_key(&curr_key, tokudb_active_index, key_buff2, key, keylen, COL_ZERO);
4887 int error = get_next(buf, 1, &curr_key, key_read);
4888 if (error) {
4889 goto cleanup;
4890 }
4891 //
4892 // now do the comparison
4893 //
4894 create_dbt_key_from_table(
4895 &found_key,
4896 tokudb_active_index,
4897 key_buff3,buf,
4898 &has_null);
4899 cmp =
4900 tokudb_prefix_cmp_dbt_key(
4901 share->key_file[tokudb_active_index],
4902 &curr_key,
4903 &found_key);
4904 if (cmp) {
4905 error = HA_ERR_END_OF_FILE;
4906 }
4907
4908cleanup:
4909 error = handle_cursor_error(error, HA_ERR_END_OF_FILE, tokudb_active_index);
4910 TOKUDB_HANDLER_DBUG_RETURN(error);
4911}
4912
4913
4914//
4915// According to InnoDB handlerton: Positions an index cursor to the index
4916// specified in keynr. Fetches the row if any
4917// Parameters:
4918// [out] buf - buffer for the returned row
4919// [in] key - key value, according to InnoDB, if NULL,
4920// position cursor at start or end of index,
4921// not sure if this is done now
4922// key_len - length of key
4923// find_flag - according to InnoDB, search flags from my_base.h
4924// Returns:
4925// 0 on success
4926// HA_ERR_KEY_NOT_FOUND if not found (per InnoDB),
4927// we seem to return HA_ERR_END_OF_FILE if find_flag != HA_READ_KEY_EXACT
4928// TODO: investigate this for correctness
4929// error otherwise
4930//
4931int ha_tokudb::index_read(
4932 uchar* buf,
4933 const uchar* key,
4934 uint key_len,
4935 enum ha_rkey_function find_flag) {
4936
4937 TOKUDB_HANDLER_DBUG_ENTER(
4938 "key %p %u:%2.2x find=%u",
4939 key,
4940 key_len,
4941 key ? key[0] : 0,
4942 find_flag);
4943 invalidate_bulk_fetch();
4944 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4945 TOKUDB_DBUG_DUMP("mysql key=", key, key_len);
4946 }
4947 DBT row;
4948 DBT lookup_key;
4949 int error = 0;
4950 uint32_t flags = 0;
4951 THD* thd = ha_thd();
4952 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
4953 struct smart_dbt_info info;
4954 struct index_read_info ir_info;
4955
4956 HANDLE_INVALID_CURSOR();
4957
4958 // if we locked a non-null key range and we now have a null key, then
4959 // remove the bounds from the cursor
4960 if (range_lock_grabbed &&
4961 !range_lock_grabbed_null &&
4962 index_key_is_null(table, tokudb_active_index, key, key_len)) {
4963 range_lock_grabbed = range_lock_grabbed_null = false;
4964 cursor->c_remove_restriction(cursor);
4965 }
4966
4967 memset((void *) &row, 0, sizeof(row));
4968
4969 info.ha = this;
4970 info.buf = buf;
4971 info.keynr = tokudb_active_index;
4972
4973 ir_info.smart_dbt_info = info;
4974 ir_info.cmp = 0;
4975
4976 flags = SET_PRELOCK_FLAG(0);
4977 switch (find_flag) {
4978 case HA_READ_KEY_EXACT: /* Find first record else error */ {
4979 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4980 DBT lookup_bound;
4981 pack_key(&lookup_bound, tokudb_active_index, key_buff4, key, key_len, COL_POS_INF);
4982 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4983 TOKUDB_DBUG_DUMP("tokudb key=", lookup_key.data, lookup_key.size);
4984 }
4985 ir_info.orig_key = &lookup_key;
4986 error = cursor->c_getf_set_range_with_bound(cursor, flags, &lookup_key, &lookup_bound, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
4987 if (ir_info.cmp) {
4988 error = DB_NOTFOUND;
4989 }
4990 break;
4991 }
4992 case HA_READ_AFTER_KEY: /* Find next rec. after key-record */
4993 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
4994 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
4995 break;
4996 case HA_READ_BEFORE_KEY: /* Find next rec. before key-record */
4997 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4998 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
4999 break;
5000 case HA_READ_KEY_OR_NEXT: /* Record or next record */
5001 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5002 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5003 break;
5004 //
5005 // This case does not seem to ever be used, it is ok for it to be slow
5006 //
5007 case HA_READ_KEY_OR_PREV: /* Record or previous */
5008 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5009 ir_info.orig_key = &lookup_key;
5010 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5011 if (error == DB_NOTFOUND) {
5012 error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5013 }
5014 else if (ir_info.cmp) {
5015 error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5016 }
5017 break;
5018 case HA_READ_PREFIX_LAST_OR_PREV: /* Last or prev key with the same prefix */
5019 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5020 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5021 break;
5022 case HA_READ_PREFIX_LAST:
5023 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5024 ir_info.orig_key = &lookup_key;
5025 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5026 if (ir_info.cmp) {
5027 error = DB_NOTFOUND;
5028 }
5029 break;
5030 default:
5031 TOKUDB_HANDLER_TRACE("unsupported:%d", find_flag);
5032 error = HA_ERR_UNSUPPORTED;
5033 break;
5034 }
5035 error = handle_cursor_error(error,HA_ERR_KEY_NOT_FOUND,tokudb_active_index);
5036 if (!error && !key_read && tokudb_active_index != primary_key && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5037 error = read_full_row(buf);
5038 }
5039
5040 if (TOKUDB_UNLIKELY(error && TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ERROR))) {
5041 TOKUDB_HANDLER_TRACE("error:%d:%d", error, find_flag);
5042 }
5043 trx->stmt_progress.queried++;
5044 track_progress(thd);
5045
5046cleanup:
5047 TOKUDB_HANDLER_DBUG_RETURN(error);
5048}
5049
5050
5051int ha_tokudb::read_data_from_range_query_buff(uchar* buf, bool need_val, bool do_key_read) {
5052 // buffer has the next row, get it from there
5053 int error;
5054 uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
5055 DBT curr_key;
5056 memset((void *) &curr_key, 0, sizeof(curr_key));
5057
5058 // get key info
5059 uint32_t key_size = *(uint32_t *)curr_pos;
5060 curr_pos += sizeof(key_size);
5061 uchar* curr_key_buff = curr_pos;
5062 curr_pos += key_size;
5063
5064 curr_key.data = curr_key_buff;
5065 curr_key.size = key_size;
5066
5067 // if this is a covering index, this is all we need
5068 if (do_key_read) {
5069 assert_always(!need_val);
5070 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5071 read_key_only(buf, tokudb_active_index, &curr_key);
5072 error = 0;
5073 }
5074 // we need to get more data
5075 else {
5076 DBT curr_val;
5077 memset((void *) &curr_val, 0, sizeof(curr_val));
5078 uchar* curr_val_buff = NULL;
5079 uint32_t val_size = 0;
5080 // in this case, we don't have a val, we are simply extracting the pk
5081 if (!need_val) {
5082 curr_val.data = curr_val_buff;
5083 curr_val.size = val_size;
5084 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5085 error = read_primary_key( buf, tokudb_active_index, &curr_val, &curr_key);
5086 }
5087 else {
5088 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5089 // need to extract a val and place it into buf
5090 if (unpack_entire_row) {
5091 // get val info
5092 val_size = *(uint32_t *)curr_pos;
5093 curr_pos += sizeof(val_size);
5094 curr_val_buff = curr_pos;
5095 curr_pos += val_size;
5096 curr_val.data = curr_val_buff;
5097 curr_val.size = val_size;
5098 error = unpack_row(buf,&curr_val, &curr_key, tokudb_active_index);
5099 }
5100 else {
5101 if (!(hidden_primary_key && tokudb_active_index == primary_key)) {
5102 unpack_key(buf,&curr_key,tokudb_active_index);
5103 }
5104 // read rows we care about
5105
5106 // first the null bytes;
5107 memcpy(buf, curr_pos, table_share->null_bytes);
5108 curr_pos += table_share->null_bytes;
5109
5110 // now the fixed sized rows
5111 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5112 uint field_index = fixed_cols_for_query[i];
5113 Field* field = table->field[field_index];
5114 unpack_fixed_field(
5115 buf + field_offset(field, table),
5116 curr_pos,
5117 share->kc_info.field_lengths[field_index]
5118 );
5119 curr_pos += share->kc_info.field_lengths[field_index];
5120 }
5121 // now the variable sized rows
5122 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5123 uint field_index = var_cols_for_query[i];
5124 Field* field = table->field[field_index];
5125 uint32_t field_len = *(uint32_t *)curr_pos;
5126 curr_pos += sizeof(field_len);
5127 unpack_var_field(
5128 buf + field_offset(field, table),
5129 curr_pos,
5130 field_len,
5131 share->kc_info.length_bytes[field_index]
5132 );
5133 curr_pos += field_len;
5134 }
5135 // now the blobs
5136 if (read_blobs) {
5137 uint32_t blob_size = *(uint32_t *)curr_pos;
5138 curr_pos += sizeof(blob_size);
5139 error = unpack_blobs(
5140 buf,
5141 curr_pos,
5142 blob_size,
5143 true
5144 );
5145 curr_pos += blob_size;
5146 if (error) {
5147 invalidate_bulk_fetch();
5148 goto exit;
5149 }
5150 }
5151 error = 0;
5152 }
5153 }
5154 }
5155
5156 curr_range_query_buff_offset = curr_pos - range_query_buff;
5157exit:
5158 return error;
5159}
5160
5161static int smart_dbt_bf_callback(
5162 DBT const* key,
5163 DBT const* row,
5164 void* context) {
5165 SMART_DBT_BF_INFO info = (SMART_DBT_BF_INFO)context;
5166 return
5167 info->ha->fill_range_query_buf(
5168 info->need_val,
5169 key,
5170 row,
5171 info->direction,
5172 info->thd,
5173 info->buf,
5174 info->key_to_compare);
5175}
5176
5177enum icp_result ha_tokudb::toku_handler_index_cond_check(
5178 Item* pushed_idx_cond) {
5179
5180 enum icp_result res;
5181 if (end_range) {
5182 int cmp;
5183#ifdef MARIADB_BASE_VERSION
5184 cmp = compare_key2(end_range);
5185#else
5186 cmp = compare_key_icp(end_range);
5187#endif
5188 if (cmp > 0) {
5189 return ICP_OUT_OF_RANGE;
5190 }
5191 }
5192 res = pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
5193 return res;
5194}
5195
5196// fill in the range query buf for bulk fetch
5197int ha_tokudb::fill_range_query_buf(
5198 bool need_val,
5199 DBT const* key,
5200 DBT const* row,
5201 int direction,
5202 THD* thd,
5203 uchar* buf,
5204 DBT* key_to_compare) {
5205
5206 int error;
5207 //
5208 // first put the value into range_query_buf
5209 //
5210 uint32_t size_remaining =
5211 size_range_query_buff - bytes_used_in_range_query_buff;
5212 uint32_t size_needed;
5213 uint32_t user_defined_size = tokudb::sysvars::read_buf_size(thd);
5214 uchar* curr_pos = NULL;
5215
5216 if (key_to_compare) {
5217 int cmp = tokudb_prefix_cmp_dbt_key(
5218 share->key_file[tokudb_active_index],
5219 key_to_compare,
5220 key);
5221 if (cmp) {
5222 icp_went_out_of_range = true;
5223 error = 0;
5224 goto cleanup;
5225 }
5226 }
5227
5228 // if we have an index condition pushed down, we check it
5229 if (toku_pushed_idx_cond &&
5230 (tokudb_active_index == toku_pushed_idx_cond_keyno)) {
5231 unpack_key(buf, key, tokudb_active_index);
5232 enum icp_result result =
5233 toku_handler_index_cond_check(toku_pushed_idx_cond);
5234
5235 // If we have reason to stop, we set icp_went_out_of_range and get out
5236 // otherwise, if we simply see that the current key is no match,
5237 // we tell the cursor to continue and don't store
5238 // the key locally
5239 if (result == ICP_OUT_OF_RANGE || thd_kill_level(thd)) {
5240 icp_went_out_of_range = true;
5241 error = 0;
5242 DEBUG_SYNC(ha_thd(), "tokudb_icp_asc_scan_out_of_range");
5243 goto cleanup;
5244 } else if (result == ICP_NO_MATCH) {
5245 // Optimizer change for MyRocks also benefits us here in TokuDB as
5246 // opt_range.cc QUICK_SELECT::get_next now sets end_range during
5247 // descending scan. We should not ever hit this condition, but
5248 // leaving this code in to prevent any possibility of a descending
5249 // scan to the beginning of an index and catch any possibility
5250 // in debug builds with an assertion
5251 assert_debug(!(!end_range && direction < 0));
5252 if (!end_range &&
5253 direction < 0) {
5254 cancel_pushed_idx_cond();
5255 }
5256 error = TOKUDB_CURSOR_CONTINUE;
5257 goto cleanup;
5258 }
5259 }
5260
5261 // at this point, if ICP is on, we have verified that the key is one
5262 // we are interested in, so we proceed with placing the data
5263 // into the range query buffer
5264
5265 if (need_val) {
5266 if (unpack_entire_row) {
5267 size_needed = 2*sizeof(uint32_t) + key->size + row->size;
5268 } else {
5269 // this is an upper bound
5270 size_needed =
5271 // size of key length
5272 sizeof(uint32_t) +
5273 // key and row
5274 key->size + row->size +
5275 // lengths of varchars stored
5276 num_var_cols_for_query * (sizeof(uint32_t)) +
5277 // length of blobs
5278 sizeof(uint32_t);
5279 }
5280 } else {
5281 size_needed = sizeof(uint32_t) + key->size;
5282 }
5283 if (size_remaining < size_needed) {
5284 range_query_buff =
5285 static_cast<uchar*>(tokudb::memory::realloc(
5286 static_cast<void*>(range_query_buff),
5287 bytes_used_in_range_query_buff + size_needed,
5288 MYF(MY_WME)));
5289 if (range_query_buff == NULL) {
5290 error = ENOMEM;
5291 invalidate_bulk_fetch();
5292 goto cleanup;
5293 }
5294 size_range_query_buff = bytes_used_in_range_query_buff + size_needed;
5295 }
5296 //
5297 // now we know we have the size, let's fill the buffer, starting with the key
5298 //
5299 curr_pos = range_query_buff + bytes_used_in_range_query_buff;
5300
5301 *reinterpret_cast<uint32_t*>(curr_pos) = key->size;
5302 curr_pos += sizeof(uint32_t);
5303 memcpy(curr_pos, key->data, key->size);
5304 curr_pos += key->size;
5305 if (need_val) {
5306 if (unpack_entire_row) {
5307 *reinterpret_cast<uint32_t*>(curr_pos) = row->size;
5308 curr_pos += sizeof(uint32_t);
5309 memcpy(curr_pos, row->data, row->size);
5310 curr_pos += row->size;
5311 } else {
5312 // need to unpack just the data we care about
5313 const uchar* fixed_field_ptr = static_cast<const uchar*>(row->data);
5314 fixed_field_ptr += table_share->null_bytes;
5315
5316 const uchar* var_field_offset_ptr = NULL;
5317 const uchar* var_field_data_ptr = NULL;
5318
5319 var_field_offset_ptr =
5320 fixed_field_ptr +
5321 share->kc_info.mcp_info[tokudb_active_index].fixed_field_size;
5322 var_field_data_ptr =
5323 var_field_offset_ptr +
5324 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets;
5325
5326 // first the null bytes
5327 memcpy(curr_pos, row->data, table_share->null_bytes);
5328 curr_pos += table_share->null_bytes;
5329 // now the fixed fields
5330 //
5331 // first the fixed fields
5332 //
5333 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5334 uint field_index = fixed_cols_for_query[i];
5335 memcpy(
5336 curr_pos,
5337 fixed_field_ptr + share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val,
5338 share->kc_info.field_lengths[field_index]);
5339 curr_pos += share->kc_info.field_lengths[field_index];
5340 }
5341
5342 //
5343 // now the var fields
5344 //
5345 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5346 uint field_index = var_cols_for_query[i];
5347 uint32_t var_field_index =
5348 share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val;
5349 uint32_t data_start_offset;
5350 uint32_t field_len;
5351
5352 get_var_field_info(
5353 &field_len,
5354 &data_start_offset,
5355 var_field_index,
5356 var_field_offset_ptr,
5357 share->kc_info.num_offset_bytes);
5358 memcpy(curr_pos, &field_len, sizeof(field_len));
5359 curr_pos += sizeof(field_len);
5360 memcpy(
5361 curr_pos,
5362 var_field_data_ptr + data_start_offset,
5363 field_len);
5364 curr_pos += field_len;
5365 }
5366
5367 if (read_blobs) {
5368 uint32_t blob_offset = 0;
5369 uint32_t data_size = 0;
5370 //
5371 // now the blobs
5372 //
5373 get_blob_field_info(
5374 &blob_offset,
5375 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets,
5376 var_field_data_ptr,
5377 share->kc_info.num_offset_bytes);
5378 data_size =
5379 row->size -
5380 blob_offset -
5381 static_cast<uint32_t>((var_field_data_ptr -
5382 static_cast<const uchar*>(row->data)));
5383 memcpy(curr_pos, &data_size, sizeof(data_size));
5384 curr_pos += sizeof(data_size);
5385 memcpy(curr_pos, var_field_data_ptr + blob_offset, data_size);
5386 curr_pos += data_size;
5387 }
5388 }
5389 }
5390
5391 bytes_used_in_range_query_buff = curr_pos - range_query_buff;
5392 assert_always(bytes_used_in_range_query_buff <= size_range_query_buff);
5393
5394 //
5395 // now determine if we should continue with the bulk fetch
5396 // we want to stop under these conditions:
5397 // - we overran the prelocked range
5398 // - we are close to the end of the buffer
5399 // - we have fetched an exponential amount of rows with
5400 // respect to the bulk fetch iteration, which is initialized
5401 // to 0 in index_init() and prelock_range().
5402
5403 rows_fetched_using_bulk_fetch++;
5404 // if the iteration is less than the number of possible shifts on
5405 // a 64 bit integer, check that we haven't exceeded this iterations
5406 // row fetch upper bound.
5407 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5408 uint64_t row_fetch_upper_bound = 1LLU << bulk_fetch_iteration;
5409 assert_always(row_fetch_upper_bound > 0);
5410 if (rows_fetched_using_bulk_fetch >= row_fetch_upper_bound) {
5411 error = 0;
5412 goto cleanup;
5413 }
5414 }
5415
5416 if (bytes_used_in_range_query_buff +
5417 table_share->rec_buff_length >
5418 user_defined_size) {
5419 error = 0;
5420 goto cleanup;
5421 }
5422 if (direction > 0) {
5423 // compare what we got to the right endpoint of prelocked range
5424 // because we are searching keys in ascending order
5425 if (prelocked_right_range_size == 0) {
5426 error = TOKUDB_CURSOR_CONTINUE;
5427 goto cleanup;
5428 }
5429 DBT right_range;
5430 memset(&right_range, 0, sizeof(right_range));
5431 right_range.size = prelocked_right_range_size;
5432 right_range.data = prelocked_right_range;
5433 int cmp = tokudb_cmp_dbt_key(
5434 share->key_file[tokudb_active_index],
5435 key,
5436 &right_range);
5437 error = (cmp > 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5438 } else {
5439 // compare what we got to the left endpoint of prelocked range
5440 // because we are searching keys in descending order
5441 if (prelocked_left_range_size == 0) {
5442 error = TOKUDB_CURSOR_CONTINUE;
5443 goto cleanup;
5444 }
5445 DBT left_range;
5446 memset(&left_range, 0, sizeof(left_range));
5447 left_range.size = prelocked_left_range_size;
5448 left_range.data = prelocked_left_range;
5449 int cmp = tokudb_cmp_dbt_key(
5450 share->key_file[tokudb_active_index],
5451 key,
5452 &left_range);
5453 error = (cmp < 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5454 }
5455cleanup:
5456 return error;
5457}
5458
5459int ha_tokudb::get_next(
5460 uchar* buf,
5461 int direction,
5462 DBT* key_to_compare,
5463 bool do_key_read) {
5464
5465 int error = 0;
5466 HANDLE_INVALID_CURSOR();
5467
5468 if (maybe_index_scan) {
5469 maybe_index_scan = false;
5470 if (!range_lock_grabbed) {
5471 error = prepare_index_scan();
5472 }
5473 }
5474
5475 if (!error) {
5476 uint32_t flags = SET_PRELOCK_FLAG(0);
5477
5478 // we need to read the val of what we retrieve if
5479 // we do NOT have a covering index AND we are using a clustering secondary
5480 // key
5481 bool need_val =
5482 (do_key_read == 0) &&
5483 (tokudb_active_index == primary_key ||
5484 key_is_clustering(&table->key_info[tokudb_active_index]));
5485
5486 if ((bytes_used_in_range_query_buff -
5487 curr_range_query_buff_offset) > 0) {
5488 error = read_data_from_range_query_buff(buf, need_val, do_key_read);
5489 } else if (icp_went_out_of_range) {
5490 icp_went_out_of_range = false;
5491 error = HA_ERR_END_OF_FILE;
5492 } else {
5493 invalidate_bulk_fetch();
5494 if (doing_bulk_fetch) {
5495 struct smart_dbt_bf_info bf_info;
5496 bf_info.ha = this;
5497 // you need the val if you have a clustering index and key_read is not 0;
5498 bf_info.direction = direction;
5499 bf_info.thd = ha_thd();
5500 bf_info.need_val = need_val;
5501 bf_info.buf = buf;
5502 bf_info.key_to_compare = key_to_compare;
5503 //
5504 // call c_getf_next with purpose of filling in range_query_buff
5505 //
5506 rows_fetched_using_bulk_fetch = 0;
5507 // it is expected that we can do ICP in the smart_dbt_bf_callback
5508 // as a result, it's possible we don't return any data because
5509 // none of the rows matched the index condition. Therefore, we need
5510 // this while loop. icp_out_of_range will be set if we hit a row that
5511 // the index condition states is out of our range. When that hits,
5512 // we know all the data in the buffer is the last data we will retrieve
5513 while (bytes_used_in_range_query_buff == 0 &&
5514 !icp_went_out_of_range && error == 0) {
5515 if (direction > 0) {
5516 error =
5517 cursor->c_getf_next(
5518 cursor,
5519 flags,
5520 smart_dbt_bf_callback,
5521 &bf_info);
5522 } else {
5523 error =
5524 cursor->c_getf_prev(
5525 cursor,
5526 flags,
5527 smart_dbt_bf_callback,
5528 &bf_info);
5529 }
5530 }
5531 // if there is no data set and we went out of range,
5532 // then there is nothing to return
5533 if (bytes_used_in_range_query_buff == 0 &&
5534 icp_went_out_of_range) {
5535 icp_went_out_of_range = false;
5536 error = HA_ERR_END_OF_FILE;
5537 }
5538 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5539 bulk_fetch_iteration++;
5540 }
5541
5542 error =
5543 handle_cursor_error(
5544 error,
5545 HA_ERR_END_OF_FILE,
5546 tokudb_active_index);
5547 if (error) {
5548 goto cleanup;
5549 }
5550
5551 //
5552 // now that range_query_buff is filled, read an element
5553 //
5554 error =
5555 read_data_from_range_query_buff(buf, need_val, do_key_read);
5556 } else {
5557 struct smart_dbt_info info;
5558 info.ha = this;
5559 info.buf = buf;
5560 info.keynr = tokudb_active_index;
5561
5562 if (direction > 0) {
5563 error =
5564 cursor->c_getf_next(
5565 cursor,
5566 flags,
5567 SMART_DBT_CALLBACK(do_key_read),
5568 &info);
5569 } else {
5570 error =
5571 cursor->c_getf_prev(
5572 cursor,
5573 flags,
5574 SMART_DBT_CALLBACK(do_key_read),
5575 &info);
5576 }
5577 error =
5578 handle_cursor_error(
5579 error,
5580 HA_ERR_END_OF_FILE,
5581 tokudb_active_index);
5582 }
5583 }
5584 }
5585
5586 //
5587 // at this point, one of two things has happened
5588 // either we have unpacked the data into buf, and we
5589 // are done, or we have unpacked the primary key
5590 // into last_key, and we use the code below to
5591 // read the full row by doing a point query into the
5592 // main table.
5593 //
5594 if (!error &&
5595 !do_key_read &&
5596 (tokudb_active_index != primary_key) &&
5597 !key_is_clustering(&table->key_info[tokudb_active_index])) {
5598 error = read_full_row(buf);
5599 }
5600
5601 if (!error) {
5602 THD *thd = ha_thd();
5603 tokudb_trx_data* trx =
5604 static_cast<tokudb_trx_data*>(thd_get_ha_data(thd, tokudb_hton));
5605 trx->stmt_progress.queried++;
5606 track_progress(thd);
5607 if (thd_kill_level(thd))
5608 error = ER_ABORTING_CONNECTION;
5609 }
5610cleanup:
5611 return error;
5612}
5613
5614
5615//
5616// Reads the next row from the active index (cursor) into buf, and advances cursor
5617// Parameters:
5618// [out] buf - buffer for the next row, in MySQL format
5619// Returns:
5620// 0 on success
5621// HA_ERR_END_OF_FILE if not found
5622// error otherwise
5623//
5624int ha_tokudb::index_next(uchar * buf) {
5625 TOKUDB_HANDLER_DBUG_ENTER("");
5626 int error = get_next(buf, 1, NULL, key_read);
5627 TOKUDB_HANDLER_DBUG_RETURN(error);
5628}
5629
5630
5631int ha_tokudb::index_read_last(uchar * buf, const uchar * key, uint key_len) {
5632 return(index_read(buf, key, key_len, HA_READ_PREFIX_LAST));
5633}
5634
5635
5636//
5637// Reads the previous row from the active index (cursor) into buf, and advances cursor
5638// Parameters:
5639// [out] buf - buffer for the next row, in MySQL format
5640// Returns:
5641// 0 on success
5642// HA_ERR_END_OF_FILE if not found
5643// error otherwise
5644//
5645int ha_tokudb::index_prev(uchar * buf) {
5646 TOKUDB_HANDLER_DBUG_ENTER("");
5647 int error = get_next(buf, -1, NULL, key_read);
5648 TOKUDB_HANDLER_DBUG_RETURN(error);
5649}
5650
5651//
5652// Reads the first row from the active index (cursor) into buf, and advances cursor
5653// Parameters:
5654// [out] buf - buffer for the next row, in MySQL format
5655// Returns:
5656// 0 on success
5657// HA_ERR_END_OF_FILE if not found
5658// error otherwise
5659//
5660int ha_tokudb::index_first(uchar * buf) {
5661 TOKUDB_HANDLER_DBUG_ENTER("");
5662 invalidate_bulk_fetch();
5663 int error = 0;
5664 struct smart_dbt_info info;
5665 uint32_t flags = SET_PRELOCK_FLAG(0);
5666 THD* thd = ha_thd();
5667 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5668 HANDLE_INVALID_CURSOR();
5669
5670 info.ha = this;
5671 info.buf = buf;
5672 info.keynr = tokudb_active_index;
5673
5674 error = cursor->c_getf_first(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5675 error = handle_cursor_error(error,HA_ERR_END_OF_FILE,tokudb_active_index);
5676
5677 //
5678 // still need to get entire contents of the row if operation done on
5679 // secondary DB and it was NOT a covering index
5680 //
5681 if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5682 error = read_full_row(buf);
5683 }
5684 if (trx) {
5685 trx->stmt_progress.queried++;
5686 }
5687 track_progress(thd);
5688 maybe_index_scan = true;
5689cleanup:
5690 TOKUDB_HANDLER_DBUG_RETURN(error);
5691}
5692
5693//
5694// Reads the last row from the active index (cursor) into buf, and advances cursor
5695// Parameters:
5696// [out] buf - buffer for the next row, in MySQL format
5697// Returns:
5698// 0 on success
5699// HA_ERR_END_OF_FILE if not found
5700// error otherwise
5701//
5702int ha_tokudb::index_last(uchar * buf) {
5703 TOKUDB_HANDLER_DBUG_ENTER("");
5704 invalidate_bulk_fetch();
5705 int error = 0;
5706 struct smart_dbt_info info;
5707 uint32_t flags = SET_PRELOCK_FLAG(0);
5708 THD* thd = ha_thd();
5709 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5710 HANDLE_INVALID_CURSOR();
5711
5712 info.ha = this;
5713 info.buf = buf;
5714 info.keynr = tokudb_active_index;
5715
5716 error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5717 error = handle_cursor_error(error,HA_ERR_END_OF_FILE,tokudb_active_index);
5718 //
5719 // still need to get entire contents of the row if operation done on
5720 // secondary DB and it was NOT a covering index
5721 //
5722 if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5723 error = read_full_row(buf);
5724 }
5725
5726 if (trx) {
5727 trx->stmt_progress.queried++;
5728 }
5729 track_progress(thd);
5730 maybe_index_scan = true;
5731cleanup:
5732 TOKUDB_HANDLER_DBUG_RETURN(error);
5733}
5734
5735//
5736// Initialize a scan of the table (which is why index_init is called on primary_key)
5737// Parameters:
5738// scan - unused
5739// Returns:
5740// 0 on success
5741// error otherwise
5742//
5743int ha_tokudb::rnd_init(bool scan) {
5744 TOKUDB_HANDLER_DBUG_ENTER("");
5745 int error = 0;
5746 range_lock_grabbed = false;
5747 error = index_init(MAX_KEY, 0);
5748 if (error) { goto cleanup;}
5749
5750 if (scan) {
5751 error = prelock_range(NULL, NULL);
5752 if (error) { goto cleanup; }
5753
5754 // only want to set range_lock_grabbed to true after index_init
5755 // successfully executed for two reasons:
5756 // 1) index_init will reset it to false anyway
5757 // 2) if it fails, we don't want prelocking on,
5758 range_lock_grabbed = true;
5759 }
5760
5761 error = 0;
5762cleanup:
5763 if (error) {
5764 index_end();
5765 last_cursor_error = error;
5766 }
5767 TOKUDB_HANDLER_DBUG_RETURN(error);
5768}
5769
5770//
5771// End a scan of the table
5772//
5773int ha_tokudb::rnd_end() {
5774 TOKUDB_HANDLER_DBUG_ENTER("");
5775 range_lock_grabbed = false;
5776 TOKUDB_HANDLER_DBUG_RETURN(index_end());
5777}
5778
5779
5780//
5781// Read the next row in a table scan
5782// Parameters:
5783// [out] buf - buffer for the next row, in MySQL format
5784// Returns:
5785// 0 on success
5786// HA_ERR_END_OF_FILE if not found
5787// error otherwise
5788//
5789int ha_tokudb::rnd_next(uchar * buf) {
5790 TOKUDB_HANDLER_DBUG_ENTER("");
5791 int error = get_next(buf, 1, NULL, false);
5792 TOKUDB_HANDLER_DBUG_RETURN(error);
5793}
5794
5795
5796void ha_tokudb::track_progress(THD* thd) {
5797 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
5798 if (trx) {
5799 ulonglong num_written = trx->stmt_progress.inserted +
5800 trx->stmt_progress.updated +
5801 trx->stmt_progress.deleted;
5802 bool update_status =
5803 (trx->stmt_progress.queried &&
5804 tokudb::sysvars::read_status_frequency &&
5805 (trx->stmt_progress.queried %
5806 tokudb::sysvars::read_status_frequency) == 0) ||
5807 (num_written && tokudb::sysvars::write_status_frequency &&
5808 (num_written % tokudb::sysvars::write_status_frequency) == 0);
5809 if (update_status) {
5810 char *next_status = write_status_msg;
5811 bool first = true;
5812 int r;
5813 if (trx->stmt_progress.queried) {
5814 r = sprintf(
5815 next_status,
5816 "Queried about %llu row%s",
5817 trx->stmt_progress.queried,
5818 trx->stmt_progress.queried == 1 ? "" : "s");
5819 assert_always(r >= 0);
5820 next_status += r;
5821 first = false;
5822 }
5823 if (trx->stmt_progress.inserted) {
5824 if (trx->stmt_progress.using_loader) {
5825 r = sprintf(
5826 next_status,
5827 "%sFetched about %llu row%s, loading data still remains",
5828 first ? "" : ", ",
5829 trx->stmt_progress.inserted,
5830 trx->stmt_progress.inserted == 1 ? "" : "s");
5831 } else {
5832 r = sprintf(
5833 next_status,
5834 "%sInserted about %llu row%s",
5835 first ? "" : ", ",
5836 trx->stmt_progress.inserted,
5837 trx->stmt_progress.inserted == 1 ? "" : "s");
5838 }
5839 assert_always(r >= 0);
5840 next_status += r;
5841 first = false;
5842 }
5843 if (trx->stmt_progress.updated) {
5844 r = sprintf(
5845 next_status,
5846 "%sUpdated about %llu row%s",
5847 first ? "" : ", ",
5848 trx->stmt_progress.updated,
5849 trx->stmt_progress.updated == 1 ? "" : "s");
5850 assert_always(r >= 0);
5851 next_status += r;
5852 first = false;
5853 }
5854 if (trx->stmt_progress.deleted) {
5855 r = sprintf(
5856 next_status,
5857 "%sDeleted about %llu row%s",
5858 first ? "" : ", ",
5859 trx->stmt_progress.deleted,
5860 trx->stmt_progress.deleted == 1 ? "" : "s");
5861 assert_always(r >= 0);
5862 next_status += r;
5863 first = false;
5864 }
5865 if (!first)
5866 thd_proc_info(thd, write_status_msg);
5867 }
5868 }
5869}
5870
5871
5872DBT *ha_tokudb::get_pos(DBT * to, uchar * pos) {
5873 TOKUDB_HANDLER_DBUG_ENTER("");
5874 /* We don't need to set app_data here */
5875 memset((void *) to, 0, sizeof(*to));
5876 to->data = pos + sizeof(uint32_t);
5877 to->size = *(uint32_t *)pos;
5878 DBUG_DUMP("key", (const uchar *) to->data, to->size);
5879 DBUG_RETURN(to);
5880}
5881
5882// Retrieves a row with based on the primary key saved in pos
5883// Returns:
5884// 0 on success
5885// HA_ERR_KEY_NOT_FOUND if not found
5886// error otherwise
5887int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
5888 TOKUDB_HANDLER_DBUG_ENTER("");
5889 DBT db_pos;
5890 int error = 0;
5891 struct smart_dbt_info info;
5892 bool old_unpack_entire_row = unpack_entire_row;
5893 DBT* key = get_pos(&db_pos, pos);
5894
5895 unpack_entire_row = true;
5896 tokudb_active_index = MAX_KEY;
5897
5898 // test rpl slave by inducing a delay before the point query
5899 THD *thd = ha_thd();
5900 if (thd->slave_thread && (in_rpl_delete_rows || in_rpl_update_rows)) {
5901 DBUG_EXECUTE_IF("tokudb_crash_if_rpl_looks_up_row", DBUG_ASSERT(0););
5902 uint64_t delay_ms = tokudb::sysvars::rpl_lookup_rows_delay(thd);
5903 if (delay_ms)
5904 usleep(delay_ms * 1000);
5905 }
5906
5907 info.ha = this;
5908 info.buf = buf;
5909 info.keynr = primary_key;
5910
5911 error = share->file->getf_set(share->file, transaction,
5912 get_cursor_isolation_flags(lock.type, thd),
5913 key, smart_dbt_callback_rowread_ptquery, &info);
5914
5915 if (error == DB_NOTFOUND) {
5916 error = HA_ERR_KEY_NOT_FOUND;
5917 goto cleanup;
5918 }
5919cleanup:
5920 unpack_entire_row = old_unpack_entire_row;
5921 TOKUDB_HANDLER_DBUG_RETURN(error);
5922}
5923
5924int ha_tokudb::prelock_range(const key_range *start_key, const key_range *end_key) {
5925 TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5926 THD* thd = ha_thd();
5927
5928 int error = 0;
5929 DBT start_dbt_key;
5930 DBT end_dbt_key;
5931 uchar* start_key_buff = prelocked_left_range;
5932 uchar* end_key_buff = prelocked_right_range;
5933
5934 memset((void *) &start_dbt_key, 0, sizeof(start_dbt_key));
5935 memset((void *) &end_dbt_key, 0, sizeof(end_dbt_key));
5936
5937 HANDLE_INVALID_CURSOR();
5938 if (start_key) {
5939 switch (start_key->flag) {
5940 case HA_READ_AFTER_KEY:
5941 pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_POS_INF);
5942 break;
5943 default:
5944 pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_NEG_INF);
5945 break;
5946 }
5947 prelocked_left_range_size = start_dbt_key.size;
5948 }
5949 else {
5950 prelocked_left_range_size = 0;
5951 }
5952
5953 if (end_key) {
5954 switch (end_key->flag) {
5955 case HA_READ_BEFORE_KEY:
5956 pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_NEG_INF);
5957 break;
5958 default:
5959 pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_POS_INF);
5960 break;
5961 }
5962 prelocked_right_range_size = end_dbt_key.size;
5963 }
5964 else {
5965 prelocked_right_range_size = 0;
5966 }
5967
5968 error = cursor->c_set_bounds(
5969 cursor,
5970 start_key ? &start_dbt_key : share->key_file[tokudb_active_index]->dbt_neg_infty(),
5971 end_key ? &end_dbt_key : share->key_file[tokudb_active_index]->dbt_pos_infty(),
5972 true,
5973 (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
5974 );
5975 if (error) {
5976 error = map_to_handler_error(error);
5977 last_cursor_error = error;
5978 //
5979 // cursor should be initialized here, but in case it is not, we still check
5980 //
5981 if (cursor) {
5982 int r = cursor->c_close(cursor);
5983 assert_always(r==0);
5984 cursor = NULL;
5985 remove_from_trx_handler_list();
5986 }
5987 goto cleanup;
5988 }
5989
5990 // at this point, determine if we will be doing bulk fetch
5991 doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
5992 bulk_fetch_iteration = 0;
5993 rows_fetched_using_bulk_fetch = 0;
5994
5995cleanup:
5996 TOKUDB_HANDLER_DBUG_RETURN(error);
5997}
5998
5999//
6000// Prelock range if possible, start_key is leftmost, end_key is rightmost
6001// whether scanning forward or backward. This function is called by MySQL
6002// for backward range queries (in QUICK_SELECT_DESC::get_next).
6003// Forward scans use read_range_first()/read_range_next().
6004//
6005int ha_tokudb::prepare_range_scan( const key_range *start_key, const key_range *end_key) {
6006 TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
6007 int error = prelock_range(start_key, end_key);
6008 if (!error) {
6009 range_lock_grabbed = true;
6010 }
6011 TOKUDB_HANDLER_DBUG_RETURN(error);
6012}
6013
6014int ha_tokudb::read_range_first(
6015 const key_range *start_key,
6016 const key_range *end_key,
6017 bool eq_range,
6018 bool sorted)
6019{
6020 TOKUDB_HANDLER_DBUG_ENTER("%p %p %u %u", start_key, end_key, eq_range, sorted);
6021 int error = prelock_range(start_key, end_key);
6022 if (error) { goto cleanup; }
6023 range_lock_grabbed = true;
6024
6025 error = handler::read_range_first(start_key, end_key, eq_range, sorted);
6026cleanup:
6027 TOKUDB_HANDLER_DBUG_RETURN(error);
6028}
6029
6030int ha_tokudb::read_range_next()
6031{
6032 TOKUDB_HANDLER_DBUG_ENTER("");
6033 int error;
6034 error = handler::read_range_next();
6035 if (error) {
6036 range_lock_grabbed = false;
6037 }
6038 TOKUDB_HANDLER_DBUG_RETURN(error);
6039}
6040
6041
6042
6043/*
6044 Set a reference to the current record in (ref,ref_length).
6045
6046 SYNOPSIS
6047 ha_tokudb::position()
6048 record The current record buffer
6049
6050 DESCRIPTION
6051 The BDB handler stores the primary key in (ref,ref_length).
6052 There is either an explicit primary key, or an implicit (hidden)
6053 primary key.
6054 During open(), 'ref_length' is calculated as the maximum primary
6055 key length. When an actual key is shorter than that, the rest of
6056 the buffer must be cleared out. The row cannot be identified, if
6057 garbage follows behind the end of the key. There is no length
6058 field for the current key, so that the whole ref_length is used
6059 for comparison.
6060
6061 RETURN
6062 nothing
6063*/
6064void ha_tokudb::position(const uchar * record) {
6065 TOKUDB_HANDLER_DBUG_ENTER("");
6066 DBT key;
6067 if (hidden_primary_key) {
6068 DBUG_ASSERT(ref_length == (TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t)));
6069 memcpy(ref + sizeof(uint32_t), current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
6070 *(uint32_t *)ref = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
6071 }
6072 else {
6073 bool has_null;
6074 //
6075 // save the data
6076 //
6077 create_dbt_key_from_table(&key, primary_key, ref + sizeof(uint32_t), record, &has_null);
6078 //
6079 // save the size of data in the first four bytes of ref
6080 //
6081 memcpy(ref, &key.size, sizeof(uint32_t));
6082 }
6083 TOKUDB_HANDLER_DBUG_VOID_RETURN;
6084}
6085
6086//
6087// Per InnoDB: Returns statistics information of the table to the MySQL interpreter,
6088// in various fields of the handle object.
6089// Return:
6090// 0, always success
6091//
6092int ha_tokudb::info(uint flag) {
6093 TOKUDB_HANDLER_DBUG_ENTER("%d", flag);
6094 int error = 0;
6095#if TOKU_CLUSTERING_IS_COVERING
6096 for (uint i=0; i < table->s->keys; i++)
6097 if (key_is_clustering(&table->key_info[i]))
6098 table->covering_keys.set_bit(i);
6099#endif
6100 DB_TXN* txn = NULL;
6101 if (flag & HA_STATUS_VARIABLE) {
6102 stats.records = share->row_count() + share->rows_from_locked_table;
6103 stats.deleted = 0;
6104 if (!(flag & HA_STATUS_NO_LOCK)) {
6105
6106 error = txn_begin(db_env, NULL, &txn, DB_READ_UNCOMMITTED, ha_thd());
6107 if (error) {
6108 goto cleanup;
6109 }
6110
6111 // we should always have a primary key
6112 assert_always(share->file != NULL);
6113
6114 DB_BTREE_STAT64 dict_stats;
6115 error = share->file->stat64(share->file, txn, &dict_stats);
6116 if (error) {
6117 goto cleanup;
6118 }
6119 share->set_row_count(dict_stats.bt_ndata, false);
6120 stats.records = dict_stats.bt_ndata;
6121 stats.create_time = dict_stats.bt_create_time_sec;
6122 stats.update_time = dict_stats.bt_modify_time_sec;
6123 stats.check_time = dict_stats.bt_verify_time_sec;
6124 stats.data_file_length = dict_stats.bt_dsize;
6125 stats.delete_length = dict_stats.bt_fsize - dict_stats.bt_dsize;
6126 if (hidden_primary_key) {
6127 //
6128 // in this case, we have a hidden primary key, do not
6129 // want to report space taken up by the hidden primary key to the user
6130 //
6131 uint64_t hpk_space =
6132 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH * dict_stats.bt_ndata;
6133 stats.data_file_length =
6134 (hpk_space > stats.data_file_length) ?
6135 0 : stats.data_file_length - hpk_space;
6136 } else {
6137 //
6138 // one infinity byte per key needs to be subtracted
6139 //
6140 uint64_t inf_byte_space = dict_stats.bt_ndata;
6141 stats.data_file_length =
6142 (inf_byte_space > stats.data_file_length) ?
6143 0 : stats.data_file_length - inf_byte_space;
6144 }
6145
6146 stats.mean_rec_length =
6147 stats.records ?
6148 (ulong)(stats.data_file_length/stats.records) : 0;
6149 stats.index_file_length = 0;
6150 // curr_num_DBs is the number of keys we have, according
6151 // to the mysql layer. if drop index is running concurrently
6152 // with info() (it can, because info does not take table locks),
6153 // then it could be the case that one of the dbs was dropped
6154 // and set to NULL before mysql was able to set table->s->keys
6155 // accordingly.
6156 //
6157 // we should just ignore any DB * that is NULL.
6158 //
6159 // this solution is much simpler than trying to maintain an
6160 // accurate number of valid keys at the handlerton layer.
6161 uint curr_num_DBs =
6162 table->s->keys + tokudb_test(hidden_primary_key);
6163 for (uint i = 0; i < curr_num_DBs; i++) {
6164 // skip the primary key, skip dropped indexes
6165 if (i == primary_key || share->key_file[i] == NULL) {
6166 continue;
6167 }
6168 error = share->key_file[i]->stat64(
6169 share->key_file[i], txn, &dict_stats);
6170 if (error) {
6171 goto cleanup;
6172 }
6173 stats.index_file_length += dict_stats.bt_dsize;
6174 stats.delete_length +=
6175 dict_stats.bt_fsize - dict_stats.bt_dsize;
6176 }
6177 }
6178
6179 /*
6180 The following comment and logic has been taken from InnoDB and
6181 an old hack was removed that forced to always set stats.records > 0
6182 ---
6183 The MySQL optimizer seems to assume in a left join that n_rows
6184 is an accurate estimate if it is zero. Of course, it is not,
6185 since we do not have any locks on the rows yet at this phase.
6186 Since SHOW TABLE STATUS seems to call this function with the
6187 HA_STATUS_TIME flag set, while the left join optimizer does not
6188 set that flag, we add one to a zero value if the flag is not
6189 set. That way SHOW TABLE STATUS will show the best estimate,
6190 while the optimizer never sees the table empty. */
6191 if (stats.records == 0 && !(flag & HA_STATUS_TIME)) {
6192 stats.records++;
6193 }
6194 }
6195 if ((flag & HA_STATUS_CONST)) {
6196 stats.max_data_file_length = 9223372036854775807ULL;
6197 }
6198 if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST)) {
6199 share->set_cardinality_counts_in_table(table);
6200 }
6201
6202 /* Don't return key if we got an error for the internal primary key */
6203 if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys) {
6204 errkey = last_dup_key;
6205 }
6206
6207 if (flag & HA_STATUS_AUTO && table->found_next_number_field) {
6208 THD* thd = table->in_use;
6209 struct system_variables* variables = &thd->variables;
6210 stats.auto_increment_value =
6211 share->last_auto_increment + variables->auto_increment_increment;
6212 }
6213 error = 0;
6214cleanup:
6215 if (txn != NULL) {
6216 commit_txn(txn, DB_TXN_NOSYNC);
6217 txn = NULL;
6218 }
6219 TOKUDB_HANDLER_DBUG_RETURN(error);
6220}
6221
6222//
6223// Per InnoDB: Tells something additional to the handler about how to do things.
6224//
6225int ha_tokudb::extra(enum ha_extra_function operation) {
6226 TOKUDB_HANDLER_DBUG_ENTER("%d", operation);
6227 switch (operation) {
6228 case HA_EXTRA_RESET_STATE:
6229 reset();
6230 break;
6231 case HA_EXTRA_KEYREAD:
6232 key_read = true; // Query satisfied with key
6233 break;
6234 case HA_EXTRA_NO_KEYREAD:
6235 key_read = false;
6236 break;
6237 case HA_EXTRA_IGNORE_DUP_KEY:
6238 using_ignore = true;
6239 break;
6240 case HA_EXTRA_NO_IGNORE_DUP_KEY:
6241 using_ignore = false;
6242 break;
6243 case HA_EXTRA_IGNORE_NO_KEY:
6244 using_ignore_no_key = true;
6245 break;
6246 case HA_EXTRA_NO_IGNORE_NO_KEY:
6247 using_ignore_no_key = false;
6248 break;
6249 case HA_EXTRA_NOT_USED:
6250 case HA_EXTRA_PREPARE_FOR_RENAME:
6251 break; // must do nothing and return 0
6252 default:
6253 break;
6254 }
6255 TOKUDB_HANDLER_DBUG_RETURN(0);
6256}
6257
6258int ha_tokudb::reset() {
6259 TOKUDB_HANDLER_DBUG_ENTER("");
6260 key_read = false;
6261 using_ignore = false;
6262 using_ignore_no_key = false;
6263 reset_dsmrr();
6264 invalidate_icp();
6265 TOKUDB_HANDLER_DBUG_RETURN(0);
6266}
6267
6268//
6269// helper function that iterates through all DB's
6270// and grabs a lock (either read or write, but not both)
6271// Parameters:
6272// [in] trans - transaction to be used to pre acquire the lock
6273// lt - type of lock to get, either lock_read or lock_write
6274// Returns:
6275// 0 on success
6276// error otherwise
6277//
6278int ha_tokudb::acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt) {
6279 TOKUDB_HANDLER_DBUG_ENTER("%p %s", trans, lt == lock_read ? "r" : "w");
6280 int error = ENOSYS;
6281 if (!num_DBs_locked_in_bulk) {
6282 rwlock_t_lock_read(share->_num_DBs_lock);
6283 }
6284 uint curr_num_DBs = share->num_DBs;
6285 if (lt == lock_read) {
6286 error = 0;
6287 goto cleanup;
6288 } else if (lt == lock_write) {
6289 for (uint i = 0; i < curr_num_DBs; i++) {
6290 DB* db = share->key_file[i];
6291 error = db->pre_acquire_table_lock(db, trans);
6292 if (error == EINVAL)
6293 TOKUDB_HANDLER_TRACE("%d db=%p trans=%p", i, db, trans);
6294 if (error) break;
6295 }
6296 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6297 if (error) goto cleanup;
6298 } else {
6299 error = ENOSYS;
6300 goto cleanup;
6301 }
6302
6303 error = 0;
6304cleanup:
6305 if (!num_DBs_locked_in_bulk) {
6306 share->_num_DBs_lock.unlock();
6307 }
6308 TOKUDB_HANDLER_DBUG_RETURN(error);
6309}
6310
6311int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
6312 int error;
6313 ulong tx_isolation = thd_tx_isolation(thd);
6314 HA_TOKU_ISO_LEVEL toku_iso_level = tx_to_toku_iso(tx_isolation);
6315 bool is_autocommit = !thd_test_options(
6316 thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN);
6317
6318 /* First table lock, start transaction */
6319 if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) &&
6320 !trx->all &&
6321 (thd_sql_command(thd) != SQLCOM_CREATE_TABLE) &&
6322 (thd_sql_command(thd) != SQLCOM_DROP_TABLE) &&
6323 (thd_sql_command(thd) != SQLCOM_DROP_INDEX) &&
6324 (thd_sql_command(thd) != SQLCOM_CREATE_INDEX) &&
6325 (thd_sql_command(thd) != SQLCOM_ALTER_TABLE)) {
6326 /* QQQ We have to start a master transaction */
6327 // DBUG_PRINT("trans", ("starting transaction all "));
6328 uint32_t txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6329#if 50614 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
6330 if (thd_tx_is_read_only(thd)) {
6331 txn_begin_flags |= DB_TXN_READ_ONLY;
6332 }
6333#endif
6334 if ((error = txn_begin(db_env, NULL, &trx->all, txn_begin_flags, thd))) {
6335 goto cleanup;
6336 }
6337 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6338 TOKUDB_DEBUG_TXN,
6339 "created master %p",
6340 trx->all);
6341 trx->sp_level = trx->all;
6342 trans_register_ha(thd, true, tokudb_hton);
6343 }
6344 DBUG_PRINT("trans", ("starting transaction stmt"));
6345 if (trx->stmt) {
6346 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6347 TOKUDB_DEBUG_TXN,
6348 "warning:stmt=%p",
6349 trx->stmt);
6350 }
6351 uint32_t txn_begin_flags;
6352 if (trx->all == NULL) {
6353 txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6354 //
6355 // if the isolation level that the user has set is serializable,
6356 // but autocommit is on and this is just a select,
6357 // then we can go ahead and set the isolation level to
6358 // be a snapshot read, because we can serialize
6359 // the transaction to be the point in time at which the snapshot began.
6360 //
6361 if (txn_begin_flags == 0 && is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT) {
6362 txn_begin_flags = DB_TXN_SNAPSHOT;
6363 }
6364 if (is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT &&
6365 !thd->in_sub_stmt && lock.type <= TL_READ_NO_INSERT &&
6366 !thd->lex->uses_stored_routines()) {
6367 txn_begin_flags |= DB_TXN_READ_ONLY;
6368 }
6369 } else {
6370 txn_begin_flags = DB_INHERIT_ISOLATION;
6371 }
6372 error = txn_begin(db_env, trx->sp_level, &trx->stmt, txn_begin_flags, thd);
6373 if (error) {
6374 /* We leave the possible master transaction open */
6375 goto cleanup;
6376 }
6377 trx->sub_sp_level = trx->stmt;
6378 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6379 TOKUDB_DEBUG_TXN,
6380 "created stmt %p sp_level %p",
6381 trx->sp_level,
6382 trx->stmt);
6383 reset_stmt_progress(&trx->stmt_progress);
6384 trans_register_ha(thd, false, tokudb_hton);
6385cleanup:
6386 return error;
6387}
6388
6389static const char *lock_type_str(int lock_type) {
6390 if (lock_type == F_RDLCK) return "F_RDLCK";
6391 if (lock_type == F_WRLCK) return "F_WRLCK";
6392 if (lock_type == F_UNLCK) return "F_UNLCK";
6393 return "?";
6394}
6395
6396/*
6397 As MySQL will execute an external lock for every new table it uses
6398 we can use this to start the transactions.
6399 If we are in auto_commit mode we just need to start a transaction
6400 for the statement to be able to rollback the statement.
6401 If not, we have to start a master transaction if there doesn't exist
6402 one from before.
6403*/
6404//
6405// Parameters:
6406// [in] thd - handle to the user thread
6407// lock_type - the type of lock
6408// Returns:
6409// 0 on success
6410// error otherwise
6411//
6412int ha_tokudb::external_lock(THD * thd, int lock_type) {
6413 TOKUDB_HANDLER_DBUG_ENTER(
6414 "cmd %d lock %d %s %s",
6415 thd_sql_command(thd),
6416 lock_type,
6417 lock_type_str(lock_type),
6418 share->full_table_name());
6419 if (TOKUDB_UNLIKELY(!TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ENTER) &&
6420 TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_LOCK))) {
6421 TOKUDB_HANDLER_TRACE(
6422 "cmd %d lock %d %s %s",
6423 thd_sql_command(thd),
6424 lock_type,
6425 lock_type_str(lock_type),
6426 share->full_table_name());
6427 }
6428 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6429
6430 int error = 0;
6431 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6432 if (!trx) {
6433 error = create_tokudb_trx_data_instance(&trx);
6434 if (error) { goto cleanup; }
6435 thd_set_ha_data(thd, tokudb_hton, trx);
6436 }
6437
6438 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6439 TOKUDB_DEBUG_TXN,
6440 "trx %p %p %p %p %u %u",
6441 trx->all,
6442 trx->stmt,
6443 trx->sp_level,
6444 trx->sub_sp_level,
6445 trx->tokudb_lock_count,
6446 trx->create_lock_count);
6447
6448 if (trx->all == NULL) {
6449 trx->sp_level = NULL;
6450 }
6451 if (lock_type != F_UNLCK) {
6452 use_write_locks = false;
6453 if (lock_type == F_WRLCK) {
6454 use_write_locks = true;
6455 }
6456 if (!trx->stmt) {
6457 transaction = NULL; // Safety
6458 error = create_txn(thd, trx);
6459 if (error) {
6460 goto cleanup;
6461 }
6462 trx->create_lock_count = trx->tokudb_lock_count;
6463 }
6464 transaction = trx->sub_sp_level;
6465 trx->tokudb_lock_count++;
6466 } else {
6467 share->update_row_count(thd, added_rows, deleted_rows, updated_rows);
6468 added_rows = 0;
6469 deleted_rows = 0;
6470 updated_rows = 0;
6471 share->rows_from_locked_table = 0;
6472 if (trx->tokudb_lock_count > 0) {
6473 if (--trx->tokudb_lock_count <= trx->create_lock_count) {
6474 trx->create_lock_count = 0;
6475 if (trx->stmt) {
6476 /*
6477 F_UNLCK is done without a transaction commit / rollback.
6478 This happens if the thread didn't update any rows
6479 We must in this case commit the work to keep the row locks
6480 */
6481 DBUG_PRINT("trans", ("commiting non-updating transaction"));
6482 reset_stmt_progress(&trx->stmt_progress);
6483 commit_txn(trx->stmt, 0);
6484 trx->stmt = NULL;
6485 trx->sub_sp_level = NULL;
6486 }
6487 }
6488 transaction = NULL;
6489 }
6490 }
6491cleanup:
6492 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6493 TOKUDB_HANDLER_DBUG_RETURN(error);
6494}
6495
6496/*
6497 When using LOCK TABLE's external_lock is only called when the actual
6498 TABLE LOCK is done.
6499 Under LOCK TABLES, each used tables will force a call to start_stmt.
6500*/
6501int ha_tokudb::start_stmt(THD* thd, thr_lock_type lock_type) {
6502 TOKUDB_HANDLER_DBUG_ENTER(
6503 "cmd %d lock %d %s",
6504 thd_sql_command(thd),
6505 lock_type,
6506 share->full_table_name());
6507
6508 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6509
6510 int error = 0;
6511 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6512 if (!trx) {
6513 error = create_tokudb_trx_data_instance(&trx);
6514 if (error) { goto cleanup; }
6515 thd_set_ha_data(thd, tokudb_hton, trx);
6516 }
6517
6518 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6519 TOKUDB_DEBUG_TXN,
6520 "trx %p %p %p %p %u %u",
6521 trx->all,
6522 trx->stmt,
6523 trx->sp_level,
6524 trx->sub_sp_level,
6525 trx->tokudb_lock_count,
6526 trx->create_lock_count);
6527
6528 /*
6529 note that trx->stmt may have been already initialized as start_stmt()
6530 is called for *each table* not for each storage engine,
6531 and there could be many bdb tables referenced in the query
6532 */
6533 if (!trx->stmt) {
6534 error = create_txn(thd, trx);
6535 if (error) {
6536 goto cleanup;
6537 }
6538 trx->create_lock_count = trx->tokudb_lock_count;
6539 } else {
6540 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6541 TOKUDB_DEBUG_TXN,
6542 "trx->stmt %p already existed",
6543 trx->stmt);
6544 }
6545 if (added_rows > deleted_rows) {
6546 share->rows_from_locked_table = added_rows - deleted_rows;
6547 }
6548 transaction = trx->sub_sp_level;
6549 trans_register_ha(thd, false, tokudb_hton);
6550cleanup:
6551 TOKUDB_HANDLER_DBUG_RETURN(error);
6552}
6553
6554
6555uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd) {
6556 uint sql_command = thd_sql_command(thd);
6557 bool in_lock_tables = thd_in_lock_tables(thd);
6558
6559 //
6560 // following InnoDB's lead and having checksum command use a snapshot read if told
6561 //
6562 if (sql_command == SQLCOM_CHECKSUM) {
6563 return 0;
6564 }
6565 else if ((lock_type == TL_READ && in_lock_tables) ||
6566 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
6567 sql_command != SQLCOM_SELECT ||
6568 (sql_command == SQLCOM_SELECT && lock_type >= TL_WRITE_ALLOW_WRITE)) { // select for update
6569 ulong tx_isolation = thd_tx_isolation(thd);
6570 // pattern matched from InnoDB
6571 if ( (tx_isolation == ISO_READ_COMMITTED || tx_isolation == ISO_READ_UNCOMMITTED) &&
6572 (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) &&
6573 (sql_command == SQLCOM_INSERT_SELECT
6574 || sql_command == SQLCOM_REPLACE_SELECT
6575 || sql_command == SQLCOM_UPDATE
6576 || sql_command == SQLCOM_CREATE_TABLE) )
6577 {
6578 return 0;
6579 }
6580 else {
6581 return DB_SERIALIZABLE;
6582 }
6583 }
6584 else {
6585 return 0;
6586 }
6587}
6588
6589/*
6590 The idea with handler::store_lock() is the following:
6591
6592 The statement decided which locks we should need for the table
6593 for updates/deletes/inserts we get WRITE locks, for SELECT... we get
6594 read locks.
6595
6596 Before adding the lock into the table lock handler (see thr_lock.c)
6597 mysqld calls store lock with the requested locks. Store lock can now
6598 modify a write lock to a read lock (or some other lock), ignore the
6599 lock (if we don't want to use MySQL table locks at all) or add locks
6600 for many tables (like we do when we are using a MERGE handler).
6601
6602 TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which
6603 signals that we are doing WRITES, but we are still allowing other
6604 reader's and writer's.
6605
6606 When releasing locks, store_lock() are also called. In this case one
6607 usually doesn't have to do anything.
6608
6609 In some exceptional cases MySQL may send a request for a TL_IGNORE;
6610 This means that we are requesting the same lock as last time and this
6611 should also be ignored. (This may happen when someone does a flush
6612 table when we have opened a part of the tables, in which case mysqld
6613 closes and reopens the tables and tries to get the same locks at last
6614 time). In the future we will probably try to remove this.
6615*/
6616
6617THR_LOCK_DATA* *ha_tokudb::store_lock(
6618 THD* thd,
6619 THR_LOCK_DATA** to,
6620 enum thr_lock_type lock_type) {
6621
6622 TOKUDB_HANDLER_DBUG_ENTER(
6623 "lock_type=%d cmd=%d",
6624 lock_type,
6625 thd_sql_command(thd));
6626 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6627 TOKUDB_DEBUG_LOCK,
6628 "lock_type=%d cmd=%d",
6629 lock_type,
6630 thd_sql_command(thd));
6631
6632 if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
6633 enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd);
6634 if (!thd->in_lock_tables) {
6635 if (sql_command == SQLCOM_CREATE_INDEX &&
6636 tokudb::sysvars::create_index_online(thd)) {
6637 // hot indexing
6638 rwlock_t_lock_read(share->_num_DBs_lock);
6639 if (share->num_DBs ==
6640 (table->s->keys + tokudb_test(hidden_primary_key))) {
6641 lock_type = TL_WRITE_ALLOW_WRITE;
6642 }
6643 share->_num_DBs_lock.unlock();
6644 } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
6645 lock_type <= TL_WRITE) &&
6646 sql_command != SQLCOM_TRUNCATE &&
6647 !thd_tablespace_op(thd)) {
6648 // allow concurrent writes
6649 lock_type = TL_WRITE_ALLOW_WRITE;
6650 } else if (sql_command == SQLCOM_OPTIMIZE &&
6651 lock_type == TL_READ_NO_INSERT) {
6652 // hot optimize table
6653 lock_type = TL_READ;
6654 }
6655 }
6656 lock.type = lock_type;
6657 }
6658 *to++ = &lock;
6659 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6660 TOKUDB_DEBUG_LOCK,
6661 "lock_type=%d",
6662 lock_type);
6663 TOKUDB_HANDLER_DBUG_RETURN_PTR(to);
6664}
6665
6666static toku_compression_method get_compression_method(DB* file) {
6667 enum toku_compression_method method;
6668 int r = file->get_compression_method(file, &method);
6669 assert_always(r == 0);
6670 return method;
6671}
6672
6673#if TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6674enum row_type ha_tokudb::get_row_type() const {
6675 toku_compression_method compression_method = get_compression_method(share->file);
6676 return toku_compression_method_to_row_type(compression_method);
6677}
6678#endif
6679
6680static int create_sub_table(
6681 const char* table_name,
6682 DBT* row_descriptor,
6683 DB_TXN* txn,
6684 uint32_t block_size,
6685 uint32_t read_block_size,
6686 toku_compression_method compression_method,
6687 bool is_hot_index,
6688 uint32_t fanout) {
6689
6690 TOKUDB_DBUG_ENTER("");
6691 int error;
6692 DB *file = NULL;
6693 uint32_t create_flags;
6694
6695
6696 error = db_create(&file, db_env, 0);
6697 if (error) {
6698 DBUG_PRINT("error", ("Got error: %d when creating table", error));
6699 my_errno = error;
6700 goto exit;
6701 }
6702
6703
6704 if (block_size != 0) {
6705 error = file->set_pagesize(file, block_size);
6706 if (error != 0) {
6707 DBUG_PRINT(
6708 "error",
6709 ("Got error: %d when setting block size %u for table '%s'",
6710 error,
6711 block_size,
6712 table_name));
6713 goto exit;
6714 }
6715 }
6716 if (read_block_size != 0) {
6717 error = file->set_readpagesize(file, read_block_size);
6718 if (error != 0) {
6719 DBUG_PRINT(
6720 "error",
6721 ("Got error: %d when setting read block size %u for table '%s'",
6722 error,
6723 read_block_size,
6724 table_name));
6725 goto exit;
6726 }
6727 }
6728 if (fanout != 0) {
6729 error = file->set_fanout(file, fanout);
6730 if (error != 0) {
6731 DBUG_PRINT(
6732 "error",
6733 ("Got error: %d when setting fanout %u for table '%s'",
6734 error,
6735 fanout,
6736 table_name));
6737 goto exit;
6738 }
6739 }
6740 error = file->set_compression_method(file, compression_method);
6741 if (error != 0) {
6742 DBUG_PRINT(
6743 "error",
6744 ("Got error: %d when setting compression type %u for table '%s'",
6745 error,
6746 compression_method,
6747 table_name));
6748 goto exit;
6749 }
6750
6751 create_flags =
6752 DB_THREAD | DB_CREATE | DB_EXCL | (is_hot_index ? DB_IS_HOT_INDEX : 0);
6753 error =
6754 file->open(
6755 file,
6756 txn,
6757 table_name,
6758 NULL,
6759 DB_BTREE,
6760 create_flags,
6761 my_umask);
6762 if (error) {
6763 DBUG_PRINT(
6764 "error",
6765 ("Got error: %d when opening table '%s'", error, table_name));
6766 goto exit;
6767 }
6768
6769 error =
6770 file->change_descriptor(
6771 file,
6772 txn,
6773 row_descriptor,
6774 (is_hot_index ? DB_IS_HOT_INDEX |
6775 DB_UPDATE_CMP_DESCRIPTOR :
6776 DB_UPDATE_CMP_DESCRIPTOR));
6777 if (error) {
6778 DBUG_PRINT(
6779 "error",
6780 ("Got error: %d when setting row descriptor for table '%s'",
6781 error,
6782 table_name));
6783 goto exit;
6784 }
6785
6786 error = 0;
6787exit:
6788 if (file) {
6789 int r = file->close(file, 0);
6790 assert_always(r==0);
6791 }
6792 TOKUDB_DBUG_RETURN(error);
6793}
6794
6795void ha_tokudb::update_create_info(HA_CREATE_INFO* create_info) {
6796 if (share->has_auto_inc) {
6797 info(HA_STATUS_AUTO);
6798 if (!(create_info->used_fields & HA_CREATE_USED_AUTO) ||
6799 create_info->auto_increment_value < stats.auto_increment_value) {
6800 create_info->auto_increment_value = stats.auto_increment_value;
6801 }
6802 }
6803#if TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6804 if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
6805 // show create table asks us to update this create_info, this makes it
6806 // so we'll always show what compression type we're using
6807 create_info->row_type = get_row_type();
6808 if (create_info->row_type == ROW_TYPE_TOKU_ZLIB &&
6809 tokudb::sysvars::hide_default_row_format(ha_thd()) != 0) {
6810 create_info->row_type = ROW_TYPE_DEFAULT;
6811 }
6812 }
6813#endif
6814}
6815
6816//
6817// removes key name from status.tokudb.
6818// needed for when we are dropping indexes, so that
6819// during drop table, we do not attempt to remove already dropped
6820// indexes because we did not keep status.tokudb in sync with list of indexes.
6821//
6822int ha_tokudb::remove_key_name_from_status(DB* status_block, const char* key_name, DB_TXN* txn) {
6823 int error;
6824 uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6825 HA_METADATA_KEY md_key = hatoku_key_name;
6826 memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6827 //
6828 // put index name in status.tokudb
6829 //
6830 memcpy(
6831 status_key_info + sizeof(HA_METADATA_KEY),
6832 key_name,
6833 strlen(key_name) + 1
6834 );
6835 error = remove_metadata(
6836 status_block,
6837 status_key_info,
6838 sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6839 txn
6840 );
6841 return error;
6842}
6843
6844//
6845// writes the key name in status.tokudb, so that we may later delete or rename
6846// the dictionary associated with key_name
6847//
6848int ha_tokudb::write_key_name_to_status(DB* status_block, const char* key_name,
6849 DB_TXN* txn) {
6850 int error;
6851 uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6852 HA_METADATA_KEY md_key = hatoku_key_name;
6853 memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6854 //
6855 // put index name in status.tokudb
6856 //
6857 memcpy(
6858 status_key_info + sizeof(HA_METADATA_KEY),
6859 key_name,
6860 strlen(key_name) + 1
6861 );
6862 error = write_metadata(
6863 status_block,
6864 status_key_info,
6865 sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6866 NULL,
6867 0,
6868 txn
6869 );
6870 return error;
6871}
6872
6873//
6874// some tracing moved out of ha_tokudb::create, because ::create was
6875// getting cluttered
6876//
6877void ha_tokudb::trace_create_table_info(const char *name, TABLE * form) {
6878 uint i;
6879 //
6880 // tracing information about what type of table we are creating
6881 //
6882 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_OPEN))) {
6883 for (i = 0; i < form->s->fields; i++) {
6884 Field *field = form->s->field[i];
6885 TOKUDB_HANDLER_TRACE(
6886 "field:%d:%s:type=%d:flags=%x",
6887 i,
6888 field->field_name.str,
6889 field->type(),
6890 field->flags);
6891 }
6892 for (i = 0; i < form->s->keys; i++) {
6893 KEY *key = &form->s->key_info[i];
6894 TOKUDB_HANDLER_TRACE(
6895 "key:%d:%s:%d",
6896 i,
6897 key->name.str,
6898 key->user_defined_key_parts);
6899 uint p;
6900 for (p = 0; p < key->user_defined_key_parts; p++) {
6901 KEY_PART_INFO* key_part = &key->key_part[p];
6902 Field* field = key_part->field;
6903 TOKUDB_HANDLER_TRACE(
6904 "key:%d:%d:length=%d:%s:type=%d:flags=%x",
6905 i,
6906 p,
6907 key_part->length,
6908 field->field_name.str,
6909 field->type(),
6910 field->flags);
6911 }
6912 }
6913 }
6914}
6915
6916static uint32_t get_max_desc_size(KEY_AND_COL_INFO* kc_info, TABLE* form) {
6917 uint32_t max_row_desc_buff_size;
6918 // upper bound of key comparison descriptor
6919 max_row_desc_buff_size = 2*(form->s->fields * 6)+10;
6920 // upper bound for sec. key part
6921 max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info);
6922 // upper bound for clustering val part
6923 max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s);
6924 return max_row_desc_buff_size;
6925}
6926
6927static uint32_t create_secondary_key_descriptor(
6928 uchar* buf,
6929 KEY* key_info,
6930 KEY* prim_key,
6931 uint hpk,
6932 TABLE* form,
6933 uint primary_key,
6934 uint32_t keynr,
6935 KEY_AND_COL_INFO* kc_info) {
6936
6937 uchar* ptr = NULL;
6938
6939 ptr = buf;
6940 ptr += create_toku_key_descriptor(
6941 ptr,
6942 false,
6943 key_info,
6944 hpk,
6945 prim_key
6946 );
6947
6948 ptr += create_toku_secondary_key_pack_descriptor(
6949 ptr,
6950 hpk,
6951 primary_key,
6952 form->s,
6953 form,
6954 kc_info,
6955 key_info,
6956 prim_key
6957 );
6958
6959 ptr += create_toku_clustering_val_pack_descriptor(
6960 ptr,
6961 primary_key,
6962 form->s,
6963 kc_info,
6964 keynr,
6965 key_is_clustering(key_info)
6966 );
6967 return ptr - buf;
6968}
6969
6970
6971//
6972// creates dictionary for secondary index, with key description key_info, all using txn
6973//
6974int ha_tokudb::create_secondary_dictionary(
6975 const char* name,
6976 TABLE* form,
6977 KEY* key_info,
6978 DB_TXN* txn,
6979 KEY_AND_COL_INFO* kc_info,
6980 uint32_t keynr,
6981 bool is_hot_index,
6982 toku_compression_method compression_method) {
6983
6984 int error;
6985 DBT row_descriptor;
6986 uchar* row_desc_buff = NULL;
6987 char* newname = NULL;
6988 size_t newname_len = 0;
6989 KEY* prim_key = NULL;
6990 char dict_name[MAX_DICT_NAME_LEN];
6991 uint32_t max_row_desc_buff_size;
6992 uint hpk= (form->s->primary_key >= MAX_KEY) ?
6993 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
6994 uint32_t block_size;
6995 uint32_t read_block_size;
6996 uint32_t fanout;
6997 THD* thd = ha_thd();
6998
6999 memset(&row_descriptor, 0, sizeof(row_descriptor));
7000
7001 max_row_desc_buff_size = get_max_desc_size(kc_info,form);
7002
7003 row_desc_buff = (uchar*)tokudb::memory::malloc(
7004 max_row_desc_buff_size,
7005 MYF(MY_WME));
7006 if (row_desc_buff == NULL) {
7007 error = ENOMEM;
7008 goto cleanup;
7009 }
7010
7011 newname_len = get_max_dict_name_path_length(name);
7012 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7013 if (newname == NULL) {
7014 error = ENOMEM;
7015 goto cleanup;
7016 }
7017
7018 sprintf(dict_name, "key-%s", key_info->name.str);
7019 make_name(newname, newname_len, name, dict_name);
7020
7021 prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7022
7023 //
7024 // setup the row descriptor
7025 //
7026 row_descriptor.data = row_desc_buff;
7027 //
7028 // save data necessary for key comparisons
7029 //
7030 row_descriptor.size = create_secondary_key_descriptor(
7031 row_desc_buff,
7032 key_info,
7033 prim_key,
7034 hpk,
7035 form,
7036 primary_key,
7037 keynr,
7038 kc_info);
7039 assert_always(row_descriptor.size <= max_row_desc_buff_size);
7040
7041 block_size = tokudb::sysvars::block_size(thd);
7042 read_block_size = tokudb::sysvars::read_block_size(thd);
7043 fanout = tokudb::sysvars::fanout(thd);
7044
7045 error = create_sub_table(
7046 newname,
7047 &row_descriptor,
7048 txn,
7049 block_size,
7050 read_block_size,
7051 compression_method,
7052 is_hot_index,
7053 fanout);
7054cleanup:
7055 tokudb::memory::free(newname);
7056 tokudb::memory::free(row_desc_buff);
7057 return error;
7058}
7059
7060
7061static uint32_t create_main_key_descriptor(
7062 uchar* buf,
7063 KEY* prim_key,
7064 uint hpk,
7065 uint primary_key,
7066 TABLE* form,
7067 KEY_AND_COL_INFO* kc_info) {
7068
7069 uchar* ptr = buf;
7070 ptr += create_toku_key_descriptor(
7071 ptr,
7072 hpk,
7073 prim_key,
7074 false,
7075 NULL);
7076
7077 ptr += create_toku_main_key_pack_descriptor(ptr);
7078
7079 ptr += create_toku_clustering_val_pack_descriptor(
7080 ptr,
7081 primary_key,
7082 form->s,
7083 kc_info,
7084 primary_key,
7085 false);
7086 return ptr - buf;
7087}
7088
7089//
7090// create and close the main dictionarr with name of "name" using table form, all within
7091// transaction txn.
7092//
7093int ha_tokudb::create_main_dictionary(
7094 const char* name,
7095 TABLE* form,
7096 DB_TXN* txn,
7097 KEY_AND_COL_INFO* kc_info,
7098 toku_compression_method compression_method) {
7099
7100 int error;
7101 DBT row_descriptor;
7102 uchar* row_desc_buff = NULL;
7103 char* newname = NULL;
7104 size_t newname_len = 0;
7105 KEY* prim_key = NULL;
7106 uint32_t max_row_desc_buff_size;
7107 uint hpk = (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7108 uint32_t block_size;
7109 uint32_t read_block_size;
7110 uint32_t fanout;
7111 THD* thd = ha_thd();
7112
7113 memset(&row_descriptor, 0, sizeof(row_descriptor));
7114 max_row_desc_buff_size = get_max_desc_size(kc_info, form);
7115
7116 row_desc_buff = (uchar*)tokudb::memory::malloc(
7117 max_row_desc_buff_size,
7118 MYF(MY_WME));
7119 if (row_desc_buff == NULL) {
7120 error = ENOMEM;
7121 goto cleanup;
7122 }
7123
7124 newname_len = get_max_dict_name_path_length(name);
7125 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7126 if (newname == NULL) {
7127 error = ENOMEM;
7128 goto cleanup;
7129 }
7130
7131 make_name(newname, newname_len, name, "main");
7132
7133 prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7134
7135 //
7136 // setup the row descriptor
7137 //
7138 row_descriptor.data = row_desc_buff;
7139 //
7140 // save data necessary for key comparisons
7141 //
7142 row_descriptor.size = create_main_key_descriptor(
7143 row_desc_buff,
7144 prim_key,
7145 hpk,
7146 primary_key,
7147 form,
7148 kc_info);
7149 assert_always(row_descriptor.size <= max_row_desc_buff_size);
7150
7151 block_size = tokudb::sysvars::block_size(thd);
7152 read_block_size = tokudb::sysvars::read_block_size(thd);
7153 fanout = tokudb::sysvars::fanout(thd);
7154
7155 /* Create the main table that will hold the real rows */
7156 error = create_sub_table(
7157 newname,
7158 &row_descriptor,
7159 txn,
7160 block_size,
7161 read_block_size,
7162 compression_method,
7163 false,
7164 fanout);
7165cleanup:
7166 tokudb::memory::free(newname);
7167 tokudb::memory::free(row_desc_buff);
7168 return error;
7169}
7170
7171//
7172// Creates a new table
7173// Parameters:
7174// [in] name - table name
7175// [in] form - info on table, columns and indexes
7176// [in] create_info - more info on table, CURRENTLY UNUSED
7177// Returns:
7178// 0 on success
7179// error otherwise
7180//
7181int ha_tokudb::create(
7182 const char* name,
7183 TABLE* form,
7184 HA_CREATE_INFO* create_info) {
7185
7186 TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7187
7188 int error;
7189 DB *status_block = NULL;
7190 uint version;
7191 uint capabilities;
7192 DB_TXN* txn = NULL;
7193 bool do_commit = false;
7194 char* newname = NULL;
7195 size_t newname_len = 0;
7196 KEY_AND_COL_INFO kc_info;
7197 tokudb_trx_data *trx = NULL;
7198 THD* thd = ha_thd();
7199
7200 memset(&kc_info, 0, sizeof(kc_info));
7201
7202#if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100999
7203 // TokuDB does not support discover_table_names() and writes no files
7204 // in the database directory, so automatic filename-based
7205 // discover_table_names() doesn't work either. So, it must force .frm
7206 // file to disk.
7207 form->s->write_frm_image();
7208#endif
7209
7210#if TOKU_INCLUDE_OPTION_STRUCTS
7211 const tokudb::sysvars::row_format_t row_format =
7212 (tokudb::sysvars::row_format_t)form->s->option_struct->row_format;
7213#else
7214 // TDB-76 : CREATE TABLE ... LIKE ... does not use source row_format on
7215 // target table
7216 // Original code would only use create_info->row_type if
7217 // create_info->used_fields & HA_CREATE_USED_ROW_FORMAT was true. This
7218 // would cause us to skip transferring the row_format for a table created
7219 // via CREATE TABLE tn LIKE tn. We also take on more InnoDB like behavior
7220 // and throw a warning if we get a row_format that we can't translate into
7221 // a known TokuDB row_format.
7222 tokudb::sysvars::row_format_t row_format =
7223 tokudb::sysvars::row_format(thd);
7224
7225 if ((create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) ||
7226 create_info->row_type != ROW_TYPE_DEFAULT) {
7227 row_format = row_type_to_row_format(create_info->row_type);
7228 if (row_format == tokudb::sysvars::SRV_ROW_FORMAT_DEFAULT &&
7229 create_info->row_type != ROW_TYPE_DEFAULT) {
7230 push_warning(thd,
7231 Sql_condition::WARN_LEVEL_WARN,
7232 ER_ILLEGAL_HA_CREATE_OPTION,
7233 "TokuDB: invalid ROW_FORMAT specifier.");
7234 }
7235 }
7236#endif
7237 const toku_compression_method compression_method =
7238 row_format_to_toku_compression_method(row_format);
7239
7240 bool create_from_engine = (create_info->table_options & HA_OPTION_CREATE_FROM_ENGINE);
7241 if (create_from_engine) {
7242 // table already exists, nothing to do
7243 error = 0;
7244 goto cleanup;
7245 }
7246
7247 // validate the fields in the table. If the table has fields
7248 // we do not support that came from an old version of MySQL,
7249 // gracefully return an error
7250 for (uint32_t i = 0; i < form->s->fields; i++) {
7251 Field* field = table_share->field[i];
7252 if (!field_valid_for_tokudb_table(field)) {
7253 sql_print_error("Table %s has an invalid field %s, that was created "
7254 "with an old version of MySQL. This field is no longer supported. "
7255 "This is probably due to an alter table engine=TokuDB. To load this "
7256 "table, do a dump and load",
7257 name,
7258 field->field_name.str
7259 );
7260 error = HA_ERR_UNSUPPORTED;
7261 goto cleanup;
7262 }
7263 }
7264
7265 newname_len = get_max_dict_name_path_length(name);
7266 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7267 if (newname == NULL) {
7268 error = ENOMEM;
7269 goto cleanup;
7270 }
7271
7272 trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
7273 if (trx && trx->sub_sp_level &&
7274 thd_sql_command(thd) == SQLCOM_CREATE_TABLE) {
7275 txn = trx->sub_sp_level;
7276 } else {
7277 do_commit = true;
7278 error = txn_begin(db_env, 0, &txn, 0, thd);
7279 if (error) {
7280 goto cleanup;
7281 }
7282 }
7283
7284 primary_key = form->s->primary_key;
7285 hidden_primary_key = (primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7286 if (hidden_primary_key) {
7287 primary_key = form->s->keys;
7288 }
7289
7290 /* do some tracing */
7291 trace_create_table_info(name,form);
7292
7293 /* Create status.tokudb and save relevant metadata */
7294 make_name(newname, newname_len, name, "status");
7295
7296 error = tokudb::metadata::create(db_env, &status_block, newname, txn);
7297 if (error) { goto cleanup; }
7298
7299 version = HA_TOKU_VERSION;
7300 error = write_to_status(
7301 status_block,
7302 hatoku_new_version,
7303 &version,
7304 sizeof(version),
7305 txn);
7306 if (error) {
7307 goto cleanup;
7308 }
7309
7310 capabilities = HA_TOKU_CAP;
7311 error = write_to_status(
7312 status_block,
7313 hatoku_capabilities,
7314 &capabilities,
7315 sizeof(capabilities),
7316 txn);
7317 if (error) {
7318 goto cleanup;
7319 }
7320
7321 error = write_auto_inc_create(
7322 status_block,
7323 create_info->auto_increment_value,
7324 txn);
7325 if (error) {
7326 goto cleanup;
7327 }
7328
7329#if WITH_PARTITION_STORAGE_ENGINE
7330 if (TOKU_PARTITION_WRITE_FRM_DATA || form->part_info == NULL) {
7331 error = write_frm_data(status_block, txn, form->s->path.str);
7332 if (error) {
7333 goto cleanup;
7334 }
7335 }
7336#else
7337 error = write_frm_data(status_block, txn, form->s->path.str);
7338 if (error) {
7339 goto cleanup;
7340 }
7341#endif
7342
7343 error = allocate_key_and_col_info(form->s, &kc_info);
7344 if (error) {
7345 goto cleanup;
7346 }
7347
7348 error = initialize_key_and_col_info(
7349 form->s,
7350 form,
7351 &kc_info,
7352 hidden_primary_key,
7353 primary_key);
7354 if (error) {
7355 goto cleanup;
7356 }
7357
7358 error = create_main_dictionary(
7359 name,
7360 form,
7361 txn,
7362 &kc_info,
7363 compression_method);
7364 if (error) {
7365 goto cleanup;
7366 }
7367
7368
7369 for (uint i = 0; i < form->s->keys; i++) {
7370 if (i != primary_key) {
7371 error = create_secondary_dictionary(
7372 name,
7373 form,
7374 &form->key_info[i],
7375 txn,
7376 &kc_info,
7377 i,
7378 false,
7379 compression_method);
7380 if (error) {
7381 goto cleanup;
7382 }
7383
7384 error = write_key_name_to_status(
7385 status_block,
7386 form->s->key_info[i].name.str,
7387 txn);
7388 if (error) {
7389 goto cleanup;
7390 }
7391 }
7392 }
7393
7394 error = 0;
7395cleanup:
7396 if (status_block != NULL) {
7397 int r = tokudb::metadata::close(&status_block);
7398 assert_always(r==0);
7399 }
7400 free_key_and_col_info(&kc_info);
7401 if (do_commit && txn) {
7402 if (error) {
7403 abort_txn(txn);
7404 } else {
7405 commit_txn(txn,0);
7406 }
7407 }
7408 tokudb::memory::free(newname);
7409 TOKUDB_HANDLER_DBUG_RETURN(error);
7410}
7411
7412int ha_tokudb::discard_or_import_tablespace(my_bool discard) {
7413 /*
7414 if (discard) {
7415 my_errno=HA_ERR_WRONG_COMMAND;
7416 return my_errno;
7417 }
7418 return add_table_to_metadata(share->table_name);
7419 */
7420 my_errno=HA_ERR_WRONG_COMMAND;
7421 return my_errno;
7422}
7423
7424
7425//
7426// deletes from_name or renames from_name to to_name, all using transaction txn.
7427// is_delete specifies which we are doing
7428// is_key specifies if it is a secondary index (and hence a "key-" needs to be prepended) or
7429// if it is not a secondary index
7430//
7431int ha_tokudb::delete_or_rename_dictionary(
7432 const char* from_name,
7433 const char* to_name,
7434 const char* secondary_name,
7435 bool is_key,
7436 DB_TXN* txn,
7437 bool is_delete) {
7438
7439 int error;
7440 char dict_name[MAX_DICT_NAME_LEN];
7441 char* new_from_name = NULL;
7442 size_t new_from_name_len = 0;
7443 char* new_to_name = NULL;
7444 size_t new_to_name_len = 0;
7445 assert_always(txn);
7446
7447 new_from_name_len = get_max_dict_name_path_length(from_name);
7448 new_from_name = (char*)tokudb::memory::malloc(
7449 new_from_name_len,
7450 MYF(MY_WME));
7451 if (new_from_name == NULL) {
7452 error = ENOMEM;
7453 goto cleanup;
7454 }
7455 if (!is_delete) {
7456 assert_always(to_name);
7457 new_to_name_len = get_max_dict_name_path_length(to_name);
7458 new_to_name = (char*)tokudb::memory::malloc(
7459 new_to_name_len,
7460 MYF(MY_WME));
7461 if (new_to_name == NULL) {
7462 error = ENOMEM;
7463 goto cleanup;
7464 }
7465 }
7466
7467 if (is_key) {
7468 sprintf(dict_name, "key-%s", secondary_name);
7469 make_name(new_from_name, new_from_name_len, from_name, dict_name);
7470 } else {
7471 make_name(new_from_name, new_from_name_len, from_name, secondary_name);
7472 }
7473 if (!is_delete) {
7474 if (is_key) {
7475 sprintf(dict_name, "key-%s", secondary_name);
7476 make_name(new_to_name, new_to_name_len, to_name, dict_name);
7477 } else {
7478 make_name(new_to_name, new_to_name_len, to_name, secondary_name);
7479 }
7480 }
7481
7482 if (is_delete) {
7483 error = db_env->dbremove(db_env, txn, new_from_name, NULL, 0);
7484 } else {
7485 error = db_env->dbrename(
7486 db_env,
7487 txn,
7488 new_from_name,
7489 NULL,
7490 new_to_name,
7491 0);
7492 }
7493 if (error) {
7494 goto cleanup;
7495 }
7496
7497cleanup:
7498 tokudb::memory::free(new_from_name);
7499 tokudb::memory::free(new_to_name);
7500 return error;
7501}
7502
7503
7504//
7505// deletes or renames a table. if is_delete is true, then we delete, and to_name can be NULL
7506// if is_delete is false, then to_name must be non-NULL, as we are renaming the table.
7507//
7508int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete) {
7509 THD *thd = ha_thd();
7510 int error;
7511 DB* status_db = NULL;
7512 DBC* status_cursor = NULL;
7513 DB_TXN* txn = NULL;
7514 DBT curr_key;
7515 DBT curr_val;
7516 memset(&curr_key, 0, sizeof(curr_key));
7517 memset(&curr_val, 0, sizeof(curr_val));
7518
7519 DB_TXN *parent_txn = NULL;
7520 tokudb_trx_data *trx = NULL;
7521 trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
7522 if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
7523 parent_txn = trx->sub_sp_level;
7524 }
7525
7526 error = txn_begin(db_env, parent_txn, &txn, 0, thd);
7527 if (error) { goto cleanup; }
7528
7529 //
7530 // open status db,
7531 // create cursor,
7532 // for each name read out of there, create a db and delete or rename it
7533 //
7534 error = open_status_dictionary(&status_db, from_name, txn);
7535 if (error) { goto cleanup; }
7536
7537 error = status_db->cursor(status_db, txn, &status_cursor, 0);
7538 if (error) { goto cleanup; }
7539 status_cursor->c_set_check_interrupt_callback(status_cursor, tokudb_killed_thd_callback, thd);
7540
7541 while (error != DB_NOTFOUND) {
7542 error = status_cursor->c_get(status_cursor, &curr_key, &curr_val, DB_NEXT);
7543 if (error && error != DB_NOTFOUND) {
7544 error = map_to_handler_error(error);
7545 goto cleanup;
7546 }
7547 if (error == DB_NOTFOUND) {
7548 break;
7549 }
7550 HA_METADATA_KEY mk = *(HA_METADATA_KEY *)curr_key.data;
7551 if (mk != hatoku_key_name) {
7552 continue;
7553 }
7554 error = delete_or_rename_dictionary(from_name, to_name, (char *)((char *)curr_key.data + sizeof(HA_METADATA_KEY)), true, txn, is_delete);
7555 if (error) { goto cleanup; }
7556 }
7557
7558 //
7559 // delete or rename main.tokudb
7560 //
7561 error = delete_or_rename_dictionary(from_name, to_name, "main", false, txn, is_delete);
7562 if (error) { goto cleanup; }
7563
7564 error = status_cursor->c_close(status_cursor);
7565 assert_always(error==0);
7566 status_cursor = NULL;
7567 if (error) { goto cleanup; }
7568
7569 error = status_db->close(status_db, 0);
7570 assert_always(error == 0);
7571 status_db = NULL;
7572
7573 //
7574 // delete or rename status.tokudb
7575 //
7576 error = delete_or_rename_dictionary(from_name, to_name, "status", false, txn, is_delete);
7577 if (error) { goto cleanup; }
7578
7579 my_errno = error;
7580cleanup:
7581 if (status_cursor) {
7582 int r = status_cursor->c_close(status_cursor);
7583 assert_always(r==0);
7584 }
7585 if (status_db) {
7586 int r = status_db->close(status_db, 0);
7587 assert_always(r==0);
7588 }
7589 if (txn) {
7590 if (error) {
7591 abort_txn(txn);
7592 }
7593 else {
7594 commit_txn(txn, 0);
7595 }
7596 }
7597 return error;
7598}
7599
7600
7601//
7602// Drops table
7603// Parameters:
7604// [in] name - name of table to be deleted
7605// Returns:
7606// 0 on success
7607// error otherwise
7608//
7609int ha_tokudb::delete_table(const char *name) {
7610 TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7611 TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(name, NULL, NULL, false);
7612 if (share) {
7613 share->unlock();
7614 share->release();
7615 // this should be enough to handle locking as the higher level MDL
7616 // on this table should prevent any new analyze tasks.
7617 share->cancel_background_jobs();
7618 TOKUDB_SHARE::drop_share(share);
7619 }
7620
7621 int error;
7622 error = delete_or_rename_table(name, NULL, true);
7623 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7624 error == DB_LOCK_NOTGRANTED) {
7625 sql_print_error(
7626 "Could not delete table %s because another transaction has "
7627 "accessed the table. To drop the table, make sure no "
7628 "transactions touch the table.",
7629 name);
7630 }
7631 TOKUDB_HANDLER_DBUG_RETURN(error);
7632}
7633
7634static bool tokudb_check_db_dir_exist_from_table_name(const char *table_name) {
7635 DBUG_ASSERT(table_name);
7636 bool mysql_dir_exists;
7637 char db_name[FN_REFLEN];
7638 const char *db_name_begin = strchr(table_name, FN_LIBCHAR);
7639 const char *db_name_end = strrchr(table_name, FN_LIBCHAR);
7640 DBUG_ASSERT(db_name_begin);
7641 DBUG_ASSERT(db_name_end);
7642 DBUG_ASSERT(db_name_begin != db_name_end);
7643
7644 ++db_name_begin;
7645 size_t db_name_size = db_name_end - db_name_begin;
7646
7647 DBUG_ASSERT(db_name_size < FN_REFLEN);
7648
7649 memcpy(db_name, db_name_begin, db_name_size);
7650 db_name[db_name_size] = '\0';
7651
7652 // At this point, db_name contains the MySQL formatted database name.
7653 // This is exactly the same format that would come into us through a
7654 // CREATE TABLE. Some charaters (like ':' for example) might be expanded
7655 // into hex (':' would papear as "@003a").
7656 // We need to check that the MySQL destination database directory exists.
7657 mysql_dir_exists = (my_access(db_name, F_OK) == 0);
7658
7659 return mysql_dir_exists;
7660}
7661
7662//
7663// renames table from "from" to "to"
7664// Parameters:
7665// [in] name - old name of table
7666// [in] to - new name of table
7667// Returns:
7668// 0 on success
7669// error otherwise
7670//
7671int ha_tokudb::rename_table(const char *from, const char *to) {
7672 TOKUDB_HANDLER_DBUG_ENTER("%s %s", from, to);
7673 TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(from, NULL, NULL, false);
7674 if (share) {
7675 share->unlock();
7676 share->release();
7677 // this should be enough to handle locking as the higher level MDL
7678 // on this table should prevent any new analyze tasks.
7679 share->cancel_background_jobs();
7680 TOKUDB_SHARE::drop_share(share);
7681 }
7682 int error;
7683 bool to_db_dir_exist = tokudb_check_db_dir_exist_from_table_name(to);
7684 if (!to_db_dir_exist) {
7685 sql_print_error(
7686 "Could not rename table from %s to %s because "
7687 "destination db does not exist",
7688 from,
7689 to);
7690#ifndef __WIN__
7691 /* Small hack. tokudb_check_db_dir_exist_from_table_name calls
7692 * my_access, which sets my_errno on Windows, but doesn't on
7693 * unix. Set it for unix too.
7694 */
7695 my_errno= errno;
7696#endif
7697 error= my_errno;
7698 }
7699 else {
7700 error = delete_or_rename_table(from, to, false);
7701 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7702 error == DB_LOCK_NOTGRANTED) {
7703 sql_print_error(
7704 "Could not rename table from %s to %s because another transaction "
7705 "has accessed the table. To rename the table, make sure no "
7706 "transactions touch the table.",
7707 from,
7708 to);
7709 }
7710 }
7711 TOKUDB_HANDLER_DBUG_RETURN(error);
7712}
7713
7714
7715/*
7716 Returns estimate on number of seeks it will take to read through the table
7717 This is to be comparable to the number returned by records_in_range so
7718 that we can decide if we should scan the table or use keys.
7719*/
7720/// QQQ why divide by 3
7721double ha_tokudb::scan_time() {
7722 TOKUDB_HANDLER_DBUG_ENTER("");
7723 double ret_val = (double)stats.records / 3;
7724 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7725 TOKUDB_DEBUG_RETURN,
7726 "return %" PRIu64 " %f",
7727 (uint64_t)stats.records,
7728 ret_val);
7729 DBUG_RETURN(ret_val);
7730}
7731
7732double ha_tokudb::keyread_time(uint index, uint ranges, ha_rows rows)
7733{
7734 TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7735 double ret_val;
7736 if (index == primary_key || key_is_clustering(&table->key_info[index])) {
7737 ret_val = read_time(index, ranges, rows);
7738 DBUG_RETURN(ret_val);
7739 }
7740 /*
7741 It is assumed that we will read trough the whole key range and that all
7742 key blocks are half full (normally things are much better). It is also
7743 assumed that each time we read the next key from the index, the handler
7744 performs a random seek, thus the cost is proportional to the number of
7745 blocks read. This model does not take into account clustered indexes -
7746 engines that support that (e.g. InnoDB) may want to overwrite this method.
7747 */
7748 double keys_per_block= (stats.block_size/2.0/
7749 (table->key_info[index].key_length +
7750 ref_length) + 1);
7751 ret_val = (rows + keys_per_block - 1)/ keys_per_block;
7752 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7753}
7754
7755//
7756// Calculate the time it takes to read a set of ranges through an index
7757// This enables us to optimize reads for clustered indexes.
7758// Implementation pulled from InnoDB
7759// Parameters:
7760// index - index to use
7761// ranges - number of ranges
7762// rows - estimated number of rows in the range
7763// Returns:
7764// estimated time measured in disk seeks
7765//
7766double ha_tokudb::read_time(
7767 uint index,
7768 uint ranges,
7769 ha_rows rows
7770 )
7771{
7772 TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7773 double total_scan;
7774 double ret_val;
7775 bool is_primary = (index == primary_key);
7776 bool is_clustering;
7777
7778 //
7779 // in case for hidden primary key, this is called
7780 //
7781 if (index >= table_share->keys) {
7782 ret_val = handler::read_time(index, ranges, rows);
7783 goto cleanup;
7784 }
7785
7786 is_clustering = key_is_clustering(&table->key_info[index]);
7787
7788
7789 //
7790 // if it is not the primary key, and it is not a clustering key, then return handler::read_time
7791 //
7792 if (!(is_primary || is_clustering)) {
7793 ret_val = handler::read_time(index, ranges, rows);
7794 goto cleanup;
7795 }
7796
7797 //
7798 // for primary key and for clustered keys, return a fraction of scan_time()
7799 //
7800 total_scan = scan_time();
7801
7802 if (stats.records <= rows) {
7803 ret_val = is_clustering ? total_scan + 0.00001 : total_scan;
7804 goto cleanup;
7805 }
7806
7807 //
7808 // one disk seek per range plus the proportional scan time of the rows
7809 //
7810 ret_val = (ranges + (double) rows / (double) stats.records * total_scan);
7811 ret_val = is_clustering ? ret_val + 0.00001 : ret_val;
7812
7813cleanup:
7814 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7815}
7816
7817double ha_tokudb::index_only_read_time(uint keynr, double records) {
7818 TOKUDB_HANDLER_DBUG_ENTER("%u %f", keynr, records);
7819 double ret_val = keyread_time(keynr, 1, (ha_rows)records);
7820 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7821}
7822
7823//
7824// Estimates the number of index records in a range. In case of errors, return
7825// HA_TOKUDB_RANGE_COUNT instead of HA_POS_ERROR. This was behavior
7826// when we got the handlerton from MySQL.
7827// Parameters:
7828// keynr -index to use
7829// [in] start_key - low end of the range
7830// [in] end_key - high end of the range
7831// Returns:
7832// 0 - There are no matching keys in the given range
7833// number > 0 - There are approximately number matching rows in the range
7834// HA_POS_ERROR - Something is wrong with the index tree
7835//
7836ha_rows ha_tokudb::records_in_range(uint keynr, key_range* start_key, key_range* end_key) {
7837 TOKUDB_HANDLER_DBUG_ENTER("%d %p %p", keynr, start_key, end_key);
7838 DBT *pleft_key, *pright_key;
7839 DBT left_key, right_key;
7840 ha_rows ret_val = HA_TOKUDB_RANGE_COUNT;
7841 DB *kfile = share->key_file[keynr];
7842 uint64_t rows = 0;
7843 int error;
7844
7845 // get start_rows and end_rows values so that we can estimate range
7846 // when calling key_range64, the only value we can trust is the value for less
7847 // The reason is that the key being passed in may be a prefix of keys in the DB
7848 // As a result, equal may be 0 and greater may actually be equal+greater
7849 // So, we call key_range64 on the key, and the key that is after it.
7850 if (!start_key && !end_key) {
7851 error = estimate_num_rows(share->file, &rows, transaction);
7852 if (error) {
7853 ret_val = HA_TOKUDB_RANGE_COUNT;
7854 goto cleanup;
7855 }
7856 ret_val = (rows <= 1) ? 1 : rows;
7857 goto cleanup;
7858 }
7859 if (start_key) {
7860 uchar inf_byte = (start_key->flag == HA_READ_KEY_EXACT) ? COL_NEG_INF : COL_POS_INF;
7861 pack_key(&left_key, keynr, key_buff, start_key->key, start_key->length, inf_byte);
7862 pleft_key = &left_key;
7863 } else {
7864 pleft_key = NULL;
7865 }
7866 if (end_key) {
7867 uchar inf_byte = (end_key->flag == HA_READ_BEFORE_KEY) ? COL_NEG_INF : COL_POS_INF;
7868 pack_key(&right_key, keynr, key_buff2, end_key->key, end_key->length, inf_byte);
7869 pright_key = &right_key;
7870 } else {
7871 pright_key = NULL;
7872 }
7873 // keys_range64 can not handle a degenerate range (left_key > right_key), so we filter here
7874 if (pleft_key && pright_key && tokudb_cmp_dbt_key(kfile, pleft_key, pright_key) > 0) {
7875 rows = 0;
7876 } else {
7877 uint64_t less, equal1, middle, equal2, greater;
7878 bool is_exact;
7879 error = kfile->keys_range64(kfile, transaction, pleft_key, pright_key,
7880 &less, &equal1, &middle, &equal2, &greater, &is_exact);
7881 if (error) {
7882 ret_val = HA_TOKUDB_RANGE_COUNT;
7883 goto cleanup;
7884 }
7885 rows = middle;
7886 }
7887
7888 // MySQL thinks a return value of 0 means there are exactly 0 rows
7889 // Therefore, always return non-zero so this assumption is not made
7890 ret_val = (ha_rows) (rows <= 1 ? 1 : rows);
7891
7892cleanup:
7893 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7894 TOKUDB_DEBUG_RETURN,
7895 "return %" PRIu64 " %" PRIu64,
7896 (uint64_t)ret_val,
7897 rows);
7898 DBUG_RETURN(ret_val);
7899}
7900
7901
7902//
7903// Initializes the auto-increment data in the local "share" object to the
7904// greater of two values: what's stored in the metadata or the last inserted
7905// auto-increment field (if auto-increment field is the first field of a key).
7906//
7907void ha_tokudb::init_auto_increment() {
7908 int error;
7909 DB_TXN* txn = NULL;
7910
7911 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
7912 if (error) {
7913 share->last_auto_increment = 0;
7914 } else {
7915 HA_METADATA_KEY key_val;
7916 DBT key;
7917 memset(&key, 0, sizeof(key));
7918 key.data = &key_val;
7919 key.size = sizeof(key_val);
7920 DBT value;
7921 memset(&value, 0, sizeof(value));
7922 value.flags = DB_DBT_USERMEM;
7923
7924 // Retrieve the initial auto increment value, as specified by create table
7925 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
7926 // then the value 100 should be stored here
7927 key_val = hatoku_ai_create_value;
7928 value.ulen = sizeof(share->auto_inc_create_value);
7929 value.data = &share->auto_inc_create_value;
7930 error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7931
7932 if (error || value.size != sizeof(share->auto_inc_create_value)) {
7933 share->auto_inc_create_value = 0;
7934 }
7935
7936 // Retrieve hatoku_max_ai, which is max value used by auto increment
7937 // column so far, the max value could have been auto generated (e.g. insert (NULL))
7938 // or it could have been manually inserted by user (e.g. insert (345))
7939 key_val = hatoku_max_ai;
7940 value.ulen = sizeof(share->last_auto_increment);
7941 value.data = &share->last_auto_increment;
7942 error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7943
7944 if (error || value.size != sizeof(share->last_auto_increment)) {
7945 if (share->auto_inc_create_value)
7946 share->last_auto_increment = share->auto_inc_create_value - 1;
7947 else
7948 share->last_auto_increment = 0;
7949 }
7950
7951 commit_txn(txn, 0);
7952 }
7953 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7954 TOKUDB_DEBUG_AUTO_INCREMENT,
7955 "init auto increment:%lld",
7956 share->last_auto_increment);
7957}
7958
7959void ha_tokudb::get_auto_increment(
7960 ulonglong offset,
7961 ulonglong increment,
7962 ulonglong nb_desired_values,
7963 ulonglong* first_value,
7964 ulonglong* nb_reserved_values) {
7965
7966 TOKUDB_HANDLER_DBUG_ENTER("");
7967 ulonglong nr;
7968 bool over;
7969
7970 if (table->s->next_number_key_offset)
7971 {
7972 handler::get_auto_increment(offset, increment, nb_desired_values, first_value, nb_reserved_values);
7973 DBUG_VOID_RETURN;
7974 }
7975
7976 share->lock();
7977
7978 if (share->auto_inc_create_value > share->last_auto_increment) {
7979 nr = share->auto_inc_create_value;
7980 over = false;
7981 share->last_auto_increment = share->auto_inc_create_value;
7982 } else {
7983 nr = share->last_auto_increment + increment;
7984 over = nr < share->last_auto_increment;
7985 if (over)
7986 nr = ULONGLONG_MAX;
7987 }
7988 if (!over) {
7989 share->last_auto_increment = nr + (nb_desired_values - 1)*increment;
7990 if (delay_updating_ai_metadata) {
7991 ai_metadata_update_required = true;
7992 } else {
7993 update_max_auto_inc(
7994 share->status_block,
7995 share->last_auto_increment);
7996 }
7997 }
7998 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7999 TOKUDB_DEBUG_AUTO_INCREMENT,
8000 "get_auto_increment(%lld,%lld,%lld): got:%lld:%lld",
8001 offset,
8002 increment,
8003 nb_desired_values,
8004 nr,
8005 nb_desired_values);
8006 *first_value = nr;
8007 *nb_reserved_values = nb_desired_values;
8008 share->unlock();
8009 TOKUDB_HANDLER_DBUG_VOID_RETURN;
8010}
8011
8012bool ha_tokudb::is_optimize_blocking() {
8013 return false;
8014}
8015
8016bool ha_tokudb::is_auto_inc_singleton(){
8017 return false;
8018}
8019
8020
8021// Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8022// With a transaction, drops dictionaries associated with indexes in key_num
8023//
8024//
8025// Adds indexes to the table. Takes the array of KEY passed in key_info, and creates
8026// DB's that will go at the end of share->key_file. THE IMPLICIT ASSUMPTION HERE is
8027// that the table will be modified and that these added keys will be appended to the end
8028// of the array table->key_info
8029// Parameters:
8030// [in] table_arg - table that is being modified, seems to be identical to this->table
8031// [in] key_info - array of KEY's to be added
8032// num_of_keys - number of keys to be added, number of elements in key_info
8033// Returns:
8034// 0 on success, error otherwise
8035//
8036int ha_tokudb::tokudb_add_index(
8037 TABLE* table_arg,
8038 KEY* key_info,
8039 uint num_of_keys,
8040 DB_TXN* txn,
8041 bool* inc_num_DBs,
8042 bool* modified_DBs) {
8043
8044 TOKUDB_HANDLER_DBUG_ENTER("");
8045 assert_always(txn);
8046
8047 int error;
8048 uint curr_index = 0;
8049 DBC* tmp_cursor = NULL;
8050 int cursor_ret_val = 0;
8051 DBT curr_pk_key, curr_pk_val;
8052 THD* thd = ha_thd();
8053 DB_LOADER* loader = NULL;
8054 DB_INDEXER* indexer = NULL;
8055 bool loader_save_space = tokudb::sysvars::load_save_space(thd);
8056 bool use_hot_index = (lock.type == TL_WRITE_ALLOW_WRITE);
8057 uint32_t loader_flags = loader_save_space ? LOADER_COMPRESS_INTERMEDIATES : 0;
8058 uint32_t indexer_flags = 0;
8059 uint32_t mult_db_flags[MAX_KEY + 1] = {0};
8060 uint32_t mult_put_flags[MAX_KEY + 1];
8061 uint32_t mult_dbt_flags[MAX_KEY + 1];
8062 bool creating_hot_index = false;
8063 struct loader_context lc;
8064 memset(&lc, 0, sizeof lc);
8065 lc.thd = thd;
8066 lc.ha = this;
8067 loader_error = 0;
8068 bool rw_lock_taken = false;
8069 *inc_num_DBs = false;
8070 *modified_DBs = false;
8071 invalidate_bulk_fetch();
8072 unpack_entire_row = true; // for bulk fetching rows
8073 for (uint32_t i = 0; i < MAX_KEY+1; i++) {
8074 mult_put_flags[i] = 0;
8075 mult_dbt_flags[i] = DB_DBT_REALLOC;
8076 }
8077 //
8078 // number of DB files we have open currently, before add_index is executed
8079 //
8080 uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8081
8082 //
8083 // get the row type to use for the indexes we're adding
8084 //
8085 toku_compression_method compression_method =
8086 get_compression_method(share->file);
8087
8088 //
8089 // status message to be shown in "show process list"
8090 //
8091 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
8092 // buffer of 200 should be a good upper bound.
8093 char status_msg[MAX_ALIAS_NAME + 200];
8094 // variable that stores number of elements inserted thus far
8095 ulonglong num_processed = 0;
8096 thd_proc_info(thd, "Adding indexes");
8097
8098 //
8099 // in unpack_row, MySQL passes a buffer that is this long,
8100 // so this length should be good enough for us as well
8101 //
8102 memset((void *) &curr_pk_key, 0, sizeof(curr_pk_key));
8103 memset((void *) &curr_pk_val, 0, sizeof(curr_pk_val));
8104
8105 //
8106 // The files for secondary tables are derived from the name of keys
8107 // If we try to add a key with the same name as an already existing key,
8108 // We can crash. So here we check if any of the keys added has the same
8109 // name of an existing key, and if so, we fail gracefully
8110 //
8111 for (uint i = 0; i < num_of_keys; i++) {
8112 for (uint j = 0; j < table_arg->s->keys; j++) {
8113 if (strcmp(key_info[i].name.str,
8114 table_arg->s->key_info[j].name.str) == 0) {
8115 error = HA_ERR_WRONG_COMMAND;
8116 goto cleanup;
8117 }
8118 }
8119 }
8120
8121 rwlock_t_lock_write(share->_num_DBs_lock);
8122 rw_lock_taken = true;
8123 //
8124 // open all the DB files and set the appropriate variables in share
8125 // they go to the end of share->key_file
8126 //
8127 creating_hot_index =
8128 use_hot_index && num_of_keys == 1 &&
8129 (key_info[0].flags & HA_NOSAME) == 0;
8130 if (use_hot_index && (share->num_DBs > curr_num_DBs)) {
8131 //
8132 // already have hot index in progress, get out
8133 //
8134 error = HA_ERR_INTERNAL_ERROR;
8135 goto cleanup;
8136 }
8137 curr_index = curr_num_DBs;
8138 *modified_DBs = true;
8139 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8140 if (key_is_clustering(&key_info[i])) {
8141 set_key_filter(
8142 &share->kc_info.key_filters[curr_index],
8143 &key_info[i],
8144 table_arg,
8145 false);
8146 if (!hidden_primary_key) {
8147 set_key_filter(
8148 &share->kc_info.key_filters[curr_index],
8149 &table_arg->key_info[primary_key],
8150 table_arg,
8151 false);
8152 }
8153
8154 error = initialize_col_pack_info(
8155 &share->kc_info,
8156 table_arg->s,
8157 curr_index);
8158 if (error) {
8159 goto cleanup;
8160 }
8161 }
8162
8163
8164 error = create_secondary_dictionary(
8165 share->full_table_name(),
8166 table_arg,
8167 &key_info[i],
8168 txn,
8169 &share->kc_info,
8170 curr_index,
8171 creating_hot_index,
8172 compression_method);
8173 if (error) {
8174 goto cleanup;
8175 }
8176
8177 error = open_secondary_dictionary(
8178 &share->key_file[curr_index],
8179 &key_info[i],
8180 share->full_table_name(),
8181 false,
8182 txn);
8183 if (error) {
8184 goto cleanup;
8185 }
8186 }
8187
8188 if (creating_hot_index) {
8189 share->num_DBs++;
8190 *inc_num_DBs = true;
8191 error = db_env->create_indexer(
8192 db_env,
8193 txn,
8194 &indexer,
8195 share->file,
8196 num_of_keys,
8197 &share->key_file[curr_num_DBs],
8198 mult_db_flags,
8199 indexer_flags);
8200 if (error) {
8201 goto cleanup;
8202 }
8203
8204 error = indexer->set_poll_function(
8205 indexer, ha_tokudb::tokudb_add_index_poll, &lc);
8206 if (error) {
8207 goto cleanup;
8208 }
8209
8210 error = indexer->set_error_callback(
8211 indexer, ha_tokudb::loader_add_index_err, &lc);
8212 if (error) {
8213 goto cleanup;
8214 }
8215
8216 share->_num_DBs_lock.unlock();
8217 rw_lock_taken = false;
8218
8219#ifdef HA_TOKUDB_HAS_THD_PROGRESS
8220 // initialize a one phase progress report.
8221 // incremental reports are done in the indexer's callback function.
8222 thd_progress_init(thd, 1);
8223#endif
8224
8225 error = indexer->build(indexer);
8226
8227 if (error) {
8228 goto cleanup;
8229 }
8230
8231 rwlock_t_lock_write(share->_num_DBs_lock);
8232 error = indexer->close(indexer);
8233 share->_num_DBs_lock.unlock();
8234 if (error) {
8235 goto cleanup;
8236 }
8237 indexer = NULL;
8238 } else {
8239 DBUG_ASSERT(table->mdl_ticket->get_type() >= MDL_SHARED_NO_WRITE);
8240 share->_num_DBs_lock.unlock();
8241 rw_lock_taken = false;
8242 prelocked_right_range_size = 0;
8243 prelocked_left_range_size = 0;
8244 struct smart_dbt_bf_info bf_info;
8245 bf_info.ha = this;
8246 // you need the val if you have a clustering index and key_read is not 0;
8247 bf_info.direction = 1;
8248 bf_info.thd = ha_thd();
8249 bf_info.need_val = true;
8250 bf_info.key_to_compare = NULL;
8251
8252 error = db_env->create_loader(
8253 db_env,
8254 txn,
8255 &loader,
8256 NULL, // no src_db needed
8257 num_of_keys,
8258 &share->key_file[curr_num_DBs],
8259 mult_put_flags,
8260 mult_dbt_flags,
8261 loader_flags);
8262 if (error) {
8263 goto cleanup;
8264 }
8265
8266 error =
8267 loader->set_poll_function(loader, ha_tokudb::bulk_insert_poll, &lc);
8268 if (error) {
8269 goto cleanup;
8270 }
8271
8272 error = loader->set_error_callback(
8273 loader, ha_tokudb::loader_add_index_err, &lc);
8274 if (error) {
8275 goto cleanup;
8276 }
8277 //
8278 // scan primary table, create each secondary key, add to each DB
8279 //
8280 error = share->file->cursor(
8281 share->file,
8282 txn,
8283 &tmp_cursor,
8284 DB_SERIALIZABLE);
8285 if (error) {
8286 tmp_cursor = NULL; // Safety
8287 goto cleanup;
8288 }
8289
8290 //
8291 // grab some locks to make this go faster
8292 // first a global read lock on the main DB, because
8293 // we intend to scan the entire thing
8294 //
8295 error = tmp_cursor->c_set_bounds(
8296 tmp_cursor,
8297 share->file->dbt_neg_infty(),
8298 share->file->dbt_pos_infty(),
8299 true,
8300 0);
8301 if (error) {
8302 goto cleanup;
8303 }
8304
8305 // set the bulk fetch iteration to its max so that adding an
8306 // index fills the bulk fetch buffer every time. we do not
8307 // want it to grow exponentially fast.
8308 rows_fetched_using_bulk_fetch = 0;
8309 bulk_fetch_iteration = HA_TOKU_BULK_FETCH_ITERATION_MAX;
8310 cursor_ret_val = tmp_cursor->c_getf_next(
8311 tmp_cursor,
8312 DB_PRELOCKED,
8313 smart_dbt_bf_callback,
8314 &bf_info);
8315
8316#ifdef HA_TOKUDB_HAS_THD_PROGRESS
8317 // initialize a two phase progress report.
8318 // first phase: putting rows into the loader
8319 thd_progress_init(thd, 2);
8320#endif
8321
8322 while (cursor_ret_val != DB_NOTFOUND ||
8323 ((bytes_used_in_range_query_buff -
8324 curr_range_query_buff_offset) > 0)) {
8325 if ((bytes_used_in_range_query_buff -
8326 curr_range_query_buff_offset) == 0) {
8327 invalidate_bulk_fetch(); // reset the buffers
8328 cursor_ret_val = tmp_cursor->c_getf_next(
8329 tmp_cursor,
8330 DB_PRELOCKED,
8331 smart_dbt_bf_callback,
8332 &bf_info);
8333 if (cursor_ret_val != DB_NOTFOUND && cursor_ret_val != 0) {
8334 error = cursor_ret_val;
8335 goto cleanup;
8336 }
8337 }
8338 // do this check in case the the c_getf_next did not put anything
8339 // into the buffer because there was no more data
8340 if ((bytes_used_in_range_query_buff -
8341 curr_range_query_buff_offset) == 0) {
8342 break;
8343 }
8344 // at this point, we know the range query buffer has at least one
8345 // key/val pair
8346 uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
8347
8348 uint32_t key_size = *(uint32_t *)curr_pos;
8349 curr_pos += sizeof(key_size);
8350 uchar* curr_key_buff = curr_pos;
8351 curr_pos += key_size;
8352 curr_pk_key.data = curr_key_buff;
8353 curr_pk_key.size = key_size;
8354
8355 uint32_t val_size = *(uint32_t *)curr_pos;
8356 curr_pos += sizeof(val_size);
8357 uchar* curr_val_buff = curr_pos;
8358 curr_pos += val_size;
8359 curr_pk_val.data = curr_val_buff;
8360 curr_pk_val.size = val_size;
8361
8362 curr_range_query_buff_offset = curr_pos - range_query_buff;
8363
8364 error = loader->put(loader, &curr_pk_key, &curr_pk_val);
8365 if (error) {
8366 goto cleanup;
8367 }
8368
8369 num_processed++;
8370
8371 if ((num_processed % 1000) == 0) {
8372 sprintf(
8373 status_msg,
8374 "Adding indexes: Fetched %llu of about %llu rows, loading "
8375 "of data still remains.",
8376 num_processed,
8377 (long long unsigned)share->row_count());
8378 thd_proc_info(thd, status_msg);
8379
8380#ifdef HA_TOKUDB_HAS_THD_PROGRESS
8381 thd_progress_report(
8382 thd,
8383 num_processed,
8384 (long long unsigned)share->row_count());
8385#endif
8386
8387 if (thd_kill_level(thd)) {
8388 error = ER_ABORTING_CONNECTION;
8389 goto cleanup;
8390 }
8391 }
8392 }
8393 error = tmp_cursor->c_close(tmp_cursor);
8394 assert_always(error==0);
8395 tmp_cursor = NULL;
8396
8397#ifdef HA_TOKUDB_HAS_THD_PROGRESS
8398 // next progress report phase: closing the loader.
8399 // incremental reports are done in the loader's callback function.
8400 thd_progress_next_stage(thd);
8401#endif
8402
8403 error = loader->close(loader);
8404 loader = NULL;
8405
8406 if (error) goto cleanup;
8407 }
8408 curr_index = curr_num_DBs;
8409 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8410 if (key_info[i].flags & HA_NOSAME) {
8411 bool is_unique;
8412 error = is_index_unique(
8413 &is_unique,
8414 txn,
8415 share->key_file[curr_index],
8416 &key_info[i],
8417 creating_hot_index ? 0 : DB_PRELOCKED_WRITE);
8418 if (error)
8419 goto cleanup;
8420 if (!is_unique) {
8421 error = HA_ERR_FOUND_DUPP_KEY;
8422 last_dup_key = i;
8423 goto cleanup;
8424 }
8425 }
8426 }
8427
8428 share->lock();
8429 //
8430 // We have an accurate row count, might as well update share->rows
8431 //
8432 if(!creating_hot_index) {
8433 share->set_row_count(num_processed, true);
8434 }
8435 //
8436 // now write stuff to status.tokudb
8437 //
8438 for (uint i = 0; i < num_of_keys; i++) {
8439 write_key_name_to_status(share->status_block, key_info[i].name.str, txn);
8440 }
8441 share->unlock();
8442
8443 error = 0;
8444cleanup:
8445#ifdef HA_TOKUDB_HAS_THD_PROGRESS
8446 thd_progress_end(thd);
8447#endif
8448 if (rw_lock_taken) {
8449 share->_num_DBs_lock.unlock();
8450 rw_lock_taken = false;
8451 }
8452 if (tmp_cursor) {
8453 int r = tmp_cursor->c_close(tmp_cursor);
8454 assert_always(r==0);
8455 tmp_cursor = NULL;
8456 }
8457 if (loader != NULL) {
8458 sprintf(status_msg, "aborting creation of indexes.");
8459 thd_proc_info(thd, status_msg);
8460 loader->abort(loader);
8461 }
8462 if (indexer != NULL) {
8463 sprintf(status_msg, "aborting creation of indexes.");
8464 thd_proc_info(thd, status_msg);
8465 rwlock_t_lock_write(share->_num_DBs_lock);
8466 indexer->abort(indexer);
8467 share->_num_DBs_lock.unlock();
8468 }
8469 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8470 error == DB_LOCK_NOTGRANTED) {
8471 sql_print_error(
8472 "Could not add indexes to table %s because another transaction has "
8473 "accessed the table. To add indexes, make sure no transactions "
8474 "touch the table.",
8475 share->full_table_name());
8476 }
8477 thd_proc_info(thd, orig_proc_info);
8478 TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
8479}
8480int ha_tokudb::tokudb_add_index_poll(void* extra, float progress) {
8481 LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
8482 if (thd_killed(context->thd)) {
8483 sprintf(context->write_status_msg,
8484 "The process has been killed, aborting add index.");
8485 return ER_ABORTING_CONNECTION;
8486 }
8487 float percentage = progress * 100;
8488 sprintf(context->write_status_msg,
8489 "Adding of indexes to %s about %.1f%% done",
8490 context->ha->share->full_table_name(),
8491 percentage);
8492 thd_proc_info(context->thd, context->write_status_msg);
8493#ifdef HA_TOKUDB_HAS_THD_PROGRESS
8494 thd_progress_report(context->thd, (unsigned long long)percentage, 100);
8495#endif
8496 return 0;
8497}
8498
8499//
8500// Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8501// Closes added indexes in case of error in error path of add_index and alter_table_phase2
8502//
8503void ha_tokudb::restore_add_index(
8504 TABLE* table_arg,
8505 uint num_of_keys,
8506 bool incremented_numDBs,
8507 bool modified_DBs) {
8508
8509 uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8510 uint curr_index = 0;
8511
8512 //
8513 // need to restore num_DBs, and we have to do it before we close the dictionaries
8514 // so that there is not a window
8515 //
8516 if (incremented_numDBs) {
8517 rwlock_t_lock_write(share->_num_DBs_lock);
8518 share->num_DBs--;
8519 }
8520 if (modified_DBs) {
8521 curr_index = curr_num_DBs;
8522 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8523 reset_key_and_col_info(&share->kc_info, curr_index);
8524 }
8525 curr_index = curr_num_DBs;
8526 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8527 if (share->key_file[curr_index]) {
8528 int r = share->key_file[curr_index]->close(
8529 share->key_file[curr_index],
8530 0);
8531 assert_always(r==0);
8532 share->key_file[curr_index] = NULL;
8533 }
8534 }
8535 }
8536 if (incremented_numDBs) {
8537 share->_num_DBs_lock.unlock();
8538 }
8539}
8540
8541//
8542// Internal function called by ha_tokudb::prepare_drop_index and ha_tokudb::alter_table_phase2
8543// With a transaction, drops dictionaries associated with indexes in key_num
8544//
8545int ha_tokudb::drop_indexes(
8546 TABLE* table_arg,
8547 uint* key_num,
8548 uint num_of_keys,
8549 KEY* key_info,
8550 DB_TXN* txn) {
8551
8552 TOKUDB_HANDLER_DBUG_ENTER("");
8553 assert_always(txn);
8554
8555 int error = 0;
8556 for (uint i = 0; i < num_of_keys; i++) {
8557 uint curr_index = key_num[i];
8558 error = share->key_file[curr_index]->pre_acquire_fileops_lock(
8559 share->key_file[curr_index],
8560 txn);
8561 if (error != 0) {
8562 goto cleanup;
8563 }
8564 }
8565 for (uint i = 0; i < num_of_keys; i++) {
8566 uint curr_index = key_num[i];
8567 int r = share->key_file[curr_index]->close(share->key_file[curr_index],0);
8568 assert_always(r==0);
8569 share->key_file[curr_index] = NULL;
8570
8571 error = remove_key_name_from_status(
8572 share->status_block,
8573 key_info[curr_index].name.str,
8574 txn);
8575 if (error) {
8576 goto cleanup;
8577 }
8578
8579 error = delete_or_rename_dictionary(
8580 share->full_table_name(),
8581 NULL,
8582 key_info[curr_index].name.str,
8583 true,
8584 txn,
8585 true);
8586 if (error) {
8587 goto cleanup;
8588 }
8589 }
8590
8591cleanup:
8592 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8593 error == DB_LOCK_NOTGRANTED) {
8594 sql_print_error(
8595 "Could not drop indexes from table %s because another transaction "
8596 "has accessed the table. To drop indexes, make sure no "
8597 "transactions touch the table.",
8598 share->full_table_name());
8599 }
8600 TOKUDB_HANDLER_DBUG_RETURN(error);
8601}
8602
8603//
8604// Internal function called by ha_tokudb::prepare_drop_index and
8605// ha_tokudb::alter_table_phase2
8606// Restores dropped indexes in case of error in error path of
8607// prepare_drop_index and alter_table_phase2
8608//
8609void ha_tokudb::restore_drop_indexes(
8610 TABLE* table_arg,
8611 uint* key_num,
8612 uint num_of_keys) {
8613
8614 //
8615 // reopen closed dictionaries
8616 //
8617 for (uint i = 0; i < num_of_keys; i++) {
8618 int r;
8619 uint curr_index = key_num[i];
8620 if (share->key_file[curr_index] == NULL) {
8621 r = open_secondary_dictionary(
8622 &share->key_file[curr_index],
8623 &table_share->key_info[curr_index],
8624 share->full_table_name(),
8625 false,
8626 NULL);
8627 assert_always(!r);
8628 }
8629 }
8630}
8631
8632int ha_tokudb::map_to_handler_error(int error) {
8633 switch (error) {
8634 case DB_LOCK_DEADLOCK:
8635 error = HA_ERR_LOCK_DEADLOCK;
8636 break;
8637 case DB_LOCK_NOTGRANTED:
8638 error = HA_ERR_LOCK_WAIT_TIMEOUT;
8639 break;
8640#if defined(HA_ERR_DISK_FULL)
8641 case ENOSPC:
8642 error = HA_ERR_DISK_FULL;
8643 break;
8644#endif
8645 case DB_KEYEXIST:
8646 error = HA_ERR_FOUND_DUPP_KEY;
8647 break;
8648#if defined(HA_ALTER_ERROR)
8649 case HA_ALTER_ERROR:
8650 error = HA_ERR_UNSUPPORTED;
8651 break;
8652#endif
8653 case TOKUDB_INTERRUPTED:
8654 error = ER_QUERY_INTERRUPTED;
8655 break;
8656 case TOKUDB_OUT_OF_LOCKS:
8657 error = HA_ERR_LOCK_TABLE_FULL;
8658 break;
8659 }
8660 return error;
8661}
8662
8663void ha_tokudb::print_error(int error, myf errflag) {
8664 error = map_to_handler_error(error);
8665 handler::print_error(error, errflag);
8666}
8667
8668//
8669// truncate's dictionary associated with keynr index using transaction txn
8670// does so by deleting and then recreating the dictionary in the context
8671// of a transaction
8672//
8673int ha_tokudb::truncate_dictionary(uint keynr, DB_TXN* txn) {
8674 int error;
8675 bool is_pk = (keynr == primary_key);
8676
8677 toku_compression_method compression_method =
8678 get_compression_method(share->key_file[keynr]);
8679 error = share->key_file[keynr]->close(share->key_file[keynr], 0);
8680 assert_always(error == 0);
8681
8682 share->key_file[keynr] = NULL;
8683 if (is_pk) {
8684 share->file = NULL;
8685 }
8686
8687 if (is_pk) {
8688 error = delete_or_rename_dictionary(
8689 share->full_table_name(),
8690 NULL,
8691 "main",
8692 false, //is_key
8693 txn,
8694 true); // is a delete
8695 if (error) {
8696 goto cleanup;
8697 }
8698 } else {
8699 error = delete_or_rename_dictionary(
8700 share->full_table_name(),
8701 NULL,
8702 table_share->key_info[keynr].name.str,
8703 true, //is_key
8704 txn,
8705 true); // is a delete
8706 if (error) {
8707 goto cleanup;
8708 }
8709 }
8710
8711 if (is_pk) {
8712 error = create_main_dictionary(
8713 share->full_table_name(),
8714 table,
8715 txn,
8716 &share->kc_info,
8717 compression_method);
8718 } else {
8719 error = create_secondary_dictionary(
8720 share->full_table_name(),
8721 table,
8722 &table_share->key_info[keynr],
8723 txn,
8724 &share->kc_info,
8725 keynr,
8726 false,
8727 compression_method);
8728 }
8729 if (error) {
8730 goto cleanup;
8731 }
8732
8733cleanup:
8734 return error;
8735}
8736
8737// for 5.5
8738int ha_tokudb::truncate() {
8739 TOKUDB_HANDLER_DBUG_ENTER("");
8740 int error = delete_all_rows_internal();
8741 TOKUDB_HANDLER_DBUG_RETURN(error);
8742}
8743
8744// delete all rows from a table
8745//
8746// effects: delete all of the rows in the main dictionary and all of the
8747// indices. this must be atomic, so we use the statement transaction
8748// for all of the truncate operations.
8749// locks: if we have an exclusive table write lock, all of the concurrency
8750// issues go away.
8751// returns: 0 if success
8752int ha_tokudb::delete_all_rows() {
8753 TOKUDB_HANDLER_DBUG_ENTER("");
8754 int error = 0;
8755 if (thd_sql_command(ha_thd()) != SQLCOM_TRUNCATE) {
8756 share->try_table_lock = true;
8757 error = HA_ERR_WRONG_COMMAND;
8758 }
8759 if (error == 0)
8760 error = delete_all_rows_internal();
8761 TOKUDB_HANDLER_DBUG_RETURN(error);
8762}
8763
8764int ha_tokudb::delete_all_rows_internal() {
8765 TOKUDB_HANDLER_DBUG_ENTER("");
8766 int error = 0;
8767 uint curr_num_DBs = 0;
8768 DB_TXN* txn = NULL;
8769
8770 // this should be enough to handle locking as the higher level MDL
8771 // on this table should prevent any new analyze tasks.
8772 share->cancel_background_jobs();
8773
8774 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
8775 if (error) {
8776 goto cleanup;
8777 }
8778
8779 curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
8780 for (uint i = 0; i < curr_num_DBs; i++) {
8781 error = share->key_file[i]->pre_acquire_fileops_lock(
8782 share->key_file[i],
8783 txn);
8784 if (error) {
8785 goto cleanup;
8786 }
8787 error = share->key_file[i]->pre_acquire_table_lock(
8788 share->key_file[i],
8789 txn);
8790 if (error) {
8791 goto cleanup;
8792 }
8793 }
8794 for (uint i = 0; i < curr_num_DBs; i++) {
8795 error = truncate_dictionary(i, txn);
8796 if (error) {
8797 goto cleanup;
8798 }
8799 }
8800
8801 DEBUG_SYNC(ha_thd(), "tokudb_after_truncate_all_dictionarys");
8802
8803 // zap the row count
8804 if (error == 0) {
8805 share->set_row_count(0, false);
8806 // update auto increment
8807 share->last_auto_increment = 0;
8808 // calling write_to_status directly because we need to use txn
8809 write_to_status(
8810 share->status_block,
8811 hatoku_max_ai,
8812 &share->last_auto_increment,
8813 sizeof(share->last_auto_increment),
8814 txn);
8815 }
8816
8817 share->try_table_lock = true;
8818cleanup:
8819 if (txn) {
8820 if (error) {
8821 abort_txn(txn);
8822 } else {
8823 commit_txn(txn,0);
8824 }
8825 }
8826
8827 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(
8828 TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8829 error == DB_LOCK_NOTGRANTED) {
8830 sql_print_error(
8831 "Could not truncate table %s because another transaction has "
8832 "accessed the table. To truncate the table, make sure no "
8833 "transactions touch the table.",
8834 share->full_table_name());
8835 }
8836 //
8837 // regardless of errors, need to reopen the DB's
8838 //
8839 for (uint i = 0; i < curr_num_DBs; i++) {
8840 int r = 0;
8841 if (share->key_file[i] == NULL) {
8842 if (i != primary_key) {
8843 r = open_secondary_dictionary(
8844 &share->key_file[i],
8845 &table_share->key_info[i],
8846 share->full_table_name(),
8847 false,
8848 NULL);
8849 assert_always(!r);
8850 } else {
8851 r = open_main_dictionary(
8852 share->full_table_name(),
8853 false,
8854 NULL);
8855 assert_always(!r);
8856 }
8857 }
8858 }
8859 TOKUDB_HANDLER_DBUG_RETURN(error);
8860}
8861
8862void ha_tokudb::set_loader_error(int err) {
8863 loader_error = err;
8864}
8865
8866void ha_tokudb::set_dup_value_for_pk(DBT* key) {
8867 assert_always(!hidden_primary_key);
8868 unpack_key(table->record[0],key,primary_key);
8869 last_dup_key = primary_key;
8870}
8871
8872void ha_tokudb::close_dsmrr() {
8873#ifdef MARIADB_BASE_VERSION
8874 ds_mrr.dsmrr_close();
8875#elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8876 ds_mrr.dsmrr_close();
8877#endif
8878}
8879
8880void ha_tokudb::reset_dsmrr() {
8881#ifdef MARIADB_BASE_VERSION
8882 ds_mrr.dsmrr_close();
8883#elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8884 ds_mrr.reset();
8885#endif
8886}
8887
8888// we cache the information so we can do filtering ourselves,
8889// but as far as MySQL knows, we are not doing any filtering,
8890// so if we happen to miss filtering a row that does not match
8891// idx_cond_arg, MySQL will catch it.
8892// This allows us the ability to deal with only index_next and index_prev,
8893// and not need to worry about other index_XXX functions
8894Item* ha_tokudb::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) {
8895 toku_pushed_idx_cond_keyno = keyno_arg;
8896 toku_pushed_idx_cond = idx_cond_arg;
8897 return idx_cond_arg;
8898}
8899
8900void ha_tokudb::cancel_pushed_idx_cond() {
8901 invalidate_icp();
8902 handler::cancel_pushed_idx_cond();
8903}
8904
8905void ha_tokudb::cleanup_txn(DB_TXN *txn) {
8906 if (transaction == txn && cursor) {
8907 int r = cursor->c_close(cursor);
8908 assert_always(r == 0);
8909 cursor = NULL;
8910 }
8911}
8912
8913void ha_tokudb::add_to_trx_handler_list() {
8914 tokudb_trx_data* trx =
8915 (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8916 trx->handlers = list_add(trx->handlers, &trx_handler_list);
8917}
8918
8919void ha_tokudb::remove_from_trx_handler_list() {
8920 tokudb_trx_data* trx =
8921 (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8922 trx->handlers = list_delete(trx->handlers, &trx_handler_list);
8923}
8924
8925void ha_tokudb::rpl_before_write_rows() {
8926 in_rpl_write_rows = true;
8927}
8928
8929void ha_tokudb::rpl_after_write_rows() {
8930 in_rpl_write_rows = false;
8931}
8932
8933void ha_tokudb::rpl_before_delete_rows() {
8934 in_rpl_delete_rows = true;
8935}
8936
8937void ha_tokudb::rpl_after_delete_rows() {
8938 in_rpl_delete_rows = false;
8939}
8940
8941void ha_tokudb::rpl_before_update_rows() {
8942 in_rpl_update_rows = true;
8943}
8944
8945void ha_tokudb::rpl_after_update_rows() {
8946 in_rpl_update_rows = false;
8947}
8948
8949bool ha_tokudb::rpl_lookup_rows() {
8950 if (!in_rpl_delete_rows && !in_rpl_update_rows)
8951 return true;
8952 else
8953 return tokudb::sysvars::rpl_lookup_rows(ha_thd());
8954}
8955
8956// table admin
8957#include "ha_tokudb_admin.cc"
8958
8959// update functions
8960#include "tokudb_update_fun.cc"
8961
8962// fast updates
8963#include "ha_tokudb_update.cc"
8964
8965// alter table code for various mysql distros
8966#include "ha_tokudb_alter_55.cc"
8967#include "ha_tokudb_alter_56.cc"
8968
8969// mrr
8970#ifdef MARIADB_BASE_VERSION
8971#include "ha_tokudb_mrr_maria.cc"
8972#elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8973#include "ha_tokudb_mrr_mysql.cc"
8974#endif
8975
8976// key comparisons
8977#include "hatoku_cmp.cc"
8978
8979// handlerton
8980#include "hatoku_hton.cc"
8981
8982// generate template functions
8983namespace tokudb {
8984 template size_t vlq_encode_ui(uint32_t n, void *p, size_t s);
8985 template size_t vlq_decode_ui(uint32_t *np, void *p, size_t s);
8986 template size_t vlq_encode_ui(uint64_t n, void *p, size_t s);
8987 template size_t vlq_decode_ui(uint64_t *np, void *p, size_t s);
8988};
8989