| 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
| 2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
| 3 | #ident "$Id$" |
| 4 | /*====== |
| 5 | This file is part of PerconaFT. |
| 6 | |
| 7 | |
| 8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
| 9 | |
| 10 | PerconaFT is free software: you can redistribute it and/or modify |
| 11 | it under the terms of the GNU General Public License, version 2, |
| 12 | as published by the Free Software Foundation. |
| 13 | |
| 14 | PerconaFT is distributed in the hope that it will be useful, |
| 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | GNU General Public License for more details. |
| 18 | |
| 19 | You should have received a copy of the GNU General Public License |
| 20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 21 | |
| 22 | ---------------------------------------- |
| 23 | |
| 24 | PerconaFT is free software: you can redistribute it and/or modify |
| 25 | it under the terms of the GNU Affero General Public License, version 3, |
| 26 | as published by the Free Software Foundation. |
| 27 | |
| 28 | PerconaFT is distributed in the hope that it will be useful, |
| 29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 31 | GNU Affero General Public License for more details. |
| 32 | |
| 33 | You should have received a copy of the GNU Affero General Public License |
| 34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 35 | ======= */ |
| 36 | |
| 37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
| 38 | |
| 39 | #include <my_global.h> |
| 40 | #include <ctype.h> |
| 41 | |
| 42 | #include <db.h> |
| 43 | #include <locktree/locktree.h> |
| 44 | #include <ft/ft.h> |
| 45 | #include <ft/ft-flusher.h> |
| 46 | #include <ft/cachetable/checkpoint.h> |
| 47 | |
| 48 | #include "ydb_cursor.h" |
| 49 | #include "ydb_row_lock.h" |
| 50 | #include "ydb_db.h" |
| 51 | #include "ydb_write.h" |
| 52 | #include "ydb-internal.h" |
| 53 | #include "ydb_load.h" |
| 54 | #include "indexer.h" |
| 55 | #include <portability/toku_atomic.h> |
| 56 | #include <util/status.h> |
| 57 | #include <ft/le-cursor.h> |
| 58 | |
| 59 | static YDB_DB_LAYER_STATUS_S ydb_db_layer_status; |
| 60 | #ifdef STATUS_VALUE |
| 61 | #undef STATUS_VALUE |
| 62 | #endif |
| 63 | #define STATUS_VALUE(x) ydb_db_layer_status.status[x].value.num |
| 64 | |
| 65 | #define STATUS_INIT(k,c,t,l,inc) TOKUFT_STATUS_INIT(ydb_db_layer_status, k, c, t, l, inc) |
| 66 | |
| 67 | static void |
| 68 | ydb_db_layer_status_init (void) { |
| 69 | // Note, this function initializes the keyname, type, and legend fields. |
| 70 | // Value fields are initialized to zero by compiler. |
| 71 | |
| 72 | STATUS_INIT(YDB_LAYER_DIRECTORY_WRITE_LOCKS, nullptr, UINT64, "directory write locks" , TOKU_ENGINE_STATUS); |
| 73 | STATUS_INIT(YDB_LAYER_DIRECTORY_WRITE_LOCKS_FAIL, nullptr, UINT64, "directory write locks fail" , TOKU_ENGINE_STATUS); |
| 74 | STATUS_INIT(YDB_LAYER_LOGSUPPRESS, nullptr, UINT64, "log suppress" , TOKU_ENGINE_STATUS); |
| 75 | STATUS_INIT(YDB_LAYER_LOGSUPPRESS_FAIL, nullptr, UINT64, "log suppress fail" , TOKU_ENGINE_STATUS); |
| 76 | ydb_db_layer_status.initialized = true; |
| 77 | } |
| 78 | #undef STATUS_INIT |
| 79 | |
| 80 | void |
| 81 | ydb_db_layer_get_status(YDB_DB_LAYER_STATUS statp) { |
| 82 | if (!ydb_db_layer_status.initialized) |
| 83 | ydb_db_layer_status_init(); |
| 84 | *statp = ydb_db_layer_status; |
| 85 | } |
| 86 | |
| 87 | void create_iname_hint(DB_ENV *env, const char *dname, char *hint) { |
| 88 | //Requires: size of hint array must be > strlen(dname) |
| 89 | //Copy alphanumeric characters only. |
| 90 | //Replace strings of non-alphanumeric characters with a single underscore. |
| 91 | if (env->get_dir_per_db(env) && !toku_os_is_absolute_name(dname)) { |
| 92 | assert(dname); |
| 93 | if (*dname == '.') |
| 94 | ++dname; |
| 95 | if (*dname == '/') |
| 96 | ++dname; |
| 97 | bool underscored = false; |
| 98 | bool dbdir_is_parsed = false; |
| 99 | // Do not change the first '/' because this is |
| 100 | // delimiter which splits name into database dir |
| 101 | // and table dir. |
| 102 | while (*dname) { |
| 103 | if (isalnum(*dname) || (*dname == '/' && !dbdir_is_parsed)) { |
| 104 | char c = *dname++; |
| 105 | *hint++ = c; |
| 106 | if (c == '/') |
| 107 | dbdir_is_parsed = true; |
| 108 | underscored = false; |
| 109 | } else if (!dbdir_is_parsed) { |
| 110 | char c = *dname++; |
| 111 | *hint++ = c; |
| 112 | } else { |
| 113 | if (!underscored) |
| 114 | *hint++ = '_'; |
| 115 | dname++; |
| 116 | underscored = true; |
| 117 | } |
| 118 | } |
| 119 | *hint = '\0'; |
| 120 | } else { |
| 121 | bool underscored = false; |
| 122 | while (*dname) { |
| 123 | if (isalnum(*dname)) { |
| 124 | char c = *dname++; |
| 125 | *hint++ = c; |
| 126 | underscored = false; |
| 127 | } |
| 128 | else { |
| 129 | if (!underscored) |
| 130 | *hint++ = '_'; |
| 131 | dname++; |
| 132 | underscored = true; |
| 133 | } |
| 134 | } |
| 135 | *hint = '\0'; |
| 136 | } |
| 137 | } |
| 138 | |
| 139 | // n < 0 means to ignore mark and ignore n |
| 140 | // n >= 0 means to include mark ("_B_" or "_P_") with hex value of n in iname |
| 141 | // (intended for use by loader, which will create many inames using one txnid). |
| 142 | char *create_iname(DB_ENV *env, |
| 143 | uint64_t id1, |
| 144 | uint64_t id2, |
| 145 | char *hint, |
| 146 | const char *mark, |
| 147 | int n) { |
| 148 | int bytes; |
| 149 | char inamebase[strlen(hint) + |
| 150 | 8 + // hex file format version |
| 151 | 24 + // hex id (normally the txnid's parent and child) |
| 152 | 8 + // hex value of n if non-neg |
| 153 | sizeof("_B___." ) + // extra pieces |
| 154 | strlen(toku_product_name)]; |
| 155 | if (n < 0) |
| 156 | bytes = snprintf(inamebase, sizeof(inamebase), |
| 157 | "%s_%" PRIx64 "_%" PRIx64 "_%" PRIx32 ".%s" , |
| 158 | hint, id1, id2, FT_LAYOUT_VERSION, toku_product_name); |
| 159 | else { |
| 160 | invariant(strlen(mark) == 1); |
| 161 | bytes = snprintf(inamebase, sizeof(inamebase), |
| 162 | "%s_%" PRIx64 "_%" PRIx64 "_%" PRIx32 "_%s_%" PRIx32 ".%s" , |
| 163 | hint, id1, id2, FT_LAYOUT_VERSION, mark, n, toku_product_name); |
| 164 | } |
| 165 | assert(bytes>0); |
| 166 | assert(bytes<=(int)sizeof(inamebase)-1); |
| 167 | char *rval; |
| 168 | if (env->i->data_dir) |
| 169 | rval = toku_construct_full_name(2, env->i->data_dir, inamebase); |
| 170 | else |
| 171 | rval = toku_construct_full_name(1, inamebase); |
| 172 | assert(rval); |
| 173 | return rval; |
| 174 | } |
| 175 | |
| 176 | static uint64_t nontransactional_open_id = 0; |
| 177 | |
| 178 | std::unique_ptr<char[], decltype(&toku_free)> generate_iname_for_rename_or_open( |
| 179 | DB_ENV *env, |
| 180 | DB_TXN *txn, |
| 181 | const char *dname, |
| 182 | bool is_open) { |
| 183 | std::unique_ptr<char[], decltype(&toku_free)> result(nullptr, &toku_free); |
| 184 | char hint[strlen(dname) + 1]; |
| 185 | uint64_t id1 = 0; |
| 186 | uint64_t id2 = 0; |
| 187 | |
| 188 | if (txn) { |
| 189 | id1 = toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn).parent_id64; |
| 190 | id2 = toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn).child_id64; |
| 191 | } else if (is_open) |
| 192 | id1 = toku_sync_fetch_and_add(&nontransactional_open_id, 1); |
| 193 | |
| 194 | create_iname_hint(env, dname, hint); |
| 195 | |
| 196 | result.reset(create_iname(env, id1, id2, hint, NULL, -1)); |
| 197 | |
| 198 | return result; |
| 199 | } |
| 200 | |
| 201 | static int toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, uint32_t flags, int mode); |
| 202 | |
| 203 | // Effect: Do the work required of DB->close(). |
| 204 | // requires: the multi_operation client lock is held. |
| 205 | int |
| 206 | toku_db_close(DB * db) { |
| 207 | int r = 0; |
| 208 | if (db_opened(db) && db->i->dname) { |
| 209 | // internal (non-user) dictionary has no dname |
| 210 | env_note_db_closed(db->dbenv, db); // tell env that this db is no longer in use by the user of this api (user-closed, may still be in use by fractal tree internals) |
| 211 | } |
| 212 | // close the ft handle, and possibly close the locktree |
| 213 | toku_ft_handle_close(db->i->ft_handle); |
| 214 | if (db->i->lt) { |
| 215 | db->dbenv->i->ltm.release_lt(db->i->lt); |
| 216 | } |
| 217 | toku_sdbt_cleanup(&db->i->skey); |
| 218 | toku_sdbt_cleanup(&db->i->sval); |
| 219 | if (db->i->dname) { |
| 220 | toku_free(db->i->dname); |
| 221 | } |
| 222 | toku_free(db->i); |
| 223 | toku_free(db); |
| 224 | return r; |
| 225 | } |
| 226 | |
| 227 | /////////// |
| 228 | //db_getf_XXX is equivalent to c_getf_XXX, without a persistent cursor |
| 229 | |
| 230 | int |
| 231 | db_getf_set(DB *db, DB_TXN *txn, uint32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *) { |
| 232 | HANDLE_PANICKED_DB(db); |
| 233 | HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); |
| 234 | DBC c; |
| 235 | uint32_t create_flags = flags & (DB_ISOLATION_FLAGS | DB_RMW); |
| 236 | flags &= ~DB_ISOLATION_FLAGS; |
| 237 | int r = toku_db_cursor_internal(db, txn, &c, create_flags | DBC_DISABLE_PREFETCHING, 1); |
| 238 | if (r==0) { |
| 239 | r = toku_c_getf_set(&c, flags, key, f, extra); |
| 240 | int r2 = toku_c_close_internal(&c); |
| 241 | if (r==0) r = r2; |
| 242 | } |
| 243 | return r; |
| 244 | } |
| 245 | |
| 246 | static inline int |
| 247 | db_thread_need_flags(DBT *dbt) { |
| 248 | return (dbt->flags & (DB_DBT_MALLOC+DB_DBT_REALLOC+DB_DBT_USERMEM)) == 0; |
| 249 | } |
| 250 | |
| 251 | int |
| 252 | toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, uint32_t flags) { |
| 253 | HANDLE_PANICKED_DB(db); |
| 254 | HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); |
| 255 | int r; |
| 256 | uint32_t iso_flags = flags & DB_ISOLATION_FLAGS; |
| 257 | |
| 258 | if ((db->i->open_flags & DB_THREAD) && db_thread_need_flags(data)) |
| 259 | return EINVAL; |
| 260 | |
| 261 | uint32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE); |
| 262 | flags &= ~lock_flags; |
| 263 | flags &= ~DB_ISOLATION_FLAGS; |
| 264 | // And DB_GET_BOTH is no longer supported. #2862. |
| 265 | if (flags != 0) return EINVAL; |
| 266 | |
| 267 | DBC dbc; |
| 268 | r = toku_db_cursor_internal(db, txn, &dbc, iso_flags | DBC_DISABLE_PREFETCHING, 1); |
| 269 | if (r!=0) return r; |
| 270 | uint32_t c_get_flags = DB_SET; |
| 271 | r = toku_c_get(&dbc, key, data, c_get_flags | lock_flags); |
| 272 | int r2 = toku_c_close_internal(&dbc); |
| 273 | return r ? r : r2; |
| 274 | } |
| 275 | |
| 276 | static int |
| 277 | db_open_subdb(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, uint32_t flags, int mode) { |
| 278 | int r; |
| 279 | if (!fname || !dbname) r = EINVAL; |
| 280 | else { |
| 281 | char subdb_full_name[strlen(fname) + sizeof("/" ) + strlen(dbname)]; |
| 282 | int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s" , fname, dbname); |
| 283 | assert(bytes==(int)sizeof(subdb_full_name)-1); |
| 284 | const char *null_subdbname = NULL; |
| 285 | r = toku_db_open(db, txn, subdb_full_name, null_subdbname, dbtype, flags, mode); |
| 286 | } |
| 287 | return r; |
| 288 | } |
| 289 | |
| 290 | // inames are created here. |
| 291 | // algorithm: |
| 292 | // begin txn |
| 293 | // convert dname to iname (possibly creating new iname) |
| 294 | // open file (toku_ft_handle_open() will handle logging) |
| 295 | // close txn |
| 296 | // if created a new iname, take full range lock |
| 297 | // Requires: no checkpoint may take place during this function, which is enforced by holding the multi_operation_client_lock. |
| 298 | static int |
| 299 | toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, uint32_t flags, int mode) { |
| 300 | HANDLE_PANICKED_DB(db); |
| 301 | HANDLE_READ_ONLY_TXN(txn); |
| 302 | if (dbname != NULL) { |
| 303 | return db_open_subdb(db, txn, fname, dbname, dbtype, flags, mode); |
| 304 | } |
| 305 | |
| 306 | // at this point fname is the dname |
| 307 | //This code ONLY supports single-db files. |
| 308 | assert(dbname == NULL); |
| 309 | const char * dname = fname; // db_open_subdb() converts (fname, dbname) to dname |
| 310 | |
| 311 | ////////////////////////////// do some level of parameter checking. |
| 312 | uint32_t unused_flags = flags; |
| 313 | int r; |
| 314 | if (dbtype!=DB_BTREE && dbtype!=DB_UNKNOWN) return EINVAL; |
| 315 | int is_db_excl = flags & DB_EXCL; unused_flags&=~DB_EXCL; |
| 316 | int is_db_create = flags & DB_CREATE; unused_flags&=~DB_CREATE; |
| 317 | int is_db_hot_index = flags & DB_IS_HOT_INDEX; unused_flags&=~DB_IS_HOT_INDEX; |
| 318 | |
| 319 | //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided. |
| 320 | unused_flags&=~DB_READ_UNCOMMITTED; |
| 321 | unused_flags&=~DB_READ_COMMITTED; |
| 322 | unused_flags&=~DB_SERIALIZABLE; |
| 323 | |
| 324 | // DB_THREAD is implicitly supported and DB_BLACKHOLE is supported at the ft-layer |
| 325 | unused_flags &= ~DB_THREAD; |
| 326 | unused_flags &= ~DB_BLACKHOLE; |
| 327 | |
| 328 | // check for unknown or conflicting flags |
| 329 | if (unused_flags) return EINVAL; // unknown flags |
| 330 | if (is_db_excl && !is_db_create) return EINVAL; |
| 331 | if (dbtype==DB_UNKNOWN && is_db_excl) return EINVAL; |
| 332 | |
| 333 | if (db_opened(db)) { |
| 334 | // it was already open |
| 335 | return EINVAL; |
| 336 | } |
| 337 | ////////////////////////////// |
| 338 | |
| 339 | // convert dname to iname |
| 340 | // - look up dname, get iname |
| 341 | // - if dname does not exist, create iname and make entry in directory |
| 342 | DBT dname_dbt; // holds dname |
| 343 | DBT iname_dbt; // holds iname_in_env |
| 344 | toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1); |
| 345 | toku_init_dbt_flags(&iname_dbt, DB_DBT_REALLOC); |
| 346 | r = toku_db_get(db->dbenv->i->directory, txn, &dname_dbt, &iname_dbt, DB_SERIALIZABLE); // allocates memory for iname |
| 347 | std::unique_ptr<char[], decltype(&toku_free)> iname( |
| 348 | static_cast<char *>(iname_dbt.data), &toku_free); |
| 349 | if (r == DB_NOTFOUND && !is_db_create) { |
| 350 | r = ENOENT; |
| 351 | } else if (r==0 && is_db_excl) { |
| 352 | r = EEXIST; |
| 353 | } else if (r == DB_NOTFOUND) { |
| 354 | iname = generate_iname_for_rename_or_open(db->dbenv, txn, dname, true); |
| 355 | toku_fill_dbt(&iname_dbt, iname.get(), strlen(iname.get()) + 1); |
| 356 | // |
| 357 | // put_flags will be 0 for performance only, avoid unnecessary query |
| 358 | // if we are creating a hot index, per #3166, we do not want the write lock in directory grabbed. |
| 359 | // directory read lock is grabbed in toku_db_get above |
| 360 | // |
| 361 | uint32_t put_flags = 0 | ((is_db_hot_index) ? DB_PRELOCKED_WRITE : 0); |
| 362 | r = toku_db_put(db->dbenv->i->directory, txn, &dname_dbt, &iname_dbt, put_flags, true); |
| 363 | } |
| 364 | |
| 365 | // we now have an iname |
| 366 | if (r == 0) { |
| 367 | r = toku_db_open_iname(db, txn, iname.get(), flags, mode); |
| 368 | if (r == 0) { |
| 369 | db->i->dname = toku_xstrdup(dname); |
| 370 | env_note_db_opened(db->dbenv, db); // tell env that a new db handle is open (using dname) |
| 371 | } |
| 372 | } |
| 373 | |
| 374 | return r; |
| 375 | } |
| 376 | |
| 377 | // set the descriptor and cmp_descriptor to the |
| 378 | // descriptors from the given ft, updating the |
| 379 | // locktree's descriptor pointer if necessary |
| 380 | static void |
| 381 | db_set_descriptors(DB *db, FT_HANDLE ft_handle) { |
| 382 | const toku::comparator &cmp = toku_ft_get_comparator(ft_handle); |
| 383 | db->descriptor = toku_ft_get_descriptor(ft_handle); |
| 384 | db->cmp_descriptor = toku_ft_get_cmp_descriptor(ft_handle); |
| 385 | invariant(db->cmp_descriptor == cmp.get_descriptor()); |
| 386 | if (db->i->lt) { |
| 387 | db->i->lt->set_comparator(cmp); |
| 388 | } |
| 389 | } |
| 390 | |
| 391 | // callback that sets the descriptors when |
| 392 | // a dictionary is redirected at the ft layer |
| 393 | static void |
| 394 | db_on_redirect_callback(FT_HANDLE ft_handle, void* ) { |
| 395 | DB *db = (DB *) extra; |
| 396 | db_set_descriptors(db, ft_handle); |
| 397 | } |
| 398 | |
| 399 | // when a locktree is created, clone a ft handle and store it |
| 400 | // as userdata so we can close it later. |
| 401 | int toku_db_lt_on_create_callback(toku::locktree *lt, void *) { |
| 402 | int r; |
| 403 | struct lt_on_create_callback_extra *info = (struct lt_on_create_callback_extra *) extra; |
| 404 | TOKUTXN ttxn = info->txn ? db_txn_struct_i(info->txn)->tokutxn : NULL; |
| 405 | FT_HANDLE ft_handle = info->ft_handle; |
| 406 | |
| 407 | FT_HANDLE cloned_ft_handle; |
| 408 | r = toku_ft_handle_clone(&cloned_ft_handle, ft_handle, ttxn); |
| 409 | if (r == 0) { |
| 410 | assert(lt->get_userdata() == NULL); |
| 411 | lt->set_userdata(cloned_ft_handle); |
| 412 | } |
| 413 | return r; |
| 414 | } |
| 415 | |
| 416 | // when a locktree is about to be destroyed, |
| 417 | // close the ft handle stored as userdata. |
| 418 | void toku_db_lt_on_destroy_callback(toku::locktree *lt) { |
| 419 | FT_HANDLE ft_handle = (FT_HANDLE) lt->get_userdata(); |
| 420 | assert(ft_handle); |
| 421 | toku_ft_handle_close(ft_handle); |
| 422 | } |
| 423 | |
| 424 | // Instruct db to use the default (built-in) key comparison function |
| 425 | // by setting the flag bits in the db and ft structs |
| 426 | int toku_db_use_builtin_key_cmp(DB *db) { |
| 427 | HANDLE_PANICKED_DB(db); |
| 428 | int r = 0; |
| 429 | if (db_opened(db)) { |
| 430 | r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be set after DB open.\n" ); |
| 431 | } else if (db->i->key_compare_was_set) { |
| 432 | r = toku_ydb_do_error(db->dbenv, EINVAL, "Key comparison function already set.\n" ); |
| 433 | } else { |
| 434 | uint32_t tflags; |
| 435 | toku_ft_get_flags(db->i->ft_handle, &tflags); |
| 436 | |
| 437 | tflags |= TOKU_DB_KEYCMP_BUILTIN; |
| 438 | toku_ft_set_flags(db->i->ft_handle, tflags); |
| 439 | db->i->key_compare_was_set = true; |
| 440 | } |
| 441 | return r; |
| 442 | } |
| 443 | |
| 444 | int toku_db_open_iname(DB * db, DB_TXN * txn, const char *iname_in_env, uint32_t flags, int mode) { |
| 445 | //Set comparison functions if not yet set. |
| 446 | HANDLE_READ_ONLY_TXN(txn); |
| 447 | if (!db->i->key_compare_was_set && db->dbenv->i->bt_compare) { |
| 448 | toku_ft_set_bt_compare(db->i->ft_handle, db->dbenv->i->bt_compare); |
| 449 | db->i->key_compare_was_set = true; |
| 450 | } |
| 451 | if (db->dbenv->i->update_function) { |
| 452 | toku_ft_set_update(db->i->ft_handle,db->dbenv->i->update_function); |
| 453 | } |
| 454 | toku_ft_set_redirect_callback( |
| 455 | db->i->ft_handle, |
| 456 | db_on_redirect_callback, |
| 457 | db |
| 458 | ); |
| 459 | bool need_locktree = (bool)((db->dbenv->i->open_flags & DB_INIT_LOCK) && |
| 460 | (db->dbenv->i->open_flags & DB_INIT_TXN)); |
| 461 | |
| 462 | int is_db_excl = flags & DB_EXCL; flags&=~DB_EXCL; |
| 463 | int is_db_create = flags & DB_CREATE; flags&=~DB_CREATE; |
| 464 | //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided. |
| 465 | flags&=~DB_READ_UNCOMMITTED; |
| 466 | flags&=~DB_READ_COMMITTED; |
| 467 | flags&=~DB_SERIALIZABLE; |
| 468 | flags&=~DB_IS_HOT_INDEX; |
| 469 | // unknown or conflicting flags are bad |
| 470 | int unknown_flags = flags & ~DB_THREAD; |
| 471 | unknown_flags &= ~DB_BLACKHOLE; |
| 472 | if (unknown_flags || (is_db_excl && !is_db_create)) { |
| 473 | return EINVAL; |
| 474 | } |
| 475 | |
| 476 | if (db_opened(db)) { |
| 477 | return EINVAL; /* It was already open. */ |
| 478 | } |
| 479 | |
| 480 | db->i->open_flags = flags; |
| 481 | db->i->open_mode = mode; |
| 482 | |
| 483 | FT_HANDLE ft_handle = db->i->ft_handle; |
| 484 | int r = toku_ft_handle_open(ft_handle, iname_in_env, |
| 485 | is_db_create, is_db_excl, |
| 486 | db->dbenv->i->cachetable, |
| 487 | txn ? db_txn_struct_i(txn)->tokutxn : nullptr); |
| 488 | if (r != 0) { |
| 489 | goto out; |
| 490 | } |
| 491 | |
| 492 | // if the dictionary was opened as a blackhole, mark the |
| 493 | // fractal tree as blackhole too. |
| 494 | if (flags & DB_BLACKHOLE) { |
| 495 | toku_ft_set_blackhole(ft_handle); |
| 496 | } |
| 497 | |
| 498 | db->i->opened = 1; |
| 499 | |
| 500 | // now that the handle has successfully opened, a valid descriptor |
| 501 | // is in the ft. we need to set the db's descriptor pointers |
| 502 | db_set_descriptors(db, ft_handle); |
| 503 | |
| 504 | if (need_locktree) { |
| 505 | db->i->dict_id = toku_ft_get_dictionary_id(db->i->ft_handle); |
| 506 | struct lt_on_create_callback_extra = { |
| 507 | .txn = txn, |
| 508 | .ft_handle = db->i->ft_handle, |
| 509 | }; |
| 510 | db->i->lt = db->dbenv->i->ltm.get_lt(db->i->dict_id, |
| 511 | toku_ft_get_comparator(db->i->ft_handle), |
| 512 | &on_create_extra); |
| 513 | if (db->i->lt == nullptr) { |
| 514 | r = errno; |
| 515 | if (r == 0) { |
| 516 | r = EINVAL; |
| 517 | } |
| 518 | goto out; |
| 519 | } |
| 520 | } |
| 521 | r = 0; |
| 522 | |
| 523 | out: |
| 524 | if (r != 0) { |
| 525 | db->i->dict_id = DICTIONARY_ID_NONE; |
| 526 | db->i->opened = 0; |
| 527 | if (db->i->lt) { |
| 528 | db->dbenv->i->ltm.release_lt(db->i->lt); |
| 529 | db->i->lt = nullptr; |
| 530 | } |
| 531 | } |
| 532 | return r; |
| 533 | } |
| 534 | |
| 535 | // Return the maximum key and val size in |
| 536 | // *key_size and *val_size respectively |
| 537 | static void |
| 538 | toku_db_get_max_row_size(DB * UU(db), uint32_t * max_key_size, uint32_t * max_val_size) { |
| 539 | *max_key_size = 0; |
| 540 | *max_val_size = 0; |
| 541 | toku_ft_get_maximum_advised_key_value_lengths(max_key_size, max_val_size); |
| 542 | } |
| 543 | |
| 544 | int toku_db_pre_acquire_fileops_lock(DB *db, DB_TXN *txn) { |
| 545 | // bad hack because some environment dictionaries do not have a dname |
| 546 | char *dname = db->i->dname; |
| 547 | if (!dname) |
| 548 | return 0; |
| 549 | |
| 550 | DBT key_in_directory = { .data = dname, .size = (uint32_t) strlen(dname)+1 }; |
| 551 | //Left end of range == right end of range (point lock) |
| 552 | int r = toku_db_get_range_lock(db->dbenv->i->directory, txn, |
| 553 | &key_in_directory, &key_in_directory, |
| 554 | toku::lock_request::type::WRITE); |
| 555 | if (r == 0) |
| 556 | STATUS_VALUE(YDB_LAYER_DIRECTORY_WRITE_LOCKS)++; // accountability |
| 557 | else |
| 558 | STATUS_VALUE(YDB_LAYER_DIRECTORY_WRITE_LOCKS_FAIL)++; // accountability |
| 559 | return r; |
| 560 | } |
| 561 | |
| 562 | // |
| 563 | // This function is used both to set an initial descriptor of a DB and to |
| 564 | // change a descriptor. (only way to set a descriptor of a DB) |
| 565 | // |
| 566 | // Requires: |
| 567 | // - The caller must not call put_multiple, del_multiple, or update_multiple concurrently |
| 568 | // - The caller must not have a hot index running concurrently on db |
| 569 | // - If the caller has passed DB_UPDATE_CMP_DESCRIPTOR as a flag, then he is calling this function |
| 570 | // ONLY immediately after creating the dictionary and before doing any actual work on the dictionary. |
| 571 | // |
| 572 | static int |
| 573 | toku_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, uint32_t flags) { |
| 574 | HANDLE_PANICKED_DB(db); |
| 575 | HANDLE_READ_ONLY_TXN(txn); |
| 576 | HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); |
| 577 | int r = 0; |
| 578 | TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; |
| 579 | bool is_db_hot_index = ((flags & DB_IS_HOT_INDEX) != 0); |
| 580 | bool update_cmp_descriptor = ((flags & DB_UPDATE_CMP_DESCRIPTOR) != 0); |
| 581 | |
| 582 | DBT old_descriptor_dbt; |
| 583 | toku_init_dbt(&old_descriptor_dbt); |
| 584 | |
| 585 | if (!db_opened(db) || !descriptor || (descriptor->size>0 && !descriptor->data)){ |
| 586 | r = EINVAL; |
| 587 | goto cleanup; |
| 588 | } |
| 589 | // For a hot index, this is an initial descriptor. |
| 590 | // We do not support (yet) hcad with hot index concurrently on a single table, which |
| 591 | // would require changing a descriptor for a hot index. |
| 592 | if (!is_db_hot_index) { |
| 593 | r = toku_db_pre_acquire_table_lock(db, txn); |
| 594 | if (r != 0) { goto cleanup; } |
| 595 | } |
| 596 | |
| 597 | toku_clone_dbt(&old_descriptor_dbt, db->descriptor->dbt); |
| 598 | toku_ft_change_descriptor(db->i->ft_handle, &old_descriptor_dbt, descriptor, |
| 599 | true, ttxn, update_cmp_descriptor); |
| 600 | |
| 601 | cleanup: |
| 602 | toku_destroy_dbt(&old_descriptor_dbt); |
| 603 | return r; |
| 604 | } |
| 605 | |
| 606 | static int |
| 607 | toku_db_set_flags(DB *db, uint32_t flags) { |
| 608 | HANDLE_PANICKED_DB(db); |
| 609 | |
| 610 | /* the following matches BDB */ |
| 611 | if (db_opened(db) && flags != 0) return EINVAL; |
| 612 | |
| 613 | return 0; |
| 614 | } |
| 615 | |
| 616 | static int |
| 617 | toku_db_get_flags(DB *db, uint32_t *pflags) { |
| 618 | HANDLE_PANICKED_DB(db); |
| 619 | if (!pflags) return EINVAL; |
| 620 | *pflags = 0; |
| 621 | return 0; |
| 622 | } |
| 623 | |
| 624 | static int |
| 625 | toku_db_change_pagesize(DB *db, uint32_t pagesize) { |
| 626 | HANDLE_PANICKED_DB(db); |
| 627 | if (!db_opened(db)) return EINVAL; |
| 628 | toku_ft_handle_set_nodesize(db->i->ft_handle, pagesize); |
| 629 | return 0; |
| 630 | } |
| 631 | |
| 632 | static int |
| 633 | toku_db_set_pagesize(DB *db, uint32_t pagesize) { |
| 634 | HANDLE_PANICKED_DB(db); |
| 635 | if (db_opened(db)) return EINVAL; |
| 636 | toku_ft_handle_set_nodesize(db->i->ft_handle, pagesize); |
| 637 | return 0; |
| 638 | } |
| 639 | |
| 640 | static int |
| 641 | toku_db_get_pagesize(DB *db, uint32_t *pagesize_ptr) { |
| 642 | HANDLE_PANICKED_DB(db); |
| 643 | toku_ft_handle_get_nodesize(db->i->ft_handle, pagesize_ptr); |
| 644 | return 0; |
| 645 | } |
| 646 | |
| 647 | static int |
| 648 | toku_db_change_readpagesize(DB *db, uint32_t readpagesize) { |
| 649 | HANDLE_PANICKED_DB(db); |
| 650 | if (!db_opened(db)) return EINVAL; |
| 651 | toku_ft_handle_set_basementnodesize(db->i->ft_handle, readpagesize); |
| 652 | return 0; |
| 653 | } |
| 654 | |
| 655 | static int |
| 656 | toku_db_set_readpagesize(DB *db, uint32_t readpagesize) { |
| 657 | HANDLE_PANICKED_DB(db); |
| 658 | if (db_opened(db)) return EINVAL; |
| 659 | toku_ft_handle_set_basementnodesize(db->i->ft_handle, readpagesize); |
| 660 | return 0; |
| 661 | } |
| 662 | |
| 663 | static int |
| 664 | toku_db_get_readpagesize(DB *db, uint32_t *readpagesize_ptr) { |
| 665 | HANDLE_PANICKED_DB(db); |
| 666 | toku_ft_handle_get_basementnodesize(db->i->ft_handle, readpagesize_ptr); |
| 667 | return 0; |
| 668 | } |
| 669 | |
| 670 | static int |
| 671 | toku_db_change_compression_method(DB *db, enum toku_compression_method compression_method) { |
| 672 | HANDLE_PANICKED_DB(db); |
| 673 | if (!db_opened(db)) return EINVAL; |
| 674 | toku_ft_handle_set_compression_method(db->i->ft_handle, compression_method); |
| 675 | return 0; |
| 676 | } |
| 677 | |
| 678 | static int |
| 679 | toku_db_set_compression_method(DB *db, enum toku_compression_method compression_method) { |
| 680 | HANDLE_PANICKED_DB(db); |
| 681 | if (db_opened(db)) return EINVAL; |
| 682 | toku_ft_handle_set_compression_method(db->i->ft_handle, compression_method); |
| 683 | return 0; |
| 684 | } |
| 685 | |
| 686 | static int |
| 687 | toku_db_get_compression_method(DB *db, enum toku_compression_method *compression_method_ptr) { |
| 688 | HANDLE_PANICKED_DB(db); |
| 689 | toku_ft_handle_get_compression_method(db->i->ft_handle, compression_method_ptr); |
| 690 | return 0; |
| 691 | } |
| 692 | |
| 693 | static int |
| 694 | toku_db_change_fanout(DB *db, unsigned int fanout) { |
| 695 | HANDLE_PANICKED_DB(db); |
| 696 | if (!db_opened(db)) return EINVAL; |
| 697 | toku_ft_handle_set_fanout(db->i->ft_handle, fanout); |
| 698 | return 0; |
| 699 | } |
| 700 | |
| 701 | static int |
| 702 | toku_db_set_fanout(DB *db, unsigned int fanout) { |
| 703 | HANDLE_PANICKED_DB(db); |
| 704 | if (db_opened(db)) return EINVAL; |
| 705 | toku_ft_handle_set_fanout(db->i->ft_handle, fanout); |
| 706 | return 0; |
| 707 | } |
| 708 | |
| 709 | static int |
| 710 | toku_db_get_fanout(DB *db, unsigned int *fanout) { |
| 711 | HANDLE_PANICKED_DB(db); |
| 712 | toku_ft_handle_get_fanout(db->i->ft_handle, fanout); |
| 713 | return 0; |
| 714 | } |
| 715 | |
| 716 | static int |
| 717 | toku_db_set_memcmp_magic(DB *db, uint8_t magic) { |
| 718 | HANDLE_PANICKED_DB(db); |
| 719 | if (db_opened(db)) { |
| 720 | return EINVAL; |
| 721 | } |
| 722 | return toku_ft_handle_set_memcmp_magic(db->i->ft_handle, magic); |
| 723 | } |
| 724 | |
| 725 | static int |
| 726 | toku_db_get_fractal_tree_info64(DB *db, uint64_t *num_blocks_allocated, uint64_t *num_blocks_in_use, uint64_t *size_allocated, uint64_t *size_in_use) { |
| 727 | HANDLE_PANICKED_DB(db); |
| 728 | struct ftinfo64 ftinfo; |
| 729 | toku_ft_handle_get_fractal_tree_info64(db->i->ft_handle, &ftinfo); |
| 730 | *num_blocks_allocated = ftinfo.num_blocks_allocated; |
| 731 | *num_blocks_in_use = ftinfo.num_blocks_in_use; |
| 732 | *size_allocated = ftinfo.size_allocated; |
| 733 | *size_in_use = ftinfo.size_in_use; |
| 734 | return 0; |
| 735 | } |
| 736 | |
| 737 | static int |
| 738 | toku_db_iterate_fractal_tree_block_map(DB *db, int (*iter)(uint64_t,int64_t,int64_t,int64_t,int64_t,void*), void *) { |
| 739 | HANDLE_PANICKED_DB(db); |
| 740 | return toku_ft_handle_iterate_fractal_tree_block_map(db->i->ft_handle, iter, iter_extra); |
| 741 | } |
| 742 | |
| 743 | static int |
| 744 | toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) { |
| 745 | HANDLE_PANICKED_DB(db); |
| 746 | HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); |
| 747 | struct ftstat64_s ftstat; |
| 748 | TOKUTXN tokutxn = NULL; |
| 749 | if (txn != NULL) { |
| 750 | tokutxn = db_txn_struct_i(txn)->tokutxn; |
| 751 | } |
| 752 | toku_ft_handle_stat64(db->i->ft_handle, tokutxn, &ftstat); |
| 753 | s->bt_nkeys = ftstat.nkeys; |
| 754 | s->bt_ndata = ftstat.ndata; |
| 755 | s->bt_dsize = ftstat.dsize; |
| 756 | s->bt_fsize = ftstat.fsize; |
| 757 | s->bt_create_time_sec = ftstat.create_time_sec; |
| 758 | s->bt_modify_time_sec = ftstat.modify_time_sec; |
| 759 | s->bt_verify_time_sec = ftstat.verify_time_sec; |
| 760 | return 0; |
| 761 | } |
| 762 | |
| 763 | static const char * |
| 764 | toku_db_get_dname(DB *db) { |
| 765 | if (!db_opened(db)) { |
| 766 | return nullptr; |
| 767 | } |
| 768 | if (db->i->dname == nullptr) { |
| 769 | return "" ; |
| 770 | } |
| 771 | return db->i->dname; |
| 772 | } |
| 773 | |
| 774 | static int |
| 775 | toku_db_keys_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* keyleft, DBT* keyright, uint64_t* less, uint64_t* left, uint64_t* between, uint64_t *right, uint64_t *greater, bool* middle_3_exact) { |
| 776 | HANDLE_PANICKED_DB(db); |
| 777 | HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); |
| 778 | |
| 779 | // note that we ignore the txn param. It would be more complicated to support it. |
| 780 | // TODO(yoni): Maybe add support for txns later? How would we do this? ydb lock comment about db_keyrange64 is obsolete. |
| 781 | toku_ft_keysrange(db->i->ft_handle, keyleft, keyright, less, left, between, right, greater, middle_3_exact); |
| 782 | return 0; |
| 783 | } |
| 784 | |
| 785 | static int |
| 786 | toku_db_key_range64(DB* db, DB_TXN* txn, DBT* key, uint64_t* less_p, uint64_t* equal_p, uint64_t* greater_p, int* is_exact) { |
| 787 | uint64_t less, equal_left, middle, equal_right, greater; |
| 788 | bool ignore; |
| 789 | int r = toku_db_keys_range64(db, txn, key, NULL, &less, &equal_left, &middle, &equal_right, &greater, &ignore); |
| 790 | if (r == 0) { |
| 791 | *less_p = less; |
| 792 | *equal_p = equal_left; |
| 793 | *greater_p = middle; |
| 794 | paranoid_invariant_zero(greater); // no keys are greater than positive infinity |
| 795 | paranoid_invariant_zero(equal_right); // no keys are equal to positive infinity |
| 796 | // toku_ft_keysrange does not know when all 3 are exact, so set is_exact to false |
| 797 | *is_exact = false; |
| 798 | } |
| 799 | return 0; |
| 800 | } |
| 801 | |
| 802 | static int toku_db_get_key_after_bytes(DB *db, DB_TXN *txn, const DBT *start_key, uint64_t skip_len, void (*callback)(const DBT *end_key, uint64_t actually_skipped, void *), void *, uint32_t UU(flags)) { |
| 803 | HANDLE_PANICKED_DB(db); |
| 804 | HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); |
| 805 | return toku_ft_get_key_after_bytes(db->i->ft_handle, start_key, skip_len, callback, cb_extra); |
| 806 | } |
| 807 | |
| 808 | // needed by loader.c |
| 809 | int |
| 810 | toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) { |
| 811 | HANDLE_PANICKED_DB(db); |
| 812 | if (!db->i->lt || !txn) return 0; |
| 813 | int r; |
| 814 | r = toku_db_get_range_lock(db, txn, |
| 815 | toku_dbt_negative_infinity(), toku_dbt_positive_infinity(), |
| 816 | toku::lock_request::type::WRITE); |
| 817 | return r; |
| 818 | } |
| 819 | |
| 820 | static int |
| 821 | locked_db_close(DB * db, uint32_t UU(flags)) { |
| 822 | // cannot begin a checkpoint |
| 823 | toku_multi_operation_client_lock(); |
| 824 | int r = toku_db_close(db); |
| 825 | toku_multi_operation_client_unlock(); |
| 826 | return r; |
| 827 | } |
| 828 | |
| 829 | int |
| 830 | autotxn_db_get(DB* db, DB_TXN* txn, DBT* key, DBT* data, uint32_t flags) { |
| 831 | bool changed; int r; |
| 832 | r = toku_db_construct_autotxn(db, &txn, &changed, false); |
| 833 | if (r!=0) return r; |
| 834 | r = toku_db_get(db, txn, key, data, flags); |
| 835 | return toku_db_destruct_autotxn(txn, r, changed); |
| 836 | } |
| 837 | |
| 838 | static inline int |
| 839 | autotxn_db_getf_set (DB *db, DB_TXN *txn, uint32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *) { |
| 840 | bool changed; int r; |
| 841 | r = toku_db_construct_autotxn(db, &txn, &changed, false); |
| 842 | if (r!=0) return r; |
| 843 | r = db_getf_set(db, txn, flags, key, f, extra); |
| 844 | return toku_db_destruct_autotxn(txn, r, changed); |
| 845 | } |
| 846 | |
| 847 | static int |
| 848 | locked_db_open(DB *db, DB_TXN *txn, const char *fname, const char *dbname, DBTYPE dbtype, uint32_t flags, int mode) { |
| 849 | int ret, r; |
| 850 | HANDLE_READ_ONLY_TXN(txn); |
| 851 | HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); |
| 852 | |
| 853 | // |
| 854 | // Note that this function opens a db with a transaction. Should |
| 855 | // the transaction abort, the user is responsible for closing the DB |
| 856 | // before aborting the transaction. Not doing so results in undefined |
| 857 | // behavior. |
| 858 | // |
| 859 | DB_ENV *env = db->dbenv; |
| 860 | DB_TXN *child_txn = NULL; |
| 861 | int using_txns = env->i->open_flags & DB_INIT_TXN; |
| 862 | if (using_txns) { |
| 863 | ret = toku_txn_begin(env, txn, &child_txn, DB_TXN_NOSYNC); |
| 864 | invariant_zero(ret); |
| 865 | } |
| 866 | |
| 867 | // cannot begin a checkpoint |
| 868 | toku_multi_operation_client_lock(); |
| 869 | r = toku_db_open(db, child_txn, fname, dbname, dbtype, flags & ~DB_AUTO_COMMIT, mode); |
| 870 | toku_multi_operation_client_unlock(); |
| 871 | |
| 872 | if (using_txns) { |
| 873 | if (r == 0) { |
| 874 | ret = locked_txn_commit(child_txn, DB_TXN_NOSYNC); |
| 875 | invariant_zero(ret); |
| 876 | } else { |
| 877 | ret = locked_txn_abort(child_txn); |
| 878 | invariant_zero(ret); |
| 879 | } |
| 880 | } |
| 881 | return r; |
| 882 | } |
| 883 | |
| 884 | static int |
| 885 | locked_db_change_descriptor(DB *db, DB_TXN *txn, const DBT *descriptor, uint32_t flags) { |
| 886 | // cannot begin a checkpoint |
| 887 | toku_multi_operation_client_lock(); |
| 888 | int r = toku_db_change_descriptor(db, txn, descriptor, flags); |
| 889 | toku_multi_operation_client_unlock(); |
| 890 | return r; |
| 891 | } |
| 892 | |
| 893 | static int |
| 894 | autotxn_db_change_descriptor(DB *db, DB_TXN *txn, const DBT *descriptor, uint32_t flags) { |
| 895 | bool changed; int r; |
| 896 | r = toku_db_construct_autotxn(db, &txn, &changed, false); |
| 897 | if (r != 0) { return r; } |
| 898 | r = locked_db_change_descriptor(db, txn, descriptor, flags); |
| 899 | return toku_db_destruct_autotxn(txn, r, changed); |
| 900 | } |
| 901 | |
| 902 | static void |
| 903 | toku_db_set_errfile (DB *db, FILE *errfile) { |
| 904 | db->dbenv->set_errfile(db->dbenv, errfile); |
| 905 | } |
| 906 | |
| 907 | // TODO 2216 delete this |
| 908 | static int |
| 909 | toku_db_fd(DB * UU(db), int * UU(fdp)) { |
| 910 | return 0; |
| 911 | } |
| 912 | |
| 913 | static const DBT* toku_db_dbt_pos_infty(void) __attribute__((pure)); |
| 914 | static const DBT* |
| 915 | toku_db_dbt_pos_infty(void) { |
| 916 | return toku_dbt_positive_infinity(); |
| 917 | } |
| 918 | |
| 919 | static const DBT* toku_db_dbt_neg_infty(void) __attribute__((pure)); |
| 920 | static const DBT* |
| 921 | toku_db_dbt_neg_infty(void) { |
| 922 | return toku_dbt_negative_infinity(); |
| 923 | } |
| 924 | |
| 925 | static int |
| 926 | toku_db_optimize(DB *db) { |
| 927 | HANDLE_PANICKED_DB(db); |
| 928 | toku_ft_optimize(db->i->ft_handle); |
| 929 | return 0; |
| 930 | } |
| 931 | |
| 932 | static int |
| 933 | toku_db_hot_optimize(DB *db, DBT* left, DBT* right, |
| 934 | int (*progress_callback)(void *, float progress), |
| 935 | void *, uint64_t* loops_run) |
| 936 | { |
| 937 | HANDLE_PANICKED_DB(db); |
| 938 | int r = 0; |
| 939 | r = toku_ft_hot_optimize(db->i->ft_handle, left, right, |
| 940 | progress_callback, |
| 941 | progress_extra, loops_run); |
| 942 | |
| 943 | return r; |
| 944 | } |
| 945 | |
| 946 | static int |
| 947 | locked_db_optimize(DB *db) { |
| 948 | // need to protect from checkpointing because |
| 949 | // toku_db_optimize does a message injection |
| 950 | toku_multi_operation_client_lock(); //Cannot begin checkpoint |
| 951 | int r = toku_db_optimize(db); |
| 952 | toku_multi_operation_client_unlock(); |
| 953 | return r; |
| 954 | } |
| 955 | |
| 956 | |
| 957 | struct { |
| 958 | YDB_CALLBACK_FUNCTION ; |
| 959 | void* ; |
| 960 | }; |
| 961 | |
| 962 | static int |
| 963 | db_get_last_key_callback(uint32_t keylen, const void *key, uint32_t vallen UU(), const void *val UU(), void *, bool lock_only) { |
| 964 | if (!lock_only) { |
| 965 | DBT keydbt; |
| 966 | toku_fill_dbt(&keydbt, key, keylen); |
| 967 | struct last_key_extra * CAST_FROM_VOIDP(info, extra); |
| 968 | info->func(&keydbt, NULL, info->extra); |
| 969 | } |
| 970 | return 0; |
| 971 | } |
| 972 | |
| 973 | static int |
| 974 | toku_db_get_last_key(DB * db, DB_TXN *txn, YDB_CALLBACK_FUNCTION func, void* ) { |
| 975 | int r; |
| 976 | LE_CURSOR cursor = nullptr; |
| 977 | struct last_key_extra = { .func = func, .extra = extra }; |
| 978 | |
| 979 | r = toku_le_cursor_create(&cursor, db->i->ft_handle, db_txn_struct_i(txn)->tokutxn); |
| 980 | if (r != 0) { goto cleanup; } |
| 981 | |
| 982 | // Goes in reverse order. First key returned is last in dictionary. |
| 983 | r = toku_le_cursor_next(cursor, db_get_last_key_callback, &last_extra); |
| 984 | if (r != 0) { goto cleanup; } |
| 985 | |
| 986 | cleanup: |
| 987 | if (cursor) { |
| 988 | toku_le_cursor_close(cursor); |
| 989 | } |
| 990 | return r; |
| 991 | } |
| 992 | |
| 993 | static int |
| 994 | autotxn_db_get_last_key(DB* db, YDB_CALLBACK_FUNCTION func, void* ) { |
| 995 | bool changed; int r; |
| 996 | DB_TXN *txn = nullptr; |
| 997 | // Cursors inside require transactions, but this is _not_ a transactional function. |
| 998 | // Create transaction in a wrapper and then later close it. |
| 999 | r = toku_db_construct_autotxn(db, &txn, &changed, false); |
| 1000 | if (r!=0) return r; |
| 1001 | r = toku_db_get_last_key(db, txn, func, extra); |
| 1002 | return toku_db_destruct_autotxn(txn, r, changed); |
| 1003 | } |
| 1004 | |
| 1005 | static int |
| 1006 | toku_db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) { |
| 1007 | HANDLE_PANICKED_DB(db); |
| 1008 | int r; |
| 1009 | if (!db_opened(db)) |
| 1010 | r = toku_ydb_do_error(db->dbenv, EINVAL, "Fragmentation report available only on open DBs.\n" ); |
| 1011 | else |
| 1012 | r = toku_ft_get_fragmentation(db->i->ft_handle, report); |
| 1013 | return r; |
| 1014 | } |
| 1015 | |
| 1016 | int |
| 1017 | toku_db_set_indexer(DB *db, DB_INDEXER * indexer) { |
| 1018 | int r = 0; |
| 1019 | if ( db->i->indexer != NULL && indexer != NULL ) { |
| 1020 | // you are trying to overwrite a valid indexer |
| 1021 | r = EINVAL; |
| 1022 | } |
| 1023 | else { |
| 1024 | db->i->indexer = indexer; |
| 1025 | } |
| 1026 | return r; |
| 1027 | } |
| 1028 | |
| 1029 | DB_INDEXER * |
| 1030 | toku_db_get_indexer(DB *db) { |
| 1031 | return db->i->indexer; |
| 1032 | } |
| 1033 | |
| 1034 | static void |
| 1035 | db_get_indexer(DB *db, DB_INDEXER **indexer_ptr) { |
| 1036 | *indexer_ptr = toku_db_get_indexer(db); |
| 1037 | } |
| 1038 | |
| 1039 | struct ydb_verify_context { |
| 1040 | int (*progress_callback)(void *, float progress); |
| 1041 | void *; |
| 1042 | }; |
| 1043 | |
| 1044 | static int |
| 1045 | ydb_verify_progress_callback(void *, float progress) { |
| 1046 | struct ydb_verify_context *context = (struct ydb_verify_context *) extra; |
| 1047 | int r = 0; |
| 1048 | if (context->progress_callback) { |
| 1049 | r = context->progress_callback(context->progress_extra, progress); |
| 1050 | } |
| 1051 | return r; |
| 1052 | } |
| 1053 | |
| 1054 | static int |
| 1055 | toku_db_verify_with_progress(DB *db, int (*progress_callback)(void *, float progress), void *, int verbose, int keep_going) { |
| 1056 | struct ydb_verify_context context = { progress_callback, progress_extra }; |
| 1057 | int r = toku_verify_ft_with_progress(db->i->ft_handle, ydb_verify_progress_callback, &context, verbose, keep_going); |
| 1058 | return r; |
| 1059 | } |
| 1060 | |
| 1061 | |
| 1062 | static int |
| 1063 | toku_db_recount_rows(DB* db, int (*progress_callback)(uint64_t count, |
| 1064 | uint64_t deleted, |
| 1065 | void* ), |
| 1066 | void* ) { |
| 1067 | |
| 1068 | HANDLE_PANICKED_DB(db); |
| 1069 | int r = 0; |
| 1070 | r = |
| 1071 | toku_ft_recount_rows( |
| 1072 | db->i->ft_handle, |
| 1073 | progress_callback, |
| 1074 | progress_extra); |
| 1075 | |
| 1076 | return r; |
| 1077 | } |
| 1078 | |
| 1079 | |
| 1080 | int toku_setup_db_internal (DB **dbp, DB_ENV *env, uint32_t flags, FT_HANDLE ft_handle, bool is_open) { |
| 1081 | if (flags || env == NULL) |
| 1082 | return EINVAL; |
| 1083 | |
| 1084 | if (!env_opened(env)) |
| 1085 | return EINVAL; |
| 1086 | |
| 1087 | DB *MALLOC(result); |
| 1088 | if (result == 0) { |
| 1089 | return ENOMEM; |
| 1090 | } |
| 1091 | memset(result, 0, sizeof *result); |
| 1092 | result->dbenv = env; |
| 1093 | MALLOC(result->i); |
| 1094 | if (result->i == 0) { |
| 1095 | toku_free(result); |
| 1096 | return ENOMEM; |
| 1097 | } |
| 1098 | memset(result->i, 0, sizeof *result->i); |
| 1099 | result->i->ft_handle = ft_handle; |
| 1100 | result->i->opened = is_open; |
| 1101 | *dbp = result; |
| 1102 | return 0; |
| 1103 | } |
| 1104 | |
| 1105 | int |
| 1106 | toku_db_create(DB ** db, DB_ENV * env, uint32_t flags) { |
| 1107 | if (flags || env == NULL) |
| 1108 | return EINVAL; |
| 1109 | |
| 1110 | if (!env_opened(env)) |
| 1111 | return EINVAL; |
| 1112 | |
| 1113 | |
| 1114 | FT_HANDLE ft_handle; |
| 1115 | toku_ft_handle_create(&ft_handle); |
| 1116 | |
| 1117 | int r = toku_setup_db_internal(db, env, flags, ft_handle, false); |
| 1118 | if (r != 0) return r; |
| 1119 | |
| 1120 | DB *result=*db; |
| 1121 | // methods that grab the ydb lock |
| 1122 | #define SDB(name) result->name = locked_db_ ## name |
| 1123 | SDB(close); |
| 1124 | SDB(open); |
| 1125 | SDB(optimize); |
| 1126 | #undef SDB |
| 1127 | // methods that do not take the ydb lock |
| 1128 | #define USDB(name) result->name = toku_db_ ## name |
| 1129 | USDB(set_errfile); |
| 1130 | USDB(set_pagesize); |
| 1131 | USDB(get_pagesize); |
| 1132 | USDB(change_pagesize); |
| 1133 | USDB(set_readpagesize); |
| 1134 | USDB(get_readpagesize); |
| 1135 | USDB(change_readpagesize); |
| 1136 | USDB(set_compression_method); |
| 1137 | USDB(get_compression_method); |
| 1138 | USDB(change_compression_method); |
| 1139 | USDB(set_fanout); |
| 1140 | USDB(get_fanout); |
| 1141 | USDB(set_memcmp_magic); |
| 1142 | USDB(change_fanout); |
| 1143 | USDB(set_flags); |
| 1144 | USDB(get_flags); |
| 1145 | USDB(fd); |
| 1146 | USDB(get_max_row_size); |
| 1147 | USDB(set_indexer); |
| 1148 | USDB(pre_acquire_table_lock); |
| 1149 | USDB(pre_acquire_fileops_lock); |
| 1150 | USDB(key_range64); |
| 1151 | USDB(keys_range64); |
| 1152 | USDB(get_key_after_bytes); |
| 1153 | USDB(hot_optimize); |
| 1154 | USDB(stat64); |
| 1155 | USDB(get_fractal_tree_info64); |
| 1156 | USDB(iterate_fractal_tree_block_map); |
| 1157 | USDB(get_dname); |
| 1158 | USDB(verify_with_progress); |
| 1159 | USDB(cursor); |
| 1160 | USDB(dbt_pos_infty); |
| 1161 | USDB(dbt_neg_infty); |
| 1162 | USDB(get_fragmentation); |
| 1163 | USDB(recount_rows); |
| 1164 | #undef USDB |
| 1165 | result->get_indexer = db_get_indexer; |
| 1166 | result->del = autotxn_db_del; |
| 1167 | result->put = autotxn_db_put; |
| 1168 | result->update = autotxn_db_update; |
| 1169 | result->update_broadcast = autotxn_db_update_broadcast; |
| 1170 | result->change_descriptor = autotxn_db_change_descriptor; |
| 1171 | result->get_last_key = autotxn_db_get_last_key; |
| 1172 | |
| 1173 | // unlocked methods |
| 1174 | result->get = autotxn_db_get; |
| 1175 | result->getf_set = autotxn_db_getf_set; |
| 1176 | |
| 1177 | result->i->dict_id = DICTIONARY_ID_NONE; |
| 1178 | result->i->opened = 0; |
| 1179 | result->i->open_flags = 0; |
| 1180 | result->i->open_mode = 0; |
| 1181 | result->i->indexer = NULL; |
| 1182 | *db = result; |
| 1183 | return 0; |
| 1184 | } |
| 1185 | |
| 1186 | // When the loader is created, it makes this call (toku_env_load_inames). |
| 1187 | // For each dictionary to be loaded, replace old iname in directory |
| 1188 | // with a newly generated iname. This will also take a write lock |
| 1189 | // on the directory entries. The write lock will be released when |
| 1190 | // the transaction of the loader is completed. |
| 1191 | // If the transaction commits, the new inames are in place. |
| 1192 | // If the transaction aborts, the old inames will be restored. |
| 1193 | // The new inames are returned to the caller. |
| 1194 | // It is the caller's responsibility to free them. |
| 1195 | // If "mark_as_loader" is true, then include a mark in the iname |
| 1196 | // to indicate that the file is created by the ft loader. |
| 1197 | // Return 0 on success (could fail if write lock not available). |
| 1198 | static int |
| 1199 | load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[/*N*/], const char * new_inames_in_env[/*N*/], LSN *load_lsn, bool mark_as_loader) { |
| 1200 | int rval = 0; |
| 1201 | int i; |
| 1202 | |
| 1203 | TXNID_PAIR xid = TXNID_PAIR_NONE; |
| 1204 | DBT dname_dbt; // holds dname |
| 1205 | DBT iname_dbt; // holds new iname |
| 1206 | |
| 1207 | const char *mark; |
| 1208 | |
| 1209 | if (mark_as_loader) { |
| 1210 | mark = "B" ; |
| 1211 | } else { |
| 1212 | mark = "P" ; |
| 1213 | } |
| 1214 | |
| 1215 | for (i=0; i<N; i++) { |
| 1216 | new_inames_in_env[i] = NULL; |
| 1217 | } |
| 1218 | |
| 1219 | if (txn) { |
| 1220 | xid = toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn); |
| 1221 | } |
| 1222 | for (i = 0; i < N; i++) { |
| 1223 | char * dname = dbs[i]->i->dname; |
| 1224 | toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1); |
| 1225 | |
| 1226 | // now create new iname |
| 1227 | char hint[strlen(dname) + 1]; |
| 1228 | create_iname_hint(env, dname, hint); |
| 1229 | |
| 1230 | // allocates memory for iname_in_env |
| 1231 | const char *new_iname = |
| 1232 | create_iname(env, xid.parent_id64, xid.child_id64, hint, mark, i); |
| 1233 | new_inames_in_env[i] = new_iname; |
| 1234 | |
| 1235 | // iname_in_env goes in directory |
| 1236 | toku_fill_dbt(&iname_dbt, new_iname, strlen(new_iname) + 1); |
| 1237 | rval = toku_db_put(env->i->directory, txn, &dname_dbt, &iname_dbt, 0, true); |
| 1238 | if (rval) break; |
| 1239 | } |
| 1240 | |
| 1241 | // Generate load log entries. |
| 1242 | if (!rval && txn) { |
| 1243 | TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; |
| 1244 | int do_fsync = 0; |
| 1245 | LSN *get_lsn = NULL; |
| 1246 | for (i = 0; i < N; i++) { |
| 1247 | FT_HANDLE ft_handle = dbs[i]->i->ft_handle; |
| 1248 | //Fsync is necessary for the last one only. |
| 1249 | if (i==N-1) { |
| 1250 | do_fsync = 1; //We only need a single fsync of logs. |
| 1251 | get_lsn = load_lsn; //Set pointer to capture the last lsn. |
| 1252 | } |
| 1253 | toku_ft_load(ft_handle, ttxn, new_inames_in_env[i], do_fsync, get_lsn); |
| 1254 | } |
| 1255 | } |
| 1256 | return rval; |
| 1257 | } |
| 1258 | |
| 1259 | int |
| 1260 | locked_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[/*N*/], char * new_inames_in_env[/*N*/], LSN *load_lsn, bool mark_as_loader) { |
| 1261 | int r; |
| 1262 | HANDLE_READ_ONLY_TXN(txn); |
| 1263 | |
| 1264 | // cannot begin a checkpoint |
| 1265 | toku_multi_operation_client_lock(); |
| 1266 | r = load_inames(env, txn, N, dbs, (const char **) new_inames_in_env, load_lsn, mark_as_loader); |
| 1267 | toku_multi_operation_client_unlock(); |
| 1268 | |
| 1269 | return r; |
| 1270 | |
| 1271 | } |
| 1272 | |
| 1273 | #undef STATUS_VALUE |
| 1274 | |
| 1275 | #include <toku_race_tools.h> |
| 1276 | void __attribute__((constructor)) toku_ydb_db_helgrind_ignore(void); |
| 1277 | void |
| 1278 | toku_ydb_db_helgrind_ignore(void) { |
| 1279 | TOKU_VALGRIND_HG_DISABLE_CHECKING(&ydb_db_layer_status, sizeof ydb_db_layer_status); |
| 1280 | } |
| 1281 | |