| 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
| 2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
| 3 | #ident "$Id$" |
| 4 | /*====== |
| 5 | This file is part of PerconaFT. |
| 6 | |
| 7 | |
| 8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
| 9 | |
| 10 | PerconaFT is free software: you can redistribute it and/or modify |
| 11 | it under the terms of the GNU General Public License, version 2, |
| 12 | as published by the Free Software Foundation. |
| 13 | |
| 14 | PerconaFT is distributed in the hope that it will be useful, |
| 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | GNU General Public License for more details. |
| 18 | |
| 19 | You should have received a copy of the GNU General Public License |
| 20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 21 | |
| 22 | ---------------------------------------- |
| 23 | |
| 24 | PerconaFT is free software: you can redistribute it and/or modify |
| 25 | it under the terms of the GNU Affero General Public License, version 3, |
| 26 | as published by the Free Software Foundation. |
| 27 | |
| 28 | PerconaFT is distributed in the hope that it will be useful, |
| 29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 31 | GNU Affero General Public License for more details. |
| 32 | |
| 33 | You should have received a copy of the GNU Affero General Public License |
| 34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 35 | ======= */ |
| 36 | |
| 37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
| 38 | |
| 39 | #pragma once |
| 40 | |
| 41 | #include "cachetable/background_job_manager.h" |
| 42 | #include <portability/toku_random.h> |
| 43 | #include <util/frwlock.h> |
| 44 | #include <util/kibbutz.h> |
| 45 | #include <util/nb_mutex.h> |
| 46 | #include <util/partitioned_counter.h> |
| 47 | |
| 48 | ////////////////////////////////////////////////////////////////////////////// |
| 49 | // |
| 50 | // This file contains the classes and structs that make up the cachetable. |
| 51 | // The structs are: |
| 52 | // - cachefile |
| 53 | // - ctpair |
| 54 | // - pair_list |
| 55 | // - cachefile_list |
| 56 | // - checkpointer |
| 57 | // - evictor |
| 58 | // - cleaner |
| 59 | // |
| 60 | // The rest of this comment assumes familiarity with the locks used in these |
| 61 | // classes/structs and what the locks protect. Nevertheless, here is |
| 62 | // a list of the locks that we have: |
| 63 | // - pair_list->list_lock |
| 64 | // - pair_list->pending_lock_expensive |
| 65 | // - pair_list->pending_lock_cheap |
| 66 | // - cachefile_list->lock |
| 67 | // - PAIR->mutex |
| 68 | // - PAIR->value_rwlock |
| 69 | // - PAIR->disk_nb_mutex |
| 70 | // |
| 71 | // Here are rules for how the locks interact: |
| 72 | // - To grab any of the pair_list's locks, or the cachefile_list's lock, |
| 73 | // the cachetable must be in existence |
| 74 | // - To grab the PAIR mutex, we must know the PAIR will not dissappear: |
| 75 | // - the PAIR must be pinned (value_rwlock or disk_nb_mutex is held) |
| 76 | // - OR, the pair_list's list lock is held |
| 77 | // - As a result, to get rid of a PAIR from the pair_list, we must hold |
| 78 | // both the pair_list's list_lock and the PAIR's mutex |
| 79 | // - To grab PAIR->value_rwlock, we must hold the PAIR's mutex |
| 80 | // - To grab PAIR->disk_nb_mutex, we must hold the PAIR's mutex |
| 81 | // and hold PAIR->value_rwlock |
| 82 | // |
| 83 | // Now let's talk about ordering. Here is an order from outer to inner (top locks must be grabbed first) |
| 84 | // - pair_list->pending_lock_expensive |
| 85 | // - pair_list->list_lock |
| 86 | // - cachefile_list->lock |
| 87 | // - PAIR->mutex |
| 88 | // - pair_list->pending_lock_cheap <-- after grabbing this lock, |
| 89 | // NO other locks |
| 90 | // should be grabbed. |
| 91 | // - when grabbing PAIR->value_rwlock or PAIR->disk_nb_mutex, |
| 92 | // if the acquisition will not block, then it does not matter if any other locks held, |
| 93 | // BUT if the acquisition will block, then NO other locks may be held besides |
| 94 | // PAIR->mutex. |
| 95 | // |
| 96 | // HERE ARE TWO EXAMPLES: |
| 97 | // To pin a PAIR on a client thread, the following must be done: |
| 98 | // - first grab the list lock and find the PAIR |
| 99 | // - with the list lock grabbed, grab PAIR->mutex |
| 100 | // - with PAIR->mutex held: |
| 101 | // - release list lock |
| 102 | // - pin PAIR |
| 103 | // - with PAIR pinned, grab pending_lock_cheap, |
| 104 | // - copy and clear PAIR->checkpoint_pending, |
| 105 | // - resolve checkpointing if necessary |
| 106 | // - return to user. |
| 107 | // The list lock may be held while pinning the PAIR if |
| 108 | // the PAIR has no contention. Otherwise, we may have |
| 109 | // get a deadlock with another thread that has the PAIR pinned, |
| 110 | // tries to pin some other PAIR, and in doing so, grabs the list lock. |
| 111 | // |
| 112 | // To unpin a PAIR on a client thread: |
| 113 | // - because the PAIR is pinned, we don't need the pair_list's list_lock |
| 114 | // - so, simply acquire PAIR->mutex |
| 115 | // - unpin the PAIR |
| 116 | // - return |
| 117 | // |
| 118 | ////////////////////////////////////////////////////////////////////////////// |
| 119 | class evictor; |
| 120 | class pair_list; |
| 121 | |
| 122 | /////////////////////////////////////////////////////////////////////////////// |
| 123 | // |
| 124 | // Maps to a file on disk. |
| 125 | // |
| 126 | struct cachefile { |
| 127 | // these next two fields are protected by cachetable's list lock |
| 128 | // they are managed whenever we add or remove a pair from |
| 129 | // the cachetable. As of Riddler, this linked list is only used to |
| 130 | // make cachetable_flush_cachefile more efficient |
| 131 | PAIR cf_head; // doubly linked list that is NOT circular |
| 132 | uint32_t num_pairs; // count on number of pairs in the cachetable belong to this cachefile |
| 133 | |
| 134 | bool for_checkpoint; //True if part of the in-progress checkpoint |
| 135 | |
| 136 | // If set and the cachefile closes, the file will be removed. |
| 137 | // Clients must not operate on the cachefile after setting this, |
| 138 | // nor attempt to open any cachefile with the same fname (dname) |
| 139 | // until this cachefile has been fully closed and unlinked. |
| 140 | bool unlink_on_close; |
| 141 | // If set then fclose will not be logged in recovery log. |
| 142 | bool skip_log_recover_on_close; |
| 143 | int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */ |
| 144 | CACHETABLE cachetable; |
| 145 | struct fileid fileid; |
| 146 | // the filenum is used as an identifer of the cachefile |
| 147 | // for logging and recovery |
| 148 | FILENUM filenum; |
| 149 | // number used to generate hashes for blocks in the cachefile |
| 150 | // used in toku_cachetable_hash |
| 151 | // this used to be the filenum.fileid, but now it is separate |
| 152 | uint32_t hash_id; |
| 153 | char *fname_in_env; /* Used for logging */ |
| 154 | |
| 155 | void *userdata; |
| 156 | void (*log_fassociate_during_checkpoint)(CACHEFILE cf, void *userdata); // When starting a checkpoint we must log all open files. |
| 157 | void (*close_userdata)(CACHEFILE cf, int fd, void *userdata, bool lsnvalid, LSN); // when closing the last reference to a cachefile, first call this function. |
| 158 | void (*free_userdata)(CACHEFILE cf, void *userdata); // when closing the last reference to a cachefile, first call this function. |
| 159 | void (*begin_checkpoint_userdata)(LSN lsn_of_checkpoint, void *userdata); // before checkpointing cachefiles call this function. |
| 160 | void (*checkpoint_userdata)(CACHEFILE cf, int fd, void *userdata); // when checkpointing a cachefile, call this function. |
| 161 | void (*end_checkpoint_userdata)(CACHEFILE cf, int fd, void *userdata); // after checkpointing cachefiles call this function. |
| 162 | void (*note_pin_by_checkpoint)(CACHEFILE cf, void *userdata); // add a reference to the userdata to prevent it from being removed from memory |
| 163 | void (*note_unpin_by_checkpoint)(CACHEFILE cf, void *userdata); // add a reference to the userdata to prevent it from being removed from memory |
| 164 | BACKGROUND_JOB_MANAGER bjm; |
| 165 | }; |
| 166 | |
| 167 | |
| 168 | /////////////////////////////////////////////////////////////////////////////// |
| 169 | // |
| 170 | // The pair represents the data stored in the cachetable. |
| 171 | // |
| 172 | struct ctpair { |
| 173 | // these fields are essentially constants. They do not change. |
| 174 | CACHEFILE cachefile; |
| 175 | CACHEKEY key; |
| 176 | uint32_t fullhash; |
| 177 | CACHETABLE_FLUSH_CALLBACK flush_callback; |
| 178 | CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK pe_est_callback; |
| 179 | CACHETABLE_PARTIAL_EVICTION_CALLBACK pe_callback; |
| 180 | CACHETABLE_CLEANER_CALLBACK cleaner_callback; |
| 181 | CACHETABLE_CLONE_CALLBACK clone_callback; |
| 182 | CACHETABLE_CHECKPOINT_COMPLETE_CALLBACK checkpoint_complete_callback; |
| 183 | void *; |
| 184 | |
| 185 | // access to these fields are protected by disk_nb_mutex |
| 186 | void* cloned_value_data; // cloned copy of value_data used for checkpointing |
| 187 | long cloned_value_size; // size of cloned_value_data, used for accounting of size_current |
| 188 | void* disk_data; // data used to fetch/flush value_data to and from disk. |
| 189 | |
| 190 | // access to these fields are protected by value_rwlock |
| 191 | void* value_data; // data used by client threads, FTNODEs and ROLLBACK_LOG_NODEs |
| 192 | PAIR_ATTR attr; |
| 193 | enum cachetable_dirty dirty; |
| 194 | |
| 195 | // protected by PAIR->mutex |
| 196 | uint32_t count; // clock count |
| 197 | uint32_t refcount; // if > 0, then this PAIR is referenced by |
| 198 | // callers to the cachetable, and therefore cannot |
| 199 | // be evicted |
| 200 | uint32_t num_waiting_on_refs; // number of threads waiting on refcount to go to zero |
| 201 | toku_cond_t refcount_wait; // cond used to wait for refcount to go to zero |
| 202 | |
| 203 | // locks |
| 204 | toku::frwlock value_rwlock; |
| 205 | struct nb_mutex disk_nb_mutex; // single writer, protects disk_data, is used for writing cloned nodes for checkpoint |
| 206 | toku_mutex_t* mutex; // gotten from the pair list |
| 207 | |
| 208 | // Access to checkpoint_pending is protected by two mechanisms, |
| 209 | // the value_rwlock and the pair_list's pending locks (expensive and cheap). |
| 210 | // checkpoint_pending may be true of false. |
| 211 | // Here are the rules for reading/modifying this bit. |
| 212 | // - To transition this field from false to true during begin_checkpoint, |
| 213 | // we must be holding both of the pair_list's pending locks. |
| 214 | // - To transition this field from true to false during end_checkpoint, |
| 215 | // we must be holding the value_rwlock. |
| 216 | // - For a non-checkpoint thread to read the value, we must hold both the |
| 217 | // value_rwlock and one of the pair_list's pending locks |
| 218 | // - For the checkpoint thread to read the value, we must |
| 219 | // hold the value_rwlock |
| 220 | // |
| 221 | bool checkpoint_pending; // If this is on, then we have got to resolve checkpointing modifying it. |
| 222 | |
| 223 | // these are variables that are only used to transfer information to background threads |
| 224 | // we cache them here to avoid a malloc. In the future, we should investigate if this |
| 225 | // is necessary, as having these fields here is not technically necessary |
| 226 | long size_evicting_estimate; |
| 227 | evictor* ev; |
| 228 | pair_list* list; |
| 229 | |
| 230 | // A PAIR is stored in a pair_list (which happens to be PAIR->list). |
| 231 | // These variables are protected by the list lock in the pair_list |
| 232 | // |
| 233 | // clock_next,clock_prev represent a circular doubly-linked list. |
| 234 | PAIR clock_next,clock_prev; // In clock. |
| 235 | PAIR hash_chain; |
| 236 | |
| 237 | // pending_next,pending_next represent a non-circular doubly-linked list. |
| 238 | PAIR pending_next; |
| 239 | PAIR pending_prev; |
| 240 | |
| 241 | // cf_next, cf_prev represent a non-circular doubly-linked list. |
| 242 | // entries in linked list for PAIRs in a cachefile, these are protected |
| 243 | // by the list lock of the PAIR's pair_list. They are used to make |
| 244 | // cachetable_flush_cachefile cheaper so that we don't need |
| 245 | // to search the entire cachetable to find a particular cachefile's |
| 246 | // PAIRs |
| 247 | PAIR cf_next; |
| 248 | PAIR cf_prev; |
| 249 | }; |
| 250 | |
| 251 | // |
| 252 | // This initializes the fields and members of the pair. |
| 253 | // |
| 254 | void pair_init(PAIR p, |
| 255 | CACHEFILE cachefile, |
| 256 | CACHEKEY key, |
| 257 | void *value, |
| 258 | PAIR_ATTR attr, |
| 259 | enum cachetable_dirty dirty, |
| 260 | uint32_t fullhash, |
| 261 | CACHETABLE_WRITE_CALLBACK write_callback, |
| 262 | evictor *ev, |
| 263 | pair_list *list); |
| 264 | |
| 265 | |
| 266 | /////////////////////////////////////////////////////////////////////////////// |
| 267 | // |
| 268 | // The pair list maintains the set of PAIR's that make up |
| 269 | // the cachetable. |
| 270 | // |
| 271 | class pair_list { |
| 272 | public: |
| 273 | // |
| 274 | // the following fields are protected by the list lock |
| 275 | // |
| 276 | uint32_t m_n_in_table; // number of pairs in the hash table |
| 277 | uint32_t m_table_size; // number of buckets in the hash table |
| 278 | uint32_t m_num_locks; |
| 279 | PAIR *m_table; // hash table |
| 280 | toku_mutex_aligned_t *m_mutexes; |
| 281 | // |
| 282 | // The following fields are the heads of various linked lists. |
| 283 | // They also protected by the list lock, but their |
| 284 | // usage is not as straightforward. For each of them, |
| 285 | // only ONE thread is allowed iterate over them with |
| 286 | // a read lock on the list lock. All other threads |
| 287 | // that want to modify elements in the lists or iterate over |
| 288 | // the lists must hold the write list lock. Here is the |
| 289 | // association between what threads may hold a read lock |
| 290 | // on the list lock while iterating: |
| 291 | // - clock_head -> eviction thread (evictor) |
| 292 | // - cleaner_head -> cleaner thread (cleaner) |
| 293 | // - pending_head -> checkpoint thread (checkpointer) |
| 294 | // |
| 295 | PAIR m_clock_head; // of clock . head is the next thing to be up for decrement. |
| 296 | PAIR m_cleaner_head; // for cleaner thread. head is the next thing to look at for possible cleaning. |
| 297 | PAIR m_checkpoint_head; // for begin checkpoint to iterate over PAIRs and mark as pending_checkpoint |
| 298 | PAIR m_pending_head; // list of pairs marked with checkpoint_pending |
| 299 | |
| 300 | // this field is public so we are still POD |
| 301 | |
| 302 | // usage of this lock is described above |
| 303 | toku_pthread_rwlock_t m_list_lock; |
| 304 | // |
| 305 | // these locks are the "pending locks" referenced |
| 306 | // in comments about PAIR->checkpoint_pending. There |
| 307 | // are two of them, but both serve the same purpose, which |
| 308 | // is to protect the transition of a PAIR's checkpoint pending |
| 309 | // value from false to true during begin_checkpoint. |
| 310 | // We use two locks, because threads that want to read the |
| 311 | // checkpoint_pending value may hold a lock for varying periods of time. |
| 312 | // Threads running eviction may need to protect checkpoint_pending |
| 313 | // while writing a node to disk, which is an expensive operation, |
| 314 | // so it uses pending_lock_expensive. Client threads that |
| 315 | // want to pin PAIRs will want to protect checkpoint_pending |
| 316 | // just long enough to read the value and wipe it out. This is |
| 317 | // a cheap operation, and as a result, uses pending_lock_cheap. |
| 318 | // |
| 319 | // By having two locks, and making begin_checkpoint first |
| 320 | // grab pending_lock_expensive and then pending_lock_cheap, |
| 321 | // we ensure that threads that want to pin nodes can grab |
| 322 | // only pending_lock_cheap, and never block behind threads |
| 323 | // holding pending_lock_expensive and writing a node out to disk |
| 324 | // |
| 325 | toku_pthread_rwlock_t m_pending_lock_expensive; |
| 326 | toku_pthread_rwlock_t m_pending_lock_cheap; |
| 327 | void init(); |
| 328 | void destroy(); |
| 329 | void evict_completely(PAIR pair); |
| 330 | void evict_from_cachetable(PAIR pair); |
| 331 | void evict_from_cachefile(PAIR pair); |
| 332 | void add_to_cachetable_only(PAIR p); |
| 333 | void put(PAIR pair); |
| 334 | PAIR find_pair(CACHEFILE file, CACHEKEY key, uint32_t hash); |
| 335 | void pending_pairs_remove (PAIR p); |
| 336 | void verify(); |
| 337 | void get_state(int *num_entries, int *hash_size); |
| 338 | void read_list_lock(); |
| 339 | void read_list_unlock(); |
| 340 | void write_list_lock(); |
| 341 | void write_list_unlock(); |
| 342 | void read_pending_exp_lock(); |
| 343 | void read_pending_exp_unlock(); |
| 344 | void write_pending_exp_lock(); |
| 345 | void write_pending_exp_unlock(); |
| 346 | void read_pending_cheap_lock(); |
| 347 | void read_pending_cheap_unlock(); |
| 348 | void write_pending_cheap_lock(); |
| 349 | void write_pending_cheap_unlock(); |
| 350 | toku_mutex_t* get_mutex_for_pair(uint32_t fullhash); |
| 351 | void pair_lock_by_fullhash(uint32_t fullhash); |
| 352 | void pair_unlock_by_fullhash(uint32_t fullhash); |
| 353 | |
| 354 | private: |
| 355 | void pair_remove (PAIR p); |
| 356 | void remove_from_hash_chain(PAIR p); |
| 357 | void add_to_cf_list (PAIR p); |
| 358 | void add_to_clock (PAIR p); |
| 359 | void add_to_hash_chain(PAIR p); |
| 360 | }; |
| 361 | |
| 362 | /////////////////////////////////////////////////////////////////////////////// |
| 363 | // |
| 364 | // Wrapper for the head of our cachefile list. |
| 365 | // |
| 366 | class cachefile_list { |
| 367 | public: |
| 368 | void init(); |
| 369 | void destroy(); |
| 370 | void read_lock(); |
| 371 | void read_unlock(); |
| 372 | void write_lock(); |
| 373 | void write_unlock(); |
| 374 | int cachefile_of_iname_in_env(const char *iname_in_env, CACHEFILE *cf); |
| 375 | int cachefile_of_filenum(FILENUM filenum, CACHEFILE *cf); |
| 376 | void add_cf_unlocked(CACHEFILE newcf); |
| 377 | void add_stale_cf(CACHEFILE newcf); |
| 378 | void remove_cf(CACHEFILE cf); |
| 379 | void remove_stale_cf_unlocked(CACHEFILE cf); |
| 380 | FILENUM reserve_filenum(); |
| 381 | uint32_t get_new_hash_id_unlocked(); |
| 382 | CACHEFILE find_cachefile_unlocked(struct fileid* fileid); |
| 383 | CACHEFILE find_stale_cachefile_unlocked(struct fileid* fileid); |
| 384 | void verify_unused_filenum(FILENUM filenum); |
| 385 | bool evict_some_stale_pair(evictor* ev); |
| 386 | void free_stale_data(evictor* ev); |
| 387 | // access to these fields are protected by the lock |
| 388 | FILENUM m_next_filenum_to_use; |
| 389 | uint32_t m_next_hash_id_to_use; |
| 390 | toku_pthread_rwlock_t m_lock; // this field is publoc so we are still POD |
| 391 | toku::omt<CACHEFILE> m_active_filenum; |
| 392 | toku::omt<CACHEFILE> m_active_fileid; |
| 393 | toku::omt<CACHEFILE> m_stale_fileid; |
| 394 | private: |
| 395 | CACHEFILE find_cachefile_in_list_unlocked(CACHEFILE start, struct fileid* fileid); |
| 396 | }; |
| 397 | |
| 398 | |
| 399 | /////////////////////////////////////////////////////////////////////////////// |
| 400 | // |
| 401 | // The checkpointer handles starting and finishing checkpoints of the |
| 402 | // cachetable's data. |
| 403 | // |
| 404 | class checkpointer { |
| 405 | public: |
| 406 | int init(pair_list *_pl, TOKULOGGER _logger, evictor *_ev, cachefile_list *files); |
| 407 | void destroy(); |
| 408 | void set_checkpoint_period(uint32_t new_period); |
| 409 | uint32_t get_checkpoint_period(); |
| 410 | int shutdown(); |
| 411 | bool has_been_shutdown(); |
| 412 | void begin_checkpoint(); |
| 413 | void add_background_job(); |
| 414 | void remove_background_job(); |
| 415 | void end_checkpoint(void (*testcallback_f)(void*), void* ); |
| 416 | TOKULOGGER get_logger(); |
| 417 | // used during begin_checkpoint |
| 418 | void increment_num_txns(); |
| 419 | private: |
| 420 | uint32_t m_checkpoint_num_txns; // how many transactions are in the checkpoint |
| 421 | TOKULOGGER m_logger; |
| 422 | LSN m_lsn_of_checkpoint_in_progress; |
| 423 | uint32_t m_checkpoint_num_files; // how many cachefiles are in the checkpoint |
| 424 | struct minicron m_checkpointer_cron; // the periodic checkpointing thread |
| 425 | cachefile_list *m_cf_list; |
| 426 | pair_list *m_list; |
| 427 | evictor *m_ev; |
| 428 | bool m_checkpointer_cron_init; |
| 429 | bool m_checkpointer_init; |
| 430 | |
| 431 | // variable used by the checkpoint thread to know |
| 432 | // when all work induced by cloning on client threads is done |
| 433 | BACKGROUND_JOB_MANAGER m_checkpoint_clones_bjm; |
| 434 | // private methods for begin_checkpoint |
| 435 | void update_cachefiles(); |
| 436 | void log_begin_checkpoint(); |
| 437 | void turn_on_pending_bits(); |
| 438 | // private methods for end_checkpoint |
| 439 | void fill_checkpoint_cfs(CACHEFILE* checkpoint_cfs); |
| 440 | void checkpoint_pending_pairs(); |
| 441 | void checkpoint_userdata(CACHEFILE* checkpoint_cfs); |
| 442 | void log_end_checkpoint(); |
| 443 | void end_checkpoint_userdata(CACHEFILE* checkpoint_cfs); |
| 444 | void remove_cachefiles(CACHEFILE* checkpoint_cfs); |
| 445 | |
| 446 | // Unit test struct needs access to private members. |
| 447 | friend struct checkpointer_test; |
| 448 | }; |
| 449 | |
| 450 | // |
| 451 | // This is how often we want the eviction thread |
| 452 | // to run, in seconds. |
| 453 | // |
| 454 | const int EVICTION_PERIOD = 1; |
| 455 | |
| 456 | /////////////////////////////////////////////////////////////////////////////// |
| 457 | // |
| 458 | // The evictor handles the removal of pairs from the pair list/cachetable. |
| 459 | // |
| 460 | class evictor { |
| 461 | public: |
| 462 | int init(long _size_limit, pair_list* _pl, cachefile_list* _cf_list, KIBBUTZ _kibbutz, uint32_t eviction_period); |
| 463 | void destroy(); |
| 464 | void add_pair_attr(PAIR_ATTR attr); |
| 465 | void remove_pair_attr(PAIR_ATTR attr); |
| 466 | void change_pair_attr(PAIR_ATTR old_attr, PAIR_ATTR new_attr); |
| 467 | void add_cloned_data_size(long size); |
| 468 | void remove_cloned_data_size(long size); |
| 469 | uint64_t reserve_memory(double fraction, uint64_t upper_bound); |
| 470 | void release_reserved_memory(uint64_t reserved_memory); |
| 471 | void run_eviction_thread(); |
| 472 | void do_partial_eviction(PAIR p); |
| 473 | void evict_pair(PAIR p, bool checkpoint_pending); |
| 474 | void wait_for_cache_pressure_to_subside(); |
| 475 | void signal_eviction_thread(); |
| 476 | void signal_eviction_thread_locked(); |
| 477 | bool should_client_thread_sleep(); |
| 478 | bool should_client_wake_eviction_thread(); |
| 479 | // function needed for testing |
| 480 | void get_state(long *size_current_ptr, long *size_limit_ptr); |
| 481 | void fill_engine_status(); |
| 482 | void set_enable_partial_eviction(bool enabled); |
| 483 | bool get_enable_partial_eviction(void) const; |
| 484 | private: |
| 485 | void add_to_size_current(long size); |
| 486 | void remove_from_size_current(long size); |
| 487 | void run_eviction(); |
| 488 | bool run_eviction_on_pair(PAIR p); |
| 489 | void try_evict_pair(PAIR p); |
| 490 | void decrease_size_evicting(long size_evicting_estimate); |
| 491 | bool should_sleeping_clients_wakeup(); |
| 492 | bool eviction_needed(); |
| 493 | |
| 494 | // We have some intentional races with these variables because we're ok with reading something a little bit old. |
| 495 | // Provide some hooks for reading variables in an unsafe way so that there are function names we can stick in a valgrind suppression. |
| 496 | int64_t unsafe_read_size_current(void) const; |
| 497 | int64_t unsafe_read_size_evicting(void) const; |
| 498 | |
| 499 | pair_list* m_pl; |
| 500 | cachefile_list* m_cf_list; |
| 501 | int64_t m_size_current; // the sum of the sizes of the pairs in the cachetable |
| 502 | int64_t m_size_cloned_data; // stores amount of cloned data we have, only used for engine status |
| 503 | // changes to these two values are protected |
| 504 | // by ev_thread_lock |
| 505 | int64_t m_size_reserved; // How much memory is reserved (e.g., by the loader) |
| 506 | int64_t m_size_evicting; // the sum of the sizes of the pairs being written |
| 507 | |
| 508 | // these are constants |
| 509 | int64_t m_low_size_watermark; // target max size of cachetable that eviction thread aims for |
| 510 | int64_t m_low_size_hysteresis; // if cachetable grows to this size, client threads wake up eviction thread upon adding data |
| 511 | int64_t m_high_size_watermark; // if cachetable grows to this size, client threads sleep upon adding data |
| 512 | int64_t m_high_size_hysteresis; // if > cachetable size, then sleeping client threads may wake up |
| 513 | |
| 514 | bool m_enable_partial_eviction; // true if partial evictions are permitted |
| 515 | |
| 516 | // used to calculate random numbers |
| 517 | struct random_data m_random_data; |
| 518 | char m_random_statebuf[64]; |
| 519 | |
| 520 | // mutex that protects fields listed immedietly below |
| 521 | toku_mutex_t m_ev_thread_lock; |
| 522 | // the eviction thread |
| 523 | toku_pthread_t m_ev_thread; |
| 524 | // condition variable that controls the sleeping period |
| 525 | // of the eviction thread |
| 526 | toku_cond_t m_ev_thread_cond; |
| 527 | // number of client threads that are currently sleeping |
| 528 | // due to an over-subscribed cachetable |
| 529 | uint32_t m_num_sleepers; |
| 530 | // states if the eviction thread should run. set to true |
| 531 | // in init, set to false during destroy |
| 532 | bool m_run_thread; |
| 533 | // bool that states if the eviction thread is currently running |
| 534 | bool m_ev_thread_is_running; |
| 535 | // period which the eviction thread sleeps |
| 536 | uint32_t m_period_in_seconds; |
| 537 | // condition variable on which client threads wait on when sleeping |
| 538 | // due to an over-subscribed cachetable |
| 539 | toku_cond_t m_flow_control_cond; |
| 540 | |
| 541 | // variables for engine status |
| 542 | PARTITIONED_COUNTER m_size_nonleaf; |
| 543 | PARTITIONED_COUNTER m_size_leaf; |
| 544 | PARTITIONED_COUNTER m_size_rollback; |
| 545 | PARTITIONED_COUNTER m_size_cachepressure; |
| 546 | PARTITIONED_COUNTER m_wait_pressure_count; |
| 547 | PARTITIONED_COUNTER m_wait_pressure_time; |
| 548 | PARTITIONED_COUNTER m_long_wait_pressure_count; |
| 549 | PARTITIONED_COUNTER m_long_wait_pressure_time; |
| 550 | |
| 551 | KIBBUTZ m_kibbutz; |
| 552 | |
| 553 | // this variable is ONLY used for testing purposes |
| 554 | uint64_t m_num_eviction_thread_runs; |
| 555 | |
| 556 | bool m_ev_thread_init; |
| 557 | bool m_evictor_init; |
| 558 | |
| 559 | friend class evictor_test_helpers; |
| 560 | friend class evictor_unit_test; |
| 561 | }; |
| 562 | |
| 563 | /////////////////////////////////////////////////////////////////////////////// |
| 564 | // |
| 565 | // Iterates over the clean head in the pair list, calling the cleaner |
| 566 | // callback on each node in that list. |
| 567 | // |
| 568 | class cleaner { |
| 569 | public: |
| 570 | int init(uint32_t cleaner_iterations, pair_list* _pl, CACHETABLE _ct); |
| 571 | void destroy(void); |
| 572 | uint32_t get_iterations(void); |
| 573 | void set_iterations(uint32_t new_iterations); |
| 574 | uint32_t get_period_unlocked(void); |
| 575 | void set_period(uint32_t new_period); |
| 576 | int run_cleaner(void); |
| 577 | |
| 578 | private: |
| 579 | pair_list* m_pl; |
| 580 | CACHETABLE m_ct; |
| 581 | struct minicron m_cleaner_cron; // the periodic cleaner thread |
| 582 | uint32_t m_cleaner_iterations; // how many times to run the cleaner per |
| 583 | // cleaner period (minicron has a |
| 584 | // minimum period of 1s so if you want |
| 585 | // more frequent cleaner runs you must |
| 586 | // use this) |
| 587 | bool m_cleaner_cron_init; |
| 588 | bool m_cleaner_init; |
| 589 | }; |
| 590 | |
| 591 | /////////////////////////////////////////////////////////////////////////////// |
| 592 | // |
| 593 | // The cachetable is as close to an ENV as we get. |
| 594 | // |
| 595 | struct cachetable { |
| 596 | pair_list list; |
| 597 | cleaner cl; |
| 598 | evictor ev; |
| 599 | checkpointer cp; |
| 600 | cachefile_list cf_list; |
| 601 | |
| 602 | KIBBUTZ client_kibbutz; // pool of worker threads and jobs to do asynchronously for the client. |
| 603 | KIBBUTZ ct_kibbutz; // pool of worker threads and jobs to do asynchronously for the cachetable |
| 604 | KIBBUTZ checkpointing_kibbutz; // small pool for checkpointing cloned pairs |
| 605 | |
| 606 | char *env_dir; |
| 607 | }; |
| 608 | |