| 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
| 2 | // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: |
| 3 | #ident "$Id$" |
| 4 | /*====== |
| 5 | This file is part of PerconaFT. |
| 6 | |
| 7 | |
| 8 | Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. |
| 9 | |
| 10 | PerconaFT is free software: you can redistribute it and/or modify |
| 11 | it under the terms of the GNU General Public License, version 2, |
| 12 | as published by the Free Software Foundation. |
| 13 | |
| 14 | PerconaFT is distributed in the hope that it will be useful, |
| 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | GNU General Public License for more details. |
| 18 | |
| 19 | You should have received a copy of the GNU General Public License |
| 20 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 21 | |
| 22 | ---------------------------------------- |
| 23 | |
| 24 | PerconaFT is free software: you can redistribute it and/or modify |
| 25 | it under the terms of the GNU Affero General Public License, version 3, |
| 26 | as published by the Free Software Foundation. |
| 27 | |
| 28 | PerconaFT is distributed in the hope that it will be useful, |
| 29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 31 | GNU Affero General Public License for more details. |
| 32 | |
| 33 | You should have received a copy of the GNU Affero General Public License |
| 34 | along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. |
| 35 | ======= */ |
| 36 | |
| 37 | #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." |
| 38 | |
| 39 | #pragma once |
| 40 | |
| 41 | #include <vector> |
| 42 | |
| 43 | #include "portability/memory.h" |
| 44 | #include "portability/toku_portability.h" |
| 45 | #include "portability/toku_race_tools.h" |
| 46 | #include "portability/toku_stdint.h" |
| 47 | |
| 48 | #include "ft/serialize/wbuf.h" |
| 49 | #include "util/growable_array.h" |
| 50 | #include "util/mempool.h" |
| 51 | |
| 52 | namespace toku { |
| 53 | typedef uint32_t node_offset; |
| 54 | |
| 55 | |
| 56 | /** |
| 57 | * Dynamic Order Maintenance Tree (DMT) |
| 58 | * |
| 59 | * Maintains a collection of totally ordered values, where each value has weight 1. |
| 60 | * A DMT supports variable sized values. |
| 61 | * The DMT is a mutable datatype. |
| 62 | * |
| 63 | * The Abstraction: |
| 64 | * |
| 65 | * An DMT is a vector of values, $V$, where $|V|$ is the length of the vector. |
| 66 | * The vector is numbered from $0$ to $|V|-1$. |
| 67 | * |
| 68 | * We can create a new DMT, which is the empty vector. |
| 69 | * |
| 70 | * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where |
| 71 | * $|V'|=1+|V|$ and |
| 72 | * |
| 73 | * V'_j = V_j if $j<i$ |
| 74 | * x if $j=i$ |
| 75 | * V_{j-1} if $j>i$. |
| 76 | * |
| 77 | * We can specify $i$ using a kind of function instead of as an integer. |
| 78 | * Let $b$ be a function mapping from values to nonzero integers, such that |
| 79 | * the signum of $b$ is monotically increasing. |
| 80 | * We can specify $i$ as the minimum integer such that $b(V_i)>0$. |
| 81 | * |
| 82 | * We look up a value using its index, or using a Heaviside function. |
| 83 | * For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing. |
| 84 | * When lookup up values, we can look up |
| 85 | * $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$. (With a special return code if no such value exists.) |
| 86 | * (Rationale: Ordinarily we want $i$ to be unique. But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.) |
| 87 | * $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$. (Or an indication that no such value exists.) |
| 88 | * $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$. (Or an indication that no such value exists.) |
| 89 | * |
| 90 | * When looking up a value using a Heaviside function, we get the value and its index. |
| 91 | * |
| 92 | * Performance: |
| 93 | * Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function. |
| 94 | * The memory required is O(|V|). |
| 95 | * |
| 96 | * Usage: |
| 97 | * The dmt is templated by three parameters: |
| 98 | * - dmtdata_t is what will be stored within the dmt. These could be pointers or real data types (ints, structs). |
| 99 | * - dmtdataout_t is what will be returned by find and related functions. By default, it is the same as dmtdata_t, but you can set it to (dmtdata_t *). |
| 100 | * - dmtwriter_t is a class that effectively handles (de)serialization between the value stored in the dmt and outside the dmt. |
| 101 | * To create an dmt which will store "TXNID"s, for example, it is a good idea to typedef the template: |
| 102 | * typedef dmt<TXNID, TXNID, txnid_writer_t> txnid_dmt_t; |
| 103 | * If you are storing structs (or you want to edit what is stored), you may want to be able to get a pointer to the data actually stored in the dmt (see find_zero). To do this, use the second template parameter: |
| 104 | * typedef dmt<struct foo, struct foo *, foo_writer_t> foo_dmt_t; |
| 105 | */ |
| 106 | |
| 107 | namespace dmt_internal { |
| 108 | |
| 109 | class subtree { |
| 110 | private: |
| 111 | uint32_t m_index; |
| 112 | public: |
| 113 | // The maximum mempool size for a dmt is 2**32-2 |
| 114 | static const uint32_t NODE_NULL = UINT32_MAX; |
| 115 | inline void set_to_null(void) { |
| 116 | m_index = NODE_NULL; |
| 117 | } |
| 118 | |
| 119 | inline bool is_null(void) const { |
| 120 | return NODE_NULL == this->get_offset(); |
| 121 | } |
| 122 | |
| 123 | inline node_offset get_offset(void) const { |
| 124 | return m_index; |
| 125 | } |
| 126 | |
| 127 | inline void set_offset(node_offset index) { |
| 128 | paranoid_invariant(index != NODE_NULL); |
| 129 | m_index = index; |
| 130 | } |
| 131 | } __attribute__((__packed__,__aligned__(4))); |
| 132 | |
| 133 | template<typename dmtdata_t> |
| 134 | class dmt_node_templated { |
| 135 | public: |
| 136 | uint32_t weight; |
| 137 | subtree left; |
| 138 | subtree right; |
| 139 | uint32_t value_length; |
| 140 | dmtdata_t value; |
| 141 | } __attribute__((__aligned__(4))); //NOTE: we cannot use attribute packed or dmtdata_t will call copy constructors (dmtdata_t might not be packed by default) |
| 142 | |
| 143 | } |
| 144 | |
| 145 | using namespace toku::dmt_internal; |
| 146 | |
| 147 | // Each data type used in a dmt requires a dmt_writer class (allows you to insert/etc with dynamic sized types). |
| 148 | // A dmt_writer can be thought of a (de)serializer |
| 149 | // There is no default implementation. |
| 150 | // A dmtwriter instance handles reading/writing 'dmtdata_t's to/from the dmt. |
| 151 | // The class must implement the following functions: |
| 152 | // The size required in a dmt for the dmtdata_t represented: |
| 153 | // size_t get_size(void) const; |
| 154 | // Write the dmtdata_t to memory owned by a dmt: |
| 155 | // void write_to(dmtdata_t *const dest) const; |
| 156 | // Constructor (others are allowed, but this one is required) |
| 157 | // dmtwriter(const uint32_t dmtdata_t_len, dmtdata_t *const src) |
| 158 | |
| 159 | template<typename dmtdata_t, |
| 160 | typename dmtdataout_t, |
| 161 | typename dmtwriter_t |
| 162 | > |
| 163 | class dmt { |
| 164 | private: |
| 165 | typedef dmt_node_templated<dmtdata_t> dmt_node; |
| 166 | |
| 167 | public: |
| 168 | static const uint8_t ALIGNMENT = 4; |
| 169 | |
| 170 | class builder { |
| 171 | public: |
| 172 | void append(const dmtwriter_t &value); |
| 173 | |
| 174 | // Create a dmt builder to build a dmt that will have at most n_values values and use |
| 175 | // at most n_value_bytes bytes in the mempool to store values (not counting node or alignment overhead). |
| 176 | void create(uint32_t n_values, uint32_t n_value_bytes); |
| 177 | |
| 178 | bool value_length_is_fixed(void); |
| 179 | |
| 180 | // Constructs a dmt that contains everything that was append()ed to this builder. |
| 181 | // Destroys this builder and frees associated memory. |
| 182 | void build(dmt<dmtdata_t, dmtdataout_t, dmtwriter_t> *dest); |
| 183 | private: |
| 184 | uint32_t max_values; |
| 185 | uint32_t max_value_bytes; |
| 186 | node_offset *sorted_node_offsets; |
| 187 | bool temp_valid; |
| 188 | dmt<dmtdata_t, dmtdataout_t, dmtwriter_t> temp; |
| 189 | }; |
| 190 | |
| 191 | /** |
| 192 | * Effect: Create an empty DMT. |
| 193 | * Performance: constant time. |
| 194 | */ |
| 195 | void create(void); |
| 196 | |
| 197 | /** |
| 198 | * Effect: Create a DMT containing values. The number of values is in numvalues. |
| 199 | * Each value is of a fixed (at runtime) length. |
| 200 | * mem contains the values in packed form (no alignment padding) |
| 201 | * Caller retains ownership of mem. |
| 202 | * Requires: this has not been created yet |
| 203 | * Rationale: Normally to insert N values takes O(N lg N) amortized time. |
| 204 | * If the N values are known in advance, are sorted, and |
| 205 | * the structure is empty, we can batch insert them much faster. |
| 206 | */ |
| 207 | __attribute__((nonnull)) |
| 208 | void create_from_sorted_memory_of_fixed_size_elements( |
| 209 | const void *mem, |
| 210 | const uint32_t numvalues, |
| 211 | const uint32_t mem_length, |
| 212 | const uint32_t fixed_value_length); |
| 213 | |
| 214 | /** |
| 215 | * Effect: Creates a copy of an dmt. |
| 216 | * Creates this as the clone. |
| 217 | * Each element is copied directly. If they are pointers, the underlying data is not duplicated. |
| 218 | * Performance: O(memory) (essentially a memdup) |
| 219 | * The underlying structures are memcpy'd. Only the values themselves are copied (shallow copy) |
| 220 | */ |
| 221 | void clone(const dmt &src); |
| 222 | |
| 223 | /** |
| 224 | * Effect: Set the tree to be empty. |
| 225 | * Note: Will not reallocate or resize any memory. |
| 226 | * Note: If this dmt had variable sized elements, it will start tracking again (until it gets values of two different sizes) |
| 227 | * Performance: time=O(1) |
| 228 | */ |
| 229 | void clear(void); |
| 230 | |
| 231 | /** |
| 232 | * Effect: Destroy an DMT, freeing all its memory. |
| 233 | * If the values being stored are pointers, their underlying data is not freed. |
| 234 | * Those values may be freed before or after calling ::destroy() |
| 235 | * Rationale: Returns no values since free() cannot fail. |
| 236 | * Rationale: Does not free the underlying pointers to reduce complexity/maintain abstraction layer |
| 237 | * Performance: time=O(1) |
| 238 | */ |
| 239 | void destroy(void); |
| 240 | |
| 241 | /** |
| 242 | * Effect: return |this| (number of values stored in this dmt). |
| 243 | * Performance: time=O(1) |
| 244 | */ |
| 245 | uint32_t size(void) const; |
| 246 | |
| 247 | /** |
| 248 | * Effect: Serialize all values contained in this dmt into a packed form (no alignment padding). |
| 249 | * We serialized to wb. expected_unpadded_memory is the size of memory reserved in the wbuf |
| 250 | * for serialization. (We assert that serialization requires exactly the expected amount) |
| 251 | * Requires: |
| 252 | * ::prepare_for_serialize() has been called and no non-const functions have been called since. |
| 253 | * This dmt has fixed-length values and is in array form. |
| 254 | * Performance: |
| 255 | * O(memory) |
| 256 | */ |
| 257 | void serialize_values(uint32_t expected_unpadded_memory, struct wbuf *wb) const; |
| 258 | |
| 259 | /** |
| 260 | * Effect: Insert value into the DMT. |
| 261 | * If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST. |
| 262 | * Otherwise, let i be the minimum value such that $h(V_i, v)>0$. |
| 263 | * If no such i exists, then let i be |V| |
| 264 | * Then this has the same effect as |
| 265 | * insert_at(tree, value, i); |
| 266 | * If idx!=NULL then i is stored in *idx |
| 267 | * Requires: The signum of h must be monotonically increasing. |
| 268 | * Returns: |
| 269 | * 0 success |
| 270 | * DB_KEYEXIST the key is present (h was equal to zero for some value) |
| 271 | * On nonzero return, dmt is unchanged. |
| 272 | * Performance: time=O(\log N) amortized. |
| 273 | * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now. |
| 274 | */ |
| 275 | template<typename dmtcmp_t, int (*h)(const uint32_t size, const dmtdata_t &, const dmtcmp_t &)> |
| 276 | int insert(const dmtwriter_t &value, const dmtcmp_t &v, uint32_t *const idx); |
| 277 | |
| 278 | /** |
| 279 | * Effect: Increases indexes of all items at slot >= idx by 1. |
| 280 | * Insert value into the position at idx. |
| 281 | * Returns: |
| 282 | * 0 success |
| 283 | * EINVAL if idx > this->size() |
| 284 | * On error, dmt is unchanged. |
| 285 | * Performance: time=O(\log N) amortized time. |
| 286 | * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now. |
| 287 | */ |
| 288 | int insert_at(const dmtwriter_t &value, const uint32_t idx); |
| 289 | |
| 290 | /** |
| 291 | * Effect: Delete the item in slot idx. |
| 292 | * Decreases indexes of all items at slot > idx by 1. |
| 293 | * Returns |
| 294 | * 0 success |
| 295 | * EINVAL if idx>=this->size() |
| 296 | * On error, dmt is unchanged. |
| 297 | * Rationale: To delete an item, first find its index using find or find_zero, then delete it. |
| 298 | * Performance: time=O(\log N) amortized. |
| 299 | */ |
| 300 | int delete_at(const uint32_t idx); |
| 301 | |
| 302 | /** |
| 303 | * Effect: Iterate over the values of the dmt, from left to right, calling f on each value. |
| 304 | * The first argument passed to f is a ref-to-const of the value stored in the dmt. |
| 305 | * The second argument passed to f is the index of the value. |
| 306 | * The third argument passed to f is iterate_extra. |
| 307 | * The indices run from 0 (inclusive) to this->size() (exclusive). |
| 308 | * Requires: f != NULL |
| 309 | * Returns: |
| 310 | * If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate. |
| 311 | * If f always returns zero, then iterate returns 0. |
| 312 | * Requires: Don't modify the dmt while running. (E.g., f may not insert or delete values from the dmt.) |
| 313 | * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the dmt. |
| 314 | * Rationale: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read. |
| 315 | * Rationale: We may at some point use functors, but for now this is a smaller change from the old DMT. |
| 316 | */ |
| 317 | template<typename iterate_extra_t, |
| 318 | int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)> |
| 319 | int iterate(iterate_extra_t *const ) const; |
| 320 | |
| 321 | /** |
| 322 | * Effect: Iterate over the values of the dmt, from left to right, calling f on each value. |
| 323 | * The first argument passed to f is a ref-to-const of the value stored in the dmt. |
| 324 | * The second argument passed to f is the index of the value. |
| 325 | * The third argument passed to f is iterate_extra. |
| 326 | * The indices run from 0 (inclusive) to this->size() (exclusive). |
| 327 | * We will iterate only over [left,right) |
| 328 | * |
| 329 | * Requires: left <= right |
| 330 | * Requires: f != NULL |
| 331 | * Returns: |
| 332 | * EINVAL if right > this->size() |
| 333 | * If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate_on_range. |
| 334 | * If f always returns zero, then iterate_on_range returns 0. |
| 335 | * Requires: Don't modify the dmt while running. (E.g., f may not insert or delete values from the dmt.) |
| 336 | * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the dmt. |
| 337 | * Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read. |
| 338 | */ |
| 339 | template<typename iterate_extra_t, |
| 340 | int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)> |
| 341 | int iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const ) const; |
| 342 | |
| 343 | // Attempt to verify this dmt is well formed. (Crashes/asserts/aborts if not well formed) |
| 344 | void verify(void) const; |
| 345 | |
| 346 | /** |
| 347 | * Effect: Iterate over the values of the dmt, from left to right, calling f on each value. |
| 348 | * The first argument passed to f is a pointer to the value stored in the dmt. |
| 349 | * The second argument passed to f is the index of the value. |
| 350 | * The third argument passed to f is iterate_extra. |
| 351 | * The indices run from 0 (inclusive) to this->size() (exclusive). |
| 352 | * Requires: same as for iterate() |
| 353 | * Returns: same as for iterate() |
| 354 | * Performance: same as for iterate() |
| 355 | * Rationale: In general, most iterators should use iterate() since they should not modify the data stored in the dmt. This function is for iterators which need to modify values (for example, free_items). |
| 356 | * Rationale: We assume if you are transforming the data in place, you want to do it to everything at once, so there is not yet an iterate_on_range_ptr (but there could be). |
| 357 | */ |
| 358 | template<typename iterate_extra_t, |
| 359 | int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)> |
| 360 | void iterate_ptr(iterate_extra_t *const ); |
| 361 | |
| 362 | /** |
| 363 | * Effect: Set *value=V_idx |
| 364 | * Returns |
| 365 | * 0 success |
| 366 | * EINVAL if index>=toku_dmt_size(dmt) |
| 367 | * On nonzero return, *value is unchanged |
| 368 | * Performance: time=O(\log N) |
| 369 | */ |
| 370 | int fetch(const uint32_t idx, uint32_t *const value_size, dmtdataout_t *const value) const; |
| 371 | |
| 372 | /** |
| 373 | * Effect: Find the smallest i such that h(V_i, extra)>=0 |
| 374 | * If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = V_i, and return 0. |
| 375 | * If there is such an i and h(V_i,extra)>0 then set *idxp=i and return DB_NOTFOUND. |
| 376 | * If there is no such i then set *idx=this->size() and return DB_NOTFOUND. |
| 377 | * Note: value is of type dmtdataout_t, which may be of type (dmtdata_t) or (dmtdata_t *) but is fixed by the instantiation. |
| 378 | * If it is the value type, then the value is copied out (even if the value type is a pointer to something else) |
| 379 | * If it is the pointer type, then *value is set to a pointer to the data within the dmt. |
| 380 | * This is determined by the type of the dmt as initially declared. |
| 381 | * If the dmt is declared as dmt<foo_t>, then foo_t's will be stored and foo_t's will be returned by find and related functions. |
| 382 | * If the dmt is declared as dmt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the stored items will be returned by find and related functions. |
| 383 | * Rationale: |
| 384 | * Structs too small for malloc should be stored directly in the dmt. |
| 385 | * These structs may need to be edited as they exist inside the dmt, so we need a way to get a pointer within the dmt. |
| 386 | * Using separate functions for returning pointers and values increases code duplication and reduces type-checking. |
| 387 | * That also reduces the ability of the creator of a data structure to give advice to its future users. |
| 388 | * Slight overloading in this case seemed to provide a better API and better type checking. |
| 389 | */ |
| 390 | template<typename dmtcmp_t, |
| 391 | int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)> |
| 392 | int find_zero(const dmtcmp_t &, uint32_t *const value_size, dmtdataout_t *const value, uint32_t *const idxp) const; |
| 393 | |
| 394 | /** |
| 395 | * Effect: |
| 396 | * If direction >0 then find the smallest i such that h(V_i,extra)>0. |
| 397 | * If direction <0 then find the largest i such that h(V_i,extra)<0. |
| 398 | * (Direction may not be equal to zero.) |
| 399 | * If value!=NULL then store V_i in *value |
| 400 | * If idxp!=NULL then store i in *idxp. |
| 401 | * Requires: The signum of h is monotically increasing. |
| 402 | * Returns |
| 403 | * 0 success |
| 404 | * DB_NOTFOUND no such value is found. |
| 405 | * On nonzero return, *value and *idxp are unchanged |
| 406 | * Performance: time=O(\log N) |
| 407 | * Rationale: |
| 408 | * Here's how to use the find function to find various things |
| 409 | * Cases for find: |
| 410 | * find first value: ( h(v)=+1, direction=+1 ) |
| 411 | * find last value ( h(v)=-1, direction=-1 ) |
| 412 | * find first X ( h(v)=(v< x) ? -1 : 1 direction=+1 ) |
| 413 | * find last X ( h(v)=(v<=x) ? -1 : 1 direction=-1 ) |
| 414 | * find X or successor to X ( same as find first X. ) |
| 415 | * |
| 416 | * Rationale: To help understand heaviside functions and behavor of find: |
| 417 | * There are 7 kinds of heaviside functions. |
| 418 | * The signus of the h must be monotonically increasing. |
| 419 | * Given a function of the following form, A is the element |
| 420 | * returned for direction>0, B is the element returned |
| 421 | * for direction<0, C is the element returned for |
| 422 | * direction==0 (see find_zero) (with a return of 0), and D is the element |
| 423 | * returned for direction==0 (see find_zero) with a return of DB_NOTFOUND. |
| 424 | * If any of A, B, or C are not found, then asking for the |
| 425 | * associated direction will return DB_NOTFOUND. |
| 426 | * See find_zero for more information. |
| 427 | * |
| 428 | * Let the following represent the signus of the heaviside function. |
| 429 | * |
| 430 | * -...- |
| 431 | * A |
| 432 | * D |
| 433 | * |
| 434 | * +...+ |
| 435 | * B |
| 436 | * D |
| 437 | * |
| 438 | * 0...0 |
| 439 | * C |
| 440 | * |
| 441 | * -...-0...0 |
| 442 | * AC |
| 443 | * |
| 444 | * 0...0+...+ |
| 445 | * C B |
| 446 | * |
| 447 | * -...-+...+ |
| 448 | * AB |
| 449 | * D |
| 450 | * |
| 451 | * -...-0...0+...+ |
| 452 | * AC B |
| 453 | */ |
| 454 | template<typename dmtcmp_t, |
| 455 | int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)> |
| 456 | int find(const dmtcmp_t &, int direction, uint32_t *const value_size, dmtdataout_t *const value, uint32_t *const idxp) const; |
| 457 | |
| 458 | /** |
| 459 | * Effect: Return the size (in bytes) of the dmt, as it resides in main memory. |
| 460 | * If the data stored are pointers, don't include the size of what they all point to. |
| 461 | * //TODO(leif or yoni): (maybe rename and) return memory footprint instead of allocated size |
| 462 | */ |
| 463 | size_t memory_size(void); |
| 464 | |
| 465 | // Returns whether all values in the dmt are known to be the same size. |
| 466 | // Note: |
| 467 | // There are no false positives, but false negatives are allowed. |
| 468 | // A false negative can happen if this dmt had 2 (or more) different size values, |
| 469 | // and then enough were deleted so that all the remaining ones are the same size. |
| 470 | // Once that happens, this dmt will never again return true for this function unless/until |
| 471 | // ::clear() is called |
| 472 | bool value_length_is_fixed(void) const; |
| 473 | |
| 474 | |
| 475 | // If this dmt is empty, return value is undefined. |
| 476 | // else if value_length_is_fixed() then it returns the fixed length. |
| 477 | // else returns 0 |
| 478 | uint32_t get_fixed_length(void) const; |
| 479 | |
| 480 | // Preprocesses the dmt so that serialization can happen quickly. |
| 481 | // After this call, serialize_values() can be called but no other mutator function can be called in between. |
| 482 | void prepare_for_serialize(void); |
| 483 | |
| 484 | private: |
| 485 | // Do a bit of verification that subtree and nodes act like packed c structs and do not introduce unnecessary padding for alignment. |
| 486 | ENSURE_POD(subtree); |
| 487 | static_assert(ALIGNMENT > 0, "ALIGNMENT <= 0" ); |
| 488 | static_assert((ALIGNMENT & (ALIGNMENT - 1)) == 0, "ALIGNMENT not a power of 2" ); |
| 489 | static_assert(sizeof(dmt_node) - sizeof(dmtdata_t) == __builtin_offsetof(dmt_node, value), "value is not last field in node" ); |
| 490 | static_assert(4 * sizeof(uint32_t) == __builtin_offsetof(dmt_node, value), "dmt_node is padded" ); |
| 491 | static_assert(__builtin_offsetof(dmt_node, value) % ALIGNMENT == 0, "dmt_node requires padding for alignment" ); |
| 492 | ENSURE_POD(dmt_node); |
| 493 | |
| 494 | struct dmt_array { |
| 495 | uint32_t num_values; |
| 496 | }; |
| 497 | |
| 498 | struct dmt_tree { |
| 499 | subtree root; |
| 500 | }; |
| 501 | |
| 502 | /* |
| 503 | Relationship between values_same_size, d.a.num_values, value_length, is_array: |
| 504 | In an empty dmt: |
| 505 | is_array is true |
| 506 | value_same_size is true |
| 507 | value_length is undefined |
| 508 | d.a.num_values is 0 |
| 509 | In a non-empty array dmt: |
| 510 | is_array is true |
| 511 | values_same_size is true |
| 512 | value_length is defined |
| 513 | d.a.num_values > 0 |
| 514 | In a non-empty tree dmt: |
| 515 | is_array = false |
| 516 | value_same_size is true iff all values have been the same size since the last time the dmt turned into a tree. |
| 517 | value_length is defined iff values_same_size is true |
| 518 | d.a.num_values is undefined (the memory is used for the tree) |
| 519 | Note that in tree form, the dmt keeps track of if all values are the same size until the first time they are not. |
| 520 | 'values_same_size' will not become true again (even if we change all values to be the same size) |
| 521 | until/unless the dmt becomes empty, at which point it becomes an array again. |
| 522 | */ |
| 523 | bool values_same_size; |
| 524 | uint32_t value_length; // valid iff values_same_size is true. |
| 525 | struct mempool mp; |
| 526 | bool is_array; |
| 527 | union { |
| 528 | struct dmt_array a; |
| 529 | struct dmt_tree t; |
| 530 | } d; |
| 531 | |
| 532 | // Returns pad bytes per element (for alignment) or 0 if not fixed length. |
| 533 | uint32_t get_fixed_length_alignment_overhead(void) const; |
| 534 | |
| 535 | void verify_internal(const subtree &subtree, std::vector<bool> *touched) const; |
| 536 | |
| 537 | // Retrieves the node for a given subtree. |
| 538 | // Requires: !subtree.is_null() |
| 539 | dmt_node & get_node(const subtree &subtree) const; |
| 540 | |
| 541 | // Retrieves the node at a given offset in the mempool. |
| 542 | dmt_node & get_node(const node_offset offset) const; |
| 543 | |
| 544 | // Returns the weight of a subtree rooted at st. |
| 545 | // if st.is_null(), returns 0 |
| 546 | // Perf: O(1) |
| 547 | uint32_t nweight(const subtree &st) const; |
| 548 | |
| 549 | // Allocates space for a node (in the mempool) and uses the dmtwriter to write the value into the node |
| 550 | node_offset node_malloc_and_set_value(const dmtwriter_t &value); |
| 551 | |
| 552 | // Uses the dmtwriter to write a value into node n |
| 553 | void node_set_value(dmt_node *n, const dmtwriter_t &value); |
| 554 | |
| 555 | // (mempool-)free the memory for a node |
| 556 | void node_free(const subtree &st); |
| 557 | |
| 558 | // Effect: Resizes the mempool (holding the array) if necessary to hold one more item of length: this->value_length |
| 559 | // Requires: |
| 560 | // This dmt is in array form (and thus this->values_same_length) |
| 561 | void maybe_resize_array_for_insert(void); |
| 562 | |
| 563 | // Effect: Converts a dmt from array form to tree form. |
| 564 | // Perf: O(n) |
| 565 | // Note: This does not clear the 'this->values_same_size' bit |
| 566 | void convert_to_tree(void); |
| 567 | |
| 568 | // Effect: Resizes the mempool holding a tree if necessary. If value==nullptr then it may shrink if overallocated, |
| 569 | // otherwise resize only happens if there is not enough free space for an insert of value |
| 570 | void maybe_resize_tree(const dmtwriter_t * value); |
| 571 | |
| 572 | // Returns true if the tree rooted at st would need rebalance after adding |
| 573 | // leftmod to the left subtree and rightmod to the right subtree |
| 574 | bool will_need_rebalance(const subtree &st, const int leftmod, const int rightmod) const; |
| 575 | |
| 576 | __attribute__((nonnull)) |
| 577 | void insert_internal(subtree *const subtreep, const dmtwriter_t &value, const uint32_t idx, subtree **const rebalance_subtree); |
| 578 | |
| 579 | template<bool with_resize> |
| 580 | int insert_at_array_end(const dmtwriter_t& value_in); |
| 581 | |
| 582 | dmtdata_t * alloc_array_value_end(void); |
| 583 | |
| 584 | dmtdata_t * get_array_value(const uint32_t idx) const; |
| 585 | |
| 586 | dmtdata_t * get_array_value_internal(const struct mempool *mempool, const uint32_t idx) const; |
| 587 | |
| 588 | void convert_from_array_to_tree(void); |
| 589 | |
| 590 | void convert_from_tree_to_array(void); |
| 591 | |
| 592 | void delete_internal(subtree *const subtreep, const uint32_t idx, subtree *const subtree_replace, subtree **const rebalance_subtree); |
| 593 | |
| 594 | template<typename iterate_extra_t, |
| 595 | int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)> |
| 596 | int iterate_internal_array(const uint32_t left, const uint32_t right, |
| 597 | iterate_extra_t *const ) const; |
| 598 | |
| 599 | template<typename iterate_extra_t, |
| 600 | int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)> |
| 601 | void iterate_ptr_internal(const uint32_t left, const uint32_t right, |
| 602 | const subtree &subtree, const uint32_t idx, |
| 603 | iterate_extra_t *const ); |
| 604 | |
| 605 | template<typename iterate_extra_t, |
| 606 | int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)> |
| 607 | void iterate_ptr_internal_array(const uint32_t left, const uint32_t right, |
| 608 | iterate_extra_t *const ); |
| 609 | |
| 610 | template<typename iterate_extra_t, |
| 611 | int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)> |
| 612 | int iterate_internal(const uint32_t left, const uint32_t right, |
| 613 | const subtree &subtree, const uint32_t idx, |
| 614 | iterate_extra_t *const ) const; |
| 615 | |
| 616 | void fetch_internal_array(const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const; |
| 617 | |
| 618 | void fetch_internal(const subtree &subtree, const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const; |
| 619 | |
| 620 | __attribute__((nonnull)) |
| 621 | void fill_array_with_subtree_offsets(node_offset *const array, const subtree &subtree) const; |
| 622 | |
| 623 | __attribute__((nonnull)) |
| 624 | void rebuild_subtree_from_offsets(subtree *const subtree, const node_offset *const offsets, const uint32_t numvalues); |
| 625 | |
| 626 | __attribute__((nonnull)) |
| 627 | void rebalance(subtree *const subtree); |
| 628 | |
| 629 | static void copyout(uint32_t *const outlen, dmtdata_t *const out, const dmt_node *const n); |
| 630 | |
| 631 | static void copyout(uint32_t *const outlen, dmtdata_t **const out, dmt_node *const n); |
| 632 | |
| 633 | static void copyout(uint32_t *const outlen, dmtdata_t *const out, const uint32_t len, const dmtdata_t *const stored_value_ptr); |
| 634 | |
| 635 | static void copyout(uint32_t *const outlen, dmtdata_t **const out, const uint32_t len, dmtdata_t *const stored_value_ptr); |
| 636 | |
| 637 | template<typename dmtcmp_t, |
| 638 | int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)> |
| 639 | int find_internal_zero_array(const dmtcmp_t &, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const; |
| 640 | |
| 641 | template<typename dmtcmp_t, |
| 642 | int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)> |
| 643 | int find_internal_zero(const subtree &subtree, const dmtcmp_t &, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const; |
| 644 | |
| 645 | template<typename dmtcmp_t, |
| 646 | int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)> |
| 647 | int find_internal_plus_array(const dmtcmp_t &, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const; |
| 648 | |
| 649 | template<typename dmtcmp_t, |
| 650 | int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)> |
| 651 | int find_internal_plus(const subtree &subtree, const dmtcmp_t &, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const; |
| 652 | |
| 653 | template<typename dmtcmp_t, |
| 654 | int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)> |
| 655 | int find_internal_minus_array(const dmtcmp_t &, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const; |
| 656 | |
| 657 | template<typename dmtcmp_t, |
| 658 | int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)> |
| 659 | int find_internal_minus(const subtree &subtree, const dmtcmp_t &, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const; |
| 660 | |
| 661 | // Allocate memory for an array: node_offset[num_idx] from pre-allocated contiguous free space in the mempool. |
| 662 | // If there is not enough space, returns nullptr. |
| 663 | node_offset* alloc_temp_node_offsets(uint32_t num_idxs); |
| 664 | |
| 665 | // Returns the aligned size of x. |
| 666 | // If x % ALIGNMENT == 0, returns x |
| 667 | // o.w. returns x + (ALIGNMENT - (x % ALIGNMENT)) |
| 668 | uint32_t align(const uint32_t x) const; |
| 669 | }; |
| 670 | |
| 671 | } // namespace toku |
| 672 | |
| 673 | // include the implementation here |
| 674 | #include "dmt.cc" |
| 675 | |
| 676 | |