| 1 | // Copyright (c) 2018 Kenton Varda and contributors |
| 2 | // Licensed under the MIT License: |
| 3 | // |
| 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 5 | // of this software and associated documentation files (the "Software"), to deal |
| 6 | // in the Software without restriction, including without limitation the rights |
| 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 8 | // copies of the Software, and to permit persons to whom the Software is |
| 9 | // furnished to do so, subject to the following conditions: |
| 10 | // |
| 11 | // The above copyright notice and this permission notice shall be included in |
| 12 | // all copies or substantial portions of the Software. |
| 13 | // |
| 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 20 | // THE SOFTWARE. |
| 21 | |
| 22 | #include "table.h" |
| 23 | #include "debug.h" |
| 24 | #include <stdlib.h> |
| 25 | |
| 26 | namespace kj { |
| 27 | namespace _ { |
| 28 | |
| 29 | static inline uint lg(uint value) { |
| 30 | // Compute floor(log2(value)). |
| 31 | // |
| 32 | // Undefined for value = 0. |
| 33 | #if _MSC_VER |
| 34 | unsigned long i; |
| 35 | auto found = _BitScanReverse(&i, value); |
| 36 | KJ_DASSERT(found); // !found means value = 0 |
| 37 | return i; |
| 38 | #else |
| 39 | return sizeof(uint) * 8 - 1 - __builtin_clz(value); |
| 40 | #endif |
| 41 | } |
| 42 | |
| 43 | void throwDuplicateTableRow() { |
| 44 | KJ_FAIL_REQUIRE("inserted row already exists in table" ); |
| 45 | } |
| 46 | |
| 47 | void logHashTableInconsistency() { |
| 48 | KJ_LOG(ERROR, |
| 49 | "HashIndex detected hash table inconsistency. This can happen if you create a kj::Table " |
| 50 | "with a hash index and you modify the rows in the table post-indexing in a way that would " |
| 51 | "change their hash. This is a serious bug which will lead to undefined behavior." |
| 52 | "\nstack: " , kj::getStackTrace()); |
| 53 | } |
| 54 | |
| 55 | // List of primes where each element is roughly double the previous. Obtained |
| 56 | // from: |
| 57 | // http://planetmath.org/goodhashtableprimes |
| 58 | // Primes < 53 were added to ensure that small tables don't allocate excessive memory. |
| 59 | static const size_t PRIMES[] = { |
| 60 | 1, // 2^ 0 = 1 |
| 61 | 3, // 2^ 1 = 2 |
| 62 | 5, // 2^ 2 = 4 |
| 63 | 11, // 2^ 3 = 8 |
| 64 | 23, // 2^ 4 = 16 |
| 65 | 53, // 2^ 5 = 32 |
| 66 | 97, // 2^ 6 = 64 |
| 67 | 193, // 2^ 7 = 128 |
| 68 | 389, // 2^ 8 = 256 |
| 69 | 769, // 2^ 9 = 512 |
| 70 | 1543, // 2^10 = 1024 |
| 71 | 3079, // 2^11 = 2048 |
| 72 | 6151, // 2^12 = 4096 |
| 73 | 12289, // 2^13 = 8192 |
| 74 | 24593, // 2^14 = 16384 |
| 75 | 49157, // 2^15 = 32768 |
| 76 | 98317, // 2^16 = 65536 |
| 77 | 196613, // 2^17 = 131072 |
| 78 | 393241, // 2^18 = 262144 |
| 79 | 786433, // 2^19 = 524288 |
| 80 | 1572869, // 2^20 = 1048576 |
| 81 | 3145739, // 2^21 = 2097152 |
| 82 | 6291469, // 2^22 = 4194304 |
| 83 | 12582917, // 2^23 = 8388608 |
| 84 | 25165843, // 2^24 = 16777216 |
| 85 | 50331653, // 2^25 = 33554432 |
| 86 | 100663319, // 2^26 = 67108864 |
| 87 | 201326611, // 2^27 = 134217728 |
| 88 | 402653189, // 2^28 = 268435456 |
| 89 | 805306457, // 2^29 = 536870912 |
| 90 | 1610612741, // 2^30 = 1073741824 |
| 91 | }; |
| 92 | |
| 93 | uint chooseBucket(uint hash, uint count) { |
| 94 | // Integer modulus is really, really slow. It turns out that the compiler can generate much |
| 95 | // faster code if the denominator is a constant. Since we have a fixed set of possible |
| 96 | // denominators, a big old switch() statement is a win. |
| 97 | |
| 98 | // TODO(perf): Consider using power-of-two bucket sizes. We can safely do so as long as we demand |
| 99 | // high-quality hash functions -- kj::hashCode() needs good diffusion even for integers, can't |
| 100 | // just be a cast. Also be sure to implement Robin Hood hashing to avoid extremely bad negative |
| 101 | // lookup time when elements have sequential hashes (otherwise, it could be necessary to scan |
| 102 | // the entire list to determine that an element isn't present). |
| 103 | |
| 104 | switch (count) { |
| 105 | #define HANDLE(i) case i##u: return hash % i##u |
| 106 | HANDLE( 1); |
| 107 | HANDLE( 3); |
| 108 | HANDLE( 5); |
| 109 | HANDLE( 11); |
| 110 | HANDLE( 23); |
| 111 | HANDLE( 53); |
| 112 | HANDLE( 97); |
| 113 | HANDLE( 193); |
| 114 | HANDLE( 389); |
| 115 | HANDLE( 769); |
| 116 | HANDLE( 1543); |
| 117 | HANDLE( 3079); |
| 118 | HANDLE( 6151); |
| 119 | HANDLE( 12289); |
| 120 | HANDLE( 24593); |
| 121 | HANDLE( 49157); |
| 122 | HANDLE( 98317); |
| 123 | HANDLE( 196613); |
| 124 | HANDLE( 393241); |
| 125 | HANDLE( 786433); |
| 126 | HANDLE( 1572869); |
| 127 | HANDLE( 3145739); |
| 128 | HANDLE( 6291469); |
| 129 | HANDLE( 12582917); |
| 130 | HANDLE( 25165843); |
| 131 | HANDLE( 50331653); |
| 132 | HANDLE( 100663319); |
| 133 | HANDLE( 201326611); |
| 134 | HANDLE( 402653189); |
| 135 | HANDLE( 805306457); |
| 136 | HANDLE(1610612741); |
| 137 | #undef HANDLE |
| 138 | default: return hash % count; |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | size_t chooseHashTableSize(uint size) { |
| 143 | if (size == 0) return 0; |
| 144 | |
| 145 | // Add 1 to compensate for the floor() above, then look up the best prime bucket size for that |
| 146 | // target size. |
| 147 | return PRIMES[lg(size) + 1]; |
| 148 | } |
| 149 | |
| 150 | kj::Array<HashBucket> rehash(kj::ArrayPtr<const HashBucket> oldBuckets, size_t targetSize) { |
| 151 | // Rehash the whole table. |
| 152 | |
| 153 | KJ_REQUIRE(targetSize < (1 << 30), "hash table has reached maximum size" ); |
| 154 | |
| 155 | size_t size = chooseHashTableSize(targetSize); |
| 156 | |
| 157 | if (size < oldBuckets.size()) { |
| 158 | size = oldBuckets.size(); |
| 159 | } |
| 160 | |
| 161 | auto newBuckets = kj::heapArray<HashBucket>(size); |
| 162 | memset(newBuckets.begin(), 0, sizeof(HashBucket) * size); |
| 163 | |
| 164 | for (auto& oldBucket: oldBuckets) { |
| 165 | if (oldBucket.isOccupied()) { |
| 166 | for (uint i = oldBucket.hash % newBuckets.size();; i = probeHash(newBuckets, i)) { |
| 167 | auto& newBucket = newBuckets[i]; |
| 168 | if (newBucket.isEmpty()) { |
| 169 | newBucket = oldBucket; |
| 170 | break; |
| 171 | } |
| 172 | } |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | return newBuckets; |
| 177 | } |
| 178 | |
| 179 | // ======================================================================================= |
| 180 | // BTree |
| 181 | |
| 182 | #if _WIN32 |
| 183 | #define aligned_free _aligned_free |
| 184 | #else |
| 185 | #define aligned_free ::free |
| 186 | #endif |
| 187 | |
| 188 | BTreeImpl::BTreeImpl() |
| 189 | : tree(const_cast<NodeUnion*>(&EMPTY_NODE)), |
| 190 | treeCapacity(1), |
| 191 | height(0), |
| 192 | freelistHead(1), |
| 193 | freelistSize(0), |
| 194 | beginLeaf(0), |
| 195 | endLeaf(0) {} |
| 196 | BTreeImpl::~BTreeImpl() noexcept(false) { |
| 197 | if (tree != &EMPTY_NODE) { |
| 198 | aligned_free(tree); |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | const BTreeImpl::NodeUnion BTreeImpl::EMPTY_NODE = {{{0, {0}}}}; |
| 203 | |
| 204 | void BTreeImpl::verify(size_t size, FunctionParam<bool(uint, uint)> f) { |
| 205 | KJ_ASSERT(verifyNode(size, f, 0, height, nullptr) == size); |
| 206 | } |
| 207 | size_t BTreeImpl::verifyNode(size_t size, FunctionParam<bool(uint, uint)>& f, |
| 208 | uint pos, uint height, MaybeUint maxRow) { |
| 209 | if (height > 0) { |
| 210 | auto& parent = tree[pos].parent; |
| 211 | |
| 212 | auto n = parent.keyCount(); |
| 213 | size_t total = 0; |
| 214 | for (auto i: kj::zeroTo(n)) { |
| 215 | KJ_ASSERT(*parent.keys[i] < size); |
| 216 | total += verifyNode(size, f, parent.children[i], height - 1, parent.keys[i]); |
| 217 | KJ_ASSERT(i + 1 == n || f(*parent.keys[i], *parent.keys[i + 1])); |
| 218 | } |
| 219 | total += verifyNode(size, f, parent.children[n], height - 1, maxRow); |
| 220 | KJ_ASSERT(maxRow == nullptr || f(*parent.keys[n-1], *maxRow)); |
| 221 | return total; |
| 222 | } else { |
| 223 | auto& leaf = tree[pos].leaf; |
| 224 | auto n = leaf.size(); |
| 225 | for (auto i: kj::zeroTo(n)) { |
| 226 | KJ_ASSERT(*leaf.rows[i] < size); |
| 227 | if (i + 1 < n) { |
| 228 | KJ_ASSERT(f(*leaf.rows[i], *leaf.rows[i + 1])); |
| 229 | } else { |
| 230 | KJ_ASSERT(maxRow == nullptr || leaf.rows[n-1] == maxRow); |
| 231 | } |
| 232 | } |
| 233 | return n; |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | void BTreeImpl::logInconsistency() const { |
| 238 | KJ_LOG(ERROR, |
| 239 | "BTreeIndex detected tree state inconsistency. This can happen if you create a kj::Table " |
| 240 | "with a b-tree index and you modify the rows in the table post-indexing in a way that would " |
| 241 | "change their ordering. This is a serious bug which will lead to undefined behavior." |
| 242 | "\nstack: " , kj::getStackTrace()); |
| 243 | } |
| 244 | |
| 245 | void BTreeImpl::reserve(size_t size) { |
| 246 | KJ_REQUIRE(size < (1u << 31), "b-tree has reached maximum size" ); |
| 247 | |
| 248 | // Calculate the worst-case number of leaves to cover the size, given that a leaf is always at |
| 249 | // least half-full. (Note that it's correct for this calculation to round down, not up: The |
| 250 | // remainder will necessarily be distributed among the non-full leaves, rather than creating a |
| 251 | // new leaf, because if it went into a new leaf, that leaf would be less than half-full.) |
| 252 | uint leaves = size / (Leaf::NROWS / 2); |
| 253 | |
| 254 | // Calculate the worst-case number of parents to cover the leaves, given that a parent is always |
| 255 | // at least half-full. Since the parents form a tree with branching factor B, the size of the |
| 256 | // tree is N/B + N/B^2 + N/B^3 + N/B^4 + ... = N / (B - 1). Math. |
| 257 | constexpr uint branchingFactor = Parent::NCHILDREN / 2; |
| 258 | uint parents = leaves / (branchingFactor - 1); |
| 259 | |
| 260 | // Height is log-base-branching-factor of leaves, plus 1 for the root node. |
| 261 | uint height = lg(leaves | 1) / lg(branchingFactor) + 1; |
| 262 | |
| 263 | size_t newSize = leaves + |
| 264 | parents + 1 + // + 1 for the root |
| 265 | height + 2; // minimum freelist size needed by insert() |
| 266 | |
| 267 | if (treeCapacity < newSize) { |
| 268 | growTree(newSize); |
| 269 | } |
| 270 | } |
| 271 | |
| 272 | void BTreeImpl::clear() { |
| 273 | if (tree != &EMPTY_NODE) { |
| 274 | azero(tree, treeCapacity); |
| 275 | height = 0; |
| 276 | freelistHead = 1; |
| 277 | freelistSize = treeCapacity; |
| 278 | beginLeaf = 0; |
| 279 | endLeaf = 0; |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | void BTreeImpl::growTree(uint minCapacity) { |
| 284 | uint newCapacity = kj::max(kj::max(minCapacity, treeCapacity * 2), 4); |
| 285 | freelistSize += newCapacity - treeCapacity; |
| 286 | |
| 287 | // Allocate some aligned memory! In theory this should be as simple as calling the C11 standard |
| 288 | // aligned_alloc() function. Unfortunately, many platforms don't implement it. Luckily, there |
| 289 | // are usually alternatives. |
| 290 | |
| 291 | #if __APPLE__ || __BIONIC__ |
| 292 | // OSX and Android lack aligned_alloc(), but have posix_memalign(). Fine. |
| 293 | void* allocPtr; |
| 294 | int error = posix_memalign(&allocPtr, |
| 295 | sizeof(BTreeImpl::NodeUnion), newCapacity * sizeof(BTreeImpl::NodeUnion)); |
| 296 | if (error != 0) { |
| 297 | KJ_FAIL_SYSCALL("posix_memalign" , error); |
| 298 | } |
| 299 | NodeUnion* newTree = reinterpret_cast<NodeUnion*>(allocPtr); |
| 300 | #elif _WIN32 |
| 301 | // Windows lacks aligned_alloc() but has its own _aligned_malloc() (which requires freeing using |
| 302 | // _aligned_free()). |
| 303 | // WATCH OUT: The argument order for _aligned_malloc() is opposite of aligned_alloc()! |
| 304 | NodeUnion* newTree = reinterpret_cast<NodeUnion*>( |
| 305 | _aligned_malloc(newCapacity * sizeof(BTreeImpl::NodeUnion), sizeof(BTreeImpl::NodeUnion))); |
| 306 | KJ_ASSERT(newTree != nullptr, "memory allocation failed" , newCapacity); |
| 307 | #else |
| 308 | // Let's use the C11 standard. |
| 309 | NodeUnion* newTree = reinterpret_cast<NodeUnion*>( |
| 310 | aligned_alloc(sizeof(BTreeImpl::NodeUnion), newCapacity * sizeof(BTreeImpl::NodeUnion))); |
| 311 | KJ_ASSERT(newTree != nullptr, "memory allocation failed" , newCapacity); |
| 312 | #endif |
| 313 | |
| 314 | acopy(newTree, tree, treeCapacity); |
| 315 | azero(newTree + treeCapacity, newCapacity - treeCapacity); |
| 316 | if (tree != &EMPTY_NODE) aligned_free(tree); |
| 317 | tree = newTree; |
| 318 | treeCapacity = newCapacity; |
| 319 | } |
| 320 | |
| 321 | BTreeImpl::Iterator BTreeImpl::search(const SearchKey& searchKey) const { |
| 322 | // Find the "first" row number (in sorted order) for which searchKey.isAfter(rowNumber) returns |
| 323 | // false. |
| 324 | |
| 325 | uint pos = 0; |
| 326 | |
| 327 | for (auto i KJ_UNUSED: zeroTo(height)) { |
| 328 | auto& parent = tree[pos].parent; |
| 329 | pos = parent.children[searchKey.search(parent)]; |
| 330 | } |
| 331 | |
| 332 | auto& leaf = tree[pos].leaf; |
| 333 | return { tree, &leaf, searchKey.search(leaf) }; |
| 334 | } |
| 335 | |
| 336 | template <typename T> |
| 337 | struct BTreeImpl::AllocResult { |
| 338 | uint index; |
| 339 | T& node; |
| 340 | }; |
| 341 | |
| 342 | template <typename T> |
| 343 | inline BTreeImpl::AllocResult<T> BTreeImpl::alloc() { |
| 344 | // Allocate a new item from the freelist. Guaranteed to be zero'd except for the first member. |
| 345 | uint i = freelistHead; |
| 346 | NodeUnion* ptr = &tree[i]; |
| 347 | freelistHead = i + 1 + ptr->freelist.nextOffset; |
| 348 | --freelistSize; |
| 349 | return { i, *ptr }; |
| 350 | } |
| 351 | |
| 352 | inline void BTreeImpl::free(uint pos) { |
| 353 | // Add the given node to the freelist. |
| 354 | |
| 355 | // HACK: This is typically called on a node immediately after copying its contents away, but the |
| 356 | // pointer used to copy it away may be a different pointer pointing to a different union member |
| 357 | // which the compiler may not recgonize as aliasing with this object. Just to be extra-safe, |
| 358 | // insert a compiler barrier. |
| 359 | compilerBarrier(); |
| 360 | |
| 361 | auto& node = tree[pos]; |
| 362 | node.freelist.nextOffset = freelistHead - pos - 1; |
| 363 | azero(node.freelist.zero, kj::size(node.freelist.zero)); |
| 364 | freelistHead = pos; |
| 365 | ++freelistSize; |
| 366 | } |
| 367 | |
| 368 | BTreeImpl::Iterator BTreeImpl::insert(const SearchKey& searchKey) { |
| 369 | // Like search() but ensures that there is room in the leaf node to insert a new row. |
| 370 | |
| 371 | // If we split the root node it will generate two new nodes. If we split any other node in the |
| 372 | // path it will generate one new node. `height` doesn't count leaf nodes, but we can equivalently |
| 373 | // think of it as not counting the root node, so in the worst case we may allocate height + 2 |
| 374 | // new nodes. |
| 375 | // |
| 376 | // (Also note that if the tree is currently empty, then `tree` points to a dummy root node in |
| 377 | // read-only memory. We definitely need to allocate a real tree node array in this case, and |
| 378 | // we'll start out allocating space for four nodes, which will be all we need up to 28 rows.) |
| 379 | if (freelistSize < height + 2) { |
| 380 | if (height > 0 && !tree[0].parent.isFull() && freelistSize >= height) { |
| 381 | // Slight optimization: The root node is not full, so we're definitely not going to split it. |
| 382 | // That means that the maximum allocations we might do is equal to `height`, not |
| 383 | // `height + 2`, and we have that much space, so no need to grow yet. |
| 384 | // |
| 385 | // This optimization is particularly important for small trees, e.g. when treeCapacity is 4 |
| 386 | // and the tree so far consists of a root and two children, we definitely don't need to grow |
| 387 | // the tree yet. |
| 388 | } else { |
| 389 | growTree(); |
| 390 | |
| 391 | if (freelistHead == 0) { |
| 392 | // We have no root yet. Allocate one. |
| 393 | KJ_ASSERT(alloc<Parent>().index == 0); |
| 394 | } |
| 395 | } |
| 396 | } |
| 397 | |
| 398 | uint pos = 0; |
| 399 | |
| 400 | // Track grandparent node and child index within grandparent. |
| 401 | Parent* parent = nullptr; |
| 402 | uint indexInParent = 0; |
| 403 | |
| 404 | for (auto i KJ_UNUSED: zeroTo(height)) { |
| 405 | Parent& node = insertHelper(searchKey, tree[pos].parent, parent, indexInParent, pos); |
| 406 | |
| 407 | parent = &node; |
| 408 | indexInParent = searchKey.search(node); |
| 409 | pos = node.children[indexInParent]; |
| 410 | } |
| 411 | |
| 412 | Leaf& leaf = insertHelper(searchKey, tree[pos].leaf, parent, indexInParent, pos); |
| 413 | |
| 414 | // Fun fact: Unlike erase(), there's no need to climb back up the tree modifying keys, because |
| 415 | // either the newly-inserted node will not be the last in the leaf (and thus parent keys aren't |
| 416 | // modified), or the leaf is the last leaf in the tree (and thus there's no parent key to |
| 417 | // modify). |
| 418 | |
| 419 | return { tree, &leaf, searchKey.search(leaf) }; |
| 420 | } |
| 421 | |
| 422 | template <typename Node> |
| 423 | Node& BTreeImpl::insertHelper(const SearchKey& searchKey, |
| 424 | Node& node, Parent* parent, uint indexInParent, uint pos) { |
| 425 | if (node.isFull()) { |
| 426 | // This node is full. Need to split. |
| 427 | if (parent == nullptr) { |
| 428 | // This is the root node. We need to split into two nodes and create a new root. |
| 429 | auto n1 = alloc<Node>(); |
| 430 | auto n2 = alloc<Node>(); |
| 431 | |
| 432 | uint pivot = split(n2.node, n2.index, node, pos); |
| 433 | move(n1.node, n1.index, node); |
| 434 | |
| 435 | // Rewrite root to have the two children. |
| 436 | tree[0].parent.initRoot(pivot, n1.index, n2.index); |
| 437 | |
| 438 | // Increased height. |
| 439 | ++height; |
| 440 | |
| 441 | // Decide which new branch has our search key. |
| 442 | if (searchKey.isAfter(pivot)) { |
| 443 | // the right one |
| 444 | return n2.node; |
| 445 | } else { |
| 446 | // the left one |
| 447 | return n1.node; |
| 448 | } |
| 449 | } else { |
| 450 | // This is a non-root parent node. We need to split it into two and insert the new node |
| 451 | // into the grandparent. |
| 452 | auto n = alloc<Node>(); |
| 453 | uint pivot = split(n.node, n.index, node, pos); |
| 454 | |
| 455 | // Insert new child into grandparent. |
| 456 | parent->insertAfter(indexInParent, pivot, n.index); |
| 457 | |
| 458 | // Decide which new branch has our search key. |
| 459 | if (searchKey.isAfter(pivot)) { |
| 460 | // the new one, which is right of the original |
| 461 | return n.node; |
| 462 | } else { |
| 463 | // the original one, which is left of the new one |
| 464 | return node; |
| 465 | } |
| 466 | } |
| 467 | } else { |
| 468 | // No split needed. |
| 469 | return node; |
| 470 | } |
| 471 | } |
| 472 | |
| 473 | void BTreeImpl::erase(uint row, const SearchKey& searchKey) { |
| 474 | // Erase the given row number from the tree. predicate() returns true for the given row and all |
| 475 | // rows after it. |
| 476 | |
| 477 | uint pos = 0; |
| 478 | |
| 479 | // Track grandparent node and child index within grandparent. |
| 480 | Parent* parent = nullptr; |
| 481 | uint indexInParent = 0; |
| 482 | |
| 483 | MaybeUint* fixup = nullptr; |
| 484 | |
| 485 | for (auto i KJ_UNUSED: zeroTo(height)) { |
| 486 | Parent& node = eraseHelper(tree[pos].parent, parent, indexInParent, pos, fixup); |
| 487 | |
| 488 | parent = &node; |
| 489 | indexInParent = searchKey.search(node); |
| 490 | pos = node.children[indexInParent]; |
| 491 | |
| 492 | if (indexInParent < kj::size(node.keys) && node.keys[indexInParent] == row) { |
| 493 | // Oh look, the row is a key in this node! We'll need to come back and fix this up later. |
| 494 | // Note that any particular row can only appear as *one* key value anywhere in the tree, so |
| 495 | // we only need one fixup pointer, which is nice. |
| 496 | MaybeUint* newFixup = &node.keys[indexInParent]; |
| 497 | if (fixup == newFixup) { |
| 498 | // The fixup pointer was already set while processing a parent node, and then a merge or |
| 499 | // rotate caused it to be moved, but the fixup pointer was updated... so it's already set |
| 500 | // to point at the slot we wanted it to point to, so nothing to see here. |
| 501 | } else { |
| 502 | KJ_DASSERT(fixup == nullptr); |
| 503 | fixup = newFixup; |
| 504 | } |
| 505 | } |
| 506 | } |
| 507 | |
| 508 | Leaf& leaf = eraseHelper(tree[pos].leaf, parent, indexInParent, pos, fixup); |
| 509 | |
| 510 | uint r = searchKey.search(leaf); |
| 511 | if (leaf.rows[r] == row) { |
| 512 | leaf.erase(r); |
| 513 | |
| 514 | if (fixup != nullptr) { |
| 515 | // There's a key in a parent node that needs fixup. This is only possible if the removed |
| 516 | // node is the last in its leaf. |
| 517 | KJ_DASSERT(leaf.rows[r] == nullptr); |
| 518 | KJ_DASSERT(r > 0); // non-root nodes must be at least half full so this can't be item 0 |
| 519 | KJ_DASSERT(*fixup == row); |
| 520 | *fixup = leaf.rows[r - 1]; |
| 521 | } |
| 522 | } else { |
| 523 | logInconsistency(); |
| 524 | } |
| 525 | } |
| 526 | |
| 527 | template <typename Node> |
| 528 | Node& BTreeImpl::eraseHelper( |
| 529 | Node& node, Parent* parent, uint indexInParent, uint pos, MaybeUint*& fixup) { |
| 530 | if (parent != nullptr && !node.isMostlyFull()) { |
| 531 | // This is not the root, but it's only half-full. Rebalance. |
| 532 | KJ_DASSERT(node.isHalfFull()); |
| 533 | |
| 534 | if (indexInParent > 0) { |
| 535 | // There's a sibling to the left. |
| 536 | uint sibPos = parent->children[indexInParent - 1]; |
| 537 | Node& sib = tree[sibPos]; |
| 538 | if (sib.isMostlyFull()) { |
| 539 | // Left sibling is more than half full. Steal one member. |
| 540 | rotateRight(sib, node, *parent, indexInParent - 1); |
| 541 | return node; |
| 542 | } else { |
| 543 | // Left sibling is half full, too. Merge. |
| 544 | KJ_ASSERT(sib.isHalfFull()); |
| 545 | merge(sib, sibPos, *parent->keys[indexInParent - 1], node); |
| 546 | parent->eraseAfter(indexInParent - 1); |
| 547 | free(pos); |
| 548 | if (fixup == &parent->keys[indexInParent]) --fixup; |
| 549 | |
| 550 | if (parent->keys[0] == nullptr) { |
| 551 | // Oh hah, the parent has no keys left. It must be the root. We can eliminate it. |
| 552 | KJ_DASSERT(parent == &tree->parent); |
| 553 | compilerBarrier(); // don't reorder any writes to parent below here |
| 554 | move(tree[0], 0, sib); |
| 555 | free(sibPos); |
| 556 | --height; |
| 557 | return tree[0]; |
| 558 | } else { |
| 559 | return sib; |
| 560 | } |
| 561 | } |
| 562 | } else if (indexInParent < Parent::NKEYS && parent->keys[indexInParent] != nullptr) { |
| 563 | // There's a sibling to the right. |
| 564 | uint sibPos = parent->children[indexInParent + 1]; |
| 565 | Node& sib = tree[sibPos]; |
| 566 | if (sib.isMostlyFull()) { |
| 567 | // Right sibling is more than half full. Steal one member. |
| 568 | rotateLeft(node, sib, *parent, indexInParent, fixup); |
| 569 | return node; |
| 570 | } else { |
| 571 | // Right sibling is half full, too. Merge. |
| 572 | KJ_ASSERT(sib.isHalfFull()); |
| 573 | merge(node, pos, *parent->keys[indexInParent], sib); |
| 574 | parent->eraseAfter(indexInParent); |
| 575 | free(sibPos); |
| 576 | if (fixup == &parent->keys[indexInParent]) fixup = nullptr; |
| 577 | |
| 578 | if (parent->keys[0] == nullptr) { |
| 579 | // Oh hah, the parent has no keys left. It must be the root. We can eliminate it. |
| 580 | KJ_DASSERT(parent == &tree->parent); |
| 581 | compilerBarrier(); // don't reorder any writes to parent below here |
| 582 | move(tree[0], 0, node); |
| 583 | free(pos); |
| 584 | --height; |
| 585 | return tree[0]; |
| 586 | } else { |
| 587 | return node; |
| 588 | } |
| 589 | } |
| 590 | } else { |
| 591 | KJ_FAIL_ASSERT("inconsistent b-tree" ); |
| 592 | } |
| 593 | } |
| 594 | |
| 595 | return node; |
| 596 | } |
| 597 | |
| 598 | void BTreeImpl::renumber(uint oldRow, uint newRow, const SearchKey& searchKey) { |
| 599 | // Renumber the given row from oldRow to newRow. predicate() returns true for oldRow and all |
| 600 | // rows after it. (It will not be called on newRow.) |
| 601 | |
| 602 | uint pos = 0; |
| 603 | |
| 604 | for (auto i KJ_UNUSED: zeroTo(height)) { |
| 605 | auto& node = tree[pos].parent; |
| 606 | uint indexInParent = searchKey.search(node); |
| 607 | pos = node.children[indexInParent]; |
| 608 | if (node.keys[indexInParent] == oldRow) { |
| 609 | node.keys[indexInParent] = newRow; |
| 610 | } |
| 611 | KJ_DASSERT(pos != 0); |
| 612 | } |
| 613 | |
| 614 | auto& leaf = tree[pos].leaf; |
| 615 | uint r = searchKey.search(leaf); |
| 616 | if (leaf.rows[r] == oldRow) { |
| 617 | leaf.rows[r] = newRow; |
| 618 | } else { |
| 619 | logInconsistency(); |
| 620 | } |
| 621 | } |
| 622 | |
| 623 | uint BTreeImpl::split(Parent& dst, uint dstPos, Parent& src, uint srcPos) { |
| 624 | constexpr size_t mid = Parent::NKEYS / 2; |
| 625 | uint pivot = *src.keys[mid]; |
| 626 | acopy(dst.keys, src.keys + mid + 1, Parent::NKEYS - mid - 1); |
| 627 | azero(src.keys + mid, Parent::NKEYS - mid); |
| 628 | acopy(dst.children, src.children + mid + 1, Parent::NCHILDREN - mid - 1); |
| 629 | azero(src.children + mid + 1, Parent::NCHILDREN - mid - 1); |
| 630 | return pivot; |
| 631 | } |
| 632 | |
| 633 | uint BTreeImpl::split(Leaf& dst, uint dstPos, Leaf& src, uint srcPos) { |
| 634 | constexpr size_t mid = Leaf::NROWS / 2; |
| 635 | uint pivot = *src.rows[mid - 1]; |
| 636 | acopy(dst.rows, src.rows + mid, Leaf::NROWS - mid); |
| 637 | azero(src.rows + mid, Leaf::NROWS - mid); |
| 638 | |
| 639 | if (src.next == 0) { |
| 640 | endLeaf = dstPos; |
| 641 | } else { |
| 642 | tree[src.next].leaf.prev = dstPos; |
| 643 | } |
| 644 | dst.next = src.next; |
| 645 | dst.prev = srcPos; |
| 646 | src.next = dstPos; |
| 647 | |
| 648 | return pivot; |
| 649 | } |
| 650 | |
| 651 | void BTreeImpl::merge(Parent& dst, uint dstPos, uint pivot, Parent& src) { |
| 652 | // merge() is only legal if both nodes are half-empty. Meanwhile, B-tree invariants |
| 653 | // guarantee that the node can't be more than half-empty, or we would have merged it sooner. |
| 654 | // (The root can be more than half-empty, but it is never merged with anything.) |
| 655 | KJ_DASSERT(src.isHalfFull()); |
| 656 | KJ_DASSERT(dst.isHalfFull()); |
| 657 | |
| 658 | constexpr size_t mid = Parent::NKEYS/2; |
| 659 | dst.keys[mid] = pivot; |
| 660 | acopy(dst.keys + mid + 1, src.keys, mid); |
| 661 | acopy(dst.children + mid + 1, src.children, mid + 1); |
| 662 | } |
| 663 | |
| 664 | void BTreeImpl::merge(Leaf& dst, uint dstPos, uint pivot, Leaf& src) { |
| 665 | // merge() is only legal if both nodes are half-empty. Meanwhile, B-tree invariants |
| 666 | // guarantee that the node can't be more than half-empty, or we would have merged it sooner. |
| 667 | // (The root can be more than half-empty, but it is never merged with anything.) |
| 668 | KJ_DASSERT(src.isHalfFull()); |
| 669 | KJ_DASSERT(dst.isHalfFull()); |
| 670 | |
| 671 | constexpr size_t mid = Leaf::NROWS/2; |
| 672 | dst.rows[mid] = pivot; |
| 673 | acopy(dst.rows + mid, src.rows, mid); |
| 674 | |
| 675 | dst.next = src.next; |
| 676 | if (dst.next == 0) { |
| 677 | endLeaf = dstPos; |
| 678 | } else { |
| 679 | tree[dst.next].leaf.prev = dstPos; |
| 680 | } |
| 681 | } |
| 682 | |
| 683 | void BTreeImpl::move(Parent& dst, uint dstPos, Parent& src) { |
| 684 | dst = src; |
| 685 | } |
| 686 | |
| 687 | void BTreeImpl::move(Leaf& dst, uint dstPos, Leaf& src) { |
| 688 | dst = src; |
| 689 | if (src.next == 0) { |
| 690 | endLeaf = dstPos; |
| 691 | } else { |
| 692 | tree[src.next].leaf.prev = dstPos; |
| 693 | } |
| 694 | if (src.prev == 0) { |
| 695 | beginLeaf = dstPos; |
| 696 | } else { |
| 697 | tree[src.prev].leaf.next = dstPos; |
| 698 | } |
| 699 | } |
| 700 | |
| 701 | void BTreeImpl::rotateLeft( |
| 702 | Parent& left, Parent& right, Parent& parent, uint indexInParent, MaybeUint*& fixup) { |
| 703 | // Steal one item from the right node and move it to the left node. |
| 704 | |
| 705 | // Like merge(), this is only called on an exactly-half-empty node. |
| 706 | KJ_DASSERT(left.isHalfFull()); |
| 707 | KJ_DASSERT(right.isMostlyFull()); |
| 708 | |
| 709 | constexpr size_t mid = Parent::NKEYS/2; |
| 710 | left.keys[mid] = parent.keys[indexInParent]; |
| 711 | if (fixup == &parent.keys[indexInParent]) fixup = &left.keys[mid]; |
| 712 | parent.keys[indexInParent] = right.keys[0]; |
| 713 | left.children[mid + 1] = right.children[0]; |
| 714 | amove(right.keys, right.keys + 1, Parent::NKEYS - 1); |
| 715 | right.keys[Parent::NKEYS - 1] = nullptr; |
| 716 | amove(right.children, right.children + 1, Parent::NCHILDREN - 1); |
| 717 | right.children[Parent::NCHILDREN - 1] = 0; |
| 718 | } |
| 719 | |
| 720 | void BTreeImpl::rotateLeft( |
| 721 | Leaf& left, Leaf& right, Parent& parent, uint indexInParent, MaybeUint*& fixup) { |
| 722 | // Steal one item from the right node and move it to the left node. |
| 723 | |
| 724 | // Like merge(), this is only called on an exactly-half-empty node. |
| 725 | KJ_DASSERT(left.isHalfFull()); |
| 726 | KJ_DASSERT(right.isMostlyFull()); |
| 727 | |
| 728 | constexpr size_t mid = Leaf::NROWS/2; |
| 729 | parent.keys[indexInParent] = left.rows[mid] = right.rows[0]; |
| 730 | if (fixup == &parent.keys[indexInParent]) fixup = nullptr; |
| 731 | amove(right.rows, right.rows + 1, Leaf::NROWS - 1); |
| 732 | right.rows[Leaf::NROWS - 1] = nullptr; |
| 733 | } |
| 734 | |
| 735 | void BTreeImpl::rotateRight(Parent& left, Parent& right, Parent& parent, uint indexInParent) { |
| 736 | // Steal one item from the left node and move it to the right node. |
| 737 | |
| 738 | // Like merge(), this is only called on an exactly-half-empty node. |
| 739 | KJ_DASSERT(right.isHalfFull()); |
| 740 | KJ_DASSERT(left.isMostlyFull()); |
| 741 | |
| 742 | constexpr size_t mid = Parent::NKEYS/2; |
| 743 | amove(right.keys + 1, right.keys, mid); |
| 744 | amove(right.children + 1, right.children, mid + 1); |
| 745 | |
| 746 | uint back = left.keyCount() - 1; |
| 747 | |
| 748 | right.keys[0] = parent.keys[indexInParent]; |
| 749 | parent.keys[indexInParent] = left.keys[back]; |
| 750 | right.children[0] = left.children[back + 1]; |
| 751 | left.keys[back] = nullptr; |
| 752 | left.children[back + 1] = 0; |
| 753 | } |
| 754 | |
| 755 | void BTreeImpl::rotateRight(Leaf& left, Leaf& right, Parent& parent, uint indexInParent) { |
| 756 | // Steal one item from the left node and move it to the right node. |
| 757 | |
| 758 | // Like mergeFrom(), this is only called on an exactly-half-empty node. |
| 759 | KJ_DASSERT(right.isHalfFull()); |
| 760 | KJ_DASSERT(left.isMostlyFull()); |
| 761 | |
| 762 | constexpr size_t mid = Leaf::NROWS/2; |
| 763 | amove(right.rows + 1, right.rows, mid); |
| 764 | |
| 765 | uint back = left.size() - 1; |
| 766 | |
| 767 | right.rows[0] = left.rows[back]; |
| 768 | parent.keys[indexInParent] = left.rows[back - 1]; |
| 769 | left.rows[back] = nullptr; |
| 770 | } |
| 771 | |
| 772 | void BTreeImpl::Parent::initRoot(uint key, uint leftChild, uint rightChild) { |
| 773 | // HACK: This is typically called on the root node immediately after copying its contents away, |
| 774 | // but the pointer used to copy it away may be a different pointer pointing to a different |
| 775 | // union member which the compiler may not recgonize as aliasing with this object. Just to |
| 776 | // be extra-safe, insert a compiler barrier. |
| 777 | compilerBarrier(); |
| 778 | |
| 779 | keys[0] = key; |
| 780 | children[0] = leftChild; |
| 781 | children[1] = rightChild; |
| 782 | azero(keys + 1, Parent::NKEYS - 1); |
| 783 | azero(children + 2, Parent::NCHILDREN - 2); |
| 784 | } |
| 785 | |
| 786 | void BTreeImpl::Parent::insertAfter(uint i, uint splitKey, uint child) { |
| 787 | KJ_IREQUIRE(children[Parent::NCHILDREN - 1] == 0); // check not full |
| 788 | |
| 789 | amove(keys + i + 1, keys + i, Parent::NKEYS - (i + 1)); |
| 790 | keys[i] = splitKey; |
| 791 | |
| 792 | amove(children + i + 2, children + i + 1, Parent::NCHILDREN - (i + 2)); |
| 793 | children[i + 1] = child; |
| 794 | } |
| 795 | |
| 796 | void BTreeImpl::Parent::eraseAfter(uint i) { |
| 797 | amove(keys + i, keys + i + 1, Parent::NKEYS - (i + 1)); |
| 798 | keys[Parent::NKEYS - 1] = nullptr; |
| 799 | amove(children + i + 1, children + i + 2, Parent::NCHILDREN - (i + 2)); |
| 800 | children[Parent::NCHILDREN - 1] = 0; |
| 801 | } |
| 802 | |
| 803 | } // namespace _ |
| 804 | |
| 805 | // ======================================================================================= |
| 806 | // Insertion order |
| 807 | |
| 808 | const InsertionOrderIndex::Link InsertionOrderIndex::EMPTY_LINK = { 0, 0 }; |
| 809 | |
| 810 | InsertionOrderIndex::InsertionOrderIndex(): capacity(0), links(const_cast<Link*>(&EMPTY_LINK)) {} |
| 811 | InsertionOrderIndex::~InsertionOrderIndex() noexcept(false) { |
| 812 | if (links != &EMPTY_LINK) delete[] links; |
| 813 | } |
| 814 | |
| 815 | void InsertionOrderIndex::reserve(size_t size) { |
| 816 | KJ_ASSERT(size < (1u << 31), "Table too big for InsertionOrderIndex" ); |
| 817 | |
| 818 | if (size > capacity) { |
| 819 | // Need to grow. |
| 820 | // Note that `size` and `capacity` do not include the special link[0]. |
| 821 | |
| 822 | // Round up to the next power of 2. |
| 823 | size_t allocation = 1u << (_::lg(size) + 1); |
| 824 | KJ_DASSERT(allocation > size); |
| 825 | KJ_DASSERT(allocation <= size * 2); |
| 826 | |
| 827 | // Round first allocation up to 8. |
| 828 | allocation = kj::max(allocation, 8); |
| 829 | |
| 830 | Link* newLinks = new Link[allocation]; |
| 831 | #ifdef KJ_DEBUG |
| 832 | // To catch bugs, fill unused links with 0xff. |
| 833 | memset(newLinks, 0xff, allocation * sizeof(Link)); |
| 834 | #endif |
| 835 | _::acopy(newLinks, links, capacity + 1); |
| 836 | if (links != &EMPTY_LINK) delete[] links; |
| 837 | links = newLinks; |
| 838 | capacity = allocation - 1; |
| 839 | } |
| 840 | } |
| 841 | |
| 842 | void InsertionOrderIndex::clear() { |
| 843 | links[0] = Link { 0, 0 }; |
| 844 | |
| 845 | #ifdef KJ_DEBUG |
| 846 | // To catch bugs, fill unused links with 0xff. |
| 847 | memset(links + 1, 0xff, capacity * sizeof(Link)); |
| 848 | #endif |
| 849 | } |
| 850 | |
| 851 | kj::Maybe<size_t> InsertionOrderIndex::insertImpl(size_t pos) { |
| 852 | if (pos >= capacity) { |
| 853 | reserve(pos + 1); |
| 854 | } |
| 855 | |
| 856 | links[pos + 1].prev = links[0].prev; |
| 857 | links[pos + 1].next = 0; |
| 858 | links[links[0].prev].next = pos + 1; |
| 859 | links[0].prev = pos + 1; |
| 860 | |
| 861 | return nullptr; |
| 862 | } |
| 863 | |
| 864 | void InsertionOrderIndex::eraseImpl(size_t pos) { |
| 865 | Link& link = links[pos + 1]; |
| 866 | links[link.next].prev = link.prev; |
| 867 | links[link.prev].next = link.next; |
| 868 | |
| 869 | #ifdef KJ_DEBUG |
| 870 | memset(&link, 0xff, sizeof(Link)); |
| 871 | #endif |
| 872 | } |
| 873 | |
| 874 | void InsertionOrderIndex::moveImpl(size_t oldPos, size_t newPos) { |
| 875 | Link& link = links[oldPos + 1]; |
| 876 | Link& newLink = links[newPos + 1]; |
| 877 | |
| 878 | newLink = link; |
| 879 | |
| 880 | KJ_DASSERT(links[link.next].prev == oldPos + 1); |
| 881 | KJ_DASSERT(links[link.prev].next == oldPos + 1); |
| 882 | links[link.next].prev = newPos + 1; |
| 883 | links[link.prev].next = newPos + 1; |
| 884 | |
| 885 | #ifdef KJ_DEBUG |
| 886 | memset(&link, 0xff, sizeof(Link)); |
| 887 | #endif |
| 888 | } |
| 889 | |
| 890 | } // namespace kj |
| 891 | |