1 | /* |
2 | * index.h |
3 | * |
4 | * Copyright (C) 2008-2016 Aerospike, Inc. |
5 | * |
6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
7 | * license agreements. |
8 | * |
9 | * This program is free software: you can redistribute it and/or modify it under |
10 | * the terms of the GNU Affero General Public License as published by the Free |
11 | * Software Foundation, either version 3 of the License, or (at your option) any |
12 | * later version. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
17 | * details. |
18 | * |
19 | * You should have received a copy of the GNU Affero General Public License |
20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
21 | */ |
22 | |
23 | #pragma once |
24 | |
25 | #include <stdbool.h> |
26 | #include <stddef.h> |
27 | #include <stdint.h> |
28 | |
29 | #include "aerospike/as_atomic.h" |
30 | #include "citrusleaf/cf_atomic.h" |
31 | #include "citrusleaf/cf_digest.h" |
32 | |
33 | #include "arenax.h" |
34 | #include "cf_mutex.h" |
35 | |
36 | #include "base/datamodel.h" |
37 | |
38 | |
39 | //========================================================== |
40 | // Index tree node - as_index, also known as as_record. |
41 | // |
42 | // There's one for every record. Contains metadata, and |
43 | // points to record data in memory and/or on storage device. |
44 | // |
45 | |
46 | typedef struct as_index_s { |
47 | |
48 | // offset: 0 |
49 | uint16_t rc; // for now, incremented & decremented only when reducing sprig |
50 | |
51 | // offset: 2 |
52 | uint8_t : 8; // reserved for bigger rc, if needed |
53 | |
54 | // offset: 3 |
55 | uint8_t tree_id: 6; |
56 | uint8_t color: 1; |
57 | uint8_t : 1; |
58 | |
59 | // offset: 4 |
60 | cf_digest keyd; |
61 | |
62 | // offset: 24 |
63 | uint64_t left_h: 40; |
64 | uint64_t right_h: 40; |
65 | |
66 | // offset: 34 |
67 | uint16_t set_id_bits: 10; |
68 | uint16_t : 6; |
69 | |
70 | // offset: 36 |
71 | uint32_t : 2; |
72 | uint32_t void_time: 30; |
73 | |
74 | // offset: 40 |
75 | uint64_t last_update_time: 40; |
76 | uint64_t generation: 16; |
77 | |
78 | // offset: 47 |
79 | // Used by the storage engines. |
80 | uint64_t rblock_id: 37; // can address 2^37 * 16b = 2Tb drive |
81 | uint64_t n_rblocks: 19; // is enough for 8Mb/16b = 512K rblocks |
82 | uint64_t file_id: 7; // can spec 2^7 = 128 drives |
83 | uint64_t key_stored: 1; |
84 | |
85 | // offset: 55 |
86 | // In single-bin mode for data-in-memory namespaces, this offset is cast to |
87 | // an as_bin, but only 4 bits get used (for the iparticle state). The other |
88 | // 4 bits are used for replication state and index flags. |
89 | uint8_t repl_state: 2; |
90 | uint8_t tombstone: 1; |
91 | uint8_t cenotaph: 1; |
92 | uint8_t single_bin_state: 4; // used indirectly, only in single-bin mode |
93 | |
94 | // offset: 56 |
95 | // For data-not-in-memory namespaces, these 8 bytes are currently unused. |
96 | // For data-in-memory namespaces: in single-bin mode the as_bin is embedded |
97 | // here (these 8 bytes plus 4 bits in flex_bits above), but in multi-bin |
98 | // mode this is a pointer to either of: |
99 | // - an as_bin_space containing n_bins and an array of as_bin structs |
100 | // - an as_rec_space containing an as_bin_space pointer and other metadata |
101 | void* dim; |
102 | |
103 | // final size: 64 |
104 | |
105 | } __attribute__ ((__packed__)) as_index; |
106 | |
107 | COMPILER_ASSERT(sizeof(as_index) == 64); |
108 | |
109 | #define AS_INDEX_SINGLE_BIN_OFFSET 55 // can't use offsetof() with bit fields |
110 | |
111 | |
112 | //========================================================== |
113 | // Accessor functions for bits in as_index. |
114 | // |
115 | |
116 | #define as_index_reserve(_r) ({ \ |
117 | uint16_t rc = as_aaf_uint16(&(_r->rc), 1); \ |
118 | cf_assert(rc != 0, AS_INDEX, "ref-count overflow"); \ |
119 | rc; \ |
120 | }) |
121 | |
122 | #define as_index_release(_r) ({ \ |
123 | uint16_t rc = as_aaf_uint16(&(_r->rc), -1); \ |
124 | cf_assert(rc != (uint16_t)-1, AS_INDEX, "ref-count underflow"); \ |
125 | rc; \ |
126 | }) |
127 | |
128 | #define TREE_ID_NUM_BITS 6 |
129 | #define MAX_NUM_TREE_IDS (1 << TREE_ID_NUM_BITS) // 64 |
130 | #define TREE_ID_MASK (MAX_NUM_TREE_IDS - 1) // 0x3F |
131 | |
132 | COMPILER_ASSERT(MAX_NUM_TREE_IDS <= 64); // must fit in 64-bit map |
133 | |
134 | // Fast way to clear the record portion of as_index. |
135 | // Note - relies on current layout and size of as_index! |
136 | // FIXME - won't be able to "rescue" with future sindex method - will go away. |
137 | static inline |
138 | void as_index_clear_record_info(as_index *index) |
139 | { |
140 | index->set_id_bits = 0; |
141 | |
142 | *(uint32_t*)((uint8_t*)index + 36) = 0; |
143 | |
144 | uint64_t *p_clear = (uint64_t*)((uint8_t*)index + 40); |
145 | |
146 | *p_clear++ = 0; |
147 | *p_clear++ = 0; |
148 | *p_clear = 0; |
149 | } |
150 | |
151 | // Generation 0 is never written, and generation plays no role in record |
152 | // destruction, so it works to flag deleted records. |
153 | static inline |
154 | void as_index_invalidate_record(as_index *index) { |
155 | index->generation = 0; |
156 | } |
157 | |
158 | static inline |
159 | bool as_index_is_valid_record(as_index *index) { |
160 | return index->generation != 0; |
161 | } |
162 | |
163 | |
164 | //------------------------------------------------ |
165 | // Single bin, as_bin_space & as_rec_space. |
166 | // |
167 | |
168 | static inline |
169 | as_bin *as_index_get_single_bin(const as_index *index) { |
170 | // We only use 4 bits of the first byte for the bin state. |
171 | return (as_bin*)((uint8_t *)index + AS_INDEX_SINGLE_BIN_OFFSET); |
172 | } |
173 | |
174 | static inline |
175 | as_bin_space* as_index_get_bin_space(const as_index *index) { |
176 | return index->key_stored == 1 ? |
177 | ((as_rec_space*)index->dim)->bin_space : (as_bin_space*)index->dim; |
178 | } |
179 | |
180 | static inline |
181 | void as_index_set_bin_space(as_index* index, as_bin_space* bin_space) { |
182 | if (index->key_stored == 1) { |
183 | ((as_rec_space*)index->dim)->bin_space = bin_space; |
184 | } |
185 | else { |
186 | index->dim = (void*)bin_space; |
187 | } |
188 | } |
189 | |
190 | |
191 | //------------------------------------------------ |
192 | // Set-ID bits. |
193 | // |
194 | |
195 | static inline |
196 | uint16_t as_index_get_set_id(const as_index *index) { |
197 | return index->set_id_bits; |
198 | } |
199 | |
200 | static inline |
201 | void as_index_set_set_id(as_index *index, uint16_t set_id) { |
202 | // TODO - check that it fits in the 10 bits ??? |
203 | index->set_id_bits = set_id; |
204 | } |
205 | |
206 | |
207 | //------------------------------------------------ |
208 | // Set-ID helpers. |
209 | // |
210 | |
211 | static inline |
212 | int as_index_set_set_w_len(as_index *index, as_namespace *ns, |
213 | const char *set_name, size_t len, bool apply_restrictions) { |
214 | uint16_t set_id; |
215 | int rv = as_namespace_set_set_w_len(ns, set_name, len, &set_id, |
216 | apply_restrictions); |
217 | |
218 | if (rv != 0) { |
219 | return rv; |
220 | } |
221 | |
222 | as_index_set_set_id(index, set_id); |
223 | return 0; |
224 | } |
225 | |
226 | static inline |
227 | const char *as_index_get_set_name(const as_index *index, as_namespace *ns) { |
228 | return as_namespace_get_set_name(ns, as_index_get_set_id(index)); |
229 | } |
230 | |
231 | |
232 | //========================================================== |
233 | // Handling as_index objects. |
234 | // |
235 | |
236 | // Container for as_index pointer with lock and location. |
237 | struct as_index_ref_s { |
238 | as_index *r; |
239 | cf_arenax_handle r_h; |
240 | cf_arenax_puddle *puddle; |
241 | cf_mutex *olock; |
242 | }; |
243 | |
244 | |
245 | //========================================================== |
246 | // Index tree. |
247 | // |
248 | |
249 | typedef void (*as_index_tree_done_fn) (uint8_t id, void *udata); |
250 | |
251 | typedef struct as_index_tree_s { |
252 | uint8_t id; |
253 | as_index_tree_done_fn done_cb; |
254 | void *udata; |
255 | |
256 | // Data common to all trees in a namespace. |
257 | as_index_tree_shared *shared; |
258 | |
259 | cf_atomic64 n_elements; |
260 | |
261 | // Variable length data, dependent on configuration. |
262 | uint8_t data[]; |
263 | } as_index_tree; |
264 | |
265 | |
266 | //========================================================== |
267 | // as_index_tree variable length data components. |
268 | // |
269 | |
270 | #define NUM_LOCK_PAIRS 256 // per partition |
271 | |
272 | typedef struct as_lock_pair_s { |
273 | // Note: reduce_lock's scope is always inside of lock's scope. |
274 | cf_mutex lock; // insert, delete vs. insert, delete, get |
275 | cf_mutex reduce_lock; // insert, delete vs. reduce |
276 | } as_lock_pair; |
277 | |
278 | #define NUM_SPRIG_BITS 28 // 3.5 bytes - yes, that's a lot of sprigs |
279 | |
280 | typedef struct as_sprig_s { |
281 | uint64_t n_elements: 24; // max 16M records per sprig |
282 | uint64_t root_h: 40; |
283 | } as_sprig; |
284 | |
285 | static inline as_lock_pair * |
286 | tree_locks(as_index_tree *tree) |
287 | { |
288 | return (as_lock_pair*)tree->data; |
289 | } |
290 | |
291 | static inline as_sprig * |
292 | tree_sprigs(as_index_tree *tree) |
293 | { |
294 | return (as_sprig*)(tree->data + tree->shared->sprigs_offset); |
295 | } |
296 | |
297 | static inline cf_arenax_puddle * |
298 | tree_puddle_for_sprig(as_index_tree *tree, int sprig_i) |
299 | { |
300 | uint32_t puddles_offset = tree->shared->puddles_offset; |
301 | return puddles_offset == 0 ? |
302 | NULL : (cf_arenax_puddle*)(tree->data + puddles_offset) + sprig_i; |
303 | } |
304 | |
305 | static inline cf_arenax_puddle * |
306 | tree_puddles(as_index_tree *tree) |
307 | { |
308 | return tree_puddle_for_sprig(tree, 0); |
309 | } |
310 | |
311 | static inline size_t |
312 | tree_puddles_size(as_index_tree_shared *shared) |
313 | { |
314 | return shared->puddles_offset == 0 ? |
315 | 0 : sizeof(cf_arenax_puddle) * shared->n_sprigs; |
316 | } |
317 | |
318 | static inline uint32_t |
319 | tree_puddles_count(as_index_tree_shared *shared) |
320 | { |
321 | return shared->puddles_offset == 0 ? 0 : shared->n_sprigs; |
322 | } |
323 | |
324 | |
325 | //------------------------------------------------ |
326 | // as_index_tree public API. |
327 | // |
328 | |
329 | void as_index_tree_gc_init(); |
330 | int as_index_tree_gc_queue_size(); |
331 | |
332 | as_index_tree *as_index_tree_create(as_index_tree_shared *shared, uint8_t id, as_index_tree_done_fn cb, void *udata); |
333 | as_index_tree *as_index_tree_resume(as_index_tree_shared *shared, as_treex* xmem_trees, uint32_t pid, as_index_tree_done_fn cb, void *udata); |
334 | void as_index_tree_block(as_index_tree *tree); |
335 | void as_index_tree_reserve(as_index_tree *tree); |
336 | int as_index_tree_release(as_index_tree *tree); |
337 | uint64_t as_index_tree_size(as_index_tree *tree); |
338 | |
339 | typedef void (*as_index_reduce_fn) (as_index_ref *value, void *udata); |
340 | |
341 | void as_index_reduce(as_index_tree *tree, as_index_reduce_fn cb, void *udata); |
342 | void as_index_reduce_partial(as_index_tree *tree, uint64_t sample_count, as_index_reduce_fn cb, void *udata); |
343 | |
344 | void as_index_reduce_live(as_index_tree *tree, as_index_reduce_fn cb, void *udata); |
345 | void as_index_reduce_partial_live(as_index_tree *tree, uint64_t sample_count, as_index_reduce_fn cb, void *udata); |
346 | |
347 | int as_index_try_exists(as_index_tree *tree, const cf_digest *keyd); |
348 | int as_index_try_get_vlock(as_index_tree *tree, const cf_digest *keyd, as_index_ref *index_ref); |
349 | int as_index_get_vlock(as_index_tree *tree, const cf_digest *keyd, as_index_ref *index_ref); |
350 | int as_index_get_insert_vlock(as_index_tree *tree, const cf_digest *keyd, as_index_ref *index_ref); |
351 | int as_index_delete(as_index_tree *tree, const cf_digest *keyd); |
352 | |
353 | |
354 | //------------------------------------------------ |
355 | // Private API - for enterprise separation only. |
356 | // |
357 | |
358 | // Container for sprig-level function parameters. |
359 | typedef struct as_index_sprig_s { |
360 | as_index_value_destructor destructor; |
361 | void *destructor_udata; |
362 | |
363 | cf_arenax *arena; |
364 | |
365 | as_lock_pair *pair; |
366 | as_sprig *sprig; |
367 | cf_arenax_puddle *puddle; |
368 | } as_index_sprig; |
369 | |
370 | uint64_t as_index_sprig_keyd_reduce_partial(as_index_sprig *isprig, uint64_t sample_count, as_index_reduce_fn cb, void *udata); |
371 | |
372 | int as_index_sprig_get_vlock(as_index_sprig *isprig, const cf_digest *keyd, as_index_ref *index_ref); |
373 | int as_index_sprig_delete(as_index_sprig *isprig, const cf_digest *keyd); |
374 | |
375 | static inline void |
376 | as_index_sprig_from_keyd(as_index_tree *tree, as_index_sprig *isprig, |
377 | const cf_digest *keyd) |
378 | { |
379 | // Get the 28 most significant non-pid bits in the digest. Note - this is |
380 | // hardwired around the way we currently extract the (12 bit) partition-ID |
381 | // from the digest. |
382 | uint32_t bits = (((uint32_t)keyd->digest[1] & 0xF0) << 20) | |
383 | ((uint32_t)keyd->digest[2] << 16) | |
384 | ((uint32_t)keyd->digest[3] << 8) | |
385 | (uint32_t)keyd->digest[4]; |
386 | |
387 | uint32_t lock_i = bits >> tree->shared->locks_shift; |
388 | uint32_t sprig_i = bits >> tree->shared->sprigs_shift; |
389 | |
390 | isprig->destructor = tree->shared->destructor; |
391 | isprig->destructor_udata = tree->shared->destructor_udata; |
392 | isprig->arena = tree->shared->arena; |
393 | isprig->pair = tree_locks(tree) + lock_i; |
394 | isprig->sprig = tree_sprigs(tree) + sprig_i; |
395 | isprig->puddle = tree_puddle_for_sprig(tree, sprig_i); |
396 | } |
397 | |
398 | #define SENTINEL_H 0 |
399 | |
400 | #define RESOLVE_H(__h) ((as_index*)cf_arenax_resolve(isprig->arena, __h)) |
401 | |
402 | // Flag to indicate full index reduce. |
403 | #define AS_REDUCE_ALL (-1L) |
404 | |