1/*
2 * index.h
3 *
4 * Copyright (C) 2008-2016 Aerospike, Inc.
5 *
6 * Portions may be licensed to Aerospike, Inc. under one or more contributor
7 * license agreements.
8 *
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU Affero General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU Affero General Public License
20 * along with this program. If not, see http://www.gnu.org/licenses/
21 */
22
23#pragma once
24
25#include <stdbool.h>
26#include <stddef.h>
27#include <stdint.h>
28
29#include "aerospike/as_atomic.h"
30#include "citrusleaf/cf_atomic.h"
31#include "citrusleaf/cf_digest.h"
32
33#include "arenax.h"
34#include "cf_mutex.h"
35
36#include "base/datamodel.h"
37
38
39//==========================================================
40// Index tree node - as_index, also known as as_record.
41//
42// There's one for every record. Contains metadata, and
43// points to record data in memory and/or on storage device.
44//
45
46typedef struct as_index_s {
47
48 // offset: 0
49 uint16_t rc; // for now, incremented & decremented only when reducing sprig
50
51 // offset: 2
52 uint8_t : 8; // reserved for bigger rc, if needed
53
54 // offset: 3
55 uint8_t tree_id: 6;
56 uint8_t color: 1;
57 uint8_t : 1;
58
59 // offset: 4
60 cf_digest keyd;
61
62 // offset: 24
63 uint64_t left_h: 40;
64 uint64_t right_h: 40;
65
66 // offset: 34
67 uint16_t set_id_bits: 10;
68 uint16_t : 6;
69
70 // offset: 36
71 uint32_t : 2;
72 uint32_t void_time: 30;
73
74 // offset: 40
75 uint64_t last_update_time: 40;
76 uint64_t generation: 16;
77
78 // offset: 47
79 // Used by the storage engines.
80 uint64_t rblock_id: 37; // can address 2^37 * 16b = 2Tb drive
81 uint64_t n_rblocks: 19; // is enough for 8Mb/16b = 512K rblocks
82 uint64_t file_id: 7; // can spec 2^7 = 128 drives
83 uint64_t key_stored: 1;
84
85 // offset: 55
86 // In single-bin mode for data-in-memory namespaces, this offset is cast to
87 // an as_bin, but only 4 bits get used (for the iparticle state). The other
88 // 4 bits are used for replication state and index flags.
89 uint8_t repl_state: 2;
90 uint8_t tombstone: 1;
91 uint8_t cenotaph: 1;
92 uint8_t single_bin_state: 4; // used indirectly, only in single-bin mode
93
94 // offset: 56
95 // For data-not-in-memory namespaces, these 8 bytes are currently unused.
96 // For data-in-memory namespaces: in single-bin mode the as_bin is embedded
97 // here (these 8 bytes plus 4 bits in flex_bits above), but in multi-bin
98 // mode this is a pointer to either of:
99 // - an as_bin_space containing n_bins and an array of as_bin structs
100 // - an as_rec_space containing an as_bin_space pointer and other metadata
101 void* dim;
102
103 // final size: 64
104
105} __attribute__ ((__packed__)) as_index;
106
107COMPILER_ASSERT(sizeof(as_index) == 64);
108
109#define AS_INDEX_SINGLE_BIN_OFFSET 55 // can't use offsetof() with bit fields
110
111
112//==========================================================
113// Accessor functions for bits in as_index.
114//
115
116#define as_index_reserve(_r) ({ \
117 uint16_t rc = as_aaf_uint16(&(_r->rc), 1); \
118 cf_assert(rc != 0, AS_INDEX, "ref-count overflow"); \
119 rc; \
120})
121
122#define as_index_release(_r) ({ \
123 uint16_t rc = as_aaf_uint16(&(_r->rc), -1); \
124 cf_assert(rc != (uint16_t)-1, AS_INDEX, "ref-count underflow"); \
125 rc; \
126})
127
128#define TREE_ID_NUM_BITS 6
129#define MAX_NUM_TREE_IDS (1 << TREE_ID_NUM_BITS) // 64
130#define TREE_ID_MASK (MAX_NUM_TREE_IDS - 1) // 0x3F
131
132COMPILER_ASSERT(MAX_NUM_TREE_IDS <= 64); // must fit in 64-bit map
133
134// Fast way to clear the record portion of as_index.
135// Note - relies on current layout and size of as_index!
136// FIXME - won't be able to "rescue" with future sindex method - will go away.
137static inline
138void as_index_clear_record_info(as_index *index)
139{
140 index->set_id_bits = 0;
141
142 *(uint32_t*)((uint8_t*)index + 36) = 0;
143
144 uint64_t *p_clear = (uint64_t*)((uint8_t*)index + 40);
145
146 *p_clear++ = 0;
147 *p_clear++ = 0;
148 *p_clear = 0;
149}
150
151// Generation 0 is never written, and generation plays no role in record
152// destruction, so it works to flag deleted records.
153static inline
154void as_index_invalidate_record(as_index *index) {
155 index->generation = 0;
156}
157
158static inline
159bool as_index_is_valid_record(as_index *index) {
160 return index->generation != 0;
161}
162
163
164//------------------------------------------------
165// Single bin, as_bin_space & as_rec_space.
166//
167
168static inline
169as_bin *as_index_get_single_bin(const as_index *index) {
170 // We only use 4 bits of the first byte for the bin state.
171 return (as_bin*)((uint8_t *)index + AS_INDEX_SINGLE_BIN_OFFSET);
172}
173
174static inline
175as_bin_space* as_index_get_bin_space(const as_index *index) {
176 return index->key_stored == 1 ?
177 ((as_rec_space*)index->dim)->bin_space : (as_bin_space*)index->dim;
178}
179
180static inline
181void as_index_set_bin_space(as_index* index, as_bin_space* bin_space) {
182 if (index->key_stored == 1) {
183 ((as_rec_space*)index->dim)->bin_space = bin_space;
184 }
185 else {
186 index->dim = (void*)bin_space;
187 }
188}
189
190
191//------------------------------------------------
192// Set-ID bits.
193//
194
195static inline
196uint16_t as_index_get_set_id(const as_index *index) {
197 return index->set_id_bits;
198}
199
200static inline
201void as_index_set_set_id(as_index *index, uint16_t set_id) {
202 // TODO - check that it fits in the 10 bits ???
203 index->set_id_bits = set_id;
204}
205
206
207//------------------------------------------------
208// Set-ID helpers.
209//
210
211static inline
212int as_index_set_set_w_len(as_index *index, as_namespace *ns,
213 const char *set_name, size_t len, bool apply_restrictions) {
214 uint16_t set_id;
215 int rv = as_namespace_set_set_w_len(ns, set_name, len, &set_id,
216 apply_restrictions);
217
218 if (rv != 0) {
219 return rv;
220 }
221
222 as_index_set_set_id(index, set_id);
223 return 0;
224}
225
226static inline
227const char *as_index_get_set_name(const as_index *index, as_namespace *ns) {
228 return as_namespace_get_set_name(ns, as_index_get_set_id(index));
229}
230
231
232//==========================================================
233// Handling as_index objects.
234//
235
236// Container for as_index pointer with lock and location.
237struct as_index_ref_s {
238 as_index *r;
239 cf_arenax_handle r_h;
240 cf_arenax_puddle *puddle;
241 cf_mutex *olock;
242};
243
244
245//==========================================================
246// Index tree.
247//
248
249typedef void (*as_index_tree_done_fn) (uint8_t id, void *udata);
250
251typedef struct as_index_tree_s {
252 uint8_t id;
253 as_index_tree_done_fn done_cb;
254 void *udata;
255
256 // Data common to all trees in a namespace.
257 as_index_tree_shared *shared;
258
259 cf_atomic64 n_elements;
260
261 // Variable length data, dependent on configuration.
262 uint8_t data[];
263} as_index_tree;
264
265
266//==========================================================
267// as_index_tree variable length data components.
268//
269
270#define NUM_LOCK_PAIRS 256 // per partition
271
272typedef struct as_lock_pair_s {
273 // Note: reduce_lock's scope is always inside of lock's scope.
274 cf_mutex lock; // insert, delete vs. insert, delete, get
275 cf_mutex reduce_lock; // insert, delete vs. reduce
276} as_lock_pair;
277
278#define NUM_SPRIG_BITS 28 // 3.5 bytes - yes, that's a lot of sprigs
279
280typedef struct as_sprig_s {
281 uint64_t n_elements: 24; // max 16M records per sprig
282 uint64_t root_h: 40;
283} as_sprig;
284
285static inline as_lock_pair *
286tree_locks(as_index_tree *tree)
287{
288 return (as_lock_pair*)tree->data;
289}
290
291static inline as_sprig *
292tree_sprigs(as_index_tree *tree)
293{
294 return (as_sprig*)(tree->data + tree->shared->sprigs_offset);
295}
296
297static inline cf_arenax_puddle *
298tree_puddle_for_sprig(as_index_tree *tree, int sprig_i)
299{
300 uint32_t puddles_offset = tree->shared->puddles_offset;
301 return puddles_offset == 0 ?
302 NULL : (cf_arenax_puddle*)(tree->data + puddles_offset) + sprig_i;
303}
304
305static inline cf_arenax_puddle *
306tree_puddles(as_index_tree *tree)
307{
308 return tree_puddle_for_sprig(tree, 0);
309}
310
311static inline size_t
312tree_puddles_size(as_index_tree_shared *shared)
313{
314 return shared->puddles_offset == 0 ?
315 0 : sizeof(cf_arenax_puddle) * shared->n_sprigs;
316}
317
318static inline uint32_t
319tree_puddles_count(as_index_tree_shared *shared)
320{
321 return shared->puddles_offset == 0 ? 0 : shared->n_sprigs;
322}
323
324
325//------------------------------------------------
326// as_index_tree public API.
327//
328
329void as_index_tree_gc_init();
330int as_index_tree_gc_queue_size();
331
332as_index_tree *as_index_tree_create(as_index_tree_shared *shared, uint8_t id, as_index_tree_done_fn cb, void *udata);
333as_index_tree *as_index_tree_resume(as_index_tree_shared *shared, as_treex* xmem_trees, uint32_t pid, as_index_tree_done_fn cb, void *udata);
334void as_index_tree_block(as_index_tree *tree);
335void as_index_tree_reserve(as_index_tree *tree);
336int as_index_tree_release(as_index_tree *tree);
337uint64_t as_index_tree_size(as_index_tree *tree);
338
339typedef void (*as_index_reduce_fn) (as_index_ref *value, void *udata);
340
341void as_index_reduce(as_index_tree *tree, as_index_reduce_fn cb, void *udata);
342void as_index_reduce_partial(as_index_tree *tree, uint64_t sample_count, as_index_reduce_fn cb, void *udata);
343
344void as_index_reduce_live(as_index_tree *tree, as_index_reduce_fn cb, void *udata);
345void as_index_reduce_partial_live(as_index_tree *tree, uint64_t sample_count, as_index_reduce_fn cb, void *udata);
346
347int as_index_try_exists(as_index_tree *tree, const cf_digest *keyd);
348int as_index_try_get_vlock(as_index_tree *tree, const cf_digest *keyd, as_index_ref *index_ref);
349int as_index_get_vlock(as_index_tree *tree, const cf_digest *keyd, as_index_ref *index_ref);
350int as_index_get_insert_vlock(as_index_tree *tree, const cf_digest *keyd, as_index_ref *index_ref);
351int as_index_delete(as_index_tree *tree, const cf_digest *keyd);
352
353
354//------------------------------------------------
355// Private API - for enterprise separation only.
356//
357
358// Container for sprig-level function parameters.
359typedef struct as_index_sprig_s {
360 as_index_value_destructor destructor;
361 void *destructor_udata;
362
363 cf_arenax *arena;
364
365 as_lock_pair *pair;
366 as_sprig *sprig;
367 cf_arenax_puddle *puddle;
368} as_index_sprig;
369
370uint64_t as_index_sprig_keyd_reduce_partial(as_index_sprig *isprig, uint64_t sample_count, as_index_reduce_fn cb, void *udata);
371
372int as_index_sprig_get_vlock(as_index_sprig *isprig, const cf_digest *keyd, as_index_ref *index_ref);
373int as_index_sprig_delete(as_index_sprig *isprig, const cf_digest *keyd);
374
375static inline void
376as_index_sprig_from_keyd(as_index_tree *tree, as_index_sprig *isprig,
377 const cf_digest *keyd)
378{
379 // Get the 28 most significant non-pid bits in the digest. Note - this is
380 // hardwired around the way we currently extract the (12 bit) partition-ID
381 // from the digest.
382 uint32_t bits = (((uint32_t)keyd->digest[1] & 0xF0) << 20) |
383 ((uint32_t)keyd->digest[2] << 16) |
384 ((uint32_t)keyd->digest[3] << 8) |
385 (uint32_t)keyd->digest[4];
386
387 uint32_t lock_i = bits >> tree->shared->locks_shift;
388 uint32_t sprig_i = bits >> tree->shared->sprigs_shift;
389
390 isprig->destructor = tree->shared->destructor;
391 isprig->destructor_udata = tree->shared->destructor_udata;
392 isprig->arena = tree->shared->arena;
393 isprig->pair = tree_locks(tree) + lock_i;
394 isprig->sprig = tree_sprigs(tree) + sprig_i;
395 isprig->puddle = tree_puddle_for_sprig(tree, sprig_i);
396}
397
398#define SENTINEL_H 0
399
400#define RESOLVE_H(__h) ((as_index*)cf_arenax_resolve(isprig->arena, __h))
401
402// Flag to indicate full index reduce.
403#define AS_REDUCE_ALL (-1L)
404