1/*
2 * datamodel.h
3 *
4 * Copyright (C) 2008-2016 Aerospike, Inc.
5 *
6 * Portions may be licensed to Aerospike, Inc. under one or more contributor
7 * license agreements.
8 *
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU Affero General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU Affero General Public License
20 * along with this program. If not, see http://www.gnu.org/licenses/
21 */
22
23/*
24 * core data model structures and definitions
25 */
26
27#pragma once
28
29#include <limits.h>
30#include <stdbool.h>
31#include <stddef.h>
32#include <stdint.h>
33#include <string.h>
34
35#include "aerospike/as_val.h"
36#include "citrusleaf/cf_atomic.h"
37#include "citrusleaf/cf_clock.h"
38#include "citrusleaf/cf_digest.h"
39
40#include "arenax.h"
41#include "cf_mutex.h"
42#include "dynbuf.h"
43#include "hist.h"
44#include "hist_track.h"
45#include "linear_hist.h"
46#include "msg.h"
47#include "node.h"
48#include "shash.h"
49#include "vmapx.h"
50#include "xmem.h"
51
52#include "base/cfg.h"
53#include "base/proto.h"
54#include "base/transaction_policy.h"
55#include "base/truncate.h"
56#include "fabric/hb.h"
57#include "fabric/partition.h"
58#include "storage/flat.h"
59#include "storage/storage.h"
60
61
62#define OBJ_SIZE_HIST_NUM_BUCKETS 1024
63#define TTL_HIST_NUM_BUCKETS 100
64
65#define MAX_ALLOWED_TTL (3600 * 24 * 365 * 10) // 10 years
66
67// [0-1] for partition-id
68// [1-4] for tree sprigs and locks
69// [5-7] unused
70// [8-11] for SSD device hash
71#define DIGEST_STORAGE_BASE_BYTE 8
72// [12-15] for rw_request hash
73#define DIGEST_HASH_BASE_BYTE 12
74// [16-19] for pred-exp filter
75
76
77/* Forward declarations */
78typedef struct as_namespace_s as_namespace;
79typedef struct as_index_s as_record;
80typedef struct as_bin_s as_bin;
81typedef struct as_index_ref_s as_index_ref;
82typedef struct as_set_s as_set;
83
84struct as_index_tree_s;
85
86
87#define AS_ID_NAMESPACE_SZ 32
88
89#define AS_ID_INAME_SZ 256
90
91#define AS_BIN_NAME_MAX_SZ 16 // changing this breaks warm restart
92#define MAX_BIN_NAMES 0x10000 // no need for more - numeric ID is 16 bits
93#define BIN_NAMES_QUOTA (MAX_BIN_NAMES / 2) // don't add more names than this via client transactions
94
95/*
96 * Compare two 16-bit generation counts, allowing wrap-arounds.
97 * Works correctly, if:
98 *
99 * - rhs is ahead of lhs, but rhs isn't ahead more than 32,768.
100 * - lhs is ahead of rhs, but lhs isn't ahead more than 32,767.
101 */
102
103static inline bool
104as_gen_less_than(uint16_t lhs, uint16_t rhs)
105{
106 return (uint16_t)(lhs - rhs) >= 32768;
107}
108
109
110/* as_particle_type
111 * Particles are typed, which reflects their contents:
112 * NULL: no associated content (not sure I really need this internally?)
113 * INTEGER: a signed, 64-bit integer
114 * FLOAT: a floating point
115 * STRING: a null-terminated UTF-8 string
116 * BLOB: arbitrary-length binary data
117 * TIMESTAMP: milliseconds since 1 January 1970, 00:00:00 GMT
118 * DIGEST: an internal Aerospike key digest */
119typedef enum {
120 AS_PARTICLE_TYPE_NULL = 0,
121 AS_PARTICLE_TYPE_INTEGER = 1,
122 AS_PARTICLE_TYPE_FLOAT = 2,
123 AS_PARTICLE_TYPE_STRING = 3,
124 AS_PARTICLE_TYPE_BLOB = 4,
125 AS_PARTICLE_TYPE_JAVA_BLOB = 7,
126 AS_PARTICLE_TYPE_CSHARP_BLOB = 8,
127 AS_PARTICLE_TYPE_PYTHON_BLOB = 9,
128 AS_PARTICLE_TYPE_RUBY_BLOB = 10,
129 AS_PARTICLE_TYPE_PHP_BLOB = 11,
130 AS_PARTICLE_TYPE_ERLANG_BLOB = 12,
131 AS_PARTICLE_TYPE_MAP = 19,
132 AS_PARTICLE_TYPE_LIST = 20,
133 AS_PARTICLE_TYPE_GEOJSON = 23,
134 AS_PARTICLE_TYPE_MAX = 24,
135 AS_PARTICLE_TYPE_BAD = AS_PARTICLE_TYPE_MAX
136} as_particle_type;
137
138/* as_particle
139 * The common part of a particle
140 * this is poor man's subclassing - IE, how to do a subclassed interface in C
141 * Go look in particle.c to see all the subclass implementation and structure */
142typedef struct as_particle_s {
143 uint8_t metadata; // used by the iparticle for is_integer and inuse, as well as version in multi bin mode only
144 // used by *particle for type
145 uint8_t data[0];
146} __attribute__ ((__packed__)) as_particle;
147
148// Bit Flag constants used for the particle state value (4 bits, 16 values)
149#define AS_BIN_STATE_UNUSED 0
150#define AS_BIN_STATE_INUSE_INTEGER 1
151#define AS_BIN_STATE_RECYCLE_ME 2 // was - hidden bin
152#define AS_BIN_STATE_INUSE_OTHER 3
153#define AS_BIN_STATE_INUSE_FLOAT 4
154
155typedef struct as_particle_iparticle_s {
156 uint8_t version: 4; // now unused - and can't be used in single-bin config
157 uint8_t state: 4; // see AS_BIN_STATE_...
158 uint8_t data[0];
159} __attribute__ ((__packed__)) as_particle_iparticle;
160
161/* Particle function declarations */
162
163static inline bool
164is_embedded_particle_type(as_particle_type type)
165{
166 return type == AS_PARTICLE_TYPE_INTEGER || type == AS_PARTICLE_TYPE_FLOAT;
167}
168
169extern as_particle_type as_particle_type_from_asval(const as_val *val);
170extern as_particle_type as_particle_type_from_msgpack(const uint8_t *packed, uint32_t packed_size);
171
172extern uint32_t as_particle_size_from_asval(const as_val *val);
173
174extern uint32_t as_particle_asval_client_value_size(const as_val *val);
175extern uint32_t as_particle_asval_to_client(const as_val *val, as_msg_op *op);
176
177extern const uint8_t *as_particle_skip_flat(const uint8_t *flat, const uint8_t *end);
178
179// as_bin particle function declarations
180
181extern void as_bin_particle_destroy(as_bin *b, bool free_particle);
182extern uint32_t as_bin_particle_size(as_bin *b);
183
184// wire:
185extern int as_bin_particle_alloc_modify_from_client(as_bin *b, const as_msg_op *op);
186extern int as_bin_particle_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op);
187extern int as_bin_particle_alloc_from_client(as_bin *b, const as_msg_op *op);
188extern int as_bin_particle_stack_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op);
189extern int as_bin_particle_alloc_from_pickled(as_bin *b, const uint8_t **p_pickled, const uint8_t *end);
190extern int as_bin_particle_stack_from_pickled(as_bin *b, cf_ll_buf *particles_llb, const uint8_t **p_pickled, const uint8_t *end);
191extern uint32_t as_bin_particle_client_value_size(const as_bin *b);
192extern uint32_t as_bin_particle_to_client(const as_bin *b, as_msg_op *op);
193extern uint32_t as_bin_particle_pickled_size(const as_bin *b);
194extern uint32_t as_bin_particle_to_pickled(const as_bin *b, uint8_t *pickled);
195
196// Different for blob bitwise operations - we don't use the normal APIs and
197// particle table functions.
198extern int as_bin_bits_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result);
199extern int as_bin_bits_alloc_modify_from_client(as_bin *b, as_msg_op *op);
200extern int as_bin_bits_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op);
201
202// Different for CDTs - the operations may return results, so we don't use the
203// normal APIs and particle table functions.
204extern int as_bin_cdt_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result);
205extern int as_bin_cdt_alloc_modify_from_client(as_bin *b, as_msg_op *op, as_bin *result);
206extern int as_bin_cdt_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op, as_bin *result);
207
208// as_val:
209extern int as_bin_particle_replace_from_asval(as_bin *b, const as_val *val);
210extern void as_bin_particle_stack_from_asval(as_bin *b, uint8_t* stack, const as_val *val);
211extern as_val *as_bin_particle_to_asval(const as_bin *b);
212
213// msgpack:
214extern int as_bin_particle_alloc_from_msgpack(as_bin *b, const uint8_t *packed, uint32_t packed_size);
215
216// flat:
217extern const uint8_t *as_bin_particle_cast_from_flat(as_bin *b, const uint8_t *flat, const uint8_t *end);
218extern const uint8_t *as_bin_particle_replace_from_flat(as_bin *b, const uint8_t *flat, const uint8_t *end);
219extern uint32_t as_bin_particle_flat_size(as_bin *b);
220extern uint32_t as_bin_particle_to_flat(const as_bin *b, uint8_t *flat);
221
222// odd as_bin particle functions for specific particle types
223
224// integer:
225extern int64_t as_bin_particle_integer_value(const as_bin *b);
226extern void as_bin_particle_integer_set(as_bin *b, int64_t i);
227
228// string:
229extern uint32_t as_bin_particle_string_ptr(const as_bin *b, char **p_value);
230
231// blob:
232extern int as_bin_bits_packed_read(const as_bin *b, const as_msg_op *msg_op, as_bin *result);
233extern int as_bin_bits_packed_modify(as_bin *b, const as_msg_op *msg_op, cf_ll_buf *particles_llb);
234
235// geojson:
236typedef void * geo_region_t;
237#define MAX_REGION_CELLS 256
238#define MAX_REGION_LEVELS 30
239extern size_t as_bin_particle_geojson_cellids(const as_bin *b, uint64_t **pp_cells);
240extern bool as_particle_geojson_match(as_particle *p, uint64_t cellid, geo_region_t region, bool is_strict);
241extern bool as_particle_geojson_match_asval(const as_val *val, uint64_t cellid, geo_region_t region, bool is_strict);
242char const *as_geojson_mem_jsonstr(const as_particle *p, size_t *p_jsonsz);
243
244// list:
245struct cdt_payload_s;
246struct rollback_alloc_s;
247extern void as_bin_particle_list_get_packed_val(const as_bin *b, struct cdt_payload_s *packed);
248
249extern int as_bin_cdt_packed_read(const as_bin *b, const as_msg_op *op, as_bin *result);
250extern int as_bin_cdt_packed_modify(as_bin *b, const as_msg_op *op, as_bin *result, cf_ll_buf *particles_llb);
251
252
253/* as_bin
254 * A bin container - null name means unused */
255struct as_bin_s {
256 as_particle iparticle; // 1 byte
257 as_particle *particle; // for embedded particle this is value, not pointer
258
259 // Never read or write these bytes in single-bin configuration:
260 uint16_t id; // ID of bin name
261 uint8_t unused; // pad to 12 bytes (multiple of 4) - legacy
262} __attribute__ ((__packed__)) ;
263
264// For data-in-memory namespaces in multi-bin mode, we keep an array of as_bin
265// structs in memory, accessed via this struct.
266typedef struct as_bin_space_s {
267 uint16_t n_bins;
268 as_bin bins[0];
269} __attribute__ ((__packed__)) as_bin_space;
270
271// TODO - Do we really need to pad as_bin to 12 bytes for thread safety?
272// Do we ever write & read adjacent as_bin structures in a bins array from
273// different threads when not under the record lock? And if we're worried about
274// 4-byte alignment for that or any other reason, wouldn't we also have to pad
275// after n_bins in as_bin_space?
276
277// For data-in-memory namespaces in multi-bin mode, if we're storing extra
278// record metadata, we access it via this struct. In this case the index points
279// here instead of directly to an as_bin_space.
280typedef struct as_rec_space_s {
281 as_bin_space* bin_space;
282
283 // So far the key is the only extra record metadata we store in memory.
284 uint32_t key_size;
285 uint8_t key[0];
286} __attribute__ ((__packed__)) as_rec_space;
287
288// For copying as_bin structs without the last 3 bytes.
289static inline void
290as_single_bin_copy(as_bin *to, const as_bin *from)
291{
292 to->iparticle = from->iparticle;
293 to->particle = from->particle;
294}
295
296static inline bool
297as_bin_inuse(const as_bin *b)
298{
299 return (((as_particle_iparticle *)b)->state);
300}
301
302static inline uint8_t
303as_bin_state(const as_bin *b)
304{
305 return ((as_particle_iparticle *)b)->state;
306}
307
308static inline void
309as_bin_state_set(as_bin *b, uint8_t val)
310{
311 ((as_particle_iparticle *)b)->state = val;
312}
313
314static inline void
315as_bin_state_set_from_type(as_bin *b, as_particle_type type)
316{
317 switch (type) {
318 case AS_PARTICLE_TYPE_NULL:
319 ((as_particle_iparticle *)b)->state = AS_BIN_STATE_UNUSED;
320 break;
321 case AS_PARTICLE_TYPE_INTEGER:
322 ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_INTEGER;
323 break;
324 case AS_PARTICLE_TYPE_FLOAT:
325 ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_FLOAT;
326 break;
327 default:
328 ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_OTHER;
329 break;
330 }
331}
332
333static inline bool
334as_bin_inuse_has(const as_storage_rd *rd)
335{
336 // In-use bins are at the beginning - only need to check the first bin.
337 return rd->n_bins != 0 && (rd->pickle != NULL || as_bin_inuse(rd->bins));
338}
339
340static inline uint16_t
341as_bin_inuse_count(const as_storage_rd *rd)
342{
343 for (uint16_t i = 0; i < rd->n_bins; i++) {
344 if (! as_bin_inuse(&rd->bins[i])) {
345 return i;
346 }
347 }
348
349 return rd->n_bins;
350}
351
352static inline void
353as_bin_set_empty(as_bin *b)
354{
355 as_bin_state_set(b, AS_BIN_STATE_UNUSED);
356}
357
358static inline void
359as_bin_set_empty_shift(as_storage_rd *rd, uint32_t i)
360{
361 // Shift the bins over, so there's no space between used bins.
362 // This can overwrite the "emptied" bin, and that's fine.
363
364 uint16_t j;
365
366 for (j = i + 1; j < rd->n_bins; j++) {
367 if (! as_bin_inuse(&rd->bins[j])) {
368 break;
369 }
370 }
371
372 uint16_t n = j - (i + 1);
373
374 if (n) {
375 memmove(&rd->bins[i], &rd->bins[i + 1], n * sizeof(as_bin));
376 }
377
378 // Mark the last bin that was *formerly* in use as null.
379 as_bin_set_empty(&rd->bins[j - 1]);
380}
381
382static inline void
383as_bin_set_empty_from(as_storage_rd *rd, uint16_t from) {
384 for (uint16_t i = from; i < rd->n_bins; i++) {
385 as_bin_set_empty(&rd->bins[i]);
386 }
387}
388
389static inline void
390as_bin_set_all_empty(as_storage_rd *rd) {
391 as_bin_set_empty_from(rd, 0);
392}
393
394static inline bool
395as_bin_is_embedded_particle(const as_bin *b) {
396 return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_INTEGER ||
397 ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_FLOAT;
398}
399
400static inline bool
401as_bin_is_external_particle(const as_bin *b) {
402 return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_OTHER;
403}
404
405static inline as_particle *
406as_bin_get_particle(as_bin *b) {
407 return as_bin_is_embedded_particle(b) ? &b->iparticle : b->particle;
408}
409
410// "Embedded" types like integer are stored directly, but other bin types
411// ("other") must follow an indirection to get the actual type.
412static inline uint8_t
413as_bin_get_particle_type(const as_bin *b) {
414 switch (((as_particle_iparticle *)b)->state) {
415 case AS_BIN_STATE_INUSE_INTEGER:
416 return AS_PARTICLE_TYPE_INTEGER;
417 case AS_BIN_STATE_INUSE_FLOAT:
418 return AS_PARTICLE_TYPE_FLOAT;
419 case AS_BIN_STATE_INUSE_OTHER:
420 return b->particle->metadata;
421 default:
422 return AS_PARTICLE_TYPE_NULL;
423 }
424}
425
426
427/* Bin function declarations */
428extern int16_t as_bin_get_id(as_namespace *ns, const char *name);
429extern bool as_bin_get_or_assign_id_w_len(as_namespace *ns, const char *name, size_t len, uint16_t *id);
430extern const char* as_bin_get_name_from_id(as_namespace *ns, uint16_t id);
431extern bool as_bin_name_within_quota(as_namespace *ns, const char *name);
432extern void as_bin_copy(as_namespace *ns, as_bin *to, const as_bin *from);
433extern int as_storage_rd_load_n_bins(as_storage_rd *rd);
434extern int as_storage_rd_load_bins(as_storage_rd *rd, as_bin *stack_bins);
435extern void as_bin_get_all_p(as_storage_rd *rd, as_bin **bin_ptrs);
436extern as_bin *as_bin_get_by_id(as_storage_rd *rd, uint32_t id);
437extern as_bin *as_bin_get(as_storage_rd *rd, const char *name);
438extern as_bin *as_bin_get_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len);
439extern as_bin *as_bin_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result);
440extern as_bin *as_bin_get_or_create(as_storage_rd *rd, const char *name);
441extern as_bin *as_bin_get_or_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result);
442extern int32_t as_bin_get_index(as_storage_rd *rd, const char *name);
443extern int32_t as_bin_get_index_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len);
444extern void as_bin_destroy(as_storage_rd *rd, uint16_t i);
445extern void as_bin_allocate_bin_space(as_storage_rd *rd, int32_t delta);
446
447
448typedef enum {
449 AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF = 0,
450 AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION = 1,
451 AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME = 2,
452 AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_CP = 3
453} conflict_resolution_pol;
454
455/* Record function declarations */
456extern uint32_t clock_skew_stop_writes_sec();
457extern bool as_record_handle_clock_skew(as_namespace* ns, uint64_t skew_ms);
458extern uint16_t plain_generation(uint16_t regime_generation, const as_namespace* ns);
459extern void as_record_set_lut(as_record *r, uint32_t regime, uint64_t now_ms, const as_namespace* ns);
460extern void as_record_increment_generation(as_record *r, const as_namespace* ns);
461extern bool as_record_is_live(const as_record *r);
462extern int as_record_get_create(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns);
463extern int as_record_get(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref);
464extern int as_record_get_live(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns);
465extern int as_record_exists(struct as_index_tree_s *tree, const cf_digest *keyd);
466extern int as_record_exists_live(struct as_index_tree_s *tree, const cf_digest *keyd, as_namespace *ns);
467extern void as_record_rescue(as_index_ref *r_ref, as_namespace *ns);
468
469extern void as_record_destroy_bins_from(as_storage_rd *rd, uint16_t from);
470extern void as_record_destroy_bins(as_storage_rd *rd);
471extern void as_record_free_bin_space(as_record *r);
472
473extern void as_record_destroy(as_record *r, as_namespace *ns);
474extern void as_record_done(as_index_ref *r_ref, as_namespace *ns);
475
476void as_record_drop_stats(as_record* r, as_namespace* ns);
477
478extern void as_record_finalize_key(as_record* r, as_namespace* ns, const uint8_t* key, uint32_t key_size);
479extern void as_record_allocate_key(as_record* r, const uint8_t* key, uint32_t key_size);
480extern int as_record_resolve_conflict(conflict_resolution_pol policy, uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut);
481extern uint8_t *as_record_pickle(as_storage_rd *rd, size_t *len_r);
482extern int as_record_write_from_pickle(as_storage_rd *rd);
483extern int as_record_set_set_from_msg(as_record *r, as_namespace *ns, as_msg *m);
484
485static inline bool
486as_record_pickle_is_binless(const uint8_t *buf)
487{
488 return *(uint16_t *)buf == 0;
489}
490
491// For enterprise split only.
492int record_resolve_conflict_cp(uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut);
493
494static inline int
495resolve_last_update_time(uint64_t left, uint64_t right)
496{
497 return left == right ? 0 : (right > left ? 1 : -1);
498}
499
500typedef struct as_remote_record_s {
501 cf_node src;
502 as_partition_reservation *rsv;
503 cf_digest *keyd;
504
505 uint8_t *pickle;
506 size_t pickle_sz;
507
508 uint32_t generation;
509 uint32_t void_time;
510 uint64_t last_update_time;
511
512 const char *set_name;
513 size_t set_name_len;
514
515 const uint8_t *key;
516 size_t key_size;
517
518 bool is_old_pickle; // TODO - old pickle - remove in "six months"
519
520 uint16_t n_bins;
521 as_flat_comp_meta cm;
522 uint32_t meta_sz;
523
524 uint8_t repl_state; // relevant only for enterprise edition
525} as_remote_record;
526
527int as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, bool skip_sindex, bool do_xdr_write);
528
529// a simpler call that gives seconds in the right epoch
530#define as_record_void_time_get() cf_clepoch_seconds()
531bool as_record_is_expired(const as_record *r); // TODO - eventually inline
532
533static inline bool
534as_record_is_doomed(const as_record *r, struct as_namespace_s *ns)
535{
536 return as_record_is_expired(r) || as_truncate_record_is_truncated(r, ns);
537}
538
539#define AS_SINDEX_MAX 256
540
541#define MIN_PARTITIONS_PER_INDEX 1
542#define MAX_PARTITIONS_PER_INDEX 256
543#define DEFAULT_PARTITIONS_PER_INDEX 32
544#define MAX_PARTITIONS_PER_INDEX_CHAR 3 // Number of characters in max paritions per index
545
546// as_sindex structure which hangs from the ns.
547#define AS_SINDEX_INACTIVE 1 // On init, pre-loading
548#define AS_SINDEX_ACTIVE 2 // On creation and afterwards
549#define AS_SINDEX_DESTROY 3 // On destroy
550// dummy sindex state when ai_btree_create() returns error this
551// sindex is not available for any of the DML operations
552#define AS_SINDEX_NOTCREATED 4 // Un-used flag
553#define AS_SINDEX_FLAG_WACTIVE 0x01 // On ai btree create of sindex, never reset
554#define AS_SINDEX_FLAG_RACTIVE 0x02 // When sindex scan of database is completed
555#define AS_SINDEX_FLAG_DESTROY_CLEANUP 0x04 // Called for AI clean-up during si deletion
556#define AS_SINDEX_FLAG_MIGRATE_CLEANUP 0x08 // Un-used
557#define AS_SINDEX_FLAG_POPULATING 0x10 // Indicates current si scan job, reset when scan is done.
558
559struct as_sindex_s;
560struct as_sindex_config_s;
561
562#define AS_SET_MAX_COUNT 0x3FF // ID's 10 bits worth minus 1 (ID 0 means no set)
563#define AS_BINID_HAS_SINDEX_SIZE MAX_BIN_NAMES / ( sizeof(uint32_t) * CHAR_BIT )
564
565
566// TODO - would be nice to put this in as_index.h:
567// Callback invoked when as_index is destroyed.
568typedef void (*as_index_value_destructor) (struct as_index_s* v, void* udata);
569
570// TODO - would be nice to put this in as_index.h:
571typedef struct as_index_tree_shared_s {
572 cf_arenax* arena;
573
574 as_index_value_destructor destructor;
575 void* destructor_udata;
576
577 // Number of sprigs per partition tree.
578 uint32_t n_sprigs;
579
580 // Bit-shifts used to calculate indexes from digest bits.
581 uint32_t locks_shift;
582 uint32_t sprigs_shift;
583
584 // Offsets into as_index_tree struct's variable-sized data.
585 uint32_t sprigs_offset;
586 uint32_t puddles_offset;
587} as_index_tree_shared;
588
589
590typedef struct as_sprigx_s {
591 uint64_t root_h: 40;
592} __attribute__ ((__packed__)) as_sprigx;
593
594typedef struct as_treex_s {
595 int block_ix[AS_PARTITIONS];
596 as_sprigx sprigxs[0];
597} as_treex;
598
599
600struct as_namespace_s {
601 //--------------------------------------------
602 // Data partitions - first, to 64-byte align.
603 //
604
605 as_partition partitions[AS_PARTITIONS];
606
607 //--------------------------------------------
608 // Name & ID.
609 //
610
611 char name[AS_ID_NAMESPACE_SZ];
612 uint32_t id; // this is 1-based
613 uint32_t namehash;
614
615 //--------------------------------------------
616 // Persistent memory.
617 //
618
619 // Persistent memory type (default is shmem).
620 cf_xmem_type xmem_type;
621 const void* xmem_type_cfg;
622
623 // Persistent memory "base" block ID for this namespace.
624 uint32_t xmem_id;
625
626 // Pointer to the persistent memory "base" block.
627 uint8_t* xmem_base;
628
629 // Pointer to partition tree info in persistent memory "treex" block.
630 as_treex* xmem_trees;
631
632 // Pointer to arena structure (not stages) in persistent memory base block.
633 cf_arenax* arena;
634
635 // Pointer to bin name vmap in persistent memory base block.
636 cf_vmapx* p_bin_name_vmap;
637
638 // Pointer to set information vmap in persistent memory base block.
639 cf_vmapx* p_sets_vmap;
640
641 // Temporary array of sets to hold config values until sets vmap is ready.
642 as_set* sets_cfg_array;
643 uint32_t sets_cfg_count;
644
645 // Configuration flags relevant for warm or cool restart.
646 uint32_t xmem_flags;
647
648 //--------------------------------------------
649 // Cold start.
650 //
651
652 // If true, read storage devices to build index at startup.
653 bool cold_start;
654
655 // If true, device headers indicate previous shutdown was not clean.
656 bool dirty_restart;
657
658 // Flag for ticker during initial loading of records from device.
659 bool loading_records;
660
661 // For cold start eviction.
662 cf_mutex cold_start_evict_lock;
663 uint32_t cold_start_record_add_count;
664 uint32_t cold_start_now;
665
666 // For sanity checking at startup (also used during warm or cool restart).
667 uint32_t startup_max_void_time;
668
669 //--------------------------------------------
670 // Memory management.
671 //
672
673 // JEMalloc arena to be used for long-term storage in this namespace (-1 if nonexistent.)
674 int jem_arena;
675
676 // Cached partition ownership info for clients.
677 client_replica_map* replica_maps;
678
679 // Common partition tree information. Contains two configuration items.
680 as_index_tree_shared tree_shared;
681
682 //--------------------------------------------
683 // Storage management.
684 //
685
686 // This is typecast to (drv_ssds*) in storage code.
687 void* storage_private;
688
689 uint64_t ssd_size; // discovered (and rounded) size of drive
690 int storage_last_avail_pct; // most recently calculated available percent
691 int storage_max_write_q; // storage_max_write_cache is converted to this
692 uint32_t saved_defrag_sleep; // restore after defrag at startup is done
693 uint32_t defrag_lwm_size; // storage_defrag_lwm_pct % of storage_write_block_size
694
695 // For data-not-in-memory, we optionally cache swbs after writing to device.
696 // To track fraction of reads from cache:
697 cf_atomic32 n_reads_from_cache;
698 cf_atomic32 n_reads_from_device;
699
700 uint8_t storage_encryption_key[64];
701
702 //--------------------------------------------
703 // Eviction.
704 //
705
706 uint32_t smd_evict_void_time;
707 uint32_t evict_void_time;
708
709 //--------------------------------------------
710 // Truncate records.
711 //
712
713 as_truncate truncate;
714
715 //--------------------------------------------
716 // Secondary index.
717 //
718
719 int sindex_cnt;
720 uint32_t n_setless_sindexes;
721 struct as_sindex_s* sindex; // array with AS_MAX_SINDEX metadata
722 cf_shash* sindex_set_binid_hash;
723 cf_shash* sindex_iname_hash;
724 uint32_t binid_has_sindex[AS_BINID_HAS_SINDEX_SIZE];
725
726 //--------------------------------------------
727 // Configuration.
728 //
729
730 uint32_t cfg_replication_factor;
731 uint32_t replication_factor; // indirect config - can become less than cfg_replication_factor
732 uint64_t memory_size;
733 uint32_t default_ttl;
734
735 bool enable_xdr;
736 bool sets_enable_xdr; // namespace-level flag to enable set-based xdr shipping
737 bool ns_forward_xdr_writes; // namespace-level flag to enable forwarding of xdr writes
738 bool ns_allow_nonxdr_writes; // namespace-level flag to allow nonxdr writes or not
739 bool ns_allow_xdr_writes; // namespace-level flag to allow xdr writes or not
740
741 uint32_t background_scan_max_rps;
742 conflict_resolution_pol conflict_resolution_policy;
743 bool cp; // relevant only for enterprise edition
744 bool cp_allow_drops; // relevant only for enterprise edition
745 bool data_in_index; // with single-bin, allows warm restart for data-in-memory (with storage-engine device)
746 bool cold_start_eviction_disabled;
747 bool write_dup_res_disabled;
748 bool disallow_null_setname;
749 bool batch_sub_benchmarks_enabled;
750 bool ops_sub_benchmarks_enabled;
751 bool read_benchmarks_enabled;
752 bool udf_benchmarks_enabled;
753 bool udf_sub_benchmarks_enabled;
754 bool write_benchmarks_enabled;
755 bool proxy_hist_enabled;
756 uint32_t evict_hist_buckets;
757 uint32_t evict_tenths_pct;
758 uint32_t hwm_disk_pct;
759 uint32_t hwm_memory_pct;
760 uint64_t index_stage_size;
761 uint32_t migrate_order;
762 uint32_t migrate_retransmit_ms;
763 uint32_t migrate_sleep;
764 uint32_t nsup_hist_period;
765 uint32_t nsup_period;
766 uint32_t n_nsup_threads;
767 bool cfg_prefer_uniform_balance; // relevant only for enterprise edition
768 bool prefer_uniform_balance; // indirect config - can become disabled if any other node reports disabled
769 uint32_t rack_id;
770 as_read_consistency_level read_consistency_level;
771 bool single_bin; // restrict the namespace to objects with exactly one bin
772 uint32_t n_single_scan_threads;
773 uint32_t stop_writes_pct;
774 uint32_t tomb_raider_eligible_age; // relevant only for enterprise edition
775 uint32_t tomb_raider_period; // relevant only for enterprise edition
776 uint32_t transaction_pending_limit; // 0 means no limit
777 uint32_t n_truncate_threads;
778 as_write_commit_level write_commit_level;
779 cf_vector xdr_dclist_v;
780
781 const char* xmem_mounts[CF_XMEM_MAX_MOUNTS];
782 uint32_t n_xmem_mounts; // indirect config
783 uint32_t mounts_hwm_pct;
784 uint64_t mounts_size_limit;
785
786 as_storage_type storage_type;
787
788 const char* storage_devices[AS_STORAGE_MAX_DEVICES];
789 uint32_t n_storage_devices; // indirect config - if devices array contains raw devices (or partitions)
790 uint32_t n_storage_files; // indirect config - if devices array contains files
791 const char* storage_shadows[AS_STORAGE_MAX_DEVICES];
792 uint32_t n_storage_shadows; // indirect config
793 uint64_t storage_filesize;
794 char* storage_scheduler_mode; // relevant for devices only, not files
795 uint32_t storage_write_block_size;
796 bool storage_data_in_memory;
797
798 bool storage_cold_start_empty;
799 bool storage_commit_to_device; // relevant only for enterprise edition
800 uint32_t storage_commit_min_size; // relevant only for enterprise edition
801 as_compression_method storage_compression; // relevant only for enterprise edition
802 uint32_t storage_compression_level; // relevant only for enterprise edition
803 uint32_t storage_defrag_lwm_pct;
804 uint32_t storage_defrag_queue_min;
805 uint32_t storage_defrag_sleep;
806 int storage_defrag_startup_minimum;
807 bool storage_direct_files;
808 bool storage_disable_odsync;
809 bool storage_benchmarks_enabled; // histograms are per-drive except device-read-size & device-write-size
810 as_encryption_method storage_encryption; // relevant only for enterprise edition
811 char* storage_encryption_key_file; // relevant only for enterprise edition
812 uint64_t storage_flush_max_us;
813 uint64_t storage_max_write_cache;
814 uint32_t storage_min_avail_pct;
815 cf_atomic32 storage_post_write_queue; // number of swbs/device held after writing to device
816 bool storage_read_page_cache;
817 bool storage_serialize_tomb_raider; // relevant only for enterprise edition
818 uint32_t storage_tomb_raider_sleep; // relevant only for enterprise edition
819
820 uint32_t sindex_num_partitions;
821
822 bool geo2dsphere_within_strict;
823 uint16_t geo2dsphere_within_min_level;
824 uint16_t geo2dsphere_within_max_level;
825 uint16_t geo2dsphere_within_max_cells;
826 uint16_t geo2dsphere_within_level_mod;
827 uint32_t geo2dsphere_within_earth_radius_meters;
828
829 //--------------------------------------------
830 // Statistics and histograms.
831 //
832
833 // Object counts.
834
835 cf_atomic64 n_objects;
836 cf_atomic64 n_tombstones; // relevant only for enterprise edition
837
838 // Consistency info.
839
840 uint32_t n_dead_partitions;
841 uint32_t n_unavailable_partitions;
842 bool clock_skew_stop_writes;
843
844 // Expiration & eviction (nsup) stats.
845
846 bool stop_writes;
847 bool hwm_breached;
848
849 uint64_t non_expirable_objects;
850
851 uint64_t n_expired_objects;
852 uint64_t n_evicted_objects;
853
854 int32_t evict_ttl; // signed - possible (but weird) it's negative
855
856 uint32_t nsup_cycle_duration; // seconds taken for most recent nsup cycle
857
858 // Memory usage stats.
859
860 cf_atomic_int n_bytes_memory;
861 cf_atomic64 n_bytes_sindex_memory;
862
863 // Persistent storage stats.
864
865 double comp_avg_orig_sz; // relevant only for enterprise edition
866 double comp_avg_comp_sz; // relevant only for enterprise edition
867 float cache_read_pct;
868
869 // Migration stats.
870
871 cf_atomic_int migrate_tx_partitions_imbalance; // debug only
872 cf_atomic_int migrate_tx_instance_count; // debug only
873 cf_atomic_int migrate_rx_instance_count; // debug only
874 cf_atomic_int migrate_tx_partitions_active;
875 cf_atomic_int migrate_rx_partitions_active;
876 cf_atomic_int migrate_tx_partitions_initial;
877 cf_atomic_int migrate_tx_partitions_remaining;
878 cf_atomic_int migrate_tx_partitions_lead_remaining;
879 cf_atomic_int migrate_rx_partitions_initial;
880 cf_atomic_int migrate_rx_partitions_remaining;
881 cf_atomic_int migrate_signals_active;
882 cf_atomic_int migrate_signals_remaining;
883 cf_atomic_int appeals_tx_active; // relevant only for enterprise edition
884 cf_atomic_int appeals_rx_active; // relevant only for enterprise edition
885 cf_atomic_int appeals_tx_remaining; // relevant only for enterprise edition
886
887 // Per-record migration stats:
888 cf_atomic_int migrate_records_skipped; // relevant only for enterprise edition
889 cf_atomic_int migrate_records_transmitted;
890 cf_atomic_int migrate_record_retransmits;
891 cf_atomic_int migrate_record_receives;
892 cf_atomic_int appeals_records_exonerated; // relevant only for enterprise edition
893
894 // From-client transaction stats.
895
896 cf_atomic64 n_client_tsvc_error;
897 cf_atomic64 n_client_tsvc_timeout;
898
899 cf_atomic64 n_client_proxy_complete;
900 cf_atomic64 n_client_proxy_error;
901 cf_atomic64 n_client_proxy_timeout;
902
903 cf_atomic64 n_client_read_success;
904 cf_atomic64 n_client_read_error;
905 cf_atomic64 n_client_read_timeout;
906 cf_atomic64 n_client_read_not_found;
907 cf_atomic64 n_client_read_filtered_out;
908
909 cf_atomic64 n_client_write_success;
910 cf_atomic64 n_client_write_error;
911 cf_atomic64 n_client_write_timeout;
912 cf_atomic64 n_client_write_filtered_out;
913
914 // Subset of n_client_write_... above, respectively.
915 cf_atomic64 n_xdr_client_write_success;
916 cf_atomic64 n_xdr_client_write_error;
917 cf_atomic64 n_xdr_client_write_timeout;
918
919 cf_atomic64 n_client_delete_success;
920 cf_atomic64 n_client_delete_error;
921 cf_atomic64 n_client_delete_timeout;
922 cf_atomic64 n_client_delete_not_found;
923 cf_atomic64 n_client_delete_filtered_out;
924
925 // Subset of n_client_delete_... above, respectively.
926 cf_atomic64 n_xdr_client_delete_success;
927 cf_atomic64 n_xdr_client_delete_error;
928 cf_atomic64 n_xdr_client_delete_timeout;
929 cf_atomic64 n_xdr_client_delete_not_found;
930
931 cf_atomic64 n_client_udf_complete;
932 cf_atomic64 n_client_udf_error;
933 cf_atomic64 n_client_udf_timeout;
934 cf_atomic64 n_client_udf_filtered_out;
935
936 cf_atomic64 n_client_lang_read_success;
937 cf_atomic64 n_client_lang_write_success;
938 cf_atomic64 n_client_lang_delete_success;
939 cf_atomic64 n_client_lang_error;
940
941 // From-proxy transaction stats.
942
943 cf_atomic64 n_from_proxy_tsvc_error;
944 cf_atomic64 n_from_proxy_tsvc_timeout;
945
946 cf_atomic64 n_from_proxy_read_success;
947 cf_atomic64 n_from_proxy_read_error;
948 cf_atomic64 n_from_proxy_read_timeout;
949 cf_atomic64 n_from_proxy_read_not_found;
950 cf_atomic64 n_from_proxy_read_filtered_out;
951
952 cf_atomic64 n_from_proxy_write_success;
953 cf_atomic64 n_from_proxy_write_error;
954 cf_atomic64 n_from_proxy_write_timeout;
955 cf_atomic64 n_from_proxy_write_filtered_out;
956
957 // Subset of n_from_proxy_write_... above, respectively.
958 cf_atomic64 n_xdr_from_proxy_write_success;
959 cf_atomic64 n_xdr_from_proxy_write_error;
960 cf_atomic64 n_xdr_from_proxy_write_timeout;
961
962 cf_atomic64 n_from_proxy_delete_success;
963 cf_atomic64 n_from_proxy_delete_error;
964 cf_atomic64 n_from_proxy_delete_timeout;
965 cf_atomic64 n_from_proxy_delete_not_found;
966 cf_atomic64 n_from_proxy_delete_filtered_out;
967
968 // Subset of n_from_proxy_delete_... above, respectively.
969 cf_atomic64 n_xdr_from_proxy_delete_success;
970 cf_atomic64 n_xdr_from_proxy_delete_error;
971 cf_atomic64 n_xdr_from_proxy_delete_timeout;
972 cf_atomic64 n_xdr_from_proxy_delete_not_found;
973
974 cf_atomic64 n_from_proxy_udf_complete;
975 cf_atomic64 n_from_proxy_udf_error;
976 cf_atomic64 n_from_proxy_udf_timeout;
977 cf_atomic64 n_from_proxy_udf_filtered_out;
978
979 cf_atomic64 n_from_proxy_lang_read_success;
980 cf_atomic64 n_from_proxy_lang_write_success;
981 cf_atomic64 n_from_proxy_lang_delete_success;
982 cf_atomic64 n_from_proxy_lang_error;
983
984 // Batch sub-transaction stats.
985
986 cf_atomic64 n_batch_sub_tsvc_error;
987 cf_atomic64 n_batch_sub_tsvc_timeout;
988
989 cf_atomic64 n_batch_sub_proxy_complete;
990 cf_atomic64 n_batch_sub_proxy_error;
991 cf_atomic64 n_batch_sub_proxy_timeout;
992
993 cf_atomic64 n_batch_sub_read_success;
994 cf_atomic64 n_batch_sub_read_error;
995 cf_atomic64 n_batch_sub_read_timeout;
996 cf_atomic64 n_batch_sub_read_not_found;
997 cf_atomic64 n_batch_sub_read_filtered_out;
998
999 // From-proxy batch sub-transaction stats.
1000
1001 cf_atomic64 n_from_proxy_batch_sub_tsvc_error;
1002 cf_atomic64 n_from_proxy_batch_sub_tsvc_timeout;
1003
1004 cf_atomic64 n_from_proxy_batch_sub_read_success;
1005 cf_atomic64 n_from_proxy_batch_sub_read_error;
1006 cf_atomic64 n_from_proxy_batch_sub_read_timeout;
1007 cf_atomic64 n_from_proxy_batch_sub_read_not_found;
1008 cf_atomic64 n_from_proxy_batch_sub_read_filtered_out;
1009
1010 // Internal-UDF sub-transaction stats.
1011
1012 cf_atomic64 n_udf_sub_tsvc_error;
1013 cf_atomic64 n_udf_sub_tsvc_timeout;
1014
1015 cf_atomic64 n_udf_sub_udf_complete;
1016 cf_atomic64 n_udf_sub_udf_error;
1017 cf_atomic64 n_udf_sub_udf_timeout;
1018 uint64_t n_udf_sub_udf_filtered_out;
1019
1020 cf_atomic64 n_udf_sub_lang_read_success;
1021 cf_atomic64 n_udf_sub_lang_write_success;
1022 cf_atomic64 n_udf_sub_lang_delete_success;
1023 cf_atomic64 n_udf_sub_lang_error;
1024
1025 // Internal-ops sub-transaction stats.
1026
1027 cf_atomic64 n_ops_sub_tsvc_error;
1028 cf_atomic64 n_ops_sub_tsvc_timeout;
1029
1030 cf_atomic64 n_ops_sub_write_success;
1031 cf_atomic64 n_ops_sub_write_error;
1032 cf_atomic64 n_ops_sub_write_timeout;
1033 uint64_t n_ops_sub_write_filtered_out;
1034
1035 // Transaction retransmit stats - 'all' means both client & proxy origins.
1036
1037 uint64_t n_retransmit_all_read_dup_res;
1038
1039 uint64_t n_retransmit_all_write_dup_res;
1040 uint64_t n_retransmit_all_write_repl_write;
1041
1042 uint64_t n_retransmit_all_delete_dup_res;
1043 uint64_t n_retransmit_all_delete_repl_write;
1044
1045 uint64_t n_retransmit_all_udf_dup_res;
1046 uint64_t n_retransmit_all_udf_repl_write;
1047
1048 uint64_t n_retransmit_all_batch_sub_dup_res;
1049
1050 uint64_t n_retransmit_udf_sub_dup_res;
1051 uint64_t n_retransmit_udf_sub_repl_write;
1052
1053 uint64_t n_retransmit_ops_sub_dup_res;
1054 uint64_t n_retransmit_ops_sub_repl_write;
1055
1056 // Scan stats.
1057
1058 uint64_t n_scan_basic_complete;
1059 uint64_t n_scan_basic_error;
1060 uint64_t n_scan_basic_abort;
1061
1062 uint64_t n_scan_aggr_complete;
1063 uint64_t n_scan_aggr_error;
1064 uint64_t n_scan_aggr_abort;
1065
1066 uint64_t n_scan_udf_bg_complete;
1067 uint64_t n_scan_udf_bg_error;
1068 uint64_t n_scan_udf_bg_abort;
1069
1070 uint64_t n_scan_ops_bg_complete;
1071 uint64_t n_scan_ops_bg_error;
1072 uint64_t n_scan_ops_bg_abort;
1073
1074 // Query stats.
1075
1076 cf_atomic64 query_reqs;
1077 cf_atomic64 query_fail;
1078 cf_atomic64 query_short_queue_full;
1079 cf_atomic64 query_long_queue_full;
1080 cf_atomic64 query_short_reqs;
1081 cf_atomic64 query_long_reqs;
1082
1083 cf_atomic64 n_lookup;
1084 cf_atomic64 n_lookup_success;
1085 cf_atomic64 n_lookup_abort;
1086 cf_atomic64 n_lookup_errs;
1087 cf_atomic64 lookup_response_size;
1088 cf_atomic64 lookup_num_records;
1089
1090 cf_atomic64 n_aggregation;
1091 cf_atomic64 n_agg_success;
1092 cf_atomic64 n_agg_abort;
1093 cf_atomic64 n_agg_errs;
1094 cf_atomic64 agg_response_size;
1095 cf_atomic64 agg_num_records;
1096
1097 cf_atomic64 n_query_udf_bg_success;
1098 cf_atomic64 n_query_udf_bg_failure;
1099
1100 cf_atomic64 n_query_ops_bg_success;
1101 cf_atomic64 n_query_ops_bg_failure;
1102
1103 // Geospatial query stats:
1104 cf_atomic64 geo_region_query_count; // number of region queries
1105 cf_atomic64 geo_region_query_cells; // number of cells used by region queries
1106 cf_atomic64 geo_region_query_points; // number of valid points found
1107 cf_atomic64 geo_region_query_falsepos; // number of false positives found
1108
1109 // Re-replication stats - relevant only for enterprise edition.
1110
1111 cf_atomic64 n_re_repl_success;
1112 cf_atomic64 n_re_repl_error;
1113 cf_atomic64 n_re_repl_timeout;
1114
1115 // Special errors that deserve their own counters:
1116
1117 cf_atomic64 n_fail_xdr_forbidden;
1118 cf_atomic64 n_fail_key_busy;
1119 cf_atomic64 n_fail_generation;
1120 cf_atomic64 n_fail_record_too_big;
1121
1122 // Special non-error counters:
1123
1124 cf_atomic64 n_deleted_last_bin;
1125
1126 // One-way automatically activated histograms.
1127
1128 cf_hist_track* read_hist;
1129 cf_hist_track* write_hist;
1130 cf_hist_track* udf_hist;
1131 cf_hist_track* query_hist;
1132 histogram* query_rec_count_hist;
1133 histogram* re_repl_hist; // relevant only for enterprise edition
1134
1135 bool read_hist_active;
1136 bool write_hist_active;
1137 bool udf_hist_active;
1138 bool query_hist_active;
1139 bool query_rec_count_hist_active;
1140 bool re_repl_hist_active; // relevant only for enterprise edition
1141
1142 // Activate-by-config histograms.
1143
1144 histogram* proxy_hist;
1145
1146 histogram* read_start_hist;
1147 histogram* read_restart_hist;
1148 histogram* read_dup_res_hist;
1149 histogram* read_repl_ping_hist;
1150 histogram* read_local_hist;
1151 histogram* read_response_hist;
1152
1153 histogram* write_start_hist;
1154 histogram* write_restart_hist;
1155 histogram* write_dup_res_hist;
1156 histogram* write_master_hist; // split this?
1157 histogram* write_repl_write_hist;
1158 histogram* write_response_hist;
1159
1160 histogram* udf_start_hist;
1161 histogram* udf_restart_hist;
1162 histogram* udf_dup_res_hist;
1163 histogram* udf_master_hist; // split this?
1164 histogram* udf_repl_write_hist;
1165 histogram* udf_response_hist;
1166
1167 histogram* batch_sub_start_hist;
1168 histogram* batch_sub_restart_hist;
1169 histogram* batch_sub_dup_res_hist;
1170 histogram* batch_sub_repl_ping_hist;
1171 histogram* batch_sub_read_local_hist;
1172 histogram* batch_sub_response_hist;
1173
1174 histogram* udf_sub_start_hist;
1175 histogram* udf_sub_restart_hist;
1176 histogram* udf_sub_dup_res_hist;
1177 histogram* udf_sub_master_hist; // split this?
1178 histogram* udf_sub_repl_write_hist;
1179 histogram* udf_sub_response_hist;
1180
1181 histogram* ops_sub_start_hist;
1182 histogram* ops_sub_restart_hist;
1183 histogram* ops_sub_dup_res_hist;
1184 histogram* ops_sub_master_hist; // split this?
1185 histogram* ops_sub_repl_write_hist;
1186 histogram* ops_sub_response_hist;
1187
1188 histogram* device_read_size_hist;
1189 histogram* device_write_size_hist;
1190
1191 // Histograms of object storage sizes. (Meaningful for drive-backed
1192 // namespaces only.)
1193 histogram* obj_size_log_hist;
1194 histogram* set_obj_size_log_hists[AS_SET_MAX_COUNT + 1];
1195 linear_hist* obj_size_lin_hist;
1196 linear_hist* set_obj_size_lin_hists[AS_SET_MAX_COUNT + 1];
1197
1198 // Histograms used for general eviction and expiration.
1199 linear_hist* evict_hist; // not just for info
1200 linear_hist* ttl_hist;
1201 linear_hist* set_ttl_hists[AS_SET_MAX_COUNT + 1];
1202
1203 //--------------------------------------------
1204 // Information for rebalancing.
1205 //
1206
1207 uint32_t cluster_size;
1208 cf_node succession[AS_CLUSTER_SZ];
1209 as_partition_version cluster_versions[AS_CLUSTER_SZ][AS_PARTITIONS];
1210 uint32_t rack_ids[AS_CLUSTER_SZ]; // is observed-rack-ids in CP mode
1211
1212 // Quiescence - relevant only for enterprise edition.
1213 uint32_t active_size;
1214 bool pending_quiesce;
1215 bool is_quiesced;
1216 bool quiesced[AS_CLUSTER_SZ];
1217
1218 // Observed nodes - relevant only for enterprise edition.
1219 uint32_t observed_cluster_size;
1220 cf_node observed_succession[AS_CLUSTER_SZ];
1221
1222 // Roster management - relevant only for enterprise edition.
1223 uint32_t smd_roster_generation;
1224 uint32_t smd_roster_count;
1225 cf_node smd_roster[AS_CLUSTER_SZ];
1226 uint32_t smd_roster_rack_ids[AS_CLUSTER_SZ];
1227 uint32_t roster_generation;
1228 uint32_t roster_count;
1229 cf_node roster[AS_CLUSTER_SZ];
1230 uint32_t roster_rack_ids[AS_CLUSTER_SZ];
1231
1232 // Master regimes - relevant only for enterprise edition.
1233 uint32_t eventual_regime;
1234 uint32_t rebalance_regime;
1235 uint32_t rebalance_regimes[AS_CLUSTER_SZ];
1236};
1237
1238#define AS_SET_NAME_MAX_SIZE 64 // includes space for null-terminator
1239
1240#define INVALID_SET_ID 0
1241
1242#define IS_SET_EVICTION_DISABLED(p_set) (cf_atomic32_get(p_set->disable_eviction) == 1)
1243#define DISABLE_SET_EVICTION(p_set, on_off) (cf_atomic32_set(&p_set->disable_eviction, on_off ? 1 : 0))
1244
1245typedef enum {
1246 AS_SET_ENABLE_XDR_DEFAULT = 0,
1247 AS_SET_ENABLE_XDR_TRUE = 1,
1248 AS_SET_ENABLE_XDR_FALSE = 2
1249} as_set_enable_xdr_flag;
1250
1251// Caution - changing this struct could break warm or cool restart.
1252struct as_set_s {
1253 char name[AS_SET_NAME_MAX_SIZE];
1254 cf_atomic64 n_objects;
1255 cf_atomic64 n_tombstones; // relevant only for enterprise edition
1256 cf_atomic64 n_bytes_memory; // for data-in-memory only - sets's total record data size
1257 cf_atomic64 stop_writes_count; // restrict number of records in a set
1258 uint64_t truncate_lut; // records with last-update-time less than this are truncated
1259 cf_atomic32 disable_eviction; // don't evict anything in this set (note - expiration still works)
1260 cf_atomic32 enable_xdr; // white-list (AS_SET_ENABLE_XDR_TRUE) or black-list (AS_SET_ENABLE_XDR_FALSE) a set for XDR replication
1261 uint32_t n_sindexes;
1262 uint8_t padding[12];
1263};
1264
1265static inline bool
1266as_set_stop_writes(as_set *p_set) {
1267 uint64_t n_objects = cf_atomic64_get(p_set->n_objects);
1268 uint64_t stop_writes_count = cf_atomic64_get(p_set->stop_writes_count);
1269
1270 return stop_writes_count != 0 && n_objects >= stop_writes_count;
1271}
1272
1273// These bin functions must be below definition of struct as_namespace_s:
1274
1275static inline bool
1276as_bin_set_id_from_name_w_len(as_namespace *ns, as_bin *b, const uint8_t *buf,
1277 size_t len) {
1278 return as_bin_get_or_assign_id_w_len(ns, (const char *)buf, len, &b->id);
1279}
1280
1281static inline size_t
1282as_bin_memcpy_name(as_namespace *ns, uint8_t *buf, as_bin *b) {
1283 size_t len = 0;
1284
1285 if (! ns->single_bin) {
1286 const char *name = as_bin_get_name_from_id(ns, b->id);
1287
1288 len = strlen(name);
1289 memcpy(buf, name, len);
1290 }
1291
1292 return len;
1293}
1294
1295// forward ref
1296struct as_msg_field_s;
1297
1298/* Namespace function declarations */
1299extern as_namespace *as_namespace_create(char *name);
1300extern void as_namespaces_init(bool cold_start_cmd, uint32_t instance);
1301extern void as_namespaces_setup(bool cold_start_cmd, uint32_t instance);
1302extern void as_namespace_finish_setup(as_namespace *ns, uint32_t instance);
1303extern bool as_namespace_configure_sets(as_namespace *ns);
1304extern as_namespace *as_namespace_get_byname(char *name);
1305extern as_namespace *as_namespace_get_byid(uint32_t id);
1306extern as_namespace *as_namespace_get_bybuf(uint8_t *name, size_t len);
1307extern as_namespace *as_namespace_get_bymsgfield(struct as_msg_field_s *fp);
1308extern const char *as_namespace_get_set_name(as_namespace *ns, uint16_t set_id);
1309extern uint16_t as_namespace_get_set_id(as_namespace *ns, const char *set_name);
1310extern uint16_t as_namespace_get_create_set_id(as_namespace *ns, const char *set_name);
1311extern int as_namespace_set_set_w_len(as_namespace *ns, const char *set_name, size_t len, uint16_t *p_set_id, bool apply_restrictions);
1312extern int as_namespace_get_create_set_w_len(as_namespace *ns, const char *set_name, size_t len, as_set **pp_set, uint16_t *p_set_id);
1313extern as_set *as_namespace_get_set_by_name(as_namespace *ns, const char *set_name);
1314extern as_set* as_namespace_get_set_by_id(as_namespace* ns, uint16_t set_id);
1315extern as_set* as_namespace_get_record_set(as_namespace *ns, const as_record *r);
1316extern void as_namespace_get_set_info(as_namespace *ns, const char *set_name, cf_dyn_buf *db);
1317extern void as_namespace_adjust_set_memory(as_namespace *ns, uint16_t set_id, int64_t delta_bytes);
1318extern void as_namespace_release_set_id(as_namespace *ns, uint16_t set_id);
1319extern void as_namespace_get_bins_info(as_namespace *ns, cf_dyn_buf *db, bool show_ns);
1320extern void as_namespace_get_hist_info(as_namespace *ns, char *set_name, char *hist_name, cf_dyn_buf *db);
1321
1322static inline bool
1323as_namespace_cool_restarts(const as_namespace *ns)
1324{
1325 return ns->storage_data_in_memory && ! ns->data_in_index;
1326}
1327
1328static inline uint32_t
1329as_namespace_device_count(const as_namespace *ns)
1330{
1331 // Only one of them will ever be non-zero.
1332 return ns->n_storage_devices + ns->n_storage_files;
1333}
1334
1335static inline const char*
1336as_namespace_start_mode_str(const as_namespace *ns)
1337{
1338 return as_namespace_cool_restarts(ns) ? "cool" : "warm";
1339}
1340
1341static inline bool
1342as_namespace_index_persisted(const as_namespace *ns)
1343{
1344 return ns->xmem_type == CF_XMEM_TYPE_PMEM ||
1345 ns->xmem_type == CF_XMEM_TYPE_FLASH;
1346}
1347
1348// Persistent Memory Management
1349void as_namespace_xmem_shutdown(as_namespace *ns, uint32_t instance);
1350