| 1 | /* |
| 2 | * datamodel.h |
| 3 | * |
| 4 | * Copyright (C) 2008-2016 Aerospike, Inc. |
| 5 | * |
| 6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
| 7 | * license agreements. |
| 8 | * |
| 9 | * This program is free software: you can redistribute it and/or modify it under |
| 10 | * the terms of the GNU Affero General Public License as published by the Free |
| 11 | * Software Foundation, either version 3 of the License, or (at your option) any |
| 12 | * later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
| 15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
| 17 | * details. |
| 18 | * |
| 19 | * You should have received a copy of the GNU Affero General Public License |
| 20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
| 21 | */ |
| 22 | |
| 23 | /* |
| 24 | * core data model structures and definitions |
| 25 | */ |
| 26 | |
| 27 | #pragma once |
| 28 | |
| 29 | #include <limits.h> |
| 30 | #include <stdbool.h> |
| 31 | #include <stddef.h> |
| 32 | #include <stdint.h> |
| 33 | #include <string.h> |
| 34 | |
| 35 | #include "aerospike/as_val.h" |
| 36 | #include "citrusleaf/cf_atomic.h" |
| 37 | #include "citrusleaf/cf_clock.h" |
| 38 | #include "citrusleaf/cf_digest.h" |
| 39 | |
| 40 | #include "arenax.h" |
| 41 | #include "cf_mutex.h" |
| 42 | #include "dynbuf.h" |
| 43 | #include "hist.h" |
| 44 | #include "hist_track.h" |
| 45 | #include "linear_hist.h" |
| 46 | #include "msg.h" |
| 47 | #include "node.h" |
| 48 | #include "shash.h" |
| 49 | #include "vmapx.h" |
| 50 | #include "xmem.h" |
| 51 | |
| 52 | #include "base/cfg.h" |
| 53 | #include "base/proto.h" |
| 54 | #include "base/transaction_policy.h" |
| 55 | #include "base/truncate.h" |
| 56 | #include "fabric/hb.h" |
| 57 | #include "fabric/partition.h" |
| 58 | #include "storage/flat.h" |
| 59 | #include "storage/storage.h" |
| 60 | |
| 61 | |
| 62 | #define OBJ_SIZE_HIST_NUM_BUCKETS 1024 |
| 63 | #define TTL_HIST_NUM_BUCKETS 100 |
| 64 | |
| 65 | #define MAX_ALLOWED_TTL (3600 * 24 * 365 * 10) // 10 years |
| 66 | |
| 67 | // [0-1] for partition-id |
| 68 | // [1-4] for tree sprigs and locks |
| 69 | // [5-7] unused |
| 70 | // [8-11] for SSD device hash |
| 71 | #define DIGEST_STORAGE_BASE_BYTE 8 |
| 72 | // [12-15] for rw_request hash |
| 73 | #define DIGEST_HASH_BASE_BYTE 12 |
| 74 | // [16-19] for pred-exp filter |
| 75 | |
| 76 | |
| 77 | /* Forward declarations */ |
| 78 | typedef struct as_namespace_s as_namespace; |
| 79 | typedef struct as_index_s as_record; |
| 80 | typedef struct as_bin_s as_bin; |
| 81 | typedef struct as_index_ref_s as_index_ref; |
| 82 | typedef struct as_set_s as_set; |
| 83 | |
| 84 | struct as_index_tree_s; |
| 85 | |
| 86 | |
| 87 | #define AS_ID_NAMESPACE_SZ 32 |
| 88 | |
| 89 | #define AS_ID_INAME_SZ 256 |
| 90 | |
| 91 | #define AS_BIN_NAME_MAX_SZ 16 // changing this breaks warm restart |
| 92 | #define MAX_BIN_NAMES 0x10000 // no need for more - numeric ID is 16 bits |
| 93 | #define BIN_NAMES_QUOTA (MAX_BIN_NAMES / 2) // don't add more names than this via client transactions |
| 94 | |
| 95 | /* |
| 96 | * Compare two 16-bit generation counts, allowing wrap-arounds. |
| 97 | * Works correctly, if: |
| 98 | * |
| 99 | * - rhs is ahead of lhs, but rhs isn't ahead more than 32,768. |
| 100 | * - lhs is ahead of rhs, but lhs isn't ahead more than 32,767. |
| 101 | */ |
| 102 | |
| 103 | static inline bool |
| 104 | as_gen_less_than(uint16_t lhs, uint16_t rhs) |
| 105 | { |
| 106 | return (uint16_t)(lhs - rhs) >= 32768; |
| 107 | } |
| 108 | |
| 109 | |
| 110 | /* as_particle_type |
| 111 | * Particles are typed, which reflects their contents: |
| 112 | * NULL: no associated content (not sure I really need this internally?) |
| 113 | * INTEGER: a signed, 64-bit integer |
| 114 | * FLOAT: a floating point |
| 115 | * STRING: a null-terminated UTF-8 string |
| 116 | * BLOB: arbitrary-length binary data |
| 117 | * TIMESTAMP: milliseconds since 1 January 1970, 00:00:00 GMT |
| 118 | * DIGEST: an internal Aerospike key digest */ |
| 119 | typedef enum { |
| 120 | AS_PARTICLE_TYPE_NULL = 0, |
| 121 | AS_PARTICLE_TYPE_INTEGER = 1, |
| 122 | AS_PARTICLE_TYPE_FLOAT = 2, |
| 123 | AS_PARTICLE_TYPE_STRING = 3, |
| 124 | AS_PARTICLE_TYPE_BLOB = 4, |
| 125 | AS_PARTICLE_TYPE_JAVA_BLOB = 7, |
| 126 | AS_PARTICLE_TYPE_CSHARP_BLOB = 8, |
| 127 | AS_PARTICLE_TYPE_PYTHON_BLOB = 9, |
| 128 | AS_PARTICLE_TYPE_RUBY_BLOB = 10, |
| 129 | AS_PARTICLE_TYPE_PHP_BLOB = 11, |
| 130 | AS_PARTICLE_TYPE_ERLANG_BLOB = 12, |
| 131 | AS_PARTICLE_TYPE_MAP = 19, |
| 132 | AS_PARTICLE_TYPE_LIST = 20, |
| 133 | AS_PARTICLE_TYPE_GEOJSON = 23, |
| 134 | AS_PARTICLE_TYPE_MAX = 24, |
| 135 | AS_PARTICLE_TYPE_BAD = AS_PARTICLE_TYPE_MAX |
| 136 | } as_particle_type; |
| 137 | |
| 138 | /* as_particle |
| 139 | * The common part of a particle |
| 140 | * this is poor man's subclassing - IE, how to do a subclassed interface in C |
| 141 | * Go look in particle.c to see all the subclass implementation and structure */ |
| 142 | typedef struct as_particle_s { |
| 143 | uint8_t metadata; // used by the iparticle for is_integer and inuse, as well as version in multi bin mode only |
| 144 | // used by *particle for type |
| 145 | uint8_t data[0]; |
| 146 | } __attribute__ ((__packed__)) as_particle; |
| 147 | |
| 148 | // Bit Flag constants used for the particle state value (4 bits, 16 values) |
| 149 | #define AS_BIN_STATE_UNUSED 0 |
| 150 | #define AS_BIN_STATE_INUSE_INTEGER 1 |
| 151 | #define AS_BIN_STATE_RECYCLE_ME 2 // was - hidden bin |
| 152 | #define AS_BIN_STATE_INUSE_OTHER 3 |
| 153 | #define AS_BIN_STATE_INUSE_FLOAT 4 |
| 154 | |
| 155 | typedef struct as_particle_iparticle_s { |
| 156 | uint8_t version: 4; // now unused - and can't be used in single-bin config |
| 157 | uint8_t state: 4; // see AS_BIN_STATE_... |
| 158 | uint8_t data[0]; |
| 159 | } __attribute__ ((__packed__)) as_particle_iparticle; |
| 160 | |
| 161 | /* Particle function declarations */ |
| 162 | |
| 163 | static inline bool |
| 164 | is_embedded_particle_type(as_particle_type type) |
| 165 | { |
| 166 | return type == AS_PARTICLE_TYPE_INTEGER || type == AS_PARTICLE_TYPE_FLOAT; |
| 167 | } |
| 168 | |
| 169 | extern as_particle_type as_particle_type_from_asval(const as_val *val); |
| 170 | extern as_particle_type as_particle_type_from_msgpack(const uint8_t *packed, uint32_t packed_size); |
| 171 | |
| 172 | extern uint32_t as_particle_size_from_asval(const as_val *val); |
| 173 | |
| 174 | extern uint32_t as_particle_asval_client_value_size(const as_val *val); |
| 175 | extern uint32_t as_particle_asval_to_client(const as_val *val, as_msg_op *op); |
| 176 | |
| 177 | extern const uint8_t *as_particle_skip_flat(const uint8_t *flat, const uint8_t *end); |
| 178 | |
| 179 | // as_bin particle function declarations |
| 180 | |
| 181 | extern void as_bin_particle_destroy(as_bin *b, bool free_particle); |
| 182 | extern uint32_t as_bin_particle_size(as_bin *b); |
| 183 | |
| 184 | // wire: |
| 185 | extern int as_bin_particle_alloc_modify_from_client(as_bin *b, const as_msg_op *op); |
| 186 | extern int as_bin_particle_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op); |
| 187 | extern int as_bin_particle_alloc_from_client(as_bin *b, const as_msg_op *op); |
| 188 | extern int as_bin_particle_stack_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op); |
| 189 | extern int as_bin_particle_alloc_from_pickled(as_bin *b, const uint8_t **p_pickled, const uint8_t *end); |
| 190 | extern int as_bin_particle_stack_from_pickled(as_bin *b, cf_ll_buf *particles_llb, const uint8_t **p_pickled, const uint8_t *end); |
| 191 | extern uint32_t as_bin_particle_client_value_size(const as_bin *b); |
| 192 | extern uint32_t as_bin_particle_to_client(const as_bin *b, as_msg_op *op); |
| 193 | extern uint32_t as_bin_particle_pickled_size(const as_bin *b); |
| 194 | extern uint32_t as_bin_particle_to_pickled(const as_bin *b, uint8_t *pickled); |
| 195 | |
| 196 | // Different for blob bitwise operations - we don't use the normal APIs and |
| 197 | // particle table functions. |
| 198 | extern int as_bin_bits_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result); |
| 199 | extern int as_bin_bits_alloc_modify_from_client(as_bin *b, as_msg_op *op); |
| 200 | extern int as_bin_bits_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op); |
| 201 | |
| 202 | // Different for CDTs - the operations may return results, so we don't use the |
| 203 | // normal APIs and particle table functions. |
| 204 | extern int as_bin_cdt_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result); |
| 205 | extern int as_bin_cdt_alloc_modify_from_client(as_bin *b, as_msg_op *op, as_bin *result); |
| 206 | extern int as_bin_cdt_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op, as_bin *result); |
| 207 | |
| 208 | // as_val: |
| 209 | extern int as_bin_particle_replace_from_asval(as_bin *b, const as_val *val); |
| 210 | extern void as_bin_particle_stack_from_asval(as_bin *b, uint8_t* stack, const as_val *val); |
| 211 | extern as_val *as_bin_particle_to_asval(const as_bin *b); |
| 212 | |
| 213 | // msgpack: |
| 214 | extern int as_bin_particle_alloc_from_msgpack(as_bin *b, const uint8_t *packed, uint32_t packed_size); |
| 215 | |
| 216 | // flat: |
| 217 | extern const uint8_t *as_bin_particle_cast_from_flat(as_bin *b, const uint8_t *flat, const uint8_t *end); |
| 218 | extern const uint8_t *as_bin_particle_replace_from_flat(as_bin *b, const uint8_t *flat, const uint8_t *end); |
| 219 | extern uint32_t as_bin_particle_flat_size(as_bin *b); |
| 220 | extern uint32_t as_bin_particle_to_flat(const as_bin *b, uint8_t *flat); |
| 221 | |
| 222 | // odd as_bin particle functions for specific particle types |
| 223 | |
| 224 | // integer: |
| 225 | extern int64_t as_bin_particle_integer_value(const as_bin *b); |
| 226 | extern void as_bin_particle_integer_set(as_bin *b, int64_t i); |
| 227 | |
| 228 | // string: |
| 229 | extern uint32_t as_bin_particle_string_ptr(const as_bin *b, char **p_value); |
| 230 | |
| 231 | // blob: |
| 232 | extern int as_bin_bits_packed_read(const as_bin *b, const as_msg_op *msg_op, as_bin *result); |
| 233 | extern int as_bin_bits_packed_modify(as_bin *b, const as_msg_op *msg_op, cf_ll_buf *particles_llb); |
| 234 | |
| 235 | // geojson: |
| 236 | typedef void * geo_region_t; |
| 237 | #define MAX_REGION_CELLS 256 |
| 238 | #define MAX_REGION_LEVELS 30 |
| 239 | extern size_t as_bin_particle_geojson_cellids(const as_bin *b, uint64_t **pp_cells); |
| 240 | extern bool as_particle_geojson_match(as_particle *p, uint64_t cellid, geo_region_t region, bool is_strict); |
| 241 | extern bool as_particle_geojson_match_asval(const as_val *val, uint64_t cellid, geo_region_t region, bool is_strict); |
| 242 | char const *as_geojson_mem_jsonstr(const as_particle *p, size_t *p_jsonsz); |
| 243 | |
| 244 | // list: |
| 245 | struct cdt_payload_s; |
| 246 | struct rollback_alloc_s; |
| 247 | extern void as_bin_particle_list_get_packed_val(const as_bin *b, struct cdt_payload_s *packed); |
| 248 | |
| 249 | extern int as_bin_cdt_packed_read(const as_bin *b, const as_msg_op *op, as_bin *result); |
| 250 | extern int as_bin_cdt_packed_modify(as_bin *b, const as_msg_op *op, as_bin *result, cf_ll_buf *particles_llb); |
| 251 | |
| 252 | |
| 253 | /* as_bin |
| 254 | * A bin container - null name means unused */ |
| 255 | struct as_bin_s { |
| 256 | as_particle iparticle; // 1 byte |
| 257 | as_particle *particle; // for embedded particle this is value, not pointer |
| 258 | |
| 259 | // Never read or write these bytes in single-bin configuration: |
| 260 | uint16_t id; // ID of bin name |
| 261 | uint8_t unused; // pad to 12 bytes (multiple of 4) - legacy |
| 262 | } __attribute__ ((__packed__)) ; |
| 263 | |
| 264 | // For data-in-memory namespaces in multi-bin mode, we keep an array of as_bin |
| 265 | // structs in memory, accessed via this struct. |
| 266 | typedef struct as_bin_space_s { |
| 267 | uint16_t n_bins; |
| 268 | as_bin bins[0]; |
| 269 | } __attribute__ ((__packed__)) as_bin_space; |
| 270 | |
| 271 | // TODO - Do we really need to pad as_bin to 12 bytes for thread safety? |
| 272 | // Do we ever write & read adjacent as_bin structures in a bins array from |
| 273 | // different threads when not under the record lock? And if we're worried about |
| 274 | // 4-byte alignment for that or any other reason, wouldn't we also have to pad |
| 275 | // after n_bins in as_bin_space? |
| 276 | |
| 277 | // For data-in-memory namespaces in multi-bin mode, if we're storing extra |
| 278 | // record metadata, we access it via this struct. In this case the index points |
| 279 | // here instead of directly to an as_bin_space. |
| 280 | typedef struct as_rec_space_s { |
| 281 | as_bin_space* bin_space; |
| 282 | |
| 283 | // So far the key is the only extra record metadata we store in memory. |
| 284 | uint32_t key_size; |
| 285 | uint8_t key[0]; |
| 286 | } __attribute__ ((__packed__)) as_rec_space; |
| 287 | |
| 288 | // For copying as_bin structs without the last 3 bytes. |
| 289 | static inline void |
| 290 | as_single_bin_copy(as_bin *to, const as_bin *from) |
| 291 | { |
| 292 | to->iparticle = from->iparticle; |
| 293 | to->particle = from->particle; |
| 294 | } |
| 295 | |
| 296 | static inline bool |
| 297 | as_bin_inuse(const as_bin *b) |
| 298 | { |
| 299 | return (((as_particle_iparticle *)b)->state); |
| 300 | } |
| 301 | |
| 302 | static inline uint8_t |
| 303 | as_bin_state(const as_bin *b) |
| 304 | { |
| 305 | return ((as_particle_iparticle *)b)->state; |
| 306 | } |
| 307 | |
| 308 | static inline void |
| 309 | as_bin_state_set(as_bin *b, uint8_t val) |
| 310 | { |
| 311 | ((as_particle_iparticle *)b)->state = val; |
| 312 | } |
| 313 | |
| 314 | static inline void |
| 315 | as_bin_state_set_from_type(as_bin *b, as_particle_type type) |
| 316 | { |
| 317 | switch (type) { |
| 318 | case AS_PARTICLE_TYPE_NULL: |
| 319 | ((as_particle_iparticle *)b)->state = AS_BIN_STATE_UNUSED; |
| 320 | break; |
| 321 | case AS_PARTICLE_TYPE_INTEGER: |
| 322 | ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_INTEGER; |
| 323 | break; |
| 324 | case AS_PARTICLE_TYPE_FLOAT: |
| 325 | ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_FLOAT; |
| 326 | break; |
| 327 | default: |
| 328 | ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_OTHER; |
| 329 | break; |
| 330 | } |
| 331 | } |
| 332 | |
| 333 | static inline bool |
| 334 | as_bin_inuse_has(const as_storage_rd *rd) |
| 335 | { |
| 336 | // In-use bins are at the beginning - only need to check the first bin. |
| 337 | return rd->n_bins != 0 && (rd->pickle != NULL || as_bin_inuse(rd->bins)); |
| 338 | } |
| 339 | |
| 340 | static inline uint16_t |
| 341 | as_bin_inuse_count(const as_storage_rd *rd) |
| 342 | { |
| 343 | for (uint16_t i = 0; i < rd->n_bins; i++) { |
| 344 | if (! as_bin_inuse(&rd->bins[i])) { |
| 345 | return i; |
| 346 | } |
| 347 | } |
| 348 | |
| 349 | return rd->n_bins; |
| 350 | } |
| 351 | |
| 352 | static inline void |
| 353 | as_bin_set_empty(as_bin *b) |
| 354 | { |
| 355 | as_bin_state_set(b, AS_BIN_STATE_UNUSED); |
| 356 | } |
| 357 | |
| 358 | static inline void |
| 359 | as_bin_set_empty_shift(as_storage_rd *rd, uint32_t i) |
| 360 | { |
| 361 | // Shift the bins over, so there's no space between used bins. |
| 362 | // This can overwrite the "emptied" bin, and that's fine. |
| 363 | |
| 364 | uint16_t j; |
| 365 | |
| 366 | for (j = i + 1; j < rd->n_bins; j++) { |
| 367 | if (! as_bin_inuse(&rd->bins[j])) { |
| 368 | break; |
| 369 | } |
| 370 | } |
| 371 | |
| 372 | uint16_t n = j - (i + 1); |
| 373 | |
| 374 | if (n) { |
| 375 | memmove(&rd->bins[i], &rd->bins[i + 1], n * sizeof(as_bin)); |
| 376 | } |
| 377 | |
| 378 | // Mark the last bin that was *formerly* in use as null. |
| 379 | as_bin_set_empty(&rd->bins[j - 1]); |
| 380 | } |
| 381 | |
| 382 | static inline void |
| 383 | as_bin_set_empty_from(as_storage_rd *rd, uint16_t from) { |
| 384 | for (uint16_t i = from; i < rd->n_bins; i++) { |
| 385 | as_bin_set_empty(&rd->bins[i]); |
| 386 | } |
| 387 | } |
| 388 | |
| 389 | static inline void |
| 390 | as_bin_set_all_empty(as_storage_rd *rd) { |
| 391 | as_bin_set_empty_from(rd, 0); |
| 392 | } |
| 393 | |
| 394 | static inline bool |
| 395 | as_bin_is_embedded_particle(const as_bin *b) { |
| 396 | return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_INTEGER || |
| 397 | ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_FLOAT; |
| 398 | } |
| 399 | |
| 400 | static inline bool |
| 401 | as_bin_is_external_particle(const as_bin *b) { |
| 402 | return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_OTHER; |
| 403 | } |
| 404 | |
| 405 | static inline as_particle * |
| 406 | as_bin_get_particle(as_bin *b) { |
| 407 | return as_bin_is_embedded_particle(b) ? &b->iparticle : b->particle; |
| 408 | } |
| 409 | |
| 410 | // "Embedded" types like integer are stored directly, but other bin types |
| 411 | // ("other") must follow an indirection to get the actual type. |
| 412 | static inline uint8_t |
| 413 | as_bin_get_particle_type(const as_bin *b) { |
| 414 | switch (((as_particle_iparticle *)b)->state) { |
| 415 | case AS_BIN_STATE_INUSE_INTEGER: |
| 416 | return AS_PARTICLE_TYPE_INTEGER; |
| 417 | case AS_BIN_STATE_INUSE_FLOAT: |
| 418 | return AS_PARTICLE_TYPE_FLOAT; |
| 419 | case AS_BIN_STATE_INUSE_OTHER: |
| 420 | return b->particle->metadata; |
| 421 | default: |
| 422 | return AS_PARTICLE_TYPE_NULL; |
| 423 | } |
| 424 | } |
| 425 | |
| 426 | |
| 427 | /* Bin function declarations */ |
| 428 | extern int16_t as_bin_get_id(as_namespace *ns, const char *name); |
| 429 | extern bool as_bin_get_or_assign_id_w_len(as_namespace *ns, const char *name, size_t len, uint16_t *id); |
| 430 | extern const char* as_bin_get_name_from_id(as_namespace *ns, uint16_t id); |
| 431 | extern bool as_bin_name_within_quota(as_namespace *ns, const char *name); |
| 432 | extern void as_bin_copy(as_namespace *ns, as_bin *to, const as_bin *from); |
| 433 | extern int as_storage_rd_load_n_bins(as_storage_rd *rd); |
| 434 | extern int as_storage_rd_load_bins(as_storage_rd *rd, as_bin *stack_bins); |
| 435 | extern void as_bin_get_all_p(as_storage_rd *rd, as_bin **bin_ptrs); |
| 436 | extern as_bin *as_bin_get_by_id(as_storage_rd *rd, uint32_t id); |
| 437 | extern as_bin *as_bin_get(as_storage_rd *rd, const char *name); |
| 438 | extern as_bin *as_bin_get_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len); |
| 439 | extern as_bin *as_bin_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result); |
| 440 | extern as_bin *as_bin_get_or_create(as_storage_rd *rd, const char *name); |
| 441 | extern as_bin *as_bin_get_or_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result); |
| 442 | extern int32_t as_bin_get_index(as_storage_rd *rd, const char *name); |
| 443 | extern int32_t as_bin_get_index_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len); |
| 444 | extern void as_bin_destroy(as_storage_rd *rd, uint16_t i); |
| 445 | extern void as_bin_allocate_bin_space(as_storage_rd *rd, int32_t delta); |
| 446 | |
| 447 | |
| 448 | typedef enum { |
| 449 | AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF = 0, |
| 450 | AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION = 1, |
| 451 | AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME = 2, |
| 452 | AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_CP = 3 |
| 453 | } conflict_resolution_pol; |
| 454 | |
| 455 | /* Record function declarations */ |
| 456 | extern uint32_t clock_skew_stop_writes_sec(); |
| 457 | extern bool as_record_handle_clock_skew(as_namespace* ns, uint64_t skew_ms); |
| 458 | extern uint16_t plain_generation(uint16_t regime_generation, const as_namespace* ns); |
| 459 | extern void as_record_set_lut(as_record *r, uint32_t regime, uint64_t now_ms, const as_namespace* ns); |
| 460 | extern void as_record_increment_generation(as_record *r, const as_namespace* ns); |
| 461 | extern bool as_record_is_live(const as_record *r); |
| 462 | extern int as_record_get_create(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns); |
| 463 | extern int as_record_get(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref); |
| 464 | extern int as_record_get_live(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns); |
| 465 | extern int as_record_exists(struct as_index_tree_s *tree, const cf_digest *keyd); |
| 466 | extern int as_record_exists_live(struct as_index_tree_s *tree, const cf_digest *keyd, as_namespace *ns); |
| 467 | extern void as_record_rescue(as_index_ref *r_ref, as_namespace *ns); |
| 468 | |
| 469 | extern void as_record_destroy_bins_from(as_storage_rd *rd, uint16_t from); |
| 470 | extern void as_record_destroy_bins(as_storage_rd *rd); |
| 471 | extern void as_record_free_bin_space(as_record *r); |
| 472 | |
| 473 | extern void as_record_destroy(as_record *r, as_namespace *ns); |
| 474 | extern void as_record_done(as_index_ref *r_ref, as_namespace *ns); |
| 475 | |
| 476 | void as_record_drop_stats(as_record* r, as_namespace* ns); |
| 477 | |
| 478 | extern void as_record_finalize_key(as_record* r, as_namespace* ns, const uint8_t* key, uint32_t key_size); |
| 479 | extern void as_record_allocate_key(as_record* r, const uint8_t* key, uint32_t key_size); |
| 480 | extern int as_record_resolve_conflict(conflict_resolution_pol policy, uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut); |
| 481 | extern uint8_t *as_record_pickle(as_storage_rd *rd, size_t *len_r); |
| 482 | extern int as_record_write_from_pickle(as_storage_rd *rd); |
| 483 | extern int as_record_set_set_from_msg(as_record *r, as_namespace *ns, as_msg *m); |
| 484 | |
| 485 | static inline bool |
| 486 | as_record_pickle_is_binless(const uint8_t *buf) |
| 487 | { |
| 488 | return *(uint16_t *)buf == 0; |
| 489 | } |
| 490 | |
| 491 | // For enterprise split only. |
| 492 | int record_resolve_conflict_cp(uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut); |
| 493 | |
| 494 | static inline int |
| 495 | resolve_last_update_time(uint64_t left, uint64_t right) |
| 496 | { |
| 497 | return left == right ? 0 : (right > left ? 1 : -1); |
| 498 | } |
| 499 | |
| 500 | typedef struct as_remote_record_s { |
| 501 | cf_node src; |
| 502 | as_partition_reservation *rsv; |
| 503 | cf_digest *keyd; |
| 504 | |
| 505 | uint8_t *pickle; |
| 506 | size_t pickle_sz; |
| 507 | |
| 508 | uint32_t generation; |
| 509 | uint32_t void_time; |
| 510 | uint64_t last_update_time; |
| 511 | |
| 512 | const char *set_name; |
| 513 | size_t set_name_len; |
| 514 | |
| 515 | const uint8_t *key; |
| 516 | size_t key_size; |
| 517 | |
| 518 | bool is_old_pickle; // TODO - old pickle - remove in "six months" |
| 519 | |
| 520 | uint16_t n_bins; |
| 521 | as_flat_comp_meta cm; |
| 522 | uint32_t meta_sz; |
| 523 | |
| 524 | uint8_t repl_state; // relevant only for enterprise edition |
| 525 | } as_remote_record; |
| 526 | |
| 527 | int as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, bool skip_sindex, bool do_xdr_write); |
| 528 | |
| 529 | // a simpler call that gives seconds in the right epoch |
| 530 | #define as_record_void_time_get() cf_clepoch_seconds() |
| 531 | bool as_record_is_expired(const as_record *r); // TODO - eventually inline |
| 532 | |
| 533 | static inline bool |
| 534 | as_record_is_doomed(const as_record *r, struct as_namespace_s *ns) |
| 535 | { |
| 536 | return as_record_is_expired(r) || as_truncate_record_is_truncated(r, ns); |
| 537 | } |
| 538 | |
| 539 | #define AS_SINDEX_MAX 256 |
| 540 | |
| 541 | #define MIN_PARTITIONS_PER_INDEX 1 |
| 542 | #define MAX_PARTITIONS_PER_INDEX 256 |
| 543 | #define DEFAULT_PARTITIONS_PER_INDEX 32 |
| 544 | #define MAX_PARTITIONS_PER_INDEX_CHAR 3 // Number of characters in max paritions per index |
| 545 | |
| 546 | // as_sindex structure which hangs from the ns. |
| 547 | #define AS_SINDEX_INACTIVE 1 // On init, pre-loading |
| 548 | #define AS_SINDEX_ACTIVE 2 // On creation and afterwards |
| 549 | #define AS_SINDEX_DESTROY 3 // On destroy |
| 550 | // dummy sindex state when ai_btree_create() returns error this |
| 551 | // sindex is not available for any of the DML operations |
| 552 | #define AS_SINDEX_NOTCREATED 4 // Un-used flag |
| 553 | #define AS_SINDEX_FLAG_WACTIVE 0x01 // On ai btree create of sindex, never reset |
| 554 | #define AS_SINDEX_FLAG_RACTIVE 0x02 // When sindex scan of database is completed |
| 555 | #define AS_SINDEX_FLAG_DESTROY_CLEANUP 0x04 // Called for AI clean-up during si deletion |
| 556 | #define AS_SINDEX_FLAG_MIGRATE_CLEANUP 0x08 // Un-used |
| 557 | #define AS_SINDEX_FLAG_POPULATING 0x10 // Indicates current si scan job, reset when scan is done. |
| 558 | |
| 559 | struct as_sindex_s; |
| 560 | struct as_sindex_config_s; |
| 561 | |
| 562 | #define AS_SET_MAX_COUNT 0x3FF // ID's 10 bits worth minus 1 (ID 0 means no set) |
| 563 | #define AS_BINID_HAS_SINDEX_SIZE MAX_BIN_NAMES / ( sizeof(uint32_t) * CHAR_BIT ) |
| 564 | |
| 565 | |
| 566 | // TODO - would be nice to put this in as_index.h: |
| 567 | // Callback invoked when as_index is destroyed. |
| 568 | typedef void (*as_index_value_destructor) (struct as_index_s* v, void* udata); |
| 569 | |
| 570 | // TODO - would be nice to put this in as_index.h: |
| 571 | typedef struct as_index_tree_shared_s { |
| 572 | cf_arenax* arena; |
| 573 | |
| 574 | as_index_value_destructor destructor; |
| 575 | void* destructor_udata; |
| 576 | |
| 577 | // Number of sprigs per partition tree. |
| 578 | uint32_t n_sprigs; |
| 579 | |
| 580 | // Bit-shifts used to calculate indexes from digest bits. |
| 581 | uint32_t locks_shift; |
| 582 | uint32_t sprigs_shift; |
| 583 | |
| 584 | // Offsets into as_index_tree struct's variable-sized data. |
| 585 | uint32_t sprigs_offset; |
| 586 | uint32_t puddles_offset; |
| 587 | } as_index_tree_shared; |
| 588 | |
| 589 | |
| 590 | typedef struct as_sprigx_s { |
| 591 | uint64_t root_h: 40; |
| 592 | } __attribute__ ((__packed__)) as_sprigx; |
| 593 | |
| 594 | typedef struct as_treex_s { |
| 595 | int block_ix[AS_PARTITIONS]; |
| 596 | as_sprigx sprigxs[0]; |
| 597 | } as_treex; |
| 598 | |
| 599 | |
| 600 | struct as_namespace_s { |
| 601 | //-------------------------------------------- |
| 602 | // Data partitions - first, to 64-byte align. |
| 603 | // |
| 604 | |
| 605 | as_partition partitions[AS_PARTITIONS]; |
| 606 | |
| 607 | //-------------------------------------------- |
| 608 | // Name & ID. |
| 609 | // |
| 610 | |
| 611 | char name[AS_ID_NAMESPACE_SZ]; |
| 612 | uint32_t id; // this is 1-based |
| 613 | uint32_t namehash; |
| 614 | |
| 615 | //-------------------------------------------- |
| 616 | // Persistent memory. |
| 617 | // |
| 618 | |
| 619 | // Persistent memory type (default is shmem). |
| 620 | cf_xmem_type xmem_type; |
| 621 | const void* xmem_type_cfg; |
| 622 | |
| 623 | // Persistent memory "base" block ID for this namespace. |
| 624 | uint32_t xmem_id; |
| 625 | |
| 626 | // Pointer to the persistent memory "base" block. |
| 627 | uint8_t* xmem_base; |
| 628 | |
| 629 | // Pointer to partition tree info in persistent memory "treex" block. |
| 630 | as_treex* xmem_trees; |
| 631 | |
| 632 | // Pointer to arena structure (not stages) in persistent memory base block. |
| 633 | cf_arenax* arena; |
| 634 | |
| 635 | // Pointer to bin name vmap in persistent memory base block. |
| 636 | cf_vmapx* p_bin_name_vmap; |
| 637 | |
| 638 | // Pointer to set information vmap in persistent memory base block. |
| 639 | cf_vmapx* p_sets_vmap; |
| 640 | |
| 641 | // Temporary array of sets to hold config values until sets vmap is ready. |
| 642 | as_set* sets_cfg_array; |
| 643 | uint32_t sets_cfg_count; |
| 644 | |
| 645 | // Configuration flags relevant for warm or cool restart. |
| 646 | uint32_t xmem_flags; |
| 647 | |
| 648 | //-------------------------------------------- |
| 649 | // Cold start. |
| 650 | // |
| 651 | |
| 652 | // If true, read storage devices to build index at startup. |
| 653 | bool cold_start; |
| 654 | |
| 655 | // If true, device headers indicate previous shutdown was not clean. |
| 656 | bool dirty_restart; |
| 657 | |
| 658 | // Flag for ticker during initial loading of records from device. |
| 659 | bool loading_records; |
| 660 | |
| 661 | // For cold start eviction. |
| 662 | cf_mutex cold_start_evict_lock; |
| 663 | uint32_t cold_start_record_add_count; |
| 664 | uint32_t cold_start_now; |
| 665 | |
| 666 | // For sanity checking at startup (also used during warm or cool restart). |
| 667 | uint32_t startup_max_void_time; |
| 668 | |
| 669 | //-------------------------------------------- |
| 670 | // Memory management. |
| 671 | // |
| 672 | |
| 673 | // JEMalloc arena to be used for long-term storage in this namespace (-1 if nonexistent.) |
| 674 | int jem_arena; |
| 675 | |
| 676 | // Cached partition ownership info for clients. |
| 677 | client_replica_map* replica_maps; |
| 678 | |
| 679 | // Common partition tree information. Contains two configuration items. |
| 680 | as_index_tree_shared tree_shared; |
| 681 | |
| 682 | //-------------------------------------------- |
| 683 | // Storage management. |
| 684 | // |
| 685 | |
| 686 | // This is typecast to (drv_ssds*) in storage code. |
| 687 | void* storage_private; |
| 688 | |
| 689 | uint64_t ssd_size; // discovered (and rounded) size of drive |
| 690 | int storage_last_avail_pct; // most recently calculated available percent |
| 691 | int storage_max_write_q; // storage_max_write_cache is converted to this |
| 692 | uint32_t saved_defrag_sleep; // restore after defrag at startup is done |
| 693 | uint32_t defrag_lwm_size; // storage_defrag_lwm_pct % of storage_write_block_size |
| 694 | |
| 695 | // For data-not-in-memory, we optionally cache swbs after writing to device. |
| 696 | // To track fraction of reads from cache: |
| 697 | cf_atomic32 n_reads_from_cache; |
| 698 | cf_atomic32 n_reads_from_device; |
| 699 | |
| 700 | uint8_t storage_encryption_key[64]; |
| 701 | |
| 702 | //-------------------------------------------- |
| 703 | // Eviction. |
| 704 | // |
| 705 | |
| 706 | uint32_t smd_evict_void_time; |
| 707 | uint32_t evict_void_time; |
| 708 | |
| 709 | //-------------------------------------------- |
| 710 | // Truncate records. |
| 711 | // |
| 712 | |
| 713 | as_truncate truncate; |
| 714 | |
| 715 | //-------------------------------------------- |
| 716 | // Secondary index. |
| 717 | // |
| 718 | |
| 719 | int sindex_cnt; |
| 720 | uint32_t n_setless_sindexes; |
| 721 | struct as_sindex_s* sindex; // array with AS_MAX_SINDEX metadata |
| 722 | cf_shash* sindex_set_binid_hash; |
| 723 | cf_shash* sindex_iname_hash; |
| 724 | uint32_t binid_has_sindex[AS_BINID_HAS_SINDEX_SIZE]; |
| 725 | |
| 726 | //-------------------------------------------- |
| 727 | // Configuration. |
| 728 | // |
| 729 | |
| 730 | uint32_t cfg_replication_factor; |
| 731 | uint32_t replication_factor; // indirect config - can become less than cfg_replication_factor |
| 732 | uint64_t memory_size; |
| 733 | uint32_t default_ttl; |
| 734 | |
| 735 | bool enable_xdr; |
| 736 | bool sets_enable_xdr; // namespace-level flag to enable set-based xdr shipping |
| 737 | bool ns_forward_xdr_writes; // namespace-level flag to enable forwarding of xdr writes |
| 738 | bool ns_allow_nonxdr_writes; // namespace-level flag to allow nonxdr writes or not |
| 739 | bool ns_allow_xdr_writes; // namespace-level flag to allow xdr writes or not |
| 740 | |
| 741 | uint32_t background_scan_max_rps; |
| 742 | conflict_resolution_pol conflict_resolution_policy; |
| 743 | bool cp; // relevant only for enterprise edition |
| 744 | bool cp_allow_drops; // relevant only for enterprise edition |
| 745 | bool data_in_index; // with single-bin, allows warm restart for data-in-memory (with storage-engine device) |
| 746 | bool cold_start_eviction_disabled; |
| 747 | bool write_dup_res_disabled; |
| 748 | bool disallow_null_setname; |
| 749 | bool batch_sub_benchmarks_enabled; |
| 750 | bool ops_sub_benchmarks_enabled; |
| 751 | bool read_benchmarks_enabled; |
| 752 | bool udf_benchmarks_enabled; |
| 753 | bool udf_sub_benchmarks_enabled; |
| 754 | bool write_benchmarks_enabled; |
| 755 | bool proxy_hist_enabled; |
| 756 | uint32_t evict_hist_buckets; |
| 757 | uint32_t evict_tenths_pct; |
| 758 | uint32_t hwm_disk_pct; |
| 759 | uint32_t hwm_memory_pct; |
| 760 | uint64_t index_stage_size; |
| 761 | uint32_t migrate_order; |
| 762 | uint32_t migrate_retransmit_ms; |
| 763 | uint32_t migrate_sleep; |
| 764 | uint32_t nsup_hist_period; |
| 765 | uint32_t nsup_period; |
| 766 | uint32_t n_nsup_threads; |
| 767 | bool cfg_prefer_uniform_balance; // relevant only for enterprise edition |
| 768 | bool prefer_uniform_balance; // indirect config - can become disabled if any other node reports disabled |
| 769 | uint32_t rack_id; |
| 770 | as_read_consistency_level read_consistency_level; |
| 771 | bool single_bin; // restrict the namespace to objects with exactly one bin |
| 772 | uint32_t n_single_scan_threads; |
| 773 | uint32_t stop_writes_pct; |
| 774 | uint32_t tomb_raider_eligible_age; // relevant only for enterprise edition |
| 775 | uint32_t tomb_raider_period; // relevant only for enterprise edition |
| 776 | uint32_t transaction_pending_limit; // 0 means no limit |
| 777 | uint32_t n_truncate_threads; |
| 778 | as_write_commit_level write_commit_level; |
| 779 | cf_vector xdr_dclist_v; |
| 780 | |
| 781 | const char* xmem_mounts[CF_XMEM_MAX_MOUNTS]; |
| 782 | uint32_t n_xmem_mounts; // indirect config |
| 783 | uint32_t mounts_hwm_pct; |
| 784 | uint64_t mounts_size_limit; |
| 785 | |
| 786 | as_storage_type storage_type; |
| 787 | |
| 788 | const char* storage_devices[AS_STORAGE_MAX_DEVICES]; |
| 789 | uint32_t n_storage_devices; // indirect config - if devices array contains raw devices (or partitions) |
| 790 | uint32_t n_storage_files; // indirect config - if devices array contains files |
| 791 | const char* storage_shadows[AS_STORAGE_MAX_DEVICES]; |
| 792 | uint32_t n_storage_shadows; // indirect config |
| 793 | uint64_t storage_filesize; |
| 794 | char* storage_scheduler_mode; // relevant for devices only, not files |
| 795 | uint32_t storage_write_block_size; |
| 796 | bool storage_data_in_memory; |
| 797 | |
| 798 | bool storage_cold_start_empty; |
| 799 | bool storage_commit_to_device; // relevant only for enterprise edition |
| 800 | uint32_t storage_commit_min_size; // relevant only for enterprise edition |
| 801 | as_compression_method storage_compression; // relevant only for enterprise edition |
| 802 | uint32_t storage_compression_level; // relevant only for enterprise edition |
| 803 | uint32_t storage_defrag_lwm_pct; |
| 804 | uint32_t storage_defrag_queue_min; |
| 805 | uint32_t storage_defrag_sleep; |
| 806 | int storage_defrag_startup_minimum; |
| 807 | bool storage_direct_files; |
| 808 | bool storage_disable_odsync; |
| 809 | bool storage_benchmarks_enabled; // histograms are per-drive except device-read-size & device-write-size |
| 810 | as_encryption_method storage_encryption; // relevant only for enterprise edition |
| 811 | char* storage_encryption_key_file; // relevant only for enterprise edition |
| 812 | uint64_t storage_flush_max_us; |
| 813 | uint64_t storage_max_write_cache; |
| 814 | uint32_t storage_min_avail_pct; |
| 815 | cf_atomic32 storage_post_write_queue; // number of swbs/device held after writing to device |
| 816 | bool storage_read_page_cache; |
| 817 | bool storage_serialize_tomb_raider; // relevant only for enterprise edition |
| 818 | uint32_t storage_tomb_raider_sleep; // relevant only for enterprise edition |
| 819 | |
| 820 | uint32_t sindex_num_partitions; |
| 821 | |
| 822 | bool geo2dsphere_within_strict; |
| 823 | uint16_t geo2dsphere_within_min_level; |
| 824 | uint16_t geo2dsphere_within_max_level; |
| 825 | uint16_t geo2dsphere_within_max_cells; |
| 826 | uint16_t geo2dsphere_within_level_mod; |
| 827 | uint32_t geo2dsphere_within_earth_radius_meters; |
| 828 | |
| 829 | //-------------------------------------------- |
| 830 | // Statistics and histograms. |
| 831 | // |
| 832 | |
| 833 | // Object counts. |
| 834 | |
| 835 | cf_atomic64 n_objects; |
| 836 | cf_atomic64 n_tombstones; // relevant only for enterprise edition |
| 837 | |
| 838 | // Consistency info. |
| 839 | |
| 840 | uint32_t n_dead_partitions; |
| 841 | uint32_t n_unavailable_partitions; |
| 842 | bool clock_skew_stop_writes; |
| 843 | |
| 844 | // Expiration & eviction (nsup) stats. |
| 845 | |
| 846 | bool stop_writes; |
| 847 | bool hwm_breached; |
| 848 | |
| 849 | uint64_t non_expirable_objects; |
| 850 | |
| 851 | uint64_t n_expired_objects; |
| 852 | uint64_t n_evicted_objects; |
| 853 | |
| 854 | int32_t evict_ttl; // signed - possible (but weird) it's negative |
| 855 | |
| 856 | uint32_t nsup_cycle_duration; // seconds taken for most recent nsup cycle |
| 857 | |
| 858 | // Memory usage stats. |
| 859 | |
| 860 | cf_atomic_int n_bytes_memory; |
| 861 | cf_atomic64 n_bytes_sindex_memory; |
| 862 | |
| 863 | // Persistent storage stats. |
| 864 | |
| 865 | double comp_avg_orig_sz; // relevant only for enterprise edition |
| 866 | double comp_avg_comp_sz; // relevant only for enterprise edition |
| 867 | float cache_read_pct; |
| 868 | |
| 869 | // Migration stats. |
| 870 | |
| 871 | cf_atomic_int migrate_tx_partitions_imbalance; // debug only |
| 872 | cf_atomic_int migrate_tx_instance_count; // debug only |
| 873 | cf_atomic_int migrate_rx_instance_count; // debug only |
| 874 | cf_atomic_int migrate_tx_partitions_active; |
| 875 | cf_atomic_int migrate_rx_partitions_active; |
| 876 | cf_atomic_int migrate_tx_partitions_initial; |
| 877 | cf_atomic_int migrate_tx_partitions_remaining; |
| 878 | cf_atomic_int migrate_tx_partitions_lead_remaining; |
| 879 | cf_atomic_int migrate_rx_partitions_initial; |
| 880 | cf_atomic_int migrate_rx_partitions_remaining; |
| 881 | cf_atomic_int migrate_signals_active; |
| 882 | cf_atomic_int migrate_signals_remaining; |
| 883 | cf_atomic_int appeals_tx_active; // relevant only for enterprise edition |
| 884 | cf_atomic_int appeals_rx_active; // relevant only for enterprise edition |
| 885 | cf_atomic_int appeals_tx_remaining; // relevant only for enterprise edition |
| 886 | |
| 887 | // Per-record migration stats: |
| 888 | cf_atomic_int migrate_records_skipped; // relevant only for enterprise edition |
| 889 | cf_atomic_int migrate_records_transmitted; |
| 890 | cf_atomic_int migrate_record_retransmits; |
| 891 | cf_atomic_int migrate_record_receives; |
| 892 | cf_atomic_int appeals_records_exonerated; // relevant only for enterprise edition |
| 893 | |
| 894 | // From-client transaction stats. |
| 895 | |
| 896 | cf_atomic64 n_client_tsvc_error; |
| 897 | cf_atomic64 n_client_tsvc_timeout; |
| 898 | |
| 899 | cf_atomic64 n_client_proxy_complete; |
| 900 | cf_atomic64 n_client_proxy_error; |
| 901 | cf_atomic64 n_client_proxy_timeout; |
| 902 | |
| 903 | cf_atomic64 n_client_read_success; |
| 904 | cf_atomic64 n_client_read_error; |
| 905 | cf_atomic64 n_client_read_timeout; |
| 906 | cf_atomic64 n_client_read_not_found; |
| 907 | cf_atomic64 n_client_read_filtered_out; |
| 908 | |
| 909 | cf_atomic64 n_client_write_success; |
| 910 | cf_atomic64 n_client_write_error; |
| 911 | cf_atomic64 n_client_write_timeout; |
| 912 | cf_atomic64 n_client_write_filtered_out; |
| 913 | |
| 914 | // Subset of n_client_write_... above, respectively. |
| 915 | cf_atomic64 n_xdr_client_write_success; |
| 916 | cf_atomic64 n_xdr_client_write_error; |
| 917 | cf_atomic64 n_xdr_client_write_timeout; |
| 918 | |
| 919 | cf_atomic64 n_client_delete_success; |
| 920 | cf_atomic64 n_client_delete_error; |
| 921 | cf_atomic64 n_client_delete_timeout; |
| 922 | cf_atomic64 n_client_delete_not_found; |
| 923 | cf_atomic64 n_client_delete_filtered_out; |
| 924 | |
| 925 | // Subset of n_client_delete_... above, respectively. |
| 926 | cf_atomic64 n_xdr_client_delete_success; |
| 927 | cf_atomic64 n_xdr_client_delete_error; |
| 928 | cf_atomic64 n_xdr_client_delete_timeout; |
| 929 | cf_atomic64 n_xdr_client_delete_not_found; |
| 930 | |
| 931 | cf_atomic64 n_client_udf_complete; |
| 932 | cf_atomic64 n_client_udf_error; |
| 933 | cf_atomic64 n_client_udf_timeout; |
| 934 | cf_atomic64 n_client_udf_filtered_out; |
| 935 | |
| 936 | cf_atomic64 n_client_lang_read_success; |
| 937 | cf_atomic64 n_client_lang_write_success; |
| 938 | cf_atomic64 n_client_lang_delete_success; |
| 939 | cf_atomic64 n_client_lang_error; |
| 940 | |
| 941 | // From-proxy transaction stats. |
| 942 | |
| 943 | cf_atomic64 n_from_proxy_tsvc_error; |
| 944 | cf_atomic64 n_from_proxy_tsvc_timeout; |
| 945 | |
| 946 | cf_atomic64 n_from_proxy_read_success; |
| 947 | cf_atomic64 n_from_proxy_read_error; |
| 948 | cf_atomic64 n_from_proxy_read_timeout; |
| 949 | cf_atomic64 n_from_proxy_read_not_found; |
| 950 | cf_atomic64 n_from_proxy_read_filtered_out; |
| 951 | |
| 952 | cf_atomic64 n_from_proxy_write_success; |
| 953 | cf_atomic64 n_from_proxy_write_error; |
| 954 | cf_atomic64 n_from_proxy_write_timeout; |
| 955 | cf_atomic64 n_from_proxy_write_filtered_out; |
| 956 | |
| 957 | // Subset of n_from_proxy_write_... above, respectively. |
| 958 | cf_atomic64 n_xdr_from_proxy_write_success; |
| 959 | cf_atomic64 n_xdr_from_proxy_write_error; |
| 960 | cf_atomic64 n_xdr_from_proxy_write_timeout; |
| 961 | |
| 962 | cf_atomic64 n_from_proxy_delete_success; |
| 963 | cf_atomic64 n_from_proxy_delete_error; |
| 964 | cf_atomic64 n_from_proxy_delete_timeout; |
| 965 | cf_atomic64 n_from_proxy_delete_not_found; |
| 966 | cf_atomic64 n_from_proxy_delete_filtered_out; |
| 967 | |
| 968 | // Subset of n_from_proxy_delete_... above, respectively. |
| 969 | cf_atomic64 n_xdr_from_proxy_delete_success; |
| 970 | cf_atomic64 n_xdr_from_proxy_delete_error; |
| 971 | cf_atomic64 n_xdr_from_proxy_delete_timeout; |
| 972 | cf_atomic64 n_xdr_from_proxy_delete_not_found; |
| 973 | |
| 974 | cf_atomic64 n_from_proxy_udf_complete; |
| 975 | cf_atomic64 n_from_proxy_udf_error; |
| 976 | cf_atomic64 n_from_proxy_udf_timeout; |
| 977 | cf_atomic64 n_from_proxy_udf_filtered_out; |
| 978 | |
| 979 | cf_atomic64 n_from_proxy_lang_read_success; |
| 980 | cf_atomic64 n_from_proxy_lang_write_success; |
| 981 | cf_atomic64 n_from_proxy_lang_delete_success; |
| 982 | cf_atomic64 n_from_proxy_lang_error; |
| 983 | |
| 984 | // Batch sub-transaction stats. |
| 985 | |
| 986 | cf_atomic64 n_batch_sub_tsvc_error; |
| 987 | cf_atomic64 n_batch_sub_tsvc_timeout; |
| 988 | |
| 989 | cf_atomic64 n_batch_sub_proxy_complete; |
| 990 | cf_atomic64 n_batch_sub_proxy_error; |
| 991 | cf_atomic64 n_batch_sub_proxy_timeout; |
| 992 | |
| 993 | cf_atomic64 n_batch_sub_read_success; |
| 994 | cf_atomic64 n_batch_sub_read_error; |
| 995 | cf_atomic64 n_batch_sub_read_timeout; |
| 996 | cf_atomic64 n_batch_sub_read_not_found; |
| 997 | cf_atomic64 n_batch_sub_read_filtered_out; |
| 998 | |
| 999 | // From-proxy batch sub-transaction stats. |
| 1000 | |
| 1001 | cf_atomic64 n_from_proxy_batch_sub_tsvc_error; |
| 1002 | cf_atomic64 n_from_proxy_batch_sub_tsvc_timeout; |
| 1003 | |
| 1004 | cf_atomic64 n_from_proxy_batch_sub_read_success; |
| 1005 | cf_atomic64 n_from_proxy_batch_sub_read_error; |
| 1006 | cf_atomic64 n_from_proxy_batch_sub_read_timeout; |
| 1007 | cf_atomic64 n_from_proxy_batch_sub_read_not_found; |
| 1008 | cf_atomic64 n_from_proxy_batch_sub_read_filtered_out; |
| 1009 | |
| 1010 | // Internal-UDF sub-transaction stats. |
| 1011 | |
| 1012 | cf_atomic64 n_udf_sub_tsvc_error; |
| 1013 | cf_atomic64 n_udf_sub_tsvc_timeout; |
| 1014 | |
| 1015 | cf_atomic64 n_udf_sub_udf_complete; |
| 1016 | cf_atomic64 n_udf_sub_udf_error; |
| 1017 | cf_atomic64 n_udf_sub_udf_timeout; |
| 1018 | uint64_t n_udf_sub_udf_filtered_out; |
| 1019 | |
| 1020 | cf_atomic64 n_udf_sub_lang_read_success; |
| 1021 | cf_atomic64 n_udf_sub_lang_write_success; |
| 1022 | cf_atomic64 n_udf_sub_lang_delete_success; |
| 1023 | cf_atomic64 n_udf_sub_lang_error; |
| 1024 | |
| 1025 | // Internal-ops sub-transaction stats. |
| 1026 | |
| 1027 | cf_atomic64 n_ops_sub_tsvc_error; |
| 1028 | cf_atomic64 n_ops_sub_tsvc_timeout; |
| 1029 | |
| 1030 | cf_atomic64 n_ops_sub_write_success; |
| 1031 | cf_atomic64 n_ops_sub_write_error; |
| 1032 | cf_atomic64 n_ops_sub_write_timeout; |
| 1033 | uint64_t n_ops_sub_write_filtered_out; |
| 1034 | |
| 1035 | // Transaction retransmit stats - 'all' means both client & proxy origins. |
| 1036 | |
| 1037 | uint64_t n_retransmit_all_read_dup_res; |
| 1038 | |
| 1039 | uint64_t n_retransmit_all_write_dup_res; |
| 1040 | uint64_t n_retransmit_all_write_repl_write; |
| 1041 | |
| 1042 | uint64_t n_retransmit_all_delete_dup_res; |
| 1043 | uint64_t n_retransmit_all_delete_repl_write; |
| 1044 | |
| 1045 | uint64_t n_retransmit_all_udf_dup_res; |
| 1046 | uint64_t n_retransmit_all_udf_repl_write; |
| 1047 | |
| 1048 | uint64_t n_retransmit_all_batch_sub_dup_res; |
| 1049 | |
| 1050 | uint64_t n_retransmit_udf_sub_dup_res; |
| 1051 | uint64_t n_retransmit_udf_sub_repl_write; |
| 1052 | |
| 1053 | uint64_t n_retransmit_ops_sub_dup_res; |
| 1054 | uint64_t n_retransmit_ops_sub_repl_write; |
| 1055 | |
| 1056 | // Scan stats. |
| 1057 | |
| 1058 | uint64_t n_scan_basic_complete; |
| 1059 | uint64_t n_scan_basic_error; |
| 1060 | uint64_t n_scan_basic_abort; |
| 1061 | |
| 1062 | uint64_t n_scan_aggr_complete; |
| 1063 | uint64_t n_scan_aggr_error; |
| 1064 | uint64_t n_scan_aggr_abort; |
| 1065 | |
| 1066 | uint64_t n_scan_udf_bg_complete; |
| 1067 | uint64_t n_scan_udf_bg_error; |
| 1068 | uint64_t n_scan_udf_bg_abort; |
| 1069 | |
| 1070 | uint64_t n_scan_ops_bg_complete; |
| 1071 | uint64_t n_scan_ops_bg_error; |
| 1072 | uint64_t n_scan_ops_bg_abort; |
| 1073 | |
| 1074 | // Query stats. |
| 1075 | |
| 1076 | cf_atomic64 query_reqs; |
| 1077 | cf_atomic64 query_fail; |
| 1078 | cf_atomic64 query_short_queue_full; |
| 1079 | cf_atomic64 query_long_queue_full; |
| 1080 | cf_atomic64 query_short_reqs; |
| 1081 | cf_atomic64 query_long_reqs; |
| 1082 | |
| 1083 | cf_atomic64 n_lookup; |
| 1084 | cf_atomic64 n_lookup_success; |
| 1085 | cf_atomic64 n_lookup_abort; |
| 1086 | cf_atomic64 n_lookup_errs; |
| 1087 | cf_atomic64 lookup_response_size; |
| 1088 | cf_atomic64 lookup_num_records; |
| 1089 | |
| 1090 | cf_atomic64 n_aggregation; |
| 1091 | cf_atomic64 n_agg_success; |
| 1092 | cf_atomic64 n_agg_abort; |
| 1093 | cf_atomic64 n_agg_errs; |
| 1094 | cf_atomic64 agg_response_size; |
| 1095 | cf_atomic64 agg_num_records; |
| 1096 | |
| 1097 | cf_atomic64 n_query_udf_bg_success; |
| 1098 | cf_atomic64 n_query_udf_bg_failure; |
| 1099 | |
| 1100 | cf_atomic64 n_query_ops_bg_success; |
| 1101 | cf_atomic64 n_query_ops_bg_failure; |
| 1102 | |
| 1103 | // Geospatial query stats: |
| 1104 | cf_atomic64 geo_region_query_count; // number of region queries |
| 1105 | cf_atomic64 geo_region_query_cells; // number of cells used by region queries |
| 1106 | cf_atomic64 geo_region_query_points; // number of valid points found |
| 1107 | cf_atomic64 geo_region_query_falsepos; // number of false positives found |
| 1108 | |
| 1109 | // Re-replication stats - relevant only for enterprise edition. |
| 1110 | |
| 1111 | cf_atomic64 n_re_repl_success; |
| 1112 | cf_atomic64 n_re_repl_error; |
| 1113 | cf_atomic64 n_re_repl_timeout; |
| 1114 | |
| 1115 | // Special errors that deserve their own counters: |
| 1116 | |
| 1117 | cf_atomic64 n_fail_xdr_forbidden; |
| 1118 | cf_atomic64 n_fail_key_busy; |
| 1119 | cf_atomic64 n_fail_generation; |
| 1120 | cf_atomic64 n_fail_record_too_big; |
| 1121 | |
| 1122 | // Special non-error counters: |
| 1123 | |
| 1124 | cf_atomic64 n_deleted_last_bin; |
| 1125 | |
| 1126 | // One-way automatically activated histograms. |
| 1127 | |
| 1128 | cf_hist_track* read_hist; |
| 1129 | cf_hist_track* write_hist; |
| 1130 | cf_hist_track* udf_hist; |
| 1131 | cf_hist_track* query_hist; |
| 1132 | histogram* query_rec_count_hist; |
| 1133 | histogram* re_repl_hist; // relevant only for enterprise edition |
| 1134 | |
| 1135 | bool read_hist_active; |
| 1136 | bool write_hist_active; |
| 1137 | bool udf_hist_active; |
| 1138 | bool query_hist_active; |
| 1139 | bool query_rec_count_hist_active; |
| 1140 | bool re_repl_hist_active; // relevant only for enterprise edition |
| 1141 | |
| 1142 | // Activate-by-config histograms. |
| 1143 | |
| 1144 | histogram* proxy_hist; |
| 1145 | |
| 1146 | histogram* read_start_hist; |
| 1147 | histogram* read_restart_hist; |
| 1148 | histogram* read_dup_res_hist; |
| 1149 | histogram* read_repl_ping_hist; |
| 1150 | histogram* read_local_hist; |
| 1151 | histogram* read_response_hist; |
| 1152 | |
| 1153 | histogram* write_start_hist; |
| 1154 | histogram* write_restart_hist; |
| 1155 | histogram* write_dup_res_hist; |
| 1156 | histogram* write_master_hist; // split this? |
| 1157 | histogram* write_repl_write_hist; |
| 1158 | histogram* write_response_hist; |
| 1159 | |
| 1160 | histogram* udf_start_hist; |
| 1161 | histogram* udf_restart_hist; |
| 1162 | histogram* udf_dup_res_hist; |
| 1163 | histogram* udf_master_hist; // split this? |
| 1164 | histogram* udf_repl_write_hist; |
| 1165 | histogram* udf_response_hist; |
| 1166 | |
| 1167 | histogram* batch_sub_start_hist; |
| 1168 | histogram* batch_sub_restart_hist; |
| 1169 | histogram* batch_sub_dup_res_hist; |
| 1170 | histogram* batch_sub_repl_ping_hist; |
| 1171 | histogram* batch_sub_read_local_hist; |
| 1172 | histogram* batch_sub_response_hist; |
| 1173 | |
| 1174 | histogram* udf_sub_start_hist; |
| 1175 | histogram* udf_sub_restart_hist; |
| 1176 | histogram* udf_sub_dup_res_hist; |
| 1177 | histogram* udf_sub_master_hist; // split this? |
| 1178 | histogram* udf_sub_repl_write_hist; |
| 1179 | histogram* udf_sub_response_hist; |
| 1180 | |
| 1181 | histogram* ops_sub_start_hist; |
| 1182 | histogram* ops_sub_restart_hist; |
| 1183 | histogram* ops_sub_dup_res_hist; |
| 1184 | histogram* ops_sub_master_hist; // split this? |
| 1185 | histogram* ops_sub_repl_write_hist; |
| 1186 | histogram* ops_sub_response_hist; |
| 1187 | |
| 1188 | histogram* device_read_size_hist; |
| 1189 | histogram* device_write_size_hist; |
| 1190 | |
| 1191 | // Histograms of object storage sizes. (Meaningful for drive-backed |
| 1192 | // namespaces only.) |
| 1193 | histogram* obj_size_log_hist; |
| 1194 | histogram* set_obj_size_log_hists[AS_SET_MAX_COUNT + 1]; |
| 1195 | linear_hist* obj_size_lin_hist; |
| 1196 | linear_hist* set_obj_size_lin_hists[AS_SET_MAX_COUNT + 1]; |
| 1197 | |
| 1198 | // Histograms used for general eviction and expiration. |
| 1199 | linear_hist* evict_hist; // not just for info |
| 1200 | linear_hist* ttl_hist; |
| 1201 | linear_hist* set_ttl_hists[AS_SET_MAX_COUNT + 1]; |
| 1202 | |
| 1203 | //-------------------------------------------- |
| 1204 | // Information for rebalancing. |
| 1205 | // |
| 1206 | |
| 1207 | uint32_t cluster_size; |
| 1208 | cf_node succession[AS_CLUSTER_SZ]; |
| 1209 | as_partition_version cluster_versions[AS_CLUSTER_SZ][AS_PARTITIONS]; |
| 1210 | uint32_t rack_ids[AS_CLUSTER_SZ]; // is observed-rack-ids in CP mode |
| 1211 | |
| 1212 | // Quiescence - relevant only for enterprise edition. |
| 1213 | uint32_t active_size; |
| 1214 | bool pending_quiesce; |
| 1215 | bool is_quiesced; |
| 1216 | bool quiesced[AS_CLUSTER_SZ]; |
| 1217 | |
| 1218 | // Observed nodes - relevant only for enterprise edition. |
| 1219 | uint32_t observed_cluster_size; |
| 1220 | cf_node observed_succession[AS_CLUSTER_SZ]; |
| 1221 | |
| 1222 | // Roster management - relevant only for enterprise edition. |
| 1223 | uint32_t smd_roster_generation; |
| 1224 | uint32_t smd_roster_count; |
| 1225 | cf_node smd_roster[AS_CLUSTER_SZ]; |
| 1226 | uint32_t smd_roster_rack_ids[AS_CLUSTER_SZ]; |
| 1227 | uint32_t roster_generation; |
| 1228 | uint32_t roster_count; |
| 1229 | cf_node roster[AS_CLUSTER_SZ]; |
| 1230 | uint32_t roster_rack_ids[AS_CLUSTER_SZ]; |
| 1231 | |
| 1232 | // Master regimes - relevant only for enterprise edition. |
| 1233 | uint32_t eventual_regime; |
| 1234 | uint32_t rebalance_regime; |
| 1235 | uint32_t rebalance_regimes[AS_CLUSTER_SZ]; |
| 1236 | }; |
| 1237 | |
| 1238 | #define AS_SET_NAME_MAX_SIZE 64 // includes space for null-terminator |
| 1239 | |
| 1240 | #define INVALID_SET_ID 0 |
| 1241 | |
| 1242 | #define IS_SET_EVICTION_DISABLED(p_set) (cf_atomic32_get(p_set->disable_eviction) == 1) |
| 1243 | #define DISABLE_SET_EVICTION(p_set, on_off) (cf_atomic32_set(&p_set->disable_eviction, on_off ? 1 : 0)) |
| 1244 | |
| 1245 | typedef enum { |
| 1246 | AS_SET_ENABLE_XDR_DEFAULT = 0, |
| 1247 | AS_SET_ENABLE_XDR_TRUE = 1, |
| 1248 | AS_SET_ENABLE_XDR_FALSE = 2 |
| 1249 | } as_set_enable_xdr_flag; |
| 1250 | |
| 1251 | // Caution - changing this struct could break warm or cool restart. |
| 1252 | struct as_set_s { |
| 1253 | char name[AS_SET_NAME_MAX_SIZE]; |
| 1254 | cf_atomic64 n_objects; |
| 1255 | cf_atomic64 n_tombstones; // relevant only for enterprise edition |
| 1256 | cf_atomic64 n_bytes_memory; // for data-in-memory only - sets's total record data size |
| 1257 | cf_atomic64 stop_writes_count; // restrict number of records in a set |
| 1258 | uint64_t truncate_lut; // records with last-update-time less than this are truncated |
| 1259 | cf_atomic32 disable_eviction; // don't evict anything in this set (note - expiration still works) |
| 1260 | cf_atomic32 enable_xdr; // white-list (AS_SET_ENABLE_XDR_TRUE) or black-list (AS_SET_ENABLE_XDR_FALSE) a set for XDR replication |
| 1261 | uint32_t n_sindexes; |
| 1262 | uint8_t padding[12]; |
| 1263 | }; |
| 1264 | |
| 1265 | static inline bool |
| 1266 | as_set_stop_writes(as_set *p_set) { |
| 1267 | uint64_t n_objects = cf_atomic64_get(p_set->n_objects); |
| 1268 | uint64_t stop_writes_count = cf_atomic64_get(p_set->stop_writes_count); |
| 1269 | |
| 1270 | return stop_writes_count != 0 && n_objects >= stop_writes_count; |
| 1271 | } |
| 1272 | |
| 1273 | // These bin functions must be below definition of struct as_namespace_s: |
| 1274 | |
| 1275 | static inline bool |
| 1276 | as_bin_set_id_from_name_w_len(as_namespace *ns, as_bin *b, const uint8_t *buf, |
| 1277 | size_t len) { |
| 1278 | return as_bin_get_or_assign_id_w_len(ns, (const char *)buf, len, &b->id); |
| 1279 | } |
| 1280 | |
| 1281 | static inline size_t |
| 1282 | as_bin_memcpy_name(as_namespace *ns, uint8_t *buf, as_bin *b) { |
| 1283 | size_t len = 0; |
| 1284 | |
| 1285 | if (! ns->single_bin) { |
| 1286 | const char *name = as_bin_get_name_from_id(ns, b->id); |
| 1287 | |
| 1288 | len = strlen(name); |
| 1289 | memcpy(buf, name, len); |
| 1290 | } |
| 1291 | |
| 1292 | return len; |
| 1293 | } |
| 1294 | |
| 1295 | // forward ref |
| 1296 | struct as_msg_field_s; |
| 1297 | |
| 1298 | /* Namespace function declarations */ |
| 1299 | extern as_namespace *as_namespace_create(char *name); |
| 1300 | extern void as_namespaces_init(bool cold_start_cmd, uint32_t instance); |
| 1301 | extern void as_namespaces_setup(bool cold_start_cmd, uint32_t instance); |
| 1302 | extern void as_namespace_finish_setup(as_namespace *ns, uint32_t instance); |
| 1303 | extern bool as_namespace_configure_sets(as_namespace *ns); |
| 1304 | extern as_namespace *as_namespace_get_byname(char *name); |
| 1305 | extern as_namespace *as_namespace_get_byid(uint32_t id); |
| 1306 | extern as_namespace *as_namespace_get_bybuf(uint8_t *name, size_t len); |
| 1307 | extern as_namespace *as_namespace_get_bymsgfield(struct as_msg_field_s *fp); |
| 1308 | extern const char *as_namespace_get_set_name(as_namespace *ns, uint16_t set_id); |
| 1309 | extern uint16_t as_namespace_get_set_id(as_namespace *ns, const char *set_name); |
| 1310 | extern uint16_t as_namespace_get_create_set_id(as_namespace *ns, const char *set_name); |
| 1311 | extern int as_namespace_set_set_w_len(as_namespace *ns, const char *set_name, size_t len, uint16_t *p_set_id, bool apply_restrictions); |
| 1312 | extern int as_namespace_get_create_set_w_len(as_namespace *ns, const char *set_name, size_t len, as_set **pp_set, uint16_t *p_set_id); |
| 1313 | extern as_set *as_namespace_get_set_by_name(as_namespace *ns, const char *set_name); |
| 1314 | extern as_set* as_namespace_get_set_by_id(as_namespace* ns, uint16_t set_id); |
| 1315 | extern as_set* as_namespace_get_record_set(as_namespace *ns, const as_record *r); |
| 1316 | extern void as_namespace_get_set_info(as_namespace *ns, const char *set_name, cf_dyn_buf *db); |
| 1317 | extern void as_namespace_adjust_set_memory(as_namespace *ns, uint16_t set_id, int64_t delta_bytes); |
| 1318 | extern void as_namespace_release_set_id(as_namespace *ns, uint16_t set_id); |
| 1319 | extern void as_namespace_get_bins_info(as_namespace *ns, cf_dyn_buf *db, bool show_ns); |
| 1320 | extern void as_namespace_get_hist_info(as_namespace *ns, char *set_name, char *hist_name, cf_dyn_buf *db); |
| 1321 | |
| 1322 | static inline bool |
| 1323 | as_namespace_cool_restarts(const as_namespace *ns) |
| 1324 | { |
| 1325 | return ns->storage_data_in_memory && ! ns->data_in_index; |
| 1326 | } |
| 1327 | |
| 1328 | static inline uint32_t |
| 1329 | as_namespace_device_count(const as_namespace *ns) |
| 1330 | { |
| 1331 | // Only one of them will ever be non-zero. |
| 1332 | return ns->n_storage_devices + ns->n_storage_files; |
| 1333 | } |
| 1334 | |
| 1335 | static inline const char* |
| 1336 | as_namespace_start_mode_str(const as_namespace *ns) |
| 1337 | { |
| 1338 | return as_namespace_cool_restarts(ns) ? "cool" : "warm" ; |
| 1339 | } |
| 1340 | |
| 1341 | static inline bool |
| 1342 | as_namespace_index_persisted(const as_namespace *ns) |
| 1343 | { |
| 1344 | return ns->xmem_type == CF_XMEM_TYPE_PMEM || |
| 1345 | ns->xmem_type == CF_XMEM_TYPE_FLASH; |
| 1346 | } |
| 1347 | |
| 1348 | // Persistent Memory Management |
| 1349 | void as_namespace_xmem_shutdown(as_namespace *ns, uint32_t instance); |
| 1350 | |