1 | /* |
2 | * datamodel.h |
3 | * |
4 | * Copyright (C) 2008-2016 Aerospike, Inc. |
5 | * |
6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
7 | * license agreements. |
8 | * |
9 | * This program is free software: you can redistribute it and/or modify it under |
10 | * the terms of the GNU Affero General Public License as published by the Free |
11 | * Software Foundation, either version 3 of the License, or (at your option) any |
12 | * later version. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
17 | * details. |
18 | * |
19 | * You should have received a copy of the GNU Affero General Public License |
20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
21 | */ |
22 | |
23 | /* |
24 | * core data model structures and definitions |
25 | */ |
26 | |
27 | #pragma once |
28 | |
29 | #include <limits.h> |
30 | #include <stdbool.h> |
31 | #include <stddef.h> |
32 | #include <stdint.h> |
33 | #include <string.h> |
34 | |
35 | #include "aerospike/as_val.h" |
36 | #include "citrusleaf/cf_atomic.h" |
37 | #include "citrusleaf/cf_clock.h" |
38 | #include "citrusleaf/cf_digest.h" |
39 | |
40 | #include "arenax.h" |
41 | #include "cf_mutex.h" |
42 | #include "dynbuf.h" |
43 | #include "hist.h" |
44 | #include "hist_track.h" |
45 | #include "linear_hist.h" |
46 | #include "msg.h" |
47 | #include "node.h" |
48 | #include "shash.h" |
49 | #include "vmapx.h" |
50 | #include "xmem.h" |
51 | |
52 | #include "base/cfg.h" |
53 | #include "base/proto.h" |
54 | #include "base/transaction_policy.h" |
55 | #include "base/truncate.h" |
56 | #include "fabric/hb.h" |
57 | #include "fabric/partition.h" |
58 | #include "storage/flat.h" |
59 | #include "storage/storage.h" |
60 | |
61 | |
62 | #define OBJ_SIZE_HIST_NUM_BUCKETS 1024 |
63 | #define TTL_HIST_NUM_BUCKETS 100 |
64 | |
65 | #define MAX_ALLOWED_TTL (3600 * 24 * 365 * 10) // 10 years |
66 | |
67 | // [0-1] for partition-id |
68 | // [1-4] for tree sprigs and locks |
69 | // [5-7] unused |
70 | // [8-11] for SSD device hash |
71 | #define DIGEST_STORAGE_BASE_BYTE 8 |
72 | // [12-15] for rw_request hash |
73 | #define DIGEST_HASH_BASE_BYTE 12 |
74 | // [16-19] for pred-exp filter |
75 | |
76 | |
77 | /* Forward declarations */ |
78 | typedef struct as_namespace_s as_namespace; |
79 | typedef struct as_index_s as_record; |
80 | typedef struct as_bin_s as_bin; |
81 | typedef struct as_index_ref_s as_index_ref; |
82 | typedef struct as_set_s as_set; |
83 | |
84 | struct as_index_tree_s; |
85 | |
86 | |
87 | #define AS_ID_NAMESPACE_SZ 32 |
88 | |
89 | #define AS_ID_INAME_SZ 256 |
90 | |
91 | #define AS_BIN_NAME_MAX_SZ 16 // changing this breaks warm restart |
92 | #define MAX_BIN_NAMES 0x10000 // no need for more - numeric ID is 16 bits |
93 | #define BIN_NAMES_QUOTA (MAX_BIN_NAMES / 2) // don't add more names than this via client transactions |
94 | |
95 | /* |
96 | * Compare two 16-bit generation counts, allowing wrap-arounds. |
97 | * Works correctly, if: |
98 | * |
99 | * - rhs is ahead of lhs, but rhs isn't ahead more than 32,768. |
100 | * - lhs is ahead of rhs, but lhs isn't ahead more than 32,767. |
101 | */ |
102 | |
103 | static inline bool |
104 | as_gen_less_than(uint16_t lhs, uint16_t rhs) |
105 | { |
106 | return (uint16_t)(lhs - rhs) >= 32768; |
107 | } |
108 | |
109 | |
110 | /* as_particle_type |
111 | * Particles are typed, which reflects their contents: |
112 | * NULL: no associated content (not sure I really need this internally?) |
113 | * INTEGER: a signed, 64-bit integer |
114 | * FLOAT: a floating point |
115 | * STRING: a null-terminated UTF-8 string |
116 | * BLOB: arbitrary-length binary data |
117 | * TIMESTAMP: milliseconds since 1 January 1970, 00:00:00 GMT |
118 | * DIGEST: an internal Aerospike key digest */ |
119 | typedef enum { |
120 | AS_PARTICLE_TYPE_NULL = 0, |
121 | AS_PARTICLE_TYPE_INTEGER = 1, |
122 | AS_PARTICLE_TYPE_FLOAT = 2, |
123 | AS_PARTICLE_TYPE_STRING = 3, |
124 | AS_PARTICLE_TYPE_BLOB = 4, |
125 | AS_PARTICLE_TYPE_JAVA_BLOB = 7, |
126 | AS_PARTICLE_TYPE_CSHARP_BLOB = 8, |
127 | AS_PARTICLE_TYPE_PYTHON_BLOB = 9, |
128 | AS_PARTICLE_TYPE_RUBY_BLOB = 10, |
129 | AS_PARTICLE_TYPE_PHP_BLOB = 11, |
130 | AS_PARTICLE_TYPE_ERLANG_BLOB = 12, |
131 | AS_PARTICLE_TYPE_MAP = 19, |
132 | AS_PARTICLE_TYPE_LIST = 20, |
133 | AS_PARTICLE_TYPE_GEOJSON = 23, |
134 | AS_PARTICLE_TYPE_MAX = 24, |
135 | AS_PARTICLE_TYPE_BAD = AS_PARTICLE_TYPE_MAX |
136 | } as_particle_type; |
137 | |
138 | /* as_particle |
139 | * The common part of a particle |
140 | * this is poor man's subclassing - IE, how to do a subclassed interface in C |
141 | * Go look in particle.c to see all the subclass implementation and structure */ |
142 | typedef struct as_particle_s { |
143 | uint8_t metadata; // used by the iparticle for is_integer and inuse, as well as version in multi bin mode only |
144 | // used by *particle for type |
145 | uint8_t data[0]; |
146 | } __attribute__ ((__packed__)) as_particle; |
147 | |
148 | // Bit Flag constants used for the particle state value (4 bits, 16 values) |
149 | #define AS_BIN_STATE_UNUSED 0 |
150 | #define AS_BIN_STATE_INUSE_INTEGER 1 |
151 | #define AS_BIN_STATE_RECYCLE_ME 2 // was - hidden bin |
152 | #define AS_BIN_STATE_INUSE_OTHER 3 |
153 | #define AS_BIN_STATE_INUSE_FLOAT 4 |
154 | |
155 | typedef struct as_particle_iparticle_s { |
156 | uint8_t version: 4; // now unused - and can't be used in single-bin config |
157 | uint8_t state: 4; // see AS_BIN_STATE_... |
158 | uint8_t data[0]; |
159 | } __attribute__ ((__packed__)) as_particle_iparticle; |
160 | |
161 | /* Particle function declarations */ |
162 | |
163 | static inline bool |
164 | is_embedded_particle_type(as_particle_type type) |
165 | { |
166 | return type == AS_PARTICLE_TYPE_INTEGER || type == AS_PARTICLE_TYPE_FLOAT; |
167 | } |
168 | |
169 | extern as_particle_type as_particle_type_from_asval(const as_val *val); |
170 | extern as_particle_type as_particle_type_from_msgpack(const uint8_t *packed, uint32_t packed_size); |
171 | |
172 | extern uint32_t as_particle_size_from_asval(const as_val *val); |
173 | |
174 | extern uint32_t as_particle_asval_client_value_size(const as_val *val); |
175 | extern uint32_t as_particle_asval_to_client(const as_val *val, as_msg_op *op); |
176 | |
177 | extern const uint8_t *as_particle_skip_flat(const uint8_t *flat, const uint8_t *end); |
178 | |
179 | // as_bin particle function declarations |
180 | |
181 | extern void as_bin_particle_destroy(as_bin *b, bool free_particle); |
182 | extern uint32_t as_bin_particle_size(as_bin *b); |
183 | |
184 | // wire: |
185 | extern int as_bin_particle_alloc_modify_from_client(as_bin *b, const as_msg_op *op); |
186 | extern int as_bin_particle_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op); |
187 | extern int as_bin_particle_alloc_from_client(as_bin *b, const as_msg_op *op); |
188 | extern int as_bin_particle_stack_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op); |
189 | extern int as_bin_particle_alloc_from_pickled(as_bin *b, const uint8_t **p_pickled, const uint8_t *end); |
190 | extern int as_bin_particle_stack_from_pickled(as_bin *b, cf_ll_buf *particles_llb, const uint8_t **p_pickled, const uint8_t *end); |
191 | extern uint32_t as_bin_particle_client_value_size(const as_bin *b); |
192 | extern uint32_t as_bin_particle_to_client(const as_bin *b, as_msg_op *op); |
193 | extern uint32_t as_bin_particle_pickled_size(const as_bin *b); |
194 | extern uint32_t as_bin_particle_to_pickled(const as_bin *b, uint8_t *pickled); |
195 | |
196 | // Different for blob bitwise operations - we don't use the normal APIs and |
197 | // particle table functions. |
198 | extern int as_bin_bits_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result); |
199 | extern int as_bin_bits_alloc_modify_from_client(as_bin *b, as_msg_op *op); |
200 | extern int as_bin_bits_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op); |
201 | |
202 | // Different for CDTs - the operations may return results, so we don't use the |
203 | // normal APIs and particle table functions. |
204 | extern int as_bin_cdt_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result); |
205 | extern int as_bin_cdt_alloc_modify_from_client(as_bin *b, as_msg_op *op, as_bin *result); |
206 | extern int as_bin_cdt_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op, as_bin *result); |
207 | |
208 | // as_val: |
209 | extern int as_bin_particle_replace_from_asval(as_bin *b, const as_val *val); |
210 | extern void as_bin_particle_stack_from_asval(as_bin *b, uint8_t* stack, const as_val *val); |
211 | extern as_val *as_bin_particle_to_asval(const as_bin *b); |
212 | |
213 | // msgpack: |
214 | extern int as_bin_particle_alloc_from_msgpack(as_bin *b, const uint8_t *packed, uint32_t packed_size); |
215 | |
216 | // flat: |
217 | extern const uint8_t *as_bin_particle_cast_from_flat(as_bin *b, const uint8_t *flat, const uint8_t *end); |
218 | extern const uint8_t *as_bin_particle_replace_from_flat(as_bin *b, const uint8_t *flat, const uint8_t *end); |
219 | extern uint32_t as_bin_particle_flat_size(as_bin *b); |
220 | extern uint32_t as_bin_particle_to_flat(const as_bin *b, uint8_t *flat); |
221 | |
222 | // odd as_bin particle functions for specific particle types |
223 | |
224 | // integer: |
225 | extern int64_t as_bin_particle_integer_value(const as_bin *b); |
226 | extern void as_bin_particle_integer_set(as_bin *b, int64_t i); |
227 | |
228 | // string: |
229 | extern uint32_t as_bin_particle_string_ptr(const as_bin *b, char **p_value); |
230 | |
231 | // blob: |
232 | extern int as_bin_bits_packed_read(const as_bin *b, const as_msg_op *msg_op, as_bin *result); |
233 | extern int as_bin_bits_packed_modify(as_bin *b, const as_msg_op *msg_op, cf_ll_buf *particles_llb); |
234 | |
235 | // geojson: |
236 | typedef void * geo_region_t; |
237 | #define MAX_REGION_CELLS 256 |
238 | #define MAX_REGION_LEVELS 30 |
239 | extern size_t as_bin_particle_geojson_cellids(const as_bin *b, uint64_t **pp_cells); |
240 | extern bool as_particle_geojson_match(as_particle *p, uint64_t cellid, geo_region_t region, bool is_strict); |
241 | extern bool as_particle_geojson_match_asval(const as_val *val, uint64_t cellid, geo_region_t region, bool is_strict); |
242 | char const *as_geojson_mem_jsonstr(const as_particle *p, size_t *p_jsonsz); |
243 | |
244 | // list: |
245 | struct cdt_payload_s; |
246 | struct rollback_alloc_s; |
247 | extern void as_bin_particle_list_get_packed_val(const as_bin *b, struct cdt_payload_s *packed); |
248 | |
249 | extern int as_bin_cdt_packed_read(const as_bin *b, const as_msg_op *op, as_bin *result); |
250 | extern int as_bin_cdt_packed_modify(as_bin *b, const as_msg_op *op, as_bin *result, cf_ll_buf *particles_llb); |
251 | |
252 | |
253 | /* as_bin |
254 | * A bin container - null name means unused */ |
255 | struct as_bin_s { |
256 | as_particle iparticle; // 1 byte |
257 | as_particle *particle; // for embedded particle this is value, not pointer |
258 | |
259 | // Never read or write these bytes in single-bin configuration: |
260 | uint16_t id; // ID of bin name |
261 | uint8_t unused; // pad to 12 bytes (multiple of 4) - legacy |
262 | } __attribute__ ((__packed__)) ; |
263 | |
264 | // For data-in-memory namespaces in multi-bin mode, we keep an array of as_bin |
265 | // structs in memory, accessed via this struct. |
266 | typedef struct as_bin_space_s { |
267 | uint16_t n_bins; |
268 | as_bin bins[0]; |
269 | } __attribute__ ((__packed__)) as_bin_space; |
270 | |
271 | // TODO - Do we really need to pad as_bin to 12 bytes for thread safety? |
272 | // Do we ever write & read adjacent as_bin structures in a bins array from |
273 | // different threads when not under the record lock? And if we're worried about |
274 | // 4-byte alignment for that or any other reason, wouldn't we also have to pad |
275 | // after n_bins in as_bin_space? |
276 | |
277 | // For data-in-memory namespaces in multi-bin mode, if we're storing extra |
278 | // record metadata, we access it via this struct. In this case the index points |
279 | // here instead of directly to an as_bin_space. |
280 | typedef struct as_rec_space_s { |
281 | as_bin_space* bin_space; |
282 | |
283 | // So far the key is the only extra record metadata we store in memory. |
284 | uint32_t key_size; |
285 | uint8_t key[0]; |
286 | } __attribute__ ((__packed__)) as_rec_space; |
287 | |
288 | // For copying as_bin structs without the last 3 bytes. |
289 | static inline void |
290 | as_single_bin_copy(as_bin *to, const as_bin *from) |
291 | { |
292 | to->iparticle = from->iparticle; |
293 | to->particle = from->particle; |
294 | } |
295 | |
296 | static inline bool |
297 | as_bin_inuse(const as_bin *b) |
298 | { |
299 | return (((as_particle_iparticle *)b)->state); |
300 | } |
301 | |
302 | static inline uint8_t |
303 | as_bin_state(const as_bin *b) |
304 | { |
305 | return ((as_particle_iparticle *)b)->state; |
306 | } |
307 | |
308 | static inline void |
309 | as_bin_state_set(as_bin *b, uint8_t val) |
310 | { |
311 | ((as_particle_iparticle *)b)->state = val; |
312 | } |
313 | |
314 | static inline void |
315 | as_bin_state_set_from_type(as_bin *b, as_particle_type type) |
316 | { |
317 | switch (type) { |
318 | case AS_PARTICLE_TYPE_NULL: |
319 | ((as_particle_iparticle *)b)->state = AS_BIN_STATE_UNUSED; |
320 | break; |
321 | case AS_PARTICLE_TYPE_INTEGER: |
322 | ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_INTEGER; |
323 | break; |
324 | case AS_PARTICLE_TYPE_FLOAT: |
325 | ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_FLOAT; |
326 | break; |
327 | default: |
328 | ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_OTHER; |
329 | break; |
330 | } |
331 | } |
332 | |
333 | static inline bool |
334 | as_bin_inuse_has(const as_storage_rd *rd) |
335 | { |
336 | // In-use bins are at the beginning - only need to check the first bin. |
337 | return rd->n_bins != 0 && (rd->pickle != NULL || as_bin_inuse(rd->bins)); |
338 | } |
339 | |
340 | static inline uint16_t |
341 | as_bin_inuse_count(const as_storage_rd *rd) |
342 | { |
343 | for (uint16_t i = 0; i < rd->n_bins; i++) { |
344 | if (! as_bin_inuse(&rd->bins[i])) { |
345 | return i; |
346 | } |
347 | } |
348 | |
349 | return rd->n_bins; |
350 | } |
351 | |
352 | static inline void |
353 | as_bin_set_empty(as_bin *b) |
354 | { |
355 | as_bin_state_set(b, AS_BIN_STATE_UNUSED); |
356 | } |
357 | |
358 | static inline void |
359 | as_bin_set_empty_shift(as_storage_rd *rd, uint32_t i) |
360 | { |
361 | // Shift the bins over, so there's no space between used bins. |
362 | // This can overwrite the "emptied" bin, and that's fine. |
363 | |
364 | uint16_t j; |
365 | |
366 | for (j = i + 1; j < rd->n_bins; j++) { |
367 | if (! as_bin_inuse(&rd->bins[j])) { |
368 | break; |
369 | } |
370 | } |
371 | |
372 | uint16_t n = j - (i + 1); |
373 | |
374 | if (n) { |
375 | memmove(&rd->bins[i], &rd->bins[i + 1], n * sizeof(as_bin)); |
376 | } |
377 | |
378 | // Mark the last bin that was *formerly* in use as null. |
379 | as_bin_set_empty(&rd->bins[j - 1]); |
380 | } |
381 | |
382 | static inline void |
383 | as_bin_set_empty_from(as_storage_rd *rd, uint16_t from) { |
384 | for (uint16_t i = from; i < rd->n_bins; i++) { |
385 | as_bin_set_empty(&rd->bins[i]); |
386 | } |
387 | } |
388 | |
389 | static inline void |
390 | as_bin_set_all_empty(as_storage_rd *rd) { |
391 | as_bin_set_empty_from(rd, 0); |
392 | } |
393 | |
394 | static inline bool |
395 | as_bin_is_embedded_particle(const as_bin *b) { |
396 | return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_INTEGER || |
397 | ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_FLOAT; |
398 | } |
399 | |
400 | static inline bool |
401 | as_bin_is_external_particle(const as_bin *b) { |
402 | return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_OTHER; |
403 | } |
404 | |
405 | static inline as_particle * |
406 | as_bin_get_particle(as_bin *b) { |
407 | return as_bin_is_embedded_particle(b) ? &b->iparticle : b->particle; |
408 | } |
409 | |
410 | // "Embedded" types like integer are stored directly, but other bin types |
411 | // ("other") must follow an indirection to get the actual type. |
412 | static inline uint8_t |
413 | as_bin_get_particle_type(const as_bin *b) { |
414 | switch (((as_particle_iparticle *)b)->state) { |
415 | case AS_BIN_STATE_INUSE_INTEGER: |
416 | return AS_PARTICLE_TYPE_INTEGER; |
417 | case AS_BIN_STATE_INUSE_FLOAT: |
418 | return AS_PARTICLE_TYPE_FLOAT; |
419 | case AS_BIN_STATE_INUSE_OTHER: |
420 | return b->particle->metadata; |
421 | default: |
422 | return AS_PARTICLE_TYPE_NULL; |
423 | } |
424 | } |
425 | |
426 | |
427 | /* Bin function declarations */ |
428 | extern int16_t as_bin_get_id(as_namespace *ns, const char *name); |
429 | extern bool as_bin_get_or_assign_id_w_len(as_namespace *ns, const char *name, size_t len, uint16_t *id); |
430 | extern const char* as_bin_get_name_from_id(as_namespace *ns, uint16_t id); |
431 | extern bool as_bin_name_within_quota(as_namespace *ns, const char *name); |
432 | extern void as_bin_copy(as_namespace *ns, as_bin *to, const as_bin *from); |
433 | extern int as_storage_rd_load_n_bins(as_storage_rd *rd); |
434 | extern int as_storage_rd_load_bins(as_storage_rd *rd, as_bin *stack_bins); |
435 | extern void as_bin_get_all_p(as_storage_rd *rd, as_bin **bin_ptrs); |
436 | extern as_bin *as_bin_get_by_id(as_storage_rd *rd, uint32_t id); |
437 | extern as_bin *as_bin_get(as_storage_rd *rd, const char *name); |
438 | extern as_bin *as_bin_get_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len); |
439 | extern as_bin *as_bin_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result); |
440 | extern as_bin *as_bin_get_or_create(as_storage_rd *rd, const char *name); |
441 | extern as_bin *as_bin_get_or_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result); |
442 | extern int32_t as_bin_get_index(as_storage_rd *rd, const char *name); |
443 | extern int32_t as_bin_get_index_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len); |
444 | extern void as_bin_destroy(as_storage_rd *rd, uint16_t i); |
445 | extern void as_bin_allocate_bin_space(as_storage_rd *rd, int32_t delta); |
446 | |
447 | |
448 | typedef enum { |
449 | AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF = 0, |
450 | AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION = 1, |
451 | AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME = 2, |
452 | AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_CP = 3 |
453 | } conflict_resolution_pol; |
454 | |
455 | /* Record function declarations */ |
456 | extern uint32_t clock_skew_stop_writes_sec(); |
457 | extern bool as_record_handle_clock_skew(as_namespace* ns, uint64_t skew_ms); |
458 | extern uint16_t plain_generation(uint16_t regime_generation, const as_namespace* ns); |
459 | extern void as_record_set_lut(as_record *r, uint32_t regime, uint64_t now_ms, const as_namespace* ns); |
460 | extern void as_record_increment_generation(as_record *r, const as_namespace* ns); |
461 | extern bool as_record_is_live(const as_record *r); |
462 | extern int as_record_get_create(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns); |
463 | extern int as_record_get(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref); |
464 | extern int as_record_get_live(struct as_index_tree_s *tree, const cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns); |
465 | extern int as_record_exists(struct as_index_tree_s *tree, const cf_digest *keyd); |
466 | extern int as_record_exists_live(struct as_index_tree_s *tree, const cf_digest *keyd, as_namespace *ns); |
467 | extern void as_record_rescue(as_index_ref *r_ref, as_namespace *ns); |
468 | |
469 | extern void as_record_destroy_bins_from(as_storage_rd *rd, uint16_t from); |
470 | extern void as_record_destroy_bins(as_storage_rd *rd); |
471 | extern void as_record_free_bin_space(as_record *r); |
472 | |
473 | extern void as_record_destroy(as_record *r, as_namespace *ns); |
474 | extern void as_record_done(as_index_ref *r_ref, as_namespace *ns); |
475 | |
476 | void as_record_drop_stats(as_record* r, as_namespace* ns); |
477 | |
478 | extern void as_record_finalize_key(as_record* r, as_namespace* ns, const uint8_t* key, uint32_t key_size); |
479 | extern void as_record_allocate_key(as_record* r, const uint8_t* key, uint32_t key_size); |
480 | extern int as_record_resolve_conflict(conflict_resolution_pol policy, uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut); |
481 | extern uint8_t *as_record_pickle(as_storage_rd *rd, size_t *len_r); |
482 | extern int as_record_write_from_pickle(as_storage_rd *rd); |
483 | extern int as_record_set_set_from_msg(as_record *r, as_namespace *ns, as_msg *m); |
484 | |
485 | static inline bool |
486 | as_record_pickle_is_binless(const uint8_t *buf) |
487 | { |
488 | return *(uint16_t *)buf == 0; |
489 | } |
490 | |
491 | // For enterprise split only. |
492 | int record_resolve_conflict_cp(uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut); |
493 | |
494 | static inline int |
495 | resolve_last_update_time(uint64_t left, uint64_t right) |
496 | { |
497 | return left == right ? 0 : (right > left ? 1 : -1); |
498 | } |
499 | |
500 | typedef struct as_remote_record_s { |
501 | cf_node src; |
502 | as_partition_reservation *rsv; |
503 | cf_digest *keyd; |
504 | |
505 | uint8_t *pickle; |
506 | size_t pickle_sz; |
507 | |
508 | uint32_t generation; |
509 | uint32_t void_time; |
510 | uint64_t last_update_time; |
511 | |
512 | const char *set_name; |
513 | size_t set_name_len; |
514 | |
515 | const uint8_t *key; |
516 | size_t key_size; |
517 | |
518 | bool is_old_pickle; // TODO - old pickle - remove in "six months" |
519 | |
520 | uint16_t n_bins; |
521 | as_flat_comp_meta cm; |
522 | uint32_t meta_sz; |
523 | |
524 | uint8_t repl_state; // relevant only for enterprise edition |
525 | } as_remote_record; |
526 | |
527 | int as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, bool skip_sindex, bool do_xdr_write); |
528 | |
529 | // a simpler call that gives seconds in the right epoch |
530 | #define as_record_void_time_get() cf_clepoch_seconds() |
531 | bool as_record_is_expired(const as_record *r); // TODO - eventually inline |
532 | |
533 | static inline bool |
534 | as_record_is_doomed(const as_record *r, struct as_namespace_s *ns) |
535 | { |
536 | return as_record_is_expired(r) || as_truncate_record_is_truncated(r, ns); |
537 | } |
538 | |
539 | #define AS_SINDEX_MAX 256 |
540 | |
541 | #define MIN_PARTITIONS_PER_INDEX 1 |
542 | #define MAX_PARTITIONS_PER_INDEX 256 |
543 | #define DEFAULT_PARTITIONS_PER_INDEX 32 |
544 | #define MAX_PARTITIONS_PER_INDEX_CHAR 3 // Number of characters in max paritions per index |
545 | |
546 | // as_sindex structure which hangs from the ns. |
547 | #define AS_SINDEX_INACTIVE 1 // On init, pre-loading |
548 | #define AS_SINDEX_ACTIVE 2 // On creation and afterwards |
549 | #define AS_SINDEX_DESTROY 3 // On destroy |
550 | // dummy sindex state when ai_btree_create() returns error this |
551 | // sindex is not available for any of the DML operations |
552 | #define AS_SINDEX_NOTCREATED 4 // Un-used flag |
553 | #define AS_SINDEX_FLAG_WACTIVE 0x01 // On ai btree create of sindex, never reset |
554 | #define AS_SINDEX_FLAG_RACTIVE 0x02 // When sindex scan of database is completed |
555 | #define AS_SINDEX_FLAG_DESTROY_CLEANUP 0x04 // Called for AI clean-up during si deletion |
556 | #define AS_SINDEX_FLAG_MIGRATE_CLEANUP 0x08 // Un-used |
557 | #define AS_SINDEX_FLAG_POPULATING 0x10 // Indicates current si scan job, reset when scan is done. |
558 | |
559 | struct as_sindex_s; |
560 | struct as_sindex_config_s; |
561 | |
562 | #define AS_SET_MAX_COUNT 0x3FF // ID's 10 bits worth minus 1 (ID 0 means no set) |
563 | #define AS_BINID_HAS_SINDEX_SIZE MAX_BIN_NAMES / ( sizeof(uint32_t) * CHAR_BIT ) |
564 | |
565 | |
566 | // TODO - would be nice to put this in as_index.h: |
567 | // Callback invoked when as_index is destroyed. |
568 | typedef void (*as_index_value_destructor) (struct as_index_s* v, void* udata); |
569 | |
570 | // TODO - would be nice to put this in as_index.h: |
571 | typedef struct as_index_tree_shared_s { |
572 | cf_arenax* arena; |
573 | |
574 | as_index_value_destructor destructor; |
575 | void* destructor_udata; |
576 | |
577 | // Number of sprigs per partition tree. |
578 | uint32_t n_sprigs; |
579 | |
580 | // Bit-shifts used to calculate indexes from digest bits. |
581 | uint32_t locks_shift; |
582 | uint32_t sprigs_shift; |
583 | |
584 | // Offsets into as_index_tree struct's variable-sized data. |
585 | uint32_t sprigs_offset; |
586 | uint32_t puddles_offset; |
587 | } as_index_tree_shared; |
588 | |
589 | |
590 | typedef struct as_sprigx_s { |
591 | uint64_t root_h: 40; |
592 | } __attribute__ ((__packed__)) as_sprigx; |
593 | |
594 | typedef struct as_treex_s { |
595 | int block_ix[AS_PARTITIONS]; |
596 | as_sprigx sprigxs[0]; |
597 | } as_treex; |
598 | |
599 | |
600 | struct as_namespace_s { |
601 | //-------------------------------------------- |
602 | // Data partitions - first, to 64-byte align. |
603 | // |
604 | |
605 | as_partition partitions[AS_PARTITIONS]; |
606 | |
607 | //-------------------------------------------- |
608 | // Name & ID. |
609 | // |
610 | |
611 | char name[AS_ID_NAMESPACE_SZ]; |
612 | uint32_t id; // this is 1-based |
613 | uint32_t namehash; |
614 | |
615 | //-------------------------------------------- |
616 | // Persistent memory. |
617 | // |
618 | |
619 | // Persistent memory type (default is shmem). |
620 | cf_xmem_type xmem_type; |
621 | const void* xmem_type_cfg; |
622 | |
623 | // Persistent memory "base" block ID for this namespace. |
624 | uint32_t xmem_id; |
625 | |
626 | // Pointer to the persistent memory "base" block. |
627 | uint8_t* xmem_base; |
628 | |
629 | // Pointer to partition tree info in persistent memory "treex" block. |
630 | as_treex* xmem_trees; |
631 | |
632 | // Pointer to arena structure (not stages) in persistent memory base block. |
633 | cf_arenax* arena; |
634 | |
635 | // Pointer to bin name vmap in persistent memory base block. |
636 | cf_vmapx* p_bin_name_vmap; |
637 | |
638 | // Pointer to set information vmap in persistent memory base block. |
639 | cf_vmapx* p_sets_vmap; |
640 | |
641 | // Temporary array of sets to hold config values until sets vmap is ready. |
642 | as_set* sets_cfg_array; |
643 | uint32_t sets_cfg_count; |
644 | |
645 | // Configuration flags relevant for warm or cool restart. |
646 | uint32_t xmem_flags; |
647 | |
648 | //-------------------------------------------- |
649 | // Cold start. |
650 | // |
651 | |
652 | // If true, read storage devices to build index at startup. |
653 | bool cold_start; |
654 | |
655 | // If true, device headers indicate previous shutdown was not clean. |
656 | bool dirty_restart; |
657 | |
658 | // Flag for ticker during initial loading of records from device. |
659 | bool loading_records; |
660 | |
661 | // For cold start eviction. |
662 | cf_mutex cold_start_evict_lock; |
663 | uint32_t cold_start_record_add_count; |
664 | uint32_t cold_start_now; |
665 | |
666 | // For sanity checking at startup (also used during warm or cool restart). |
667 | uint32_t startup_max_void_time; |
668 | |
669 | //-------------------------------------------- |
670 | // Memory management. |
671 | // |
672 | |
673 | // JEMalloc arena to be used for long-term storage in this namespace (-1 if nonexistent.) |
674 | int jem_arena; |
675 | |
676 | // Cached partition ownership info for clients. |
677 | client_replica_map* replica_maps; |
678 | |
679 | // Common partition tree information. Contains two configuration items. |
680 | as_index_tree_shared tree_shared; |
681 | |
682 | //-------------------------------------------- |
683 | // Storage management. |
684 | // |
685 | |
686 | // This is typecast to (drv_ssds*) in storage code. |
687 | void* storage_private; |
688 | |
689 | uint64_t ssd_size; // discovered (and rounded) size of drive |
690 | int storage_last_avail_pct; // most recently calculated available percent |
691 | int storage_max_write_q; // storage_max_write_cache is converted to this |
692 | uint32_t saved_defrag_sleep; // restore after defrag at startup is done |
693 | uint32_t defrag_lwm_size; // storage_defrag_lwm_pct % of storage_write_block_size |
694 | |
695 | // For data-not-in-memory, we optionally cache swbs after writing to device. |
696 | // To track fraction of reads from cache: |
697 | cf_atomic32 n_reads_from_cache; |
698 | cf_atomic32 n_reads_from_device; |
699 | |
700 | uint8_t storage_encryption_key[64]; |
701 | |
702 | //-------------------------------------------- |
703 | // Eviction. |
704 | // |
705 | |
706 | uint32_t smd_evict_void_time; |
707 | uint32_t evict_void_time; |
708 | |
709 | //-------------------------------------------- |
710 | // Truncate records. |
711 | // |
712 | |
713 | as_truncate truncate; |
714 | |
715 | //-------------------------------------------- |
716 | // Secondary index. |
717 | // |
718 | |
719 | int sindex_cnt; |
720 | uint32_t n_setless_sindexes; |
721 | struct as_sindex_s* sindex; // array with AS_MAX_SINDEX metadata |
722 | cf_shash* sindex_set_binid_hash; |
723 | cf_shash* sindex_iname_hash; |
724 | uint32_t binid_has_sindex[AS_BINID_HAS_SINDEX_SIZE]; |
725 | |
726 | //-------------------------------------------- |
727 | // Configuration. |
728 | // |
729 | |
730 | uint32_t cfg_replication_factor; |
731 | uint32_t replication_factor; // indirect config - can become less than cfg_replication_factor |
732 | uint64_t memory_size; |
733 | uint32_t default_ttl; |
734 | |
735 | bool enable_xdr; |
736 | bool sets_enable_xdr; // namespace-level flag to enable set-based xdr shipping |
737 | bool ns_forward_xdr_writes; // namespace-level flag to enable forwarding of xdr writes |
738 | bool ns_allow_nonxdr_writes; // namespace-level flag to allow nonxdr writes or not |
739 | bool ns_allow_xdr_writes; // namespace-level flag to allow xdr writes or not |
740 | |
741 | uint32_t background_scan_max_rps; |
742 | conflict_resolution_pol conflict_resolution_policy; |
743 | bool cp; // relevant only for enterprise edition |
744 | bool cp_allow_drops; // relevant only for enterprise edition |
745 | bool data_in_index; // with single-bin, allows warm restart for data-in-memory (with storage-engine device) |
746 | bool cold_start_eviction_disabled; |
747 | bool write_dup_res_disabled; |
748 | bool disallow_null_setname; |
749 | bool batch_sub_benchmarks_enabled; |
750 | bool ops_sub_benchmarks_enabled; |
751 | bool read_benchmarks_enabled; |
752 | bool udf_benchmarks_enabled; |
753 | bool udf_sub_benchmarks_enabled; |
754 | bool write_benchmarks_enabled; |
755 | bool proxy_hist_enabled; |
756 | uint32_t evict_hist_buckets; |
757 | uint32_t evict_tenths_pct; |
758 | uint32_t hwm_disk_pct; |
759 | uint32_t hwm_memory_pct; |
760 | uint64_t index_stage_size; |
761 | uint32_t migrate_order; |
762 | uint32_t migrate_retransmit_ms; |
763 | uint32_t migrate_sleep; |
764 | uint32_t nsup_hist_period; |
765 | uint32_t nsup_period; |
766 | uint32_t n_nsup_threads; |
767 | bool cfg_prefer_uniform_balance; // relevant only for enterprise edition |
768 | bool prefer_uniform_balance; // indirect config - can become disabled if any other node reports disabled |
769 | uint32_t rack_id; |
770 | as_read_consistency_level read_consistency_level; |
771 | bool single_bin; // restrict the namespace to objects with exactly one bin |
772 | uint32_t n_single_scan_threads; |
773 | uint32_t stop_writes_pct; |
774 | uint32_t tomb_raider_eligible_age; // relevant only for enterprise edition |
775 | uint32_t tomb_raider_period; // relevant only for enterprise edition |
776 | uint32_t transaction_pending_limit; // 0 means no limit |
777 | uint32_t n_truncate_threads; |
778 | as_write_commit_level write_commit_level; |
779 | cf_vector xdr_dclist_v; |
780 | |
781 | const char* xmem_mounts[CF_XMEM_MAX_MOUNTS]; |
782 | uint32_t n_xmem_mounts; // indirect config |
783 | uint32_t mounts_hwm_pct; |
784 | uint64_t mounts_size_limit; |
785 | |
786 | as_storage_type storage_type; |
787 | |
788 | const char* storage_devices[AS_STORAGE_MAX_DEVICES]; |
789 | uint32_t n_storage_devices; // indirect config - if devices array contains raw devices (or partitions) |
790 | uint32_t n_storage_files; // indirect config - if devices array contains files |
791 | const char* storage_shadows[AS_STORAGE_MAX_DEVICES]; |
792 | uint32_t n_storage_shadows; // indirect config |
793 | uint64_t storage_filesize; |
794 | char* storage_scheduler_mode; // relevant for devices only, not files |
795 | uint32_t storage_write_block_size; |
796 | bool storage_data_in_memory; |
797 | |
798 | bool storage_cold_start_empty; |
799 | bool storage_commit_to_device; // relevant only for enterprise edition |
800 | uint32_t storage_commit_min_size; // relevant only for enterprise edition |
801 | as_compression_method storage_compression; // relevant only for enterprise edition |
802 | uint32_t storage_compression_level; // relevant only for enterprise edition |
803 | uint32_t storage_defrag_lwm_pct; |
804 | uint32_t storage_defrag_queue_min; |
805 | uint32_t storage_defrag_sleep; |
806 | int storage_defrag_startup_minimum; |
807 | bool storage_direct_files; |
808 | bool storage_disable_odsync; |
809 | bool storage_benchmarks_enabled; // histograms are per-drive except device-read-size & device-write-size |
810 | as_encryption_method storage_encryption; // relevant only for enterprise edition |
811 | char* storage_encryption_key_file; // relevant only for enterprise edition |
812 | uint64_t storage_flush_max_us; |
813 | uint64_t storage_max_write_cache; |
814 | uint32_t storage_min_avail_pct; |
815 | cf_atomic32 storage_post_write_queue; // number of swbs/device held after writing to device |
816 | bool storage_read_page_cache; |
817 | bool storage_serialize_tomb_raider; // relevant only for enterprise edition |
818 | uint32_t storage_tomb_raider_sleep; // relevant only for enterprise edition |
819 | |
820 | uint32_t sindex_num_partitions; |
821 | |
822 | bool geo2dsphere_within_strict; |
823 | uint16_t geo2dsphere_within_min_level; |
824 | uint16_t geo2dsphere_within_max_level; |
825 | uint16_t geo2dsphere_within_max_cells; |
826 | uint16_t geo2dsphere_within_level_mod; |
827 | uint32_t geo2dsphere_within_earth_radius_meters; |
828 | |
829 | //-------------------------------------------- |
830 | // Statistics and histograms. |
831 | // |
832 | |
833 | // Object counts. |
834 | |
835 | cf_atomic64 n_objects; |
836 | cf_atomic64 n_tombstones; // relevant only for enterprise edition |
837 | |
838 | // Consistency info. |
839 | |
840 | uint32_t n_dead_partitions; |
841 | uint32_t n_unavailable_partitions; |
842 | bool clock_skew_stop_writes; |
843 | |
844 | // Expiration & eviction (nsup) stats. |
845 | |
846 | bool stop_writes; |
847 | bool hwm_breached; |
848 | |
849 | uint64_t non_expirable_objects; |
850 | |
851 | uint64_t n_expired_objects; |
852 | uint64_t n_evicted_objects; |
853 | |
854 | int32_t evict_ttl; // signed - possible (but weird) it's negative |
855 | |
856 | uint32_t nsup_cycle_duration; // seconds taken for most recent nsup cycle |
857 | |
858 | // Memory usage stats. |
859 | |
860 | cf_atomic_int n_bytes_memory; |
861 | cf_atomic64 n_bytes_sindex_memory; |
862 | |
863 | // Persistent storage stats. |
864 | |
865 | double comp_avg_orig_sz; // relevant only for enterprise edition |
866 | double comp_avg_comp_sz; // relevant only for enterprise edition |
867 | float cache_read_pct; |
868 | |
869 | // Migration stats. |
870 | |
871 | cf_atomic_int migrate_tx_partitions_imbalance; // debug only |
872 | cf_atomic_int migrate_tx_instance_count; // debug only |
873 | cf_atomic_int migrate_rx_instance_count; // debug only |
874 | cf_atomic_int migrate_tx_partitions_active; |
875 | cf_atomic_int migrate_rx_partitions_active; |
876 | cf_atomic_int migrate_tx_partitions_initial; |
877 | cf_atomic_int migrate_tx_partitions_remaining; |
878 | cf_atomic_int migrate_tx_partitions_lead_remaining; |
879 | cf_atomic_int migrate_rx_partitions_initial; |
880 | cf_atomic_int migrate_rx_partitions_remaining; |
881 | cf_atomic_int migrate_signals_active; |
882 | cf_atomic_int migrate_signals_remaining; |
883 | cf_atomic_int appeals_tx_active; // relevant only for enterprise edition |
884 | cf_atomic_int appeals_rx_active; // relevant only for enterprise edition |
885 | cf_atomic_int appeals_tx_remaining; // relevant only for enterprise edition |
886 | |
887 | // Per-record migration stats: |
888 | cf_atomic_int migrate_records_skipped; // relevant only for enterprise edition |
889 | cf_atomic_int migrate_records_transmitted; |
890 | cf_atomic_int migrate_record_retransmits; |
891 | cf_atomic_int migrate_record_receives; |
892 | cf_atomic_int appeals_records_exonerated; // relevant only for enterprise edition |
893 | |
894 | // From-client transaction stats. |
895 | |
896 | cf_atomic64 n_client_tsvc_error; |
897 | cf_atomic64 n_client_tsvc_timeout; |
898 | |
899 | cf_atomic64 n_client_proxy_complete; |
900 | cf_atomic64 n_client_proxy_error; |
901 | cf_atomic64 n_client_proxy_timeout; |
902 | |
903 | cf_atomic64 n_client_read_success; |
904 | cf_atomic64 n_client_read_error; |
905 | cf_atomic64 n_client_read_timeout; |
906 | cf_atomic64 n_client_read_not_found; |
907 | cf_atomic64 n_client_read_filtered_out; |
908 | |
909 | cf_atomic64 n_client_write_success; |
910 | cf_atomic64 n_client_write_error; |
911 | cf_atomic64 n_client_write_timeout; |
912 | cf_atomic64 n_client_write_filtered_out; |
913 | |
914 | // Subset of n_client_write_... above, respectively. |
915 | cf_atomic64 n_xdr_client_write_success; |
916 | cf_atomic64 n_xdr_client_write_error; |
917 | cf_atomic64 n_xdr_client_write_timeout; |
918 | |
919 | cf_atomic64 n_client_delete_success; |
920 | cf_atomic64 n_client_delete_error; |
921 | cf_atomic64 n_client_delete_timeout; |
922 | cf_atomic64 n_client_delete_not_found; |
923 | cf_atomic64 n_client_delete_filtered_out; |
924 | |
925 | // Subset of n_client_delete_... above, respectively. |
926 | cf_atomic64 n_xdr_client_delete_success; |
927 | cf_atomic64 n_xdr_client_delete_error; |
928 | cf_atomic64 n_xdr_client_delete_timeout; |
929 | cf_atomic64 n_xdr_client_delete_not_found; |
930 | |
931 | cf_atomic64 n_client_udf_complete; |
932 | cf_atomic64 n_client_udf_error; |
933 | cf_atomic64 n_client_udf_timeout; |
934 | cf_atomic64 n_client_udf_filtered_out; |
935 | |
936 | cf_atomic64 n_client_lang_read_success; |
937 | cf_atomic64 n_client_lang_write_success; |
938 | cf_atomic64 n_client_lang_delete_success; |
939 | cf_atomic64 n_client_lang_error; |
940 | |
941 | // From-proxy transaction stats. |
942 | |
943 | cf_atomic64 n_from_proxy_tsvc_error; |
944 | cf_atomic64 n_from_proxy_tsvc_timeout; |
945 | |
946 | cf_atomic64 n_from_proxy_read_success; |
947 | cf_atomic64 n_from_proxy_read_error; |
948 | cf_atomic64 n_from_proxy_read_timeout; |
949 | cf_atomic64 n_from_proxy_read_not_found; |
950 | cf_atomic64 n_from_proxy_read_filtered_out; |
951 | |
952 | cf_atomic64 n_from_proxy_write_success; |
953 | cf_atomic64 n_from_proxy_write_error; |
954 | cf_atomic64 n_from_proxy_write_timeout; |
955 | cf_atomic64 n_from_proxy_write_filtered_out; |
956 | |
957 | // Subset of n_from_proxy_write_... above, respectively. |
958 | cf_atomic64 n_xdr_from_proxy_write_success; |
959 | cf_atomic64 n_xdr_from_proxy_write_error; |
960 | cf_atomic64 n_xdr_from_proxy_write_timeout; |
961 | |
962 | cf_atomic64 n_from_proxy_delete_success; |
963 | cf_atomic64 n_from_proxy_delete_error; |
964 | cf_atomic64 n_from_proxy_delete_timeout; |
965 | cf_atomic64 n_from_proxy_delete_not_found; |
966 | cf_atomic64 n_from_proxy_delete_filtered_out; |
967 | |
968 | // Subset of n_from_proxy_delete_... above, respectively. |
969 | cf_atomic64 n_xdr_from_proxy_delete_success; |
970 | cf_atomic64 n_xdr_from_proxy_delete_error; |
971 | cf_atomic64 n_xdr_from_proxy_delete_timeout; |
972 | cf_atomic64 n_xdr_from_proxy_delete_not_found; |
973 | |
974 | cf_atomic64 n_from_proxy_udf_complete; |
975 | cf_atomic64 n_from_proxy_udf_error; |
976 | cf_atomic64 n_from_proxy_udf_timeout; |
977 | cf_atomic64 n_from_proxy_udf_filtered_out; |
978 | |
979 | cf_atomic64 n_from_proxy_lang_read_success; |
980 | cf_atomic64 n_from_proxy_lang_write_success; |
981 | cf_atomic64 n_from_proxy_lang_delete_success; |
982 | cf_atomic64 n_from_proxy_lang_error; |
983 | |
984 | // Batch sub-transaction stats. |
985 | |
986 | cf_atomic64 n_batch_sub_tsvc_error; |
987 | cf_atomic64 n_batch_sub_tsvc_timeout; |
988 | |
989 | cf_atomic64 n_batch_sub_proxy_complete; |
990 | cf_atomic64 n_batch_sub_proxy_error; |
991 | cf_atomic64 n_batch_sub_proxy_timeout; |
992 | |
993 | cf_atomic64 n_batch_sub_read_success; |
994 | cf_atomic64 n_batch_sub_read_error; |
995 | cf_atomic64 n_batch_sub_read_timeout; |
996 | cf_atomic64 n_batch_sub_read_not_found; |
997 | cf_atomic64 n_batch_sub_read_filtered_out; |
998 | |
999 | // From-proxy batch sub-transaction stats. |
1000 | |
1001 | cf_atomic64 n_from_proxy_batch_sub_tsvc_error; |
1002 | cf_atomic64 n_from_proxy_batch_sub_tsvc_timeout; |
1003 | |
1004 | cf_atomic64 n_from_proxy_batch_sub_read_success; |
1005 | cf_atomic64 n_from_proxy_batch_sub_read_error; |
1006 | cf_atomic64 n_from_proxy_batch_sub_read_timeout; |
1007 | cf_atomic64 n_from_proxy_batch_sub_read_not_found; |
1008 | cf_atomic64 n_from_proxy_batch_sub_read_filtered_out; |
1009 | |
1010 | // Internal-UDF sub-transaction stats. |
1011 | |
1012 | cf_atomic64 n_udf_sub_tsvc_error; |
1013 | cf_atomic64 n_udf_sub_tsvc_timeout; |
1014 | |
1015 | cf_atomic64 n_udf_sub_udf_complete; |
1016 | cf_atomic64 n_udf_sub_udf_error; |
1017 | cf_atomic64 n_udf_sub_udf_timeout; |
1018 | uint64_t n_udf_sub_udf_filtered_out; |
1019 | |
1020 | cf_atomic64 n_udf_sub_lang_read_success; |
1021 | cf_atomic64 n_udf_sub_lang_write_success; |
1022 | cf_atomic64 n_udf_sub_lang_delete_success; |
1023 | cf_atomic64 n_udf_sub_lang_error; |
1024 | |
1025 | // Internal-ops sub-transaction stats. |
1026 | |
1027 | cf_atomic64 n_ops_sub_tsvc_error; |
1028 | cf_atomic64 n_ops_sub_tsvc_timeout; |
1029 | |
1030 | cf_atomic64 n_ops_sub_write_success; |
1031 | cf_atomic64 n_ops_sub_write_error; |
1032 | cf_atomic64 n_ops_sub_write_timeout; |
1033 | uint64_t n_ops_sub_write_filtered_out; |
1034 | |
1035 | // Transaction retransmit stats - 'all' means both client & proxy origins. |
1036 | |
1037 | uint64_t n_retransmit_all_read_dup_res; |
1038 | |
1039 | uint64_t n_retransmit_all_write_dup_res; |
1040 | uint64_t n_retransmit_all_write_repl_write; |
1041 | |
1042 | uint64_t n_retransmit_all_delete_dup_res; |
1043 | uint64_t n_retransmit_all_delete_repl_write; |
1044 | |
1045 | uint64_t n_retransmit_all_udf_dup_res; |
1046 | uint64_t n_retransmit_all_udf_repl_write; |
1047 | |
1048 | uint64_t n_retransmit_all_batch_sub_dup_res; |
1049 | |
1050 | uint64_t n_retransmit_udf_sub_dup_res; |
1051 | uint64_t n_retransmit_udf_sub_repl_write; |
1052 | |
1053 | uint64_t n_retransmit_ops_sub_dup_res; |
1054 | uint64_t n_retransmit_ops_sub_repl_write; |
1055 | |
1056 | // Scan stats. |
1057 | |
1058 | uint64_t n_scan_basic_complete; |
1059 | uint64_t n_scan_basic_error; |
1060 | uint64_t n_scan_basic_abort; |
1061 | |
1062 | uint64_t n_scan_aggr_complete; |
1063 | uint64_t n_scan_aggr_error; |
1064 | uint64_t n_scan_aggr_abort; |
1065 | |
1066 | uint64_t n_scan_udf_bg_complete; |
1067 | uint64_t n_scan_udf_bg_error; |
1068 | uint64_t n_scan_udf_bg_abort; |
1069 | |
1070 | uint64_t n_scan_ops_bg_complete; |
1071 | uint64_t n_scan_ops_bg_error; |
1072 | uint64_t n_scan_ops_bg_abort; |
1073 | |
1074 | // Query stats. |
1075 | |
1076 | cf_atomic64 query_reqs; |
1077 | cf_atomic64 query_fail; |
1078 | cf_atomic64 query_short_queue_full; |
1079 | cf_atomic64 query_long_queue_full; |
1080 | cf_atomic64 query_short_reqs; |
1081 | cf_atomic64 query_long_reqs; |
1082 | |
1083 | cf_atomic64 n_lookup; |
1084 | cf_atomic64 n_lookup_success; |
1085 | cf_atomic64 n_lookup_abort; |
1086 | cf_atomic64 n_lookup_errs; |
1087 | cf_atomic64 lookup_response_size; |
1088 | cf_atomic64 lookup_num_records; |
1089 | |
1090 | cf_atomic64 n_aggregation; |
1091 | cf_atomic64 n_agg_success; |
1092 | cf_atomic64 n_agg_abort; |
1093 | cf_atomic64 n_agg_errs; |
1094 | cf_atomic64 agg_response_size; |
1095 | cf_atomic64 agg_num_records; |
1096 | |
1097 | cf_atomic64 n_query_udf_bg_success; |
1098 | cf_atomic64 n_query_udf_bg_failure; |
1099 | |
1100 | cf_atomic64 n_query_ops_bg_success; |
1101 | cf_atomic64 n_query_ops_bg_failure; |
1102 | |
1103 | // Geospatial query stats: |
1104 | cf_atomic64 geo_region_query_count; // number of region queries |
1105 | cf_atomic64 geo_region_query_cells; // number of cells used by region queries |
1106 | cf_atomic64 geo_region_query_points; // number of valid points found |
1107 | cf_atomic64 geo_region_query_falsepos; // number of false positives found |
1108 | |
1109 | // Re-replication stats - relevant only for enterprise edition. |
1110 | |
1111 | cf_atomic64 n_re_repl_success; |
1112 | cf_atomic64 n_re_repl_error; |
1113 | cf_atomic64 n_re_repl_timeout; |
1114 | |
1115 | // Special errors that deserve their own counters: |
1116 | |
1117 | cf_atomic64 n_fail_xdr_forbidden; |
1118 | cf_atomic64 n_fail_key_busy; |
1119 | cf_atomic64 n_fail_generation; |
1120 | cf_atomic64 n_fail_record_too_big; |
1121 | |
1122 | // Special non-error counters: |
1123 | |
1124 | cf_atomic64 n_deleted_last_bin; |
1125 | |
1126 | // One-way automatically activated histograms. |
1127 | |
1128 | cf_hist_track* read_hist; |
1129 | cf_hist_track* write_hist; |
1130 | cf_hist_track* udf_hist; |
1131 | cf_hist_track* query_hist; |
1132 | histogram* query_rec_count_hist; |
1133 | histogram* re_repl_hist; // relevant only for enterprise edition |
1134 | |
1135 | bool read_hist_active; |
1136 | bool write_hist_active; |
1137 | bool udf_hist_active; |
1138 | bool query_hist_active; |
1139 | bool query_rec_count_hist_active; |
1140 | bool re_repl_hist_active; // relevant only for enterprise edition |
1141 | |
1142 | // Activate-by-config histograms. |
1143 | |
1144 | histogram* proxy_hist; |
1145 | |
1146 | histogram* read_start_hist; |
1147 | histogram* read_restart_hist; |
1148 | histogram* read_dup_res_hist; |
1149 | histogram* read_repl_ping_hist; |
1150 | histogram* read_local_hist; |
1151 | histogram* read_response_hist; |
1152 | |
1153 | histogram* write_start_hist; |
1154 | histogram* write_restart_hist; |
1155 | histogram* write_dup_res_hist; |
1156 | histogram* write_master_hist; // split this? |
1157 | histogram* write_repl_write_hist; |
1158 | histogram* write_response_hist; |
1159 | |
1160 | histogram* udf_start_hist; |
1161 | histogram* udf_restart_hist; |
1162 | histogram* udf_dup_res_hist; |
1163 | histogram* udf_master_hist; // split this? |
1164 | histogram* udf_repl_write_hist; |
1165 | histogram* udf_response_hist; |
1166 | |
1167 | histogram* batch_sub_start_hist; |
1168 | histogram* batch_sub_restart_hist; |
1169 | histogram* batch_sub_dup_res_hist; |
1170 | histogram* batch_sub_repl_ping_hist; |
1171 | histogram* batch_sub_read_local_hist; |
1172 | histogram* batch_sub_response_hist; |
1173 | |
1174 | histogram* udf_sub_start_hist; |
1175 | histogram* udf_sub_restart_hist; |
1176 | histogram* udf_sub_dup_res_hist; |
1177 | histogram* udf_sub_master_hist; // split this? |
1178 | histogram* udf_sub_repl_write_hist; |
1179 | histogram* udf_sub_response_hist; |
1180 | |
1181 | histogram* ops_sub_start_hist; |
1182 | histogram* ops_sub_restart_hist; |
1183 | histogram* ops_sub_dup_res_hist; |
1184 | histogram* ops_sub_master_hist; // split this? |
1185 | histogram* ops_sub_repl_write_hist; |
1186 | histogram* ops_sub_response_hist; |
1187 | |
1188 | histogram* device_read_size_hist; |
1189 | histogram* device_write_size_hist; |
1190 | |
1191 | // Histograms of object storage sizes. (Meaningful for drive-backed |
1192 | // namespaces only.) |
1193 | histogram* obj_size_log_hist; |
1194 | histogram* set_obj_size_log_hists[AS_SET_MAX_COUNT + 1]; |
1195 | linear_hist* obj_size_lin_hist; |
1196 | linear_hist* set_obj_size_lin_hists[AS_SET_MAX_COUNT + 1]; |
1197 | |
1198 | // Histograms used for general eviction and expiration. |
1199 | linear_hist* evict_hist; // not just for info |
1200 | linear_hist* ttl_hist; |
1201 | linear_hist* set_ttl_hists[AS_SET_MAX_COUNT + 1]; |
1202 | |
1203 | //-------------------------------------------- |
1204 | // Information for rebalancing. |
1205 | // |
1206 | |
1207 | uint32_t cluster_size; |
1208 | cf_node succession[AS_CLUSTER_SZ]; |
1209 | as_partition_version cluster_versions[AS_CLUSTER_SZ][AS_PARTITIONS]; |
1210 | uint32_t rack_ids[AS_CLUSTER_SZ]; // is observed-rack-ids in CP mode |
1211 | |
1212 | // Quiescence - relevant only for enterprise edition. |
1213 | uint32_t active_size; |
1214 | bool pending_quiesce; |
1215 | bool is_quiesced; |
1216 | bool quiesced[AS_CLUSTER_SZ]; |
1217 | |
1218 | // Observed nodes - relevant only for enterprise edition. |
1219 | uint32_t observed_cluster_size; |
1220 | cf_node observed_succession[AS_CLUSTER_SZ]; |
1221 | |
1222 | // Roster management - relevant only for enterprise edition. |
1223 | uint32_t smd_roster_generation; |
1224 | uint32_t smd_roster_count; |
1225 | cf_node smd_roster[AS_CLUSTER_SZ]; |
1226 | uint32_t smd_roster_rack_ids[AS_CLUSTER_SZ]; |
1227 | uint32_t roster_generation; |
1228 | uint32_t roster_count; |
1229 | cf_node roster[AS_CLUSTER_SZ]; |
1230 | uint32_t roster_rack_ids[AS_CLUSTER_SZ]; |
1231 | |
1232 | // Master regimes - relevant only for enterprise edition. |
1233 | uint32_t eventual_regime; |
1234 | uint32_t rebalance_regime; |
1235 | uint32_t rebalance_regimes[AS_CLUSTER_SZ]; |
1236 | }; |
1237 | |
1238 | #define AS_SET_NAME_MAX_SIZE 64 // includes space for null-terminator |
1239 | |
1240 | #define INVALID_SET_ID 0 |
1241 | |
1242 | #define IS_SET_EVICTION_DISABLED(p_set) (cf_atomic32_get(p_set->disable_eviction) == 1) |
1243 | #define DISABLE_SET_EVICTION(p_set, on_off) (cf_atomic32_set(&p_set->disable_eviction, on_off ? 1 : 0)) |
1244 | |
1245 | typedef enum { |
1246 | AS_SET_ENABLE_XDR_DEFAULT = 0, |
1247 | AS_SET_ENABLE_XDR_TRUE = 1, |
1248 | AS_SET_ENABLE_XDR_FALSE = 2 |
1249 | } as_set_enable_xdr_flag; |
1250 | |
1251 | // Caution - changing this struct could break warm or cool restart. |
1252 | struct as_set_s { |
1253 | char name[AS_SET_NAME_MAX_SIZE]; |
1254 | cf_atomic64 n_objects; |
1255 | cf_atomic64 n_tombstones; // relevant only for enterprise edition |
1256 | cf_atomic64 n_bytes_memory; // for data-in-memory only - sets's total record data size |
1257 | cf_atomic64 stop_writes_count; // restrict number of records in a set |
1258 | uint64_t truncate_lut; // records with last-update-time less than this are truncated |
1259 | cf_atomic32 disable_eviction; // don't evict anything in this set (note - expiration still works) |
1260 | cf_atomic32 enable_xdr; // white-list (AS_SET_ENABLE_XDR_TRUE) or black-list (AS_SET_ENABLE_XDR_FALSE) a set for XDR replication |
1261 | uint32_t n_sindexes; |
1262 | uint8_t padding[12]; |
1263 | }; |
1264 | |
1265 | static inline bool |
1266 | as_set_stop_writes(as_set *p_set) { |
1267 | uint64_t n_objects = cf_atomic64_get(p_set->n_objects); |
1268 | uint64_t stop_writes_count = cf_atomic64_get(p_set->stop_writes_count); |
1269 | |
1270 | return stop_writes_count != 0 && n_objects >= stop_writes_count; |
1271 | } |
1272 | |
1273 | // These bin functions must be below definition of struct as_namespace_s: |
1274 | |
1275 | static inline bool |
1276 | as_bin_set_id_from_name_w_len(as_namespace *ns, as_bin *b, const uint8_t *buf, |
1277 | size_t len) { |
1278 | return as_bin_get_or_assign_id_w_len(ns, (const char *)buf, len, &b->id); |
1279 | } |
1280 | |
1281 | static inline size_t |
1282 | as_bin_memcpy_name(as_namespace *ns, uint8_t *buf, as_bin *b) { |
1283 | size_t len = 0; |
1284 | |
1285 | if (! ns->single_bin) { |
1286 | const char *name = as_bin_get_name_from_id(ns, b->id); |
1287 | |
1288 | len = strlen(name); |
1289 | memcpy(buf, name, len); |
1290 | } |
1291 | |
1292 | return len; |
1293 | } |
1294 | |
1295 | // forward ref |
1296 | struct as_msg_field_s; |
1297 | |
1298 | /* Namespace function declarations */ |
1299 | extern as_namespace *as_namespace_create(char *name); |
1300 | extern void as_namespaces_init(bool cold_start_cmd, uint32_t instance); |
1301 | extern void as_namespaces_setup(bool cold_start_cmd, uint32_t instance); |
1302 | extern void as_namespace_finish_setup(as_namespace *ns, uint32_t instance); |
1303 | extern bool as_namespace_configure_sets(as_namespace *ns); |
1304 | extern as_namespace *as_namespace_get_byname(char *name); |
1305 | extern as_namespace *as_namespace_get_byid(uint32_t id); |
1306 | extern as_namespace *as_namespace_get_bybuf(uint8_t *name, size_t len); |
1307 | extern as_namespace *as_namespace_get_bymsgfield(struct as_msg_field_s *fp); |
1308 | extern const char *as_namespace_get_set_name(as_namespace *ns, uint16_t set_id); |
1309 | extern uint16_t as_namespace_get_set_id(as_namespace *ns, const char *set_name); |
1310 | extern uint16_t as_namespace_get_create_set_id(as_namespace *ns, const char *set_name); |
1311 | extern int as_namespace_set_set_w_len(as_namespace *ns, const char *set_name, size_t len, uint16_t *p_set_id, bool apply_restrictions); |
1312 | extern int as_namespace_get_create_set_w_len(as_namespace *ns, const char *set_name, size_t len, as_set **pp_set, uint16_t *p_set_id); |
1313 | extern as_set *as_namespace_get_set_by_name(as_namespace *ns, const char *set_name); |
1314 | extern as_set* as_namespace_get_set_by_id(as_namespace* ns, uint16_t set_id); |
1315 | extern as_set* as_namespace_get_record_set(as_namespace *ns, const as_record *r); |
1316 | extern void as_namespace_get_set_info(as_namespace *ns, const char *set_name, cf_dyn_buf *db); |
1317 | extern void as_namespace_adjust_set_memory(as_namespace *ns, uint16_t set_id, int64_t delta_bytes); |
1318 | extern void as_namespace_release_set_id(as_namespace *ns, uint16_t set_id); |
1319 | extern void as_namespace_get_bins_info(as_namespace *ns, cf_dyn_buf *db, bool show_ns); |
1320 | extern void as_namespace_get_hist_info(as_namespace *ns, char *set_name, char *hist_name, cf_dyn_buf *db); |
1321 | |
1322 | static inline bool |
1323 | as_namespace_cool_restarts(const as_namespace *ns) |
1324 | { |
1325 | return ns->storage_data_in_memory && ! ns->data_in_index; |
1326 | } |
1327 | |
1328 | static inline uint32_t |
1329 | as_namespace_device_count(const as_namespace *ns) |
1330 | { |
1331 | // Only one of them will ever be non-zero. |
1332 | return ns->n_storage_devices + ns->n_storage_files; |
1333 | } |
1334 | |
1335 | static inline const char* |
1336 | as_namespace_start_mode_str(const as_namespace *ns) |
1337 | { |
1338 | return as_namespace_cool_restarts(ns) ? "cool" : "warm" ; |
1339 | } |
1340 | |
1341 | static inline bool |
1342 | as_namespace_index_persisted(const as_namespace *ns) |
1343 | { |
1344 | return ns->xmem_type == CF_XMEM_TYPE_PMEM || |
1345 | ns->xmem_type == CF_XMEM_TYPE_FLASH; |
1346 | } |
1347 | |
1348 | // Persistent Memory Management |
1349 | void as_namespace_xmem_shutdown(as_namespace *ns, uint32_t instance); |
1350 | |