| 1 | /* |
| 2 | * partition.h |
| 3 | * |
| 4 | * Copyright (C) 2016 Aerospike, Inc. |
| 5 | * |
| 6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
| 7 | * license agreements. |
| 8 | * |
| 9 | * This program is free software: you can redistribute it and/or modify it under |
| 10 | * the terms of the GNU Affero General Public License as published by the Free |
| 11 | * Software Foundation, either version 3 of the License, or (at your option) any |
| 12 | * later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
| 15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
| 17 | * details. |
| 18 | * |
| 19 | * You should have received a copy of the GNU Affero General Public License |
| 20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
| 21 | */ |
| 22 | |
| 23 | #pragma once |
| 24 | |
| 25 | //========================================================== |
| 26 | // Includes. |
| 27 | // |
| 28 | |
| 29 | #include <stdbool.h> |
| 30 | #include <stddef.h> |
| 31 | #include <stdint.h> |
| 32 | #include <stdio.h> |
| 33 | #include <string.h> |
| 34 | |
| 35 | #include "citrusleaf/cf_atomic.h" |
| 36 | #include "citrusleaf/cf_digest.h" |
| 37 | |
| 38 | #include "cf_mutex.h" |
| 39 | #include "dynbuf.h" |
| 40 | #include "node.h" |
| 41 | |
| 42 | #include "base/cfg.h" |
| 43 | #include "fabric/hb.h" |
| 44 | |
| 45 | |
| 46 | //========================================================== |
| 47 | // Forward declarations. |
| 48 | // |
| 49 | |
| 50 | struct as_index_tree_s; |
| 51 | struct as_namespace_s; |
| 52 | struct as_transaction_s; |
| 53 | |
| 54 | |
| 55 | //========================================================== |
| 56 | // Typedefs & constants. |
| 57 | // |
| 58 | |
| 59 | #define AS_PARTITIONS 4096 |
| 60 | #define AS_PARTITION_MASK (AS_PARTITIONS - 1) |
| 61 | |
| 62 | #define VERSION_FAMILY_BITS 4 |
| 63 | #define VERSION_FAMILY_UNIQUE ((1 << VERSION_FAMILY_BITS) - 1) |
| 64 | #define AS_PARTITION_N_FAMILIES VERSION_FAMILY_UNIQUE |
| 65 | |
| 66 | typedef struct as_partition_version_s { |
| 67 | uint64_t ckey:48; |
| 68 | uint64_t family:VERSION_FAMILY_BITS; |
| 69 | uint64_t unused:8; |
| 70 | uint64_t revived:1; // enterprise only |
| 71 | uint64_t master:1; |
| 72 | uint64_t subset:1; |
| 73 | uint64_t evade:1; |
| 74 | } as_partition_version; |
| 75 | |
| 76 | COMPILER_ASSERT(sizeof(as_partition_version) == sizeof(uint64_t)); |
| 77 | |
| 78 | typedef struct as_partition_version_string_s { |
| 79 | char s[19 + 1]; // format CCCCccccCCCC.F.mse - F may someday be 2 characters |
| 80 | } as_partition_version_string; |
| 81 | |
| 82 | typedef struct as_partition_s { |
| 83 | //-------------------------------------------- |
| 84 | // Used during every transaction. |
| 85 | // |
| 86 | |
| 87 | cf_atomic64 n_tombstones; // relevant only for enterprise edition |
| 88 | cf_atomic32 max_void_time; // TODO - do we really need this? |
| 89 | |
| 90 | cf_mutex lock; |
| 91 | |
| 92 | struct as_index_tree_s* tree; |
| 93 | |
| 94 | cf_node working_master; |
| 95 | |
| 96 | uint32_t regime; // relevant only for enterprise edition |
| 97 | |
| 98 | uint32_t n_nodes; // relevant only for enterprise edition |
| 99 | uint32_t n_replicas; |
| 100 | uint32_t n_dupl; |
| 101 | |
| 102 | // @ 48 bytes - room for 2 replicas within above 64-byte cache line. |
| 103 | cf_node replicas[AS_CLUSTER_SZ]; |
| 104 | |
| 105 | uint8_t align_1[16]; |
| 106 | // @ 64-byte-aligned boundary. |
| 107 | |
| 108 | //-------------------------------------------- |
| 109 | // Used only when cluster is in flux. |
| 110 | // |
| 111 | |
| 112 | uint32_t id; |
| 113 | |
| 114 | bool must_appeal; // relevant only for enterprise edition |
| 115 | |
| 116 | uint8_t tree_id; |
| 117 | uint64_t tree_ids_used; // bit map |
| 118 | |
| 119 | as_partition_version final_version; |
| 120 | as_partition_version version; |
| 121 | |
| 122 | uint16_t pending_emigrations; |
| 123 | uint16_t pending_lead_emigrations; |
| 124 | uint16_t pending_immigrations; |
| 125 | |
| 126 | uint16_t n_witnesses; |
| 127 | |
| 128 | // @ 40 bytes - room for 3 duplicates within above 64-byte cache line. |
| 129 | cf_node dupls[AS_CLUSTER_SZ]; |
| 130 | |
| 131 | uint8_t align_2[24]; |
| 132 | // @ 64-byte-aligned boundary. |
| 133 | |
| 134 | bool immigrators[AS_CLUSTER_SZ]; |
| 135 | // Byte alignment depends on AS_CLUSTER_SZ - pad below to realign. |
| 136 | |
| 137 | uint8_t align_3[AS_CLUSTER_SZ == 8 ? 56 : 0]; |
| 138 | // @ 64-byte-aligned boundary. |
| 139 | |
| 140 | cf_node witnesses[AS_CLUSTER_SZ]; |
| 141 | } as_partition; |
| 142 | |
| 143 | COMPILER_ASSERT(sizeof(as_partition) % 64 == 0); |
| 144 | |
| 145 | typedef struct as_partition_reservation_s { |
| 146 | struct as_namespace_s* ns; |
| 147 | as_partition* p; |
| 148 | struct as_index_tree_s* tree; |
| 149 | uint32_t regime; |
| 150 | uint32_t n_dupl; |
| 151 | cf_node dupl_nodes[AS_CLUSTER_SZ]; |
| 152 | } as_partition_reservation; |
| 153 | |
| 154 | typedef struct repl_stats_s { |
| 155 | uint64_t n_master_objects; |
| 156 | uint64_t n_prole_objects; |
| 157 | uint64_t n_non_replica_objects; |
| 158 | uint64_t n_master_tombstones; |
| 159 | uint64_t n_prole_tombstones; |
| 160 | uint64_t n_non_replica_tombstones; |
| 161 | } repl_stats; |
| 162 | |
| 163 | #define CLIENT_BITMAP_BYTES ((AS_PARTITIONS + 7) / 8) |
| 164 | #define CLIENT_B64MAP_BYTES (((CLIENT_BITMAP_BYTES + 2) / 3) * 4) |
| 165 | |
| 166 | typedef struct client_replica_map_s { |
| 167 | cf_mutex write_lock; |
| 168 | |
| 169 | volatile uint8_t bitmap[CLIENT_BITMAP_BYTES]; |
| 170 | volatile char b64map[CLIENT_B64MAP_BYTES]; |
| 171 | } client_replica_map; |
| 172 | |
| 173 | typedef enum { |
| 174 | AS_MIGRATE_OK, |
| 175 | AS_MIGRATE_FAIL, |
| 176 | AS_MIGRATE_AGAIN |
| 177 | } as_migrate_result; |
| 178 | |
| 179 | |
| 180 | //========================================================== |
| 181 | // Public API. |
| 182 | // |
| 183 | |
| 184 | void as_partition_init(struct as_namespace_s* ns, uint32_t pid); |
| 185 | void as_partition_shutdown(struct as_namespace_s* ns, uint32_t pid); |
| 186 | |
| 187 | void as_partition_isolate_version(const struct as_namespace_s* ns, as_partition* p); |
| 188 | int as_partition_check_source(const struct as_namespace_s* ns, as_partition* p, cf_node src, bool* from_replica); |
| 189 | void as_partition_freeze(as_partition* p); |
| 190 | |
| 191 | uint32_t as_partition_get_other_replicas(as_partition* p, cf_node* nv); |
| 192 | |
| 193 | cf_node as_partition_writable_node(struct as_namespace_s* ns, uint32_t pid); |
| 194 | cf_node as_partition_proxyee_redirect(struct as_namespace_s* ns, uint32_t pid); |
| 195 | |
| 196 | void as_partition_get_replicas_master_str(cf_dyn_buf* db); |
| 197 | void as_partition_get_replicas_all_str(cf_dyn_buf* db, bool include_regime); |
| 198 | |
| 199 | void as_partition_get_replica_stats(struct as_namespace_s* ns, repl_stats* p_stats); |
| 200 | |
| 201 | void as_partition_reserve(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); |
| 202 | int as_partition_reserve_replica(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); |
| 203 | int as_partition_reserve_write(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, cf_node* node); |
| 204 | int as_partition_reserve_read_tr(struct as_namespace_s* ns, uint32_t pid, struct as_transaction_s* tr, cf_node* node); |
| 205 | int as_partition_prereserve_query(struct as_namespace_s* ns, bool can_partition_query[], as_partition_reservation rsv[]); |
| 206 | int as_partition_reserve_query(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); |
| 207 | int as_partition_reserve_xdr_read(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); |
| 208 | void as_partition_reservation_copy(as_partition_reservation* dst, as_partition_reservation* src); |
| 209 | |
| 210 | void as_partition_release(as_partition_reservation* rsv); |
| 211 | |
| 212 | void as_partition_advance_tree_id(as_partition* p, const char* ns_name); |
| 213 | void as_partition_tree_done(uint8_t id, void* udata); |
| 214 | |
| 215 | void as_partition_getinfo_str(cf_dyn_buf* db); |
| 216 | |
| 217 | // Use VERSION_AS_STRING() - see below. |
| 218 | static inline as_partition_version_string |
| 219 | as_partition_version_as_string(const as_partition_version* version) |
| 220 | { |
| 221 | as_partition_version_string str; |
| 222 | |
| 223 | if (version->family == VERSION_FAMILY_UNIQUE) { |
| 224 | sprintf(str.s, "%012lx.U.%c%c%c" , (uint64_t)version->ckey, |
| 225 | version->master == 0 ? '-' : 'm', |
| 226 | version->subset == 0 ? 'p' : 's', |
| 227 | version->evade == 0 ? '-' : 'e'); |
| 228 | } |
| 229 | else { |
| 230 | sprintf(str.s, "%012lx.%X.%c%c%c" , (uint64_t)version->ckey, |
| 231 | (uint32_t)version->family, |
| 232 | version->master == 0 ? '-' : 'm', |
| 233 | version->subset == 0 ? 'p' : 's', |
| 234 | version->evade == 0 ? |
| 235 | (version->revived == 0 ? '-' : 'r') : 'e'); |
| 236 | } |
| 237 | |
| 238 | return str; |
| 239 | } |
| 240 | |
| 241 | static inline bool |
| 242 | as_partition_version_is_null(const as_partition_version* version) |
| 243 | { |
| 244 | return *(uint64_t*)version == 0; |
| 245 | } |
| 246 | |
| 247 | static inline bool |
| 248 | as_partition_version_has_data(const as_partition_version* version) |
| 249 | { |
| 250 | return version->ckey != 0; |
| 251 | } |
| 252 | |
| 253 | static inline bool |
| 254 | as_partition_version_same(const as_partition_version* v1, const as_partition_version* v2) |
| 255 | { |
| 256 | return *(uint64_t*)v1 == *(uint64_t*)v2; |
| 257 | } |
| 258 | |
| 259 | static inline uint32_t |
| 260 | as_partition_getid(const cf_digest* d) |
| 261 | { |
| 262 | return *(uint32_t*)d & AS_PARTITION_MASK; |
| 263 | } |
| 264 | |
| 265 | static inline int |
| 266 | find_self_in_replicas(const as_partition* p) |
| 267 | { |
| 268 | return index_of_node(p->replicas, p->n_replicas, g_config.self_node); |
| 269 | } |
| 270 | |
| 271 | static inline bool |
| 272 | is_self_replica(const as_partition* p) |
| 273 | { |
| 274 | return contains_node(p->replicas, p->n_replicas, g_config.self_node); |
| 275 | } |
| 276 | |
| 277 | static inline bool |
| 278 | contains_self(const cf_node* nodes, uint32_t n_nodes) |
| 279 | { |
| 280 | return contains_node(nodes, n_nodes, g_config.self_node); |
| 281 | } |
| 282 | |
| 283 | #define AS_PARTITION_ID_UNDEF ((uint16_t)0xFFFF) |
| 284 | |
| 285 | #define AS_PARTITION_RESERVATION_INIT(__rsv) \ |
| 286 | __rsv.ns = NULL; \ |
| 287 | __rsv.p = NULL; \ |
| 288 | __rsv.tree = NULL; \ |
| 289 | __rsv.regime = 0; \ |
| 290 | __rsv.n_dupl = 0; |
| 291 | |
| 292 | #define VERSION_AS_STRING(v_ptr) (as_partition_version_as_string(v_ptr).s) |
| 293 | |
| 294 | |
| 295 | //========================================================== |
| 296 | // Public API - client view replica maps. |
| 297 | // |
| 298 | |
| 299 | void client_replica_maps_create(struct as_namespace_s* ns); |
| 300 | void client_replica_maps_clear(struct as_namespace_s* ns); |
| 301 | bool client_replica_maps_update(struct as_namespace_s* ns, uint32_t pid); |
| 302 | bool client_replica_maps_is_partition_queryable(const struct as_namespace_s* ns, uint32_t pid); |
| 303 | |
| 304 | |
| 305 | //========================================================== |
| 306 | // Private API - for enterprise separation only. |
| 307 | // |
| 308 | |
| 309 | int partition_reserve_unavailable(const struct as_namespace_s* ns, const as_partition* p, struct as_transaction_s* tr, cf_node* node); |
| 310 | bool partition_reserve_promote(const struct as_namespace_s* ns, const as_partition* p, struct as_transaction_s* tr); |
| 311 | |