| 1 | /* |
| 2 | * partition_balance.h |
| 3 | * |
| 4 | * Copyright (C) 2016-2018 Aerospike, Inc. |
| 5 | * |
| 6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
| 7 | * license agreements. |
| 8 | * |
| 9 | * This program is free software: you can redistribute it and/or modify it under |
| 10 | * the terms of the GNU Affero General Public License as published by the Free |
| 11 | * Software Foundation, either version 3 of the License, or (at your option) any |
| 12 | * later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
| 15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
| 17 | * details. |
| 18 | * |
| 19 | * You should have received a copy of the GNU Affero General Public License |
| 20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
| 21 | */ |
| 22 | |
| 23 | #pragma once |
| 24 | |
| 25 | //========================================================== |
| 26 | // Includes. |
| 27 | // |
| 28 | |
| 29 | #include <stdbool.h> |
| 30 | #include <stdint.h> |
| 31 | |
| 32 | #include "citrusleaf/cf_atomic.h" |
| 33 | #include "citrusleaf/cf_queue.h" |
| 34 | |
| 35 | #include "dynbuf.h" |
| 36 | #include "fault.h" |
| 37 | #include "node.h" |
| 38 | |
| 39 | #include "fabric/hb.h" |
| 40 | #include "fabric/partition.h" |
| 41 | |
| 42 | |
| 43 | //========================================================== |
| 44 | // Forward declarations. |
| 45 | // |
| 46 | |
| 47 | struct as_namespace_s; |
| 48 | |
| 49 | |
| 50 | //========================================================== |
| 51 | // Typedefs & constants. |
| 52 | // |
| 53 | |
| 54 | typedef enum { |
| 55 | PB_TASK_EMIG_TRANSFER, |
| 56 | PB_TASK_EMIG_SIGNAL_ALL_DONE, |
| 57 | PB_TASK_APPEAL |
| 58 | } pb_task_type; |
| 59 | |
| 60 | typedef struct pb_task_s { |
| 61 | cf_node dest; |
| 62 | struct as_namespace_s* ns; |
| 63 | uint32_t pid; |
| 64 | uint64_t cluster_key; |
| 65 | pb_task_type type; |
| 66 | uint32_t tx_flags; |
| 67 | } pb_task; |
| 68 | |
| 69 | #define MAX_RACK_ID 1000000 |
| 70 | #define MAX_RACK_ID_LEN 7 // number of decimal characters |
| 71 | |
| 72 | |
| 73 | //========================================================== |
| 74 | // Public API - regulate migrations. |
| 75 | // |
| 76 | |
| 77 | void as_partition_balance_disallow_migrations(); |
| 78 | bool as_partition_balance_are_migrations_allowed(); |
| 79 | void as_partition_balance_synchronize_migrations(); |
| 80 | void as_partition_balance_emigration_yield(); |
| 81 | |
| 82 | |
| 83 | //========================================================== |
| 84 | // Public API - balance partitions. |
| 85 | // |
| 86 | |
| 87 | void as_partition_balance_init(); |
| 88 | bool as_partition_balance_is_init_resolved(); |
| 89 | void as_partition_balance_revert_to_orphan(); |
| 90 | void as_partition_balance(); |
| 91 | |
| 92 | uint64_t as_partition_balance_remaining_migrations(); |
| 93 | bool as_partition_balance_revive(struct as_namespace_s* ns); |
| 94 | void as_partition_balance_protect_roster_set(struct as_namespace_s* ns); |
| 95 | void as_partition_balance_effective_rack_ids(cf_dyn_buf* db); |
| 96 | |
| 97 | |
| 98 | //========================================================== |
| 99 | // Public API - migration-related as_partition methods. |
| 100 | // |
| 101 | |
| 102 | bool as_partition_pending_migrations(as_partition* p); |
| 103 | |
| 104 | bool as_partition_pre_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, uint32_t tx_flags); |
| 105 | void as_partition_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node dest_node, uint32_t tx_flags); |
| 106 | as_migrate_result as_partition_immigrate_start(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node); |
| 107 | as_migrate_result as_partition_immigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node); |
| 108 | as_migrate_result as_partition_migrations_all_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key); |
| 109 | void as_partition_signal_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key); |
| 110 | |
| 111 | // Counter that tells clients partition ownership has changed. |
| 112 | extern cf_atomic32 g_partition_generation; |
| 113 | |
| 114 | // Time of last rebalance. |
| 115 | extern uint64_t g_rebalance_sec; |
| 116 | |
| 117 | // Count rebalances. |
| 118 | extern uint64_t g_rebalance_generation; |
| 119 | |
| 120 | |
| 121 | //========================================================== |
| 122 | // Private API - for enterprise separation only. |
| 123 | // |
| 124 | |
| 125 | //------------------------------------------------ |
| 126 | // Typedefs & constants. |
| 127 | // |
| 128 | |
| 129 | COMPILER_ASSERT((AS_CLUSTER_SZ & (AS_CLUSTER_SZ - 1)) == 0); |
| 130 | |
| 131 | #define AS_CLUSTER_SZ_MASKP (-(uint64_t)AS_CLUSTER_SZ) |
| 132 | #define AS_CLUSTER_SZ_MASKN ((uint64_t)AS_CLUSTER_SZ - 1) |
| 133 | |
| 134 | typedef uint8_t sl_ix_t; |
| 135 | |
| 136 | COMPILER_ASSERT(AS_CLUSTER_SZ_MASKN >> (sizeof(sl_ix_t) * 8) == 0); |
| 137 | |
| 138 | typedef struct inter_hash_s { |
| 139 | uint64_t hashed_node; |
| 140 | uint64_t hashed_pid; |
| 141 | } inter_hash; |
| 142 | |
| 143 | extern const as_partition_version ZERO_VERSION; |
| 144 | |
| 145 | #define REBALANCE_FLUSH_SIZE 4096 |
| 146 | #define PMETA_SIZE 16 // sizeof(ssd_common_pmeta) without including drv_ssd.h |
| 147 | |
| 148 | #define PIDS_PER_GROUP (REBALANCE_FLUSH_SIZE / PMETA_SIZE) // 256 |
| 149 | #define NUM_PID_GROUPS (AS_PARTITIONS / PIDS_PER_GROUP) // 16 |
| 150 | |
| 151 | |
| 152 | //------------------------------------------------ |
| 153 | // Globals. |
| 154 | // |
| 155 | |
| 156 | extern volatile int g_allow_migrations; |
| 157 | |
| 158 | extern uint64_t g_hashed_pids[AS_PARTITIONS]; |
| 159 | |
| 160 | // Shortcuts to values set by as_exchange, for use in partition balance only. |
| 161 | extern uint32_t g_cluster_size; |
| 162 | extern cf_node* g_succession; |
| 163 | |
| 164 | extern cf_node g_full_node_seq_table[AS_CLUSTER_SZ * AS_PARTITIONS]; |
| 165 | extern sl_ix_t g_full_sl_ix_table[AS_CLUSTER_SZ * AS_PARTITIONS]; |
| 166 | |
| 167 | |
| 168 | //------------------------------------------------ |
| 169 | // Forward declarations. |
| 170 | // |
| 171 | |
| 172 | void partition_balance_init(); |
| 173 | |
| 174 | void pb_task_init(pb_task* task, cf_node dest, struct as_namespace_s* ns, uint32_t pid, uint64_t cluster_key, pb_task_type type, uint32_t tx_flags); |
| 175 | |
| 176 | void balance_namespace(struct as_namespace_s* ns, cf_queue* mq); |
| 177 | void prepare_for_appeals(); |
| 178 | void process_pb_tasks(cf_queue* tq); |
| 179 | void balance_namespace_ap(struct as_namespace_s* ns, cf_queue* mq); |
| 180 | void set_active_size(struct as_namespace_s* ns); |
| 181 | uint32_t rack_count(const struct as_namespace_s* ns); |
| 182 | void fill_translation(int translation[], const struct as_namespace_s* ns); |
| 183 | void init_target_claims_ap(const struct as_namespace_s* ns, const int translation[], uint32_t* target_claims); |
| 184 | void fill_namespace_rows(const cf_node* full_node_seq, const sl_ix_t* full_sl_ix, cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, const int translation[]); |
| 185 | void quiesce_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, struct as_namespace_s* ns); |
| 186 | void uniform_adjust_row(cf_node* node_seq, uint32_t n_nodes, sl_ix_t* ns_sl_ix, uint32_t n_replicas, uint32_t* claims, const uint32_t* target_claims, const uint32_t* rack_ids, uint32_t n_racks); |
| 187 | void rack_aware_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, uint32_t replication_factor, const uint32_t* rack_ids, uint32_t n_ids, uint32_t n_racks, uint32_t start_n); |
| 188 | uint32_t find_self(const cf_node* ns_node_seq, const struct as_namespace_s* ns); |
| 189 | int shift_working_master(const as_partition* p, const sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, int working_master_n, const as_partition_version* working_master_version); |
| 190 | uint32_t fill_immigrators(as_partition* p, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns, uint32_t working_master_n, uint32_t n_dupl); |
| 191 | void emig_lead_flags_ap(const as_partition* p, const sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, uint32_t lead_flags[]); |
| 192 | void queue_namespace_migrations(as_partition* p, struct as_namespace_s* ns, uint32_t self_n, cf_node working_master, uint32_t n_dupl, cf_node dupls[], const uint32_t lead_flags[], cf_queue* mq); |
| 193 | bool drop_superfluous_version(as_partition* p, struct as_namespace_s* ns); |
| 194 | bool adjust_superfluous_version(as_partition* p, struct as_namespace_s* ns); |
| 195 | void fill_witnesses(as_partition* p, const cf_node* ns_node_seq, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns); |
| 196 | void handle_version_change(as_partition* p, struct as_namespace_s* ns, as_partition_version* orig_version); |
| 197 | |
| 198 | void emigrate_done_advance_non_master_version(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags); |
| 199 | void emigrate_done_advance_non_master_version_ap(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags); |
| 200 | void immigrate_start_advance_non_master_version(struct as_namespace_s* ns, as_partition* p); |
| 201 | void immigrate_start_advance_non_master_version_ap(as_partition* p); |
| 202 | void immigrate_done_advance_final_master_version(struct as_namespace_s* ns, as_partition* p); |
| 203 | void immigrate_done_advance_final_master_version_ap(struct as_namespace_s* ns, as_partition* p); |
| 204 | bool immigrate_yield(); |
| 205 | |
| 206 | |
| 207 | //------------------------------------------------ |
| 208 | // Inlines and macros. |
| 209 | // |
| 210 | |
| 211 | static inline bool |
| 212 | is_same_as_full_master(const as_partition_version* mv, const as_partition_version* v) |
| 213 | { |
| 214 | // Works for CP too, even with family check. |
| 215 | return v->subset == 0 && mv->ckey == v->ckey && mv->family == v->family && |
| 216 | mv->family != VERSION_FAMILY_UNIQUE; |
| 217 | } |
| 218 | |
| 219 | // Define macros for accessing the full node-seq and sl-ix arrays. |
| 220 | #define FULL_NODE_SEQ(x, y) g_full_node_seq_table[(x * g_cluster_size) + y] |
| 221 | #define FULL_SL_IX(x, y) g_full_sl_ix_table[(x * g_cluster_size) + y] |
| 222 | |
| 223 | // Get the partition version that was input by exchange. |
| 224 | #define INPUT_VERSION(_n) (&ns->cluster_versions[ns_sl_ix[_n]][p->id]) |
| 225 | |