1 | /* |
2 | * partition_balance.h |
3 | * |
4 | * Copyright (C) 2016-2018 Aerospike, Inc. |
5 | * |
6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
7 | * license agreements. |
8 | * |
9 | * This program is free software: you can redistribute it and/or modify it under |
10 | * the terms of the GNU Affero General Public License as published by the Free |
11 | * Software Foundation, either version 3 of the License, or (at your option) any |
12 | * later version. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
17 | * details. |
18 | * |
19 | * You should have received a copy of the GNU Affero General Public License |
20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
21 | */ |
22 | |
23 | #pragma once |
24 | |
25 | //========================================================== |
26 | // Includes. |
27 | // |
28 | |
29 | #include <stdbool.h> |
30 | #include <stdint.h> |
31 | |
32 | #include "citrusleaf/cf_atomic.h" |
33 | #include "citrusleaf/cf_queue.h" |
34 | |
35 | #include "dynbuf.h" |
36 | #include "fault.h" |
37 | #include "node.h" |
38 | |
39 | #include "fabric/hb.h" |
40 | #include "fabric/partition.h" |
41 | |
42 | |
43 | //========================================================== |
44 | // Forward declarations. |
45 | // |
46 | |
47 | struct as_namespace_s; |
48 | |
49 | |
50 | //========================================================== |
51 | // Typedefs & constants. |
52 | // |
53 | |
54 | typedef enum { |
55 | PB_TASK_EMIG_TRANSFER, |
56 | PB_TASK_EMIG_SIGNAL_ALL_DONE, |
57 | PB_TASK_APPEAL |
58 | } pb_task_type; |
59 | |
60 | typedef struct pb_task_s { |
61 | cf_node dest; |
62 | struct as_namespace_s* ns; |
63 | uint32_t pid; |
64 | uint64_t cluster_key; |
65 | pb_task_type type; |
66 | uint32_t tx_flags; |
67 | } pb_task; |
68 | |
69 | #define MAX_RACK_ID 1000000 |
70 | #define MAX_RACK_ID_LEN 7 // number of decimal characters |
71 | |
72 | |
73 | //========================================================== |
74 | // Public API - regulate migrations. |
75 | // |
76 | |
77 | void as_partition_balance_disallow_migrations(); |
78 | bool as_partition_balance_are_migrations_allowed(); |
79 | void as_partition_balance_synchronize_migrations(); |
80 | void as_partition_balance_emigration_yield(); |
81 | |
82 | |
83 | //========================================================== |
84 | // Public API - balance partitions. |
85 | // |
86 | |
87 | void as_partition_balance_init(); |
88 | bool as_partition_balance_is_init_resolved(); |
89 | void as_partition_balance_revert_to_orphan(); |
90 | void as_partition_balance(); |
91 | |
92 | uint64_t as_partition_balance_remaining_migrations(); |
93 | bool as_partition_balance_revive(struct as_namespace_s* ns); |
94 | void as_partition_balance_protect_roster_set(struct as_namespace_s* ns); |
95 | void as_partition_balance_effective_rack_ids(cf_dyn_buf* db); |
96 | |
97 | |
98 | //========================================================== |
99 | // Public API - migration-related as_partition methods. |
100 | // |
101 | |
102 | bool as_partition_pending_migrations(as_partition* p); |
103 | |
104 | bool as_partition_pre_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, uint32_t tx_flags); |
105 | void as_partition_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node dest_node, uint32_t tx_flags); |
106 | as_migrate_result as_partition_immigrate_start(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node); |
107 | as_migrate_result as_partition_immigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node); |
108 | as_migrate_result as_partition_migrations_all_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key); |
109 | void as_partition_signal_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key); |
110 | |
111 | // Counter that tells clients partition ownership has changed. |
112 | extern cf_atomic32 g_partition_generation; |
113 | |
114 | // Time of last rebalance. |
115 | extern uint64_t g_rebalance_sec; |
116 | |
117 | // Count rebalances. |
118 | extern uint64_t g_rebalance_generation; |
119 | |
120 | |
121 | //========================================================== |
122 | // Private API - for enterprise separation only. |
123 | // |
124 | |
125 | //------------------------------------------------ |
126 | // Typedefs & constants. |
127 | // |
128 | |
129 | COMPILER_ASSERT((AS_CLUSTER_SZ & (AS_CLUSTER_SZ - 1)) == 0); |
130 | |
131 | #define AS_CLUSTER_SZ_MASKP (-(uint64_t)AS_CLUSTER_SZ) |
132 | #define AS_CLUSTER_SZ_MASKN ((uint64_t)AS_CLUSTER_SZ - 1) |
133 | |
134 | typedef uint8_t sl_ix_t; |
135 | |
136 | COMPILER_ASSERT(AS_CLUSTER_SZ_MASKN >> (sizeof(sl_ix_t) * 8) == 0); |
137 | |
138 | typedef struct inter_hash_s { |
139 | uint64_t hashed_node; |
140 | uint64_t hashed_pid; |
141 | } inter_hash; |
142 | |
143 | extern const as_partition_version ZERO_VERSION; |
144 | |
145 | #define REBALANCE_FLUSH_SIZE 4096 |
146 | #define PMETA_SIZE 16 // sizeof(ssd_common_pmeta) without including drv_ssd.h |
147 | |
148 | #define PIDS_PER_GROUP (REBALANCE_FLUSH_SIZE / PMETA_SIZE) // 256 |
149 | #define NUM_PID_GROUPS (AS_PARTITIONS / PIDS_PER_GROUP) // 16 |
150 | |
151 | |
152 | //------------------------------------------------ |
153 | // Globals. |
154 | // |
155 | |
156 | extern volatile int g_allow_migrations; |
157 | |
158 | extern uint64_t g_hashed_pids[AS_PARTITIONS]; |
159 | |
160 | // Shortcuts to values set by as_exchange, for use in partition balance only. |
161 | extern uint32_t g_cluster_size; |
162 | extern cf_node* g_succession; |
163 | |
164 | extern cf_node g_full_node_seq_table[AS_CLUSTER_SZ * AS_PARTITIONS]; |
165 | extern sl_ix_t g_full_sl_ix_table[AS_CLUSTER_SZ * AS_PARTITIONS]; |
166 | |
167 | |
168 | //------------------------------------------------ |
169 | // Forward declarations. |
170 | // |
171 | |
172 | void partition_balance_init(); |
173 | |
174 | void pb_task_init(pb_task* task, cf_node dest, struct as_namespace_s* ns, uint32_t pid, uint64_t cluster_key, pb_task_type type, uint32_t tx_flags); |
175 | |
176 | void balance_namespace(struct as_namespace_s* ns, cf_queue* mq); |
177 | void prepare_for_appeals(); |
178 | void process_pb_tasks(cf_queue* tq); |
179 | void balance_namespace_ap(struct as_namespace_s* ns, cf_queue* mq); |
180 | void set_active_size(struct as_namespace_s* ns); |
181 | uint32_t rack_count(const struct as_namespace_s* ns); |
182 | void fill_translation(int translation[], const struct as_namespace_s* ns); |
183 | void init_target_claims_ap(const struct as_namespace_s* ns, const int translation[], uint32_t* target_claims); |
184 | void fill_namespace_rows(const cf_node* full_node_seq, const sl_ix_t* full_sl_ix, cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, const int translation[]); |
185 | void quiesce_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, struct as_namespace_s* ns); |
186 | void uniform_adjust_row(cf_node* node_seq, uint32_t n_nodes, sl_ix_t* ns_sl_ix, uint32_t n_replicas, uint32_t* claims, const uint32_t* target_claims, const uint32_t* rack_ids, uint32_t n_racks); |
187 | void rack_aware_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, uint32_t replication_factor, const uint32_t* rack_ids, uint32_t n_ids, uint32_t n_racks, uint32_t start_n); |
188 | uint32_t find_self(const cf_node* ns_node_seq, const struct as_namespace_s* ns); |
189 | int shift_working_master(const as_partition* p, const sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, int working_master_n, const as_partition_version* working_master_version); |
190 | uint32_t fill_immigrators(as_partition* p, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns, uint32_t working_master_n, uint32_t n_dupl); |
191 | void emig_lead_flags_ap(const as_partition* p, const sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, uint32_t lead_flags[]); |
192 | void queue_namespace_migrations(as_partition* p, struct as_namespace_s* ns, uint32_t self_n, cf_node working_master, uint32_t n_dupl, cf_node dupls[], const uint32_t lead_flags[], cf_queue* mq); |
193 | bool drop_superfluous_version(as_partition* p, struct as_namespace_s* ns); |
194 | bool adjust_superfluous_version(as_partition* p, struct as_namespace_s* ns); |
195 | void fill_witnesses(as_partition* p, const cf_node* ns_node_seq, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns); |
196 | void handle_version_change(as_partition* p, struct as_namespace_s* ns, as_partition_version* orig_version); |
197 | |
198 | void emigrate_done_advance_non_master_version(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags); |
199 | void emigrate_done_advance_non_master_version_ap(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags); |
200 | void immigrate_start_advance_non_master_version(struct as_namespace_s* ns, as_partition* p); |
201 | void immigrate_start_advance_non_master_version_ap(as_partition* p); |
202 | void immigrate_done_advance_final_master_version(struct as_namespace_s* ns, as_partition* p); |
203 | void immigrate_done_advance_final_master_version_ap(struct as_namespace_s* ns, as_partition* p); |
204 | bool immigrate_yield(); |
205 | |
206 | |
207 | //------------------------------------------------ |
208 | // Inlines and macros. |
209 | // |
210 | |
211 | static inline bool |
212 | is_same_as_full_master(const as_partition_version* mv, const as_partition_version* v) |
213 | { |
214 | // Works for CP too, even with family check. |
215 | return v->subset == 0 && mv->ckey == v->ckey && mv->family == v->family && |
216 | mv->family != VERSION_FAMILY_UNIQUE; |
217 | } |
218 | |
219 | // Define macros for accessing the full node-seq and sl-ix arrays. |
220 | #define FULL_NODE_SEQ(x, y) g_full_node_seq_table[(x * g_cluster_size) + y] |
221 | #define FULL_SL_IX(x, y) g_full_sl_ix_table[(x * g_cluster_size) + y] |
222 | |
223 | // Get the partition version that was input by exchange. |
224 | #define INPUT_VERSION(_n) (&ns->cluster_versions[ns_sl_ix[_n]][p->id]) |
225 | |