1/*
2 * partition_balance.h
3 *
4 * Copyright (C) 2016-2018 Aerospike, Inc.
5 *
6 * Portions may be licensed to Aerospike, Inc. under one or more contributor
7 * license agreements.
8 *
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU Affero General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU Affero General Public License
20 * along with this program. If not, see http://www.gnu.org/licenses/
21 */
22
23#pragma once
24
25//==========================================================
26// Includes.
27//
28
29#include <stdbool.h>
30#include <stdint.h>
31
32#include "citrusleaf/cf_atomic.h"
33#include "citrusleaf/cf_queue.h"
34
35#include "dynbuf.h"
36#include "fault.h"
37#include "node.h"
38
39#include "fabric/hb.h"
40#include "fabric/partition.h"
41
42
43//==========================================================
44// Forward declarations.
45//
46
47struct as_namespace_s;
48
49
50//==========================================================
51// Typedefs & constants.
52//
53
54typedef enum {
55 PB_TASK_EMIG_TRANSFER,
56 PB_TASK_EMIG_SIGNAL_ALL_DONE,
57 PB_TASK_APPEAL
58} pb_task_type;
59
60typedef struct pb_task_s {
61 cf_node dest;
62 struct as_namespace_s* ns;
63 uint32_t pid;
64 uint64_t cluster_key;
65 pb_task_type type;
66 uint32_t tx_flags;
67} pb_task;
68
69#define MAX_RACK_ID 1000000
70#define MAX_RACK_ID_LEN 7 // number of decimal characters
71
72
73//==========================================================
74// Public API - regulate migrations.
75//
76
77void as_partition_balance_disallow_migrations();
78bool as_partition_balance_are_migrations_allowed();
79void as_partition_balance_synchronize_migrations();
80void as_partition_balance_emigration_yield();
81
82
83//==========================================================
84// Public API - balance partitions.
85//
86
87void as_partition_balance_init();
88bool as_partition_balance_is_init_resolved();
89void as_partition_balance_revert_to_orphan();
90void as_partition_balance();
91
92uint64_t as_partition_balance_remaining_migrations();
93bool as_partition_balance_revive(struct as_namespace_s* ns);
94void as_partition_balance_protect_roster_set(struct as_namespace_s* ns);
95void as_partition_balance_effective_rack_ids(cf_dyn_buf* db);
96
97
98//==========================================================
99// Public API - migration-related as_partition methods.
100//
101
102bool as_partition_pending_migrations(as_partition* p);
103
104bool as_partition_pre_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, uint32_t tx_flags);
105void as_partition_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node dest_node, uint32_t tx_flags);
106as_migrate_result as_partition_immigrate_start(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node);
107as_migrate_result as_partition_immigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node);
108as_migrate_result as_partition_migrations_all_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key);
109void as_partition_signal_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key);
110
111// Counter that tells clients partition ownership has changed.
112extern cf_atomic32 g_partition_generation;
113
114// Time of last rebalance.
115extern uint64_t g_rebalance_sec;
116
117// Count rebalances.
118extern uint64_t g_rebalance_generation;
119
120
121//==========================================================
122// Private API - for enterprise separation only.
123//
124
125//------------------------------------------------
126// Typedefs & constants.
127//
128
129COMPILER_ASSERT((AS_CLUSTER_SZ & (AS_CLUSTER_SZ - 1)) == 0);
130
131#define AS_CLUSTER_SZ_MASKP (-(uint64_t)AS_CLUSTER_SZ)
132#define AS_CLUSTER_SZ_MASKN ((uint64_t)AS_CLUSTER_SZ - 1)
133
134typedef uint8_t sl_ix_t;
135
136COMPILER_ASSERT(AS_CLUSTER_SZ_MASKN >> (sizeof(sl_ix_t) * 8) == 0);
137
138typedef struct inter_hash_s {
139 uint64_t hashed_node;
140 uint64_t hashed_pid;
141} inter_hash;
142
143extern const as_partition_version ZERO_VERSION;
144
145#define REBALANCE_FLUSH_SIZE 4096
146#define PMETA_SIZE 16 // sizeof(ssd_common_pmeta) without including drv_ssd.h
147
148#define PIDS_PER_GROUP (REBALANCE_FLUSH_SIZE / PMETA_SIZE) // 256
149#define NUM_PID_GROUPS (AS_PARTITIONS / PIDS_PER_GROUP) // 16
150
151
152//------------------------------------------------
153// Globals.
154//
155
156extern volatile int g_allow_migrations;
157
158extern uint64_t g_hashed_pids[AS_PARTITIONS];
159
160// Shortcuts to values set by as_exchange, for use in partition balance only.
161extern uint32_t g_cluster_size;
162extern cf_node* g_succession;
163
164extern cf_node g_full_node_seq_table[AS_CLUSTER_SZ * AS_PARTITIONS];
165extern sl_ix_t g_full_sl_ix_table[AS_CLUSTER_SZ * AS_PARTITIONS];
166
167
168//------------------------------------------------
169// Forward declarations.
170//
171
172void partition_balance_init();
173
174void pb_task_init(pb_task* task, cf_node dest, struct as_namespace_s* ns, uint32_t pid, uint64_t cluster_key, pb_task_type type, uint32_t tx_flags);
175
176void balance_namespace(struct as_namespace_s* ns, cf_queue* mq);
177void prepare_for_appeals();
178void process_pb_tasks(cf_queue* tq);
179void balance_namespace_ap(struct as_namespace_s* ns, cf_queue* mq);
180void set_active_size(struct as_namespace_s* ns);
181uint32_t rack_count(const struct as_namespace_s* ns);
182void fill_translation(int translation[], const struct as_namespace_s* ns);
183void init_target_claims_ap(const struct as_namespace_s* ns, const int translation[], uint32_t* target_claims);
184void fill_namespace_rows(const cf_node* full_node_seq, const sl_ix_t* full_sl_ix, cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, const int translation[]);
185void quiesce_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, struct as_namespace_s* ns);
186void uniform_adjust_row(cf_node* node_seq, uint32_t n_nodes, sl_ix_t* ns_sl_ix, uint32_t n_replicas, uint32_t* claims, const uint32_t* target_claims, const uint32_t* rack_ids, uint32_t n_racks);
187void rack_aware_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, uint32_t replication_factor, const uint32_t* rack_ids, uint32_t n_ids, uint32_t n_racks, uint32_t start_n);
188uint32_t find_self(const cf_node* ns_node_seq, const struct as_namespace_s* ns);
189int shift_working_master(const as_partition* p, const sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, int working_master_n, const as_partition_version* working_master_version);
190uint32_t fill_immigrators(as_partition* p, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns, uint32_t working_master_n, uint32_t n_dupl);
191void emig_lead_flags_ap(const as_partition* p, const sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, uint32_t lead_flags[]);
192void queue_namespace_migrations(as_partition* p, struct as_namespace_s* ns, uint32_t self_n, cf_node working_master, uint32_t n_dupl, cf_node dupls[], const uint32_t lead_flags[], cf_queue* mq);
193bool drop_superfluous_version(as_partition* p, struct as_namespace_s* ns);
194bool adjust_superfluous_version(as_partition* p, struct as_namespace_s* ns);
195void fill_witnesses(as_partition* p, const cf_node* ns_node_seq, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns);
196void handle_version_change(as_partition* p, struct as_namespace_s* ns, as_partition_version* orig_version);
197
198void emigrate_done_advance_non_master_version(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags);
199void emigrate_done_advance_non_master_version_ap(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags);
200void immigrate_start_advance_non_master_version(struct as_namespace_s* ns, as_partition* p);
201void immigrate_start_advance_non_master_version_ap(as_partition* p);
202void immigrate_done_advance_final_master_version(struct as_namespace_s* ns, as_partition* p);
203void immigrate_done_advance_final_master_version_ap(struct as_namespace_s* ns, as_partition* p);
204bool immigrate_yield();
205
206
207//------------------------------------------------
208// Inlines and macros.
209//
210
211static inline bool
212is_same_as_full_master(const as_partition_version* mv, const as_partition_version* v)
213{
214 // Works for CP too, even with family check.
215 return v->subset == 0 && mv->ckey == v->ckey && mv->family == v->family &&
216 mv->family != VERSION_FAMILY_UNIQUE;
217}
218
219// Define macros for accessing the full node-seq and sl-ix arrays.
220#define FULL_NODE_SEQ(x, y) g_full_node_seq_table[(x * g_cluster_size) + y]
221#define FULL_SL_IX(x, y) g_full_sl_ix_table[(x * g_cluster_size) + y]
222
223// Get the partition version that was input by exchange.
224#define INPUT_VERSION(_n) (&ns->cluster_versions[ns_sl_ix[_n]][p->id])
225