1/*
2 * partition.h
3 *
4 * Copyright (C) 2016 Aerospike, Inc.
5 *
6 * Portions may be licensed to Aerospike, Inc. under one or more contributor
7 * license agreements.
8 *
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU Affero General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU Affero General Public License
20 * along with this program. If not, see http://www.gnu.org/licenses/
21 */
22
23#pragma once
24
25//==========================================================
26// Includes.
27//
28
29#include <stdbool.h>
30#include <stddef.h>
31#include <stdint.h>
32#include <stdio.h>
33#include <string.h>
34
35#include "citrusleaf/cf_atomic.h"
36#include "citrusleaf/cf_digest.h"
37
38#include "cf_mutex.h"
39#include "dynbuf.h"
40#include "node.h"
41
42#include "base/cfg.h"
43#include "fabric/hb.h"
44
45
46//==========================================================
47// Forward declarations.
48//
49
50struct as_index_tree_s;
51struct as_namespace_s;
52struct as_transaction_s;
53
54
55//==========================================================
56// Typedefs & constants.
57//
58
59#define AS_PARTITIONS 4096
60#define AS_PARTITION_MASK (AS_PARTITIONS - 1)
61
62#define VERSION_FAMILY_BITS 4
63#define VERSION_FAMILY_UNIQUE ((1 << VERSION_FAMILY_BITS) - 1)
64#define AS_PARTITION_N_FAMILIES VERSION_FAMILY_UNIQUE
65
66typedef struct as_partition_version_s {
67 uint64_t ckey:48;
68 uint64_t family:VERSION_FAMILY_BITS;
69 uint64_t unused:8;
70 uint64_t revived:1; // enterprise only
71 uint64_t master:1;
72 uint64_t subset:1;
73 uint64_t evade:1;
74} as_partition_version;
75
76COMPILER_ASSERT(sizeof(as_partition_version) == sizeof(uint64_t));
77
78typedef struct as_partition_version_string_s {
79 char s[19 + 1]; // format CCCCccccCCCC.F.mse - F may someday be 2 characters
80} as_partition_version_string;
81
82typedef struct as_partition_s {
83 //--------------------------------------------
84 // Used during every transaction.
85 //
86
87 cf_atomic64 n_tombstones; // relevant only for enterprise edition
88 cf_atomic32 max_void_time; // TODO - do we really need this?
89
90 cf_mutex lock;
91
92 struct as_index_tree_s* tree;
93
94 cf_node working_master;
95
96 uint32_t regime; // relevant only for enterprise edition
97
98 uint32_t n_nodes; // relevant only for enterprise edition
99 uint32_t n_replicas;
100 uint32_t n_dupl;
101
102 // @ 48 bytes - room for 2 replicas within above 64-byte cache line.
103 cf_node replicas[AS_CLUSTER_SZ];
104
105 uint8_t align_1[16];
106 // @ 64-byte-aligned boundary.
107
108 //--------------------------------------------
109 // Used only when cluster is in flux.
110 //
111
112 uint32_t id;
113
114 bool must_appeal; // relevant only for enterprise edition
115
116 uint8_t tree_id;
117 uint64_t tree_ids_used; // bit map
118
119 as_partition_version final_version;
120 as_partition_version version;
121
122 uint16_t pending_emigrations;
123 uint16_t pending_lead_emigrations;
124 uint16_t pending_immigrations;
125
126 uint16_t n_witnesses;
127
128 // @ 40 bytes - room for 3 duplicates within above 64-byte cache line.
129 cf_node dupls[AS_CLUSTER_SZ];
130
131 uint8_t align_2[24];
132 // @ 64-byte-aligned boundary.
133
134 bool immigrators[AS_CLUSTER_SZ];
135 // Byte alignment depends on AS_CLUSTER_SZ - pad below to realign.
136
137 uint8_t align_3[AS_CLUSTER_SZ == 8 ? 56 : 0];
138 // @ 64-byte-aligned boundary.
139
140 cf_node witnesses[AS_CLUSTER_SZ];
141} as_partition;
142
143COMPILER_ASSERT(sizeof(as_partition) % 64 == 0);
144
145typedef struct as_partition_reservation_s {
146 struct as_namespace_s* ns;
147 as_partition* p;
148 struct as_index_tree_s* tree;
149 uint32_t regime;
150 uint32_t n_dupl;
151 cf_node dupl_nodes[AS_CLUSTER_SZ];
152} as_partition_reservation;
153
154typedef struct repl_stats_s {
155 uint64_t n_master_objects;
156 uint64_t n_prole_objects;
157 uint64_t n_non_replica_objects;
158 uint64_t n_master_tombstones;
159 uint64_t n_prole_tombstones;
160 uint64_t n_non_replica_tombstones;
161} repl_stats;
162
163#define CLIENT_BITMAP_BYTES ((AS_PARTITIONS + 7) / 8)
164#define CLIENT_B64MAP_BYTES (((CLIENT_BITMAP_BYTES + 2) / 3) * 4)
165
166typedef struct client_replica_map_s {
167 cf_mutex write_lock;
168
169 volatile uint8_t bitmap[CLIENT_BITMAP_BYTES];
170 volatile char b64map[CLIENT_B64MAP_BYTES];
171} client_replica_map;
172
173typedef enum {
174 AS_MIGRATE_OK,
175 AS_MIGRATE_FAIL,
176 AS_MIGRATE_AGAIN
177} as_migrate_result;
178
179
180//==========================================================
181// Public API.
182//
183
184void as_partition_init(struct as_namespace_s* ns, uint32_t pid);
185void as_partition_shutdown(struct as_namespace_s* ns, uint32_t pid);
186
187void as_partition_isolate_version(const struct as_namespace_s* ns, as_partition* p);
188int as_partition_check_source(const struct as_namespace_s* ns, as_partition* p, cf_node src, bool* from_replica);
189void as_partition_freeze(as_partition* p);
190
191uint32_t as_partition_get_other_replicas(as_partition* p, cf_node* nv);
192
193cf_node as_partition_writable_node(struct as_namespace_s* ns, uint32_t pid);
194cf_node as_partition_proxyee_redirect(struct as_namespace_s* ns, uint32_t pid);
195
196void as_partition_get_replicas_master_str(cf_dyn_buf* db);
197void as_partition_get_replicas_all_str(cf_dyn_buf* db, bool include_regime);
198
199void as_partition_get_replica_stats(struct as_namespace_s* ns, repl_stats* p_stats);
200
201void as_partition_reserve(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv);
202int as_partition_reserve_replica(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv);
203int as_partition_reserve_write(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, cf_node* node);
204int as_partition_reserve_read_tr(struct as_namespace_s* ns, uint32_t pid, struct as_transaction_s* tr, cf_node* node);
205int as_partition_prereserve_query(struct as_namespace_s* ns, bool can_partition_query[], as_partition_reservation rsv[]);
206int as_partition_reserve_query(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv);
207int as_partition_reserve_xdr_read(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv);
208void as_partition_reservation_copy(as_partition_reservation* dst, as_partition_reservation* src);
209
210void as_partition_release(as_partition_reservation* rsv);
211
212void as_partition_advance_tree_id(as_partition* p, const char* ns_name);
213void as_partition_tree_done(uint8_t id, void* udata);
214
215void as_partition_getinfo_str(cf_dyn_buf* db);
216
217// Use VERSION_AS_STRING() - see below.
218static inline as_partition_version_string
219as_partition_version_as_string(const as_partition_version* version)
220{
221 as_partition_version_string str;
222
223 if (version->family == VERSION_FAMILY_UNIQUE) {
224 sprintf(str.s, "%012lx.U.%c%c%c", (uint64_t)version->ckey,
225 version->master == 0 ? '-' : 'm',
226 version->subset == 0 ? 'p' : 's',
227 version->evade == 0 ? '-' : 'e');
228 }
229 else {
230 sprintf(str.s, "%012lx.%X.%c%c%c", (uint64_t)version->ckey,
231 (uint32_t)version->family,
232 version->master == 0 ? '-' : 'm',
233 version->subset == 0 ? 'p' : 's',
234 version->evade == 0 ?
235 (version->revived == 0 ? '-' : 'r') : 'e');
236 }
237
238 return str;
239}
240
241static inline bool
242as_partition_version_is_null(const as_partition_version* version)
243{
244 return *(uint64_t*)version == 0;
245}
246
247static inline bool
248as_partition_version_has_data(const as_partition_version* version)
249{
250 return version->ckey != 0;
251}
252
253static inline bool
254as_partition_version_same(const as_partition_version* v1, const as_partition_version* v2)
255{
256 return *(uint64_t*)v1 == *(uint64_t*)v2;
257}
258
259static inline uint32_t
260as_partition_getid(const cf_digest* d)
261{
262 return *(uint32_t*)d & AS_PARTITION_MASK;
263}
264
265static inline int
266find_self_in_replicas(const as_partition* p)
267{
268 return index_of_node(p->replicas, p->n_replicas, g_config.self_node);
269}
270
271static inline bool
272is_self_replica(const as_partition* p)
273{
274 return contains_node(p->replicas, p->n_replicas, g_config.self_node);
275}
276
277static inline bool
278contains_self(const cf_node* nodes, uint32_t n_nodes)
279{
280 return contains_node(nodes, n_nodes, g_config.self_node);
281}
282
283#define AS_PARTITION_ID_UNDEF ((uint16_t)0xFFFF)
284
285#define AS_PARTITION_RESERVATION_INIT(__rsv) \
286 __rsv.ns = NULL; \
287 __rsv.p = NULL; \
288 __rsv.tree = NULL; \
289 __rsv.regime = 0; \
290 __rsv.n_dupl = 0;
291
292#define VERSION_AS_STRING(v_ptr) (as_partition_version_as_string(v_ptr).s)
293
294
295//==========================================================
296// Public API - client view replica maps.
297//
298
299void client_replica_maps_create(struct as_namespace_s* ns);
300void client_replica_maps_clear(struct as_namespace_s* ns);
301bool client_replica_maps_update(struct as_namespace_s* ns, uint32_t pid);
302bool client_replica_maps_is_partition_queryable(const struct as_namespace_s* ns, uint32_t pid);
303
304
305//==========================================================
306// Private API - for enterprise separation only.
307//
308
309int partition_reserve_unavailable(const struct as_namespace_s* ns, const as_partition* p, struct as_transaction_s* tr, cf_node* node);
310bool partition_reserve_promote(const struct as_namespace_s* ns, const as_partition* p, struct as_transaction_s* tr);
311