1 | /* |
2 | * partition.h |
3 | * |
4 | * Copyright (C) 2016 Aerospike, Inc. |
5 | * |
6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
7 | * license agreements. |
8 | * |
9 | * This program is free software: you can redistribute it and/or modify it under |
10 | * the terms of the GNU Affero General Public License as published by the Free |
11 | * Software Foundation, either version 3 of the License, or (at your option) any |
12 | * later version. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
17 | * details. |
18 | * |
19 | * You should have received a copy of the GNU Affero General Public License |
20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
21 | */ |
22 | |
23 | #pragma once |
24 | |
25 | //========================================================== |
26 | // Includes. |
27 | // |
28 | |
29 | #include <stdbool.h> |
30 | #include <stddef.h> |
31 | #include <stdint.h> |
32 | #include <stdio.h> |
33 | #include <string.h> |
34 | |
35 | #include "citrusleaf/cf_atomic.h" |
36 | #include "citrusleaf/cf_digest.h" |
37 | |
38 | #include "cf_mutex.h" |
39 | #include "dynbuf.h" |
40 | #include "node.h" |
41 | |
42 | #include "base/cfg.h" |
43 | #include "fabric/hb.h" |
44 | |
45 | |
46 | //========================================================== |
47 | // Forward declarations. |
48 | // |
49 | |
50 | struct as_index_tree_s; |
51 | struct as_namespace_s; |
52 | struct as_transaction_s; |
53 | |
54 | |
55 | //========================================================== |
56 | // Typedefs & constants. |
57 | // |
58 | |
59 | #define AS_PARTITIONS 4096 |
60 | #define AS_PARTITION_MASK (AS_PARTITIONS - 1) |
61 | |
62 | #define VERSION_FAMILY_BITS 4 |
63 | #define VERSION_FAMILY_UNIQUE ((1 << VERSION_FAMILY_BITS) - 1) |
64 | #define AS_PARTITION_N_FAMILIES VERSION_FAMILY_UNIQUE |
65 | |
66 | typedef struct as_partition_version_s { |
67 | uint64_t ckey:48; |
68 | uint64_t family:VERSION_FAMILY_BITS; |
69 | uint64_t unused:8; |
70 | uint64_t revived:1; // enterprise only |
71 | uint64_t master:1; |
72 | uint64_t subset:1; |
73 | uint64_t evade:1; |
74 | } as_partition_version; |
75 | |
76 | COMPILER_ASSERT(sizeof(as_partition_version) == sizeof(uint64_t)); |
77 | |
78 | typedef struct as_partition_version_string_s { |
79 | char s[19 + 1]; // format CCCCccccCCCC.F.mse - F may someday be 2 characters |
80 | } as_partition_version_string; |
81 | |
82 | typedef struct as_partition_s { |
83 | //-------------------------------------------- |
84 | // Used during every transaction. |
85 | // |
86 | |
87 | cf_atomic64 n_tombstones; // relevant only for enterprise edition |
88 | cf_atomic32 max_void_time; // TODO - do we really need this? |
89 | |
90 | cf_mutex lock; |
91 | |
92 | struct as_index_tree_s* tree; |
93 | |
94 | cf_node working_master; |
95 | |
96 | uint32_t regime; // relevant only for enterprise edition |
97 | |
98 | uint32_t n_nodes; // relevant only for enterprise edition |
99 | uint32_t n_replicas; |
100 | uint32_t n_dupl; |
101 | |
102 | // @ 48 bytes - room for 2 replicas within above 64-byte cache line. |
103 | cf_node replicas[AS_CLUSTER_SZ]; |
104 | |
105 | uint8_t align_1[16]; |
106 | // @ 64-byte-aligned boundary. |
107 | |
108 | //-------------------------------------------- |
109 | // Used only when cluster is in flux. |
110 | // |
111 | |
112 | uint32_t id; |
113 | |
114 | bool must_appeal; // relevant only for enterprise edition |
115 | |
116 | uint8_t tree_id; |
117 | uint64_t tree_ids_used; // bit map |
118 | |
119 | as_partition_version final_version; |
120 | as_partition_version version; |
121 | |
122 | uint16_t pending_emigrations; |
123 | uint16_t pending_lead_emigrations; |
124 | uint16_t pending_immigrations; |
125 | |
126 | uint16_t n_witnesses; |
127 | |
128 | // @ 40 bytes - room for 3 duplicates within above 64-byte cache line. |
129 | cf_node dupls[AS_CLUSTER_SZ]; |
130 | |
131 | uint8_t align_2[24]; |
132 | // @ 64-byte-aligned boundary. |
133 | |
134 | bool immigrators[AS_CLUSTER_SZ]; |
135 | // Byte alignment depends on AS_CLUSTER_SZ - pad below to realign. |
136 | |
137 | uint8_t align_3[AS_CLUSTER_SZ == 8 ? 56 : 0]; |
138 | // @ 64-byte-aligned boundary. |
139 | |
140 | cf_node witnesses[AS_CLUSTER_SZ]; |
141 | } as_partition; |
142 | |
143 | COMPILER_ASSERT(sizeof(as_partition) % 64 == 0); |
144 | |
145 | typedef struct as_partition_reservation_s { |
146 | struct as_namespace_s* ns; |
147 | as_partition* p; |
148 | struct as_index_tree_s* tree; |
149 | uint32_t regime; |
150 | uint32_t n_dupl; |
151 | cf_node dupl_nodes[AS_CLUSTER_SZ]; |
152 | } as_partition_reservation; |
153 | |
154 | typedef struct repl_stats_s { |
155 | uint64_t n_master_objects; |
156 | uint64_t n_prole_objects; |
157 | uint64_t n_non_replica_objects; |
158 | uint64_t n_master_tombstones; |
159 | uint64_t n_prole_tombstones; |
160 | uint64_t n_non_replica_tombstones; |
161 | } repl_stats; |
162 | |
163 | #define CLIENT_BITMAP_BYTES ((AS_PARTITIONS + 7) / 8) |
164 | #define CLIENT_B64MAP_BYTES (((CLIENT_BITMAP_BYTES + 2) / 3) * 4) |
165 | |
166 | typedef struct client_replica_map_s { |
167 | cf_mutex write_lock; |
168 | |
169 | volatile uint8_t bitmap[CLIENT_BITMAP_BYTES]; |
170 | volatile char b64map[CLIENT_B64MAP_BYTES]; |
171 | } client_replica_map; |
172 | |
173 | typedef enum { |
174 | AS_MIGRATE_OK, |
175 | AS_MIGRATE_FAIL, |
176 | AS_MIGRATE_AGAIN |
177 | } as_migrate_result; |
178 | |
179 | |
180 | //========================================================== |
181 | // Public API. |
182 | // |
183 | |
184 | void as_partition_init(struct as_namespace_s* ns, uint32_t pid); |
185 | void as_partition_shutdown(struct as_namespace_s* ns, uint32_t pid); |
186 | |
187 | void as_partition_isolate_version(const struct as_namespace_s* ns, as_partition* p); |
188 | int as_partition_check_source(const struct as_namespace_s* ns, as_partition* p, cf_node src, bool* from_replica); |
189 | void as_partition_freeze(as_partition* p); |
190 | |
191 | uint32_t as_partition_get_other_replicas(as_partition* p, cf_node* nv); |
192 | |
193 | cf_node as_partition_writable_node(struct as_namespace_s* ns, uint32_t pid); |
194 | cf_node as_partition_proxyee_redirect(struct as_namespace_s* ns, uint32_t pid); |
195 | |
196 | void as_partition_get_replicas_master_str(cf_dyn_buf* db); |
197 | void as_partition_get_replicas_all_str(cf_dyn_buf* db, bool include_regime); |
198 | |
199 | void as_partition_get_replica_stats(struct as_namespace_s* ns, repl_stats* p_stats); |
200 | |
201 | void as_partition_reserve(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); |
202 | int as_partition_reserve_replica(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); |
203 | int as_partition_reserve_write(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, cf_node* node); |
204 | int as_partition_reserve_read_tr(struct as_namespace_s* ns, uint32_t pid, struct as_transaction_s* tr, cf_node* node); |
205 | int as_partition_prereserve_query(struct as_namespace_s* ns, bool can_partition_query[], as_partition_reservation rsv[]); |
206 | int as_partition_reserve_query(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); |
207 | int as_partition_reserve_xdr_read(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); |
208 | void as_partition_reservation_copy(as_partition_reservation* dst, as_partition_reservation* src); |
209 | |
210 | void as_partition_release(as_partition_reservation* rsv); |
211 | |
212 | void as_partition_advance_tree_id(as_partition* p, const char* ns_name); |
213 | void as_partition_tree_done(uint8_t id, void* udata); |
214 | |
215 | void as_partition_getinfo_str(cf_dyn_buf* db); |
216 | |
217 | // Use VERSION_AS_STRING() - see below. |
218 | static inline as_partition_version_string |
219 | as_partition_version_as_string(const as_partition_version* version) |
220 | { |
221 | as_partition_version_string str; |
222 | |
223 | if (version->family == VERSION_FAMILY_UNIQUE) { |
224 | sprintf(str.s, "%012lx.U.%c%c%c" , (uint64_t)version->ckey, |
225 | version->master == 0 ? '-' : 'm', |
226 | version->subset == 0 ? 'p' : 's', |
227 | version->evade == 0 ? '-' : 'e'); |
228 | } |
229 | else { |
230 | sprintf(str.s, "%012lx.%X.%c%c%c" , (uint64_t)version->ckey, |
231 | (uint32_t)version->family, |
232 | version->master == 0 ? '-' : 'm', |
233 | version->subset == 0 ? 'p' : 's', |
234 | version->evade == 0 ? |
235 | (version->revived == 0 ? '-' : 'r') : 'e'); |
236 | } |
237 | |
238 | return str; |
239 | } |
240 | |
241 | static inline bool |
242 | as_partition_version_is_null(const as_partition_version* version) |
243 | { |
244 | return *(uint64_t*)version == 0; |
245 | } |
246 | |
247 | static inline bool |
248 | as_partition_version_has_data(const as_partition_version* version) |
249 | { |
250 | return version->ckey != 0; |
251 | } |
252 | |
253 | static inline bool |
254 | as_partition_version_same(const as_partition_version* v1, const as_partition_version* v2) |
255 | { |
256 | return *(uint64_t*)v1 == *(uint64_t*)v2; |
257 | } |
258 | |
259 | static inline uint32_t |
260 | as_partition_getid(const cf_digest* d) |
261 | { |
262 | return *(uint32_t*)d & AS_PARTITION_MASK; |
263 | } |
264 | |
265 | static inline int |
266 | find_self_in_replicas(const as_partition* p) |
267 | { |
268 | return index_of_node(p->replicas, p->n_replicas, g_config.self_node); |
269 | } |
270 | |
271 | static inline bool |
272 | is_self_replica(const as_partition* p) |
273 | { |
274 | return contains_node(p->replicas, p->n_replicas, g_config.self_node); |
275 | } |
276 | |
277 | static inline bool |
278 | contains_self(const cf_node* nodes, uint32_t n_nodes) |
279 | { |
280 | return contains_node(nodes, n_nodes, g_config.self_node); |
281 | } |
282 | |
283 | #define AS_PARTITION_ID_UNDEF ((uint16_t)0xFFFF) |
284 | |
285 | #define AS_PARTITION_RESERVATION_INIT(__rsv) \ |
286 | __rsv.ns = NULL; \ |
287 | __rsv.p = NULL; \ |
288 | __rsv.tree = NULL; \ |
289 | __rsv.regime = 0; \ |
290 | __rsv.n_dupl = 0; |
291 | |
292 | #define VERSION_AS_STRING(v_ptr) (as_partition_version_as_string(v_ptr).s) |
293 | |
294 | |
295 | //========================================================== |
296 | // Public API - client view replica maps. |
297 | // |
298 | |
299 | void client_replica_maps_create(struct as_namespace_s* ns); |
300 | void client_replica_maps_clear(struct as_namespace_s* ns); |
301 | bool client_replica_maps_update(struct as_namespace_s* ns, uint32_t pid); |
302 | bool client_replica_maps_is_partition_queryable(const struct as_namespace_s* ns, uint32_t pid); |
303 | |
304 | |
305 | //========================================================== |
306 | // Private API - for enterprise separation only. |
307 | // |
308 | |
309 | int partition_reserve_unavailable(const struct as_namespace_s* ns, const as_partition* p, struct as_transaction_s* tr, cf_node* node); |
310 | bool partition_reserve_promote(const struct as_namespace_s* ns, const as_partition* p, struct as_transaction_s* tr); |
311 | |