1 | /* |
2 | * drv_ssd.h |
3 | * |
4 | * Copyright (C) 2014 Aerospike, Inc. |
5 | * |
6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
7 | * license agreements. |
8 | * |
9 | * This program is free software: you can redistribute it and/or modify it under |
10 | * the terms of the GNU Affero General Public License as published by the Free |
11 | * Software Foundation, either version 3 of the License, or (at your option) any |
12 | * later version. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
17 | * details. |
18 | * |
19 | * You should have received a copy of the GNU Affero General Public License |
20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
21 | */ |
22 | |
23 | #pragma once |
24 | |
25 | //========================================================== |
26 | // Includes. |
27 | // |
28 | |
29 | #include <errno.h> |
30 | #include <stdbool.h> |
31 | #include <stddef.h> |
32 | #include <stdint.h> |
33 | #include <sys/types.h> |
34 | #include <unistd.h> |
35 | |
36 | #include "citrusleaf/cf_atomic.h" |
37 | #include "citrusleaf/cf_queue.h" |
38 | |
39 | #include "cf_mutex.h" |
40 | #include "cf_thread.h" |
41 | #include "fault.h" |
42 | #include "hist.h" |
43 | |
44 | #include "base/datamodel.h" |
45 | #include "fabric/partition.h" |
46 | #include "storage/flat.h" |
47 | #include "storage/storage.h" |
48 | |
49 | |
50 | //========================================================== |
51 | // Forward declarations. |
52 | // |
53 | |
54 | struct as_flat_opt_meta_s; |
55 | struct as_flat_record_s; |
56 | struct as_index_s; |
57 | struct as_namespace_s; |
58 | struct as_storage_rd_s; |
59 | struct drv_ssd_s; |
60 | |
61 | |
62 | //========================================================== |
63 | // Typedefs & constants. |
64 | // |
65 | |
66 | #define (0x4349747275730707L) |
67 | #define (0x4349747275730322L) |
68 | #define SSD_VERSION 3 |
69 | // SSD_VERSION history: |
70 | // 1 - original |
71 | // 2 - minimum storage increment (RBLOCK_SIZE) from 512 to 128 bytes |
72 | // 3 - total overhaul including changed magic and moved version |
73 | |
74 | // Device header flags. |
75 | #define 0x01 |
76 | #define 0x02 |
77 | #define 0x04 |
78 | #define 0x08 |
79 | #define 0x10 |
80 | |
81 | // Used when determining a device's io_min_size. |
82 | #define LO_IO_MIN_SIZE 512 |
83 | #define HI_IO_MIN_SIZE 4096 |
84 | |
85 | // SSD_HEADER_SIZE must be a power of 2 and >= MAX_WRITE_BLOCK_SIZE. |
86 | // Do NOT change SSD_HEADER_SIZE! |
87 | #define (8 * 1024 * 1024) |
88 | |
89 | |
90 | //------------------------------------------------ |
91 | // Device header. |
92 | // |
93 | |
94 | // TODO - were we going to change 'prefix' to 'base'? |
95 | typedef struct ssd_common_prefix_s { |
96 | uint64_t magic; |
97 | uint32_t version; |
98 | char namespace[32]; |
99 | uint32_t n_devices; |
100 | uint64_t random; // identify matching set of devices |
101 | uint32_t flags; |
102 | uint32_t write_block_size; |
103 | uint32_t eventual_regime; |
104 | uint32_t last_evict_void_time; |
105 | uint32_t roster_generation; |
106 | } ssd_common_prefix; |
107 | |
108 | // Because we pad explicitly: |
109 | COMPILER_ASSERT(sizeof(ssd_common_prefix) <= HI_IO_MIN_SIZE); |
110 | |
111 | // TODO - deal with the name and the name of as_storage_info_set/get! |
112 | typedef struct ssd_common_pmeta_s { |
113 | as_partition_version version; |
114 | uint8_t tree_id; |
115 | uint8_t unused[7]; |
116 | } ssd_common_pmeta; |
117 | |
118 | // Make sure a ssd_common_pmeta never unnecessarily crosses an IO size boundary. |
119 | COMPILER_ASSERT((sizeof(ssd_common_pmeta) & (sizeof(ssd_common_pmeta) - 1)) == 0); |
120 | |
121 | typedef struct ssd_device_common_s { |
122 | ssd_common_prefix prefix; |
123 | uint8_t pad_prefix[HI_IO_MIN_SIZE - sizeof(ssd_common_prefix)]; |
124 | ssd_common_pmeta pmeta[AS_PARTITIONS]; |
125 | } ssd_device_common; |
126 | |
127 | typedef struct ssd_device_unique_s { |
128 | uint32_t device_id; |
129 | uint32_t unused; |
130 | uint8_t encrypted_key[64]; |
131 | uint8_t canary[16]; |
132 | uint64_t pristine_offset; |
133 | } ssd_device_unique; |
134 | |
135 | #define ROUND_UP_COMMON \ |
136 | ((sizeof(ssd_device_common) + (HI_IO_MIN_SIZE - 1)) & -HI_IO_MIN_SIZE) |
137 | |
138 | typedef struct { |
139 | ssd_device_common ; |
140 | uint8_t [ROUND_UP_COMMON - sizeof(ssd_device_common)]; |
141 | ssd_device_unique ; |
142 | } ; |
143 | |
144 | COMPILER_ASSERT(sizeof(ssd_device_header) <= SSD_HEADER_SIZE); |
145 | |
146 | COMPILER_ASSERT(offsetof(ssd_device_header, common) == 0); |
147 | COMPILER_ASSERT(offsetof(ssd_device_header, common.prefix) == 0); |
148 | |
149 | #define SSD_OFFSET_UNIQUE (offsetof(ssd_device_header, unique)) |
150 | |
151 | |
152 | //------------------------------------------------ |
153 | // A defragged wblock waiting to be freed. |
154 | // |
155 | typedef struct vacated_wblock_s { |
156 | uint32_t file_id; |
157 | uint32_t wblock_id; |
158 | } vacated_wblock; |
159 | |
160 | |
161 | //------------------------------------------------ |
162 | // Write buffer - where records accumulate until |
163 | // (the full buffer is) flushed to a device. |
164 | // |
165 | typedef struct { |
166 | cf_atomic32 rc; |
167 | cf_atomic32 n_writers; // number of concurrent writers |
168 | bool dirty; // written to since last flushed |
169 | bool skip_post_write_q; |
170 | uint32_t n_vacated; |
171 | uint32_t vacated_capacity; |
172 | vacated_wblock *vacated_wblocks; |
173 | struct drv_ssd_s *ssd; |
174 | uint32_t wblock_id; |
175 | uint32_t pos; |
176 | uint8_t *buf; |
177 | } ssd_write_buf; |
178 | |
179 | |
180 | //------------------------------------------------ |
181 | // Per-wblock information. |
182 | // |
183 | typedef struct ssd_wblock_state_s { |
184 | cf_atomic32 inuse_sz; // number of bytes currently used in the wblock |
185 | cf_mutex LOCK; // transactions, write_worker, and defrag all are interested in wblock_state |
186 | ssd_write_buf *swb; // pending writes for the wblock, also treated as a cache for reads |
187 | uint32_t state; // for now just a defrag flag |
188 | cf_atomic32 n_vac_dests; // number of wblocks into which this wblock defragged |
189 | } ssd_wblock_state; |
190 | |
191 | // wblock state |
192 | // |
193 | // Ultimately this may become a full-blown state, but for now it's effectively |
194 | // just a defrag flag. |
195 | #define WBLOCK_STATE_NONE 0 |
196 | #define WBLOCK_STATE_DEFRAG 1 |
197 | |
198 | |
199 | //------------------------------------------------ |
200 | // Per-device information. |
201 | // |
202 | typedef struct drv_ssd_s { |
203 | struct as_namespace_s *ns; |
204 | |
205 | const char *name; // this device's name |
206 | const char *shadow_name; // this device's shadow's name, if any |
207 | |
208 | uint32_t running; |
209 | |
210 | cf_mutex write_lock; // lock protects writes to current swb |
211 | ssd_write_buf *current_swb; // swb currently being filled by writes |
212 | |
213 | int commit_fd; // relevant for enterprise edition only |
214 | int shadow_commit_fd; // relevant for enterprise edition only |
215 | |
216 | cf_mutex defrag_lock; // lock protects writes to defrag swb |
217 | ssd_write_buf *defrag_swb; // swb currently being filled by defrag |
218 | |
219 | cf_queue *fd_q; // queue of open fds |
220 | cf_queue *fd_cache_q; // queue of open fds that use page cache |
221 | cf_queue *shadow_fd_q; // queue of open fds on shadow, if any |
222 | |
223 | cf_queue *free_wblock_q; // IDs of free wblocks |
224 | cf_queue *defrag_wblock_q; // IDs of wblocks to defrag |
225 | |
226 | cf_queue *swb_write_q; // pointers to swbs ready to write |
227 | cf_queue *swb_shadow_q; // pointers to swbs ready to write to shadow, if any |
228 | cf_queue *swb_free_q; // pointers to swbs free and waiting |
229 | cf_queue *post_write_q; // pointers to swbs that have been written but are cached |
230 | |
231 | uint8_t encryption_key[64]; // relevant for enterprise edition only |
232 | |
233 | cf_atomic64 n_defrag_wblock_reads; // total number of wblocks added to the defrag_wblock_q |
234 | cf_atomic64 n_defrag_wblock_writes; // total number of swbs added to the swb_write_q by defrag |
235 | cf_atomic64 n_wblock_writes; // total number of swbs added to the swb_write_q by writes |
236 | |
237 | cf_atomic64 n_wblock_defrag_io_skips; // total number of wblocks empty on defrag_wblock_q pop |
238 | cf_atomic64 n_wblock_direct_frees; // total number of wblocks freed by other than defrag |
239 | |
240 | volatile uint64_t n_tomb_raider_reads; // relevant for enterprise edition only |
241 | |
242 | cf_atomic32 defrag_sweep; // defrag sweep flag |
243 | |
244 | uint64_t file_size; |
245 | int file_id; |
246 | |
247 | uint32_t open_flag; |
248 | |
249 | uint64_t io_min_size; // device IO operations are aligned and sized in multiples of this |
250 | uint64_t shadow_io_min_size; // shadow device IO operations are aligned and sized in multiples of this |
251 | |
252 | uint64_t commit_min_size; // commit (write) operations are aligned and sized in multiples of this |
253 | uint64_t shadow_commit_min_size; // shadow commit (write) operations are aligned and sized in multiples of this |
254 | |
255 | cf_atomic64 inuse_size; // number of bytes in actual use on this device |
256 | |
257 | uint32_t write_block_size; // number of bytes to write at a time |
258 | uint32_t first_wblock_id; // wblock-id of first non-header wblock |
259 | |
260 | uint32_t pristine_wblock_id; // minimum wblock-id of "pristine" region |
261 | |
262 | uint32_t n_wblocks; // number of wblocks on this device |
263 | ssd_wblock_state *wblock_state; // array of info per wblock on this device |
264 | |
265 | uint32_t sweep_wblock_id; // wblocks read at startup |
266 | uint64_t record_add_older_counter; // records not inserted due to better existing one |
267 | uint64_t record_add_expired_counter; // records not inserted due to expiration |
268 | uint64_t record_add_evicted_counter; // records not inserted due to eviction |
269 | uint64_t record_add_replace_counter; // records reinserted |
270 | uint64_t record_add_unique_counter; // records inserted |
271 | |
272 | cf_tid write_tid; |
273 | cf_tid shadow_tid; |
274 | |
275 | histogram *hist_read; |
276 | histogram *hist_large_block_read; |
277 | histogram *hist_write; |
278 | histogram *hist_shadow_write; |
279 | } drv_ssd; |
280 | |
281 | |
282 | //------------------------------------------------ |
283 | // Per-namespace storage information. |
284 | // |
285 | typedef struct drv_ssds_s { |
286 | struct as_namespace_s *ns; |
287 | ssd_device_common *common; |
288 | |
289 | // Not a great place for this - used only at startup to determine whether to |
290 | // load a record. |
291 | bool get_state_from_storage[AS_PARTITIONS]; |
292 | |
293 | // Indexed by previous device-id to get new device-id. -1 means device is |
294 | // "fresh" or absent. Used only at startup to fix index elements' file-id. |
295 | int8_t device_translation[AS_STORAGE_MAX_DEVICES]; |
296 | |
297 | // Used only at startup, set true if all devices are fresh. |
298 | bool all_fresh; |
299 | |
300 | cf_mutex flush_lock; |
301 | |
302 | int n_ssds; |
303 | drv_ssd ssds[]; |
304 | } drv_ssds; |
305 | |
306 | |
307 | //========================================================== |
308 | // Private API - for enterprise separation only. |
309 | // |
310 | |
311 | typedef struct ssd_load_records_info_s { |
312 | drv_ssds *ssds; |
313 | drv_ssd *ssd; |
314 | cf_queue *complete_q; |
315 | void *complete_rc; |
316 | } ssd_load_records_info; |
317 | |
318 | // Warm and cool restart. |
319 | void ssd_resume_devices(drv_ssds *ssds); |
320 | void *run_ssd_cool_start(void *udata); |
321 | void ssd_load_wblock_queues(drv_ssds *ssds); |
322 | void ssd_start_maintenance_threads(drv_ssds *ssds); |
323 | void ssd_start_write_threads(drv_ssds *ssds); |
324 | void ssd_start_defrag_threads(drv_ssds *ssds); |
325 | void apply_opt_meta(struct as_index_s *r, struct as_namespace_s *ns, const struct as_flat_opt_meta_s *opt_meta); |
326 | |
327 | // Tomb raider. |
328 | void ssd_cold_start_adjust_cenotaph(struct as_namespace_s *ns, bool block_has_bins, uint32_t block_void_time, struct as_index_s *r); |
329 | void ssd_cold_start_transition_record(struct as_namespace_s *ns, const struct as_flat_record_s *flat, struct as_index_s *r, bool is_create); |
330 | void ssd_cold_start_drop_cenotaphs(struct as_namespace_s *ns); |
331 | |
332 | // Record encryption. |
333 | void ssd_encrypt(drv_ssd *ssd, uint64_t off, struct as_flat_record_s *flat); |
334 | void ssd_decrypt(drv_ssd *ssd, uint64_t off, struct as_flat_record_s *flat); |
335 | void ssd_decrypt_whole(drv_ssd *ssd, uint64_t off, uint32_t n_rblocks, struct as_flat_record_s *flat); |
336 | |
337 | // CP. |
338 | void ssd_adjust_versions(struct as_namespace_s *ns, ssd_common_pmeta* pmeta); |
339 | conflict_resolution_pol ssd_cold_start_policy(struct as_namespace_s *ns); |
340 | void ssd_cold_start_init_repl_state(struct as_namespace_s *ns, struct as_index_s* r); |
341 | |
342 | // Miscellaneous. |
343 | int ssd_fd_get(drv_ssd *ssd); |
344 | int ssd_shadow_fd_get(drv_ssd *ssd); |
345 | void ssd_fd_put(drv_ssd *ssd, int fd); |
346 | void (const struct as_namespace_s *ns, drv_ssd* ssd, ssd_device_header *); |
347 | void (const struct as_namespace_s *ns, drv_ssd* ssd, const ssd_device_header *); |
348 | void ssd_flush_final_cfg(struct as_namespace_s *ns); |
349 | bool ssd_cold_start_is_valid_n_bins(uint32_t n_bins); |
350 | void (drv_ssd *ssd, uint8_t *, uint8_t *from, size_t size); |
351 | void ssd_prefetch_wblock(drv_ssd *ssd, uint64_t file_offset, uint8_t *read_buf); |
352 | |
353 | // Durability. |
354 | void ssd_init_commit(drv_ssd *ssd); |
355 | uint64_t ssd_flush_max_us(const struct as_namespace_s *ns); |
356 | void ssd_post_write(drv_ssd *ssd, ssd_write_buf *swb); |
357 | int ssd_write_bins(struct as_storage_rd_s *rd); |
358 | int ssd_buffer_bins(struct as_storage_rd_s *rd); |
359 | ssd_write_buf *swb_get(drv_ssd *ssd); |
360 | |
361 | // Called in (enterprise-split) storage table function. |
362 | int ssd_write(struct as_storage_rd_s *rd); |
363 | |
364 | |
365 | // |
366 | // Conversions between offsets and rblocks. |
367 | // |
368 | |
369 | // TODO - make checks stricter (exclude drive header, consider drive size) ??? |
370 | #define STORAGE_RBLOCK_IS_VALID(__x) ((__x) != 0) |
371 | #define STORAGE_RBLOCK_IS_INVALID(__x) ((__x) == 0) |
372 | |
373 | // Convert byte offset to rblock_id, as long as offset is already a multiple of |
374 | // rblock size. |
375 | static inline uint64_t OFFSET_TO_RBLOCK_ID(uint64_t offset) { |
376 | return offset >> LOG_2_RBLOCK_SIZE; |
377 | } |
378 | |
379 | // Convert rblock_id to byte offset. |
380 | static inline uint64_t RBLOCK_ID_TO_OFFSET(uint64_t rblocks) { |
381 | return rblocks << LOG_2_RBLOCK_SIZE; |
382 | } |
383 | |
384 | |
385 | // |
386 | // Conversions between bytes/rblocks and wblocks. |
387 | // |
388 | |
389 | #define STORAGE_INVALID_WBLOCK 0xFFFFffff |
390 | |
391 | // Convert byte offset to wblock_id. |
392 | static inline uint32_t OFFSET_TO_WBLOCK_ID(drv_ssd *ssd, uint64_t offset) { |
393 | return (uint32_t)(offset / ssd->write_block_size); |
394 | } |
395 | |
396 | // Convert wblock_id to byte offset. |
397 | static inline uint64_t WBLOCK_ID_TO_OFFSET(drv_ssd *ssd, uint32_t wblock_id) { |
398 | return (uint64_t)wblock_id * (uint64_t)ssd->write_block_size; |
399 | } |
400 | |
401 | // Convert rblock_id to wblock_id. |
402 | static inline uint32_t RBLOCK_ID_TO_WBLOCK_ID(drv_ssd *ssd, uint64_t rblock_id) { |
403 | return (uint32_t)((rblock_id << LOG_2_RBLOCK_SIZE) / ssd->write_block_size); |
404 | } |
405 | |
406 | |
407 | // |
408 | // Size rounding needed for sanity checking. |
409 | // |
410 | |
411 | #define SSD_RECORD_MIN_SIZE \ |
412 | (((uint32_t)sizeof(as_flat_record) + (RBLOCK_SIZE - 1)) & -RBLOCK_SIZE) |
413 | |
414 | |
415 | // |
416 | // Size rounding needed for direct IO. |
417 | // |
418 | |
419 | // Round bytes down to a multiple of device's minimum IO operation size. |
420 | static inline uint64_t BYTES_DOWN_TO_IO_MIN(drv_ssd *ssd, uint64_t bytes) { |
421 | return bytes & -ssd->io_min_size; |
422 | } |
423 | |
424 | // Round bytes up to a multiple of device's minimum IO operation size. |
425 | static inline uint64_t BYTES_UP_TO_IO_MIN(drv_ssd *ssd, uint64_t bytes) { |
426 | return (bytes + (ssd->io_min_size - 1)) & -ssd->io_min_size; |
427 | } |
428 | |
429 | // Round bytes down to a multiple of shadow device's minimum IO operation size. |
430 | static inline uint64_t |
431 | BYTES_DOWN_TO_SHADOW_IO_MIN(drv_ssd *ssd, uint64_t bytes) { |
432 | return bytes & -ssd->shadow_io_min_size; |
433 | } |
434 | |
435 | // Round bytes up to a multiple of shadow device's minimum IO operation size. |
436 | static inline uint64_t |
437 | BYTES_UP_TO_SHADOW_IO_MIN(drv_ssd *ssd, uint64_t bytes) { |
438 | return (bytes + (ssd->shadow_io_min_size - 1)) & -ssd->shadow_io_min_size; |
439 | } |
440 | |
441 | |
442 | // |
443 | // Device IO. |
444 | // |
445 | |
446 | static inline bool |
447 | pread_all(int fd, void* buf, size_t size, off_t offset) |
448 | { |
449 | ssize_t result; |
450 | |
451 | while ((result = pread(fd, buf, size, offset)) != (ssize_t)size) { |
452 | if (result < 0) { |
453 | return false; // let the caller log errors |
454 | } |
455 | |
456 | if (result == 0) { // should only happen if caller passed 0 size |
457 | errno = EINVAL; |
458 | return false; |
459 | } |
460 | |
461 | buf += result; |
462 | offset += result; |
463 | size -= result; |
464 | } |
465 | |
466 | return true; |
467 | } |
468 | |
469 | static inline bool |
470 | pwrite_all(int fd, void* buf, size_t size, off_t offset) |
471 | { |
472 | ssize_t result; |
473 | |
474 | while ((result = pwrite(fd, buf, size, offset)) != (ssize_t)size) { |
475 | if (result < 0) { |
476 | return false; // let the caller log errors |
477 | } |
478 | |
479 | if (result == 0) { // should only happen if caller passed 0 size |
480 | errno = EINVAL; |
481 | return false; |
482 | } |
483 | |
484 | buf += result; |
485 | offset += result; |
486 | size -= result; |
487 | } |
488 | |
489 | return true; |
490 | } |
491 | |