1/*
2 * drv_ssd.h
3 *
4 * Copyright (C) 2014 Aerospike, Inc.
5 *
6 * Portions may be licensed to Aerospike, Inc. under one or more contributor
7 * license agreements.
8 *
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU Affero General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU Affero General Public License
20 * along with this program. If not, see http://www.gnu.org/licenses/
21 */
22
23#pragma once
24
25//==========================================================
26// Includes.
27//
28
29#include <errno.h>
30#include <stdbool.h>
31#include <stddef.h>
32#include <stdint.h>
33#include <sys/types.h>
34#include <unistd.h>
35
36#include "citrusleaf/cf_atomic.h"
37#include "citrusleaf/cf_queue.h"
38
39#include "cf_mutex.h"
40#include "cf_thread.h"
41#include "fault.h"
42#include "hist.h"
43
44#include "base/datamodel.h"
45#include "fabric/partition.h"
46#include "storage/flat.h"
47#include "storage/storage.h"
48
49
50//==========================================================
51// Forward declarations.
52//
53
54struct as_flat_opt_meta_s;
55struct as_flat_record_s;
56struct as_index_s;
57struct as_namespace_s;
58struct as_storage_rd_s;
59struct drv_ssd_s;
60
61
62//==========================================================
63// Typedefs & constants.
64//
65
66#define SSD_HEADER_OLD_MAGIC (0x4349747275730707L)
67#define SSD_HEADER_MAGIC (0x4349747275730322L)
68#define SSD_VERSION 3
69// SSD_VERSION history:
70// 1 - original
71// 2 - minimum storage increment (RBLOCK_SIZE) from 512 to 128 bytes
72// 3 - total overhaul including changed magic and moved version
73
74// Device header flags.
75#define SSD_HEADER_FLAG_TRUSTED 0x01
76#define SSD_HEADER_FLAG_SINGLE_BIN 0x02
77#define SSD_HEADER_FLAG_ENCRYPTED 0x04
78#define SSD_HEADER_FLAG_CP 0x08
79#define SSD_HEADER_FLAG_COMMIT_TO_DEVICE 0x10
80
81// Used when determining a device's io_min_size.
82#define LO_IO_MIN_SIZE 512
83#define HI_IO_MIN_SIZE 4096
84
85// SSD_HEADER_SIZE must be a power of 2 and >= MAX_WRITE_BLOCK_SIZE.
86// Do NOT change SSD_HEADER_SIZE!
87#define SSD_HEADER_SIZE (8 * 1024 * 1024)
88
89
90//------------------------------------------------
91// Device header.
92//
93
94// TODO - were we going to change 'prefix' to 'base'?
95typedef struct ssd_common_prefix_s {
96 uint64_t magic;
97 uint32_t version;
98 char namespace[32];
99 uint32_t n_devices;
100 uint64_t random; // identify matching set of devices
101 uint32_t flags;
102 uint32_t write_block_size;
103 uint32_t eventual_regime;
104 uint32_t last_evict_void_time;
105 uint32_t roster_generation;
106} ssd_common_prefix;
107
108// Because we pad explicitly:
109COMPILER_ASSERT(sizeof(ssd_common_prefix) <= HI_IO_MIN_SIZE);
110
111// TODO - deal with the name and the name of as_storage_info_set/get!
112typedef struct ssd_common_pmeta_s {
113 as_partition_version version;
114 uint8_t tree_id;
115 uint8_t unused[7];
116} ssd_common_pmeta;
117
118// Make sure a ssd_common_pmeta never unnecessarily crosses an IO size boundary.
119COMPILER_ASSERT((sizeof(ssd_common_pmeta) & (sizeof(ssd_common_pmeta) - 1)) == 0);
120
121typedef struct ssd_device_common_s {
122 ssd_common_prefix prefix;
123 uint8_t pad_prefix[HI_IO_MIN_SIZE - sizeof(ssd_common_prefix)];
124 ssd_common_pmeta pmeta[AS_PARTITIONS];
125} ssd_device_common;
126
127typedef struct ssd_device_unique_s {
128 uint32_t device_id;
129 uint32_t unused;
130 uint8_t encrypted_key[64];
131 uint8_t canary[16];
132 uint64_t pristine_offset;
133} ssd_device_unique;
134
135#define ROUND_UP_COMMON \
136 ((sizeof(ssd_device_common) + (HI_IO_MIN_SIZE - 1)) & -HI_IO_MIN_SIZE)
137
138typedef struct ssd_device_header_s {
139 ssd_device_common common;
140 uint8_t pad_common[ROUND_UP_COMMON - sizeof(ssd_device_common)];
141 ssd_device_unique unique;
142} ssd_device_header;
143
144COMPILER_ASSERT(sizeof(ssd_device_header) <= SSD_HEADER_SIZE);
145
146COMPILER_ASSERT(offsetof(ssd_device_header, common) == 0);
147COMPILER_ASSERT(offsetof(ssd_device_header, common.prefix) == 0);
148
149#define SSD_OFFSET_UNIQUE (offsetof(ssd_device_header, unique))
150
151
152//------------------------------------------------
153// A defragged wblock waiting to be freed.
154//
155typedef struct vacated_wblock_s {
156 uint32_t file_id;
157 uint32_t wblock_id;
158} vacated_wblock;
159
160
161//------------------------------------------------
162// Write buffer - where records accumulate until
163// (the full buffer is) flushed to a device.
164//
165typedef struct {
166 cf_atomic32 rc;
167 cf_atomic32 n_writers; // number of concurrent writers
168 bool dirty; // written to since last flushed
169 bool skip_post_write_q;
170 uint32_t n_vacated;
171 uint32_t vacated_capacity;
172 vacated_wblock *vacated_wblocks;
173 struct drv_ssd_s *ssd;
174 uint32_t wblock_id;
175 uint32_t pos;
176 uint8_t *buf;
177} ssd_write_buf;
178
179
180//------------------------------------------------
181// Per-wblock information.
182//
183typedef struct ssd_wblock_state_s {
184 cf_atomic32 inuse_sz; // number of bytes currently used in the wblock
185 cf_mutex LOCK; // transactions, write_worker, and defrag all are interested in wblock_state
186 ssd_write_buf *swb; // pending writes for the wblock, also treated as a cache for reads
187 uint32_t state; // for now just a defrag flag
188 cf_atomic32 n_vac_dests; // number of wblocks into which this wblock defragged
189} ssd_wblock_state;
190
191// wblock state
192//
193// Ultimately this may become a full-blown state, but for now it's effectively
194// just a defrag flag.
195#define WBLOCK_STATE_NONE 0
196#define WBLOCK_STATE_DEFRAG 1
197
198
199//------------------------------------------------
200// Per-device information.
201//
202typedef struct drv_ssd_s {
203 struct as_namespace_s *ns;
204
205 const char *name; // this device's name
206 const char *shadow_name; // this device's shadow's name, if any
207
208 uint32_t running;
209
210 cf_mutex write_lock; // lock protects writes to current swb
211 ssd_write_buf *current_swb; // swb currently being filled by writes
212
213 int commit_fd; // relevant for enterprise edition only
214 int shadow_commit_fd; // relevant for enterprise edition only
215
216 cf_mutex defrag_lock; // lock protects writes to defrag swb
217 ssd_write_buf *defrag_swb; // swb currently being filled by defrag
218
219 cf_queue *fd_q; // queue of open fds
220 cf_queue *fd_cache_q; // queue of open fds that use page cache
221 cf_queue *shadow_fd_q; // queue of open fds on shadow, if any
222
223 cf_queue *free_wblock_q; // IDs of free wblocks
224 cf_queue *defrag_wblock_q; // IDs of wblocks to defrag
225
226 cf_queue *swb_write_q; // pointers to swbs ready to write
227 cf_queue *swb_shadow_q; // pointers to swbs ready to write to shadow, if any
228 cf_queue *swb_free_q; // pointers to swbs free and waiting
229 cf_queue *post_write_q; // pointers to swbs that have been written but are cached
230
231 uint8_t encryption_key[64]; // relevant for enterprise edition only
232
233 cf_atomic64 n_defrag_wblock_reads; // total number of wblocks added to the defrag_wblock_q
234 cf_atomic64 n_defrag_wblock_writes; // total number of swbs added to the swb_write_q by defrag
235 cf_atomic64 n_wblock_writes; // total number of swbs added to the swb_write_q by writes
236
237 cf_atomic64 n_wblock_defrag_io_skips; // total number of wblocks empty on defrag_wblock_q pop
238 cf_atomic64 n_wblock_direct_frees; // total number of wblocks freed by other than defrag
239
240 volatile uint64_t n_tomb_raider_reads; // relevant for enterprise edition only
241
242 cf_atomic32 defrag_sweep; // defrag sweep flag
243
244 uint64_t file_size;
245 int file_id;
246
247 uint32_t open_flag;
248
249 uint64_t io_min_size; // device IO operations are aligned and sized in multiples of this
250 uint64_t shadow_io_min_size; // shadow device IO operations are aligned and sized in multiples of this
251
252 uint64_t commit_min_size; // commit (write) operations are aligned and sized in multiples of this
253 uint64_t shadow_commit_min_size; // shadow commit (write) operations are aligned and sized in multiples of this
254
255 cf_atomic64 inuse_size; // number of bytes in actual use on this device
256
257 uint32_t write_block_size; // number of bytes to write at a time
258 uint32_t first_wblock_id; // wblock-id of first non-header wblock
259
260 uint32_t pristine_wblock_id; // minimum wblock-id of "pristine" region
261
262 uint32_t n_wblocks; // number of wblocks on this device
263 ssd_wblock_state *wblock_state; // array of info per wblock on this device
264
265 uint32_t sweep_wblock_id; // wblocks read at startup
266 uint64_t record_add_older_counter; // records not inserted due to better existing one
267 uint64_t record_add_expired_counter; // records not inserted due to expiration
268 uint64_t record_add_evicted_counter; // records not inserted due to eviction
269 uint64_t record_add_replace_counter; // records reinserted
270 uint64_t record_add_unique_counter; // records inserted
271
272 cf_tid write_tid;
273 cf_tid shadow_tid;
274
275 histogram *hist_read;
276 histogram *hist_large_block_read;
277 histogram *hist_write;
278 histogram *hist_shadow_write;
279} drv_ssd;
280
281
282//------------------------------------------------
283// Per-namespace storage information.
284//
285typedef struct drv_ssds_s {
286 struct as_namespace_s *ns;
287 ssd_device_common *common;
288
289 // Not a great place for this - used only at startup to determine whether to
290 // load a record.
291 bool get_state_from_storage[AS_PARTITIONS];
292
293 // Indexed by previous device-id to get new device-id. -1 means device is
294 // "fresh" or absent. Used only at startup to fix index elements' file-id.
295 int8_t device_translation[AS_STORAGE_MAX_DEVICES];
296
297 // Used only at startup, set true if all devices are fresh.
298 bool all_fresh;
299
300 cf_mutex flush_lock;
301
302 int n_ssds;
303 drv_ssd ssds[];
304} drv_ssds;
305
306
307//==========================================================
308// Private API - for enterprise separation only.
309//
310
311typedef struct ssd_load_records_info_s {
312 drv_ssds *ssds;
313 drv_ssd *ssd;
314 cf_queue *complete_q;
315 void *complete_rc;
316} ssd_load_records_info;
317
318// Warm and cool restart.
319void ssd_resume_devices(drv_ssds *ssds);
320void *run_ssd_cool_start(void *udata);
321void ssd_load_wblock_queues(drv_ssds *ssds);
322void ssd_start_maintenance_threads(drv_ssds *ssds);
323void ssd_start_write_threads(drv_ssds *ssds);
324void ssd_start_defrag_threads(drv_ssds *ssds);
325void apply_opt_meta(struct as_index_s *r, struct as_namespace_s *ns, const struct as_flat_opt_meta_s *opt_meta);
326
327// Tomb raider.
328void ssd_cold_start_adjust_cenotaph(struct as_namespace_s *ns, bool block_has_bins, uint32_t block_void_time, struct as_index_s *r);
329void ssd_cold_start_transition_record(struct as_namespace_s *ns, const struct as_flat_record_s *flat, struct as_index_s *r, bool is_create);
330void ssd_cold_start_drop_cenotaphs(struct as_namespace_s *ns);
331
332// Record encryption.
333void ssd_encrypt(drv_ssd *ssd, uint64_t off, struct as_flat_record_s *flat);
334void ssd_decrypt(drv_ssd *ssd, uint64_t off, struct as_flat_record_s *flat);
335void ssd_decrypt_whole(drv_ssd *ssd, uint64_t off, uint32_t n_rblocks, struct as_flat_record_s *flat);
336
337// CP.
338void ssd_adjust_versions(struct as_namespace_s *ns, ssd_common_pmeta* pmeta);
339conflict_resolution_pol ssd_cold_start_policy(struct as_namespace_s *ns);
340void ssd_cold_start_init_repl_state(struct as_namespace_s *ns, struct as_index_s* r);
341
342// Miscellaneous.
343int ssd_fd_get(drv_ssd *ssd);
344int ssd_shadow_fd_get(drv_ssd *ssd);
345void ssd_fd_put(drv_ssd *ssd, int fd);
346void ssd_header_init_cfg(const struct as_namespace_s *ns, drv_ssd* ssd, ssd_device_header *header);
347void ssd_header_validate_cfg(const struct as_namespace_s *ns, drv_ssd* ssd, const ssd_device_header *header);
348void ssd_flush_final_cfg(struct as_namespace_s *ns);
349bool ssd_cold_start_is_valid_n_bins(uint32_t n_bins);
350void ssd_write_header(drv_ssd *ssd, uint8_t *header, uint8_t *from, size_t size);
351void ssd_prefetch_wblock(drv_ssd *ssd, uint64_t file_offset, uint8_t *read_buf);
352
353// Durability.
354void ssd_init_commit(drv_ssd *ssd);
355uint64_t ssd_flush_max_us(const struct as_namespace_s *ns);
356void ssd_post_write(drv_ssd *ssd, ssd_write_buf *swb);
357int ssd_write_bins(struct as_storage_rd_s *rd);
358int ssd_buffer_bins(struct as_storage_rd_s *rd);
359ssd_write_buf *swb_get(drv_ssd *ssd);
360
361// Called in (enterprise-split) storage table function.
362int ssd_write(struct as_storage_rd_s *rd);
363
364
365//
366// Conversions between offsets and rblocks.
367//
368
369// TODO - make checks stricter (exclude drive header, consider drive size) ???
370#define STORAGE_RBLOCK_IS_VALID(__x) ((__x) != 0)
371#define STORAGE_RBLOCK_IS_INVALID(__x) ((__x) == 0)
372
373// Convert byte offset to rblock_id, as long as offset is already a multiple of
374// rblock size.
375static inline uint64_t OFFSET_TO_RBLOCK_ID(uint64_t offset) {
376 return offset >> LOG_2_RBLOCK_SIZE;
377}
378
379// Convert rblock_id to byte offset.
380static inline uint64_t RBLOCK_ID_TO_OFFSET(uint64_t rblocks) {
381 return rblocks << LOG_2_RBLOCK_SIZE;
382}
383
384
385//
386// Conversions between bytes/rblocks and wblocks.
387//
388
389#define STORAGE_INVALID_WBLOCK 0xFFFFffff
390
391// Convert byte offset to wblock_id.
392static inline uint32_t OFFSET_TO_WBLOCK_ID(drv_ssd *ssd, uint64_t offset) {
393 return (uint32_t)(offset / ssd->write_block_size);
394}
395
396// Convert wblock_id to byte offset.
397static inline uint64_t WBLOCK_ID_TO_OFFSET(drv_ssd *ssd, uint32_t wblock_id) {
398 return (uint64_t)wblock_id * (uint64_t)ssd->write_block_size;
399}
400
401// Convert rblock_id to wblock_id.
402static inline uint32_t RBLOCK_ID_TO_WBLOCK_ID(drv_ssd *ssd, uint64_t rblock_id) {
403 return (uint32_t)((rblock_id << LOG_2_RBLOCK_SIZE) / ssd->write_block_size);
404}
405
406
407//
408// Size rounding needed for sanity checking.
409//
410
411#define SSD_RECORD_MIN_SIZE \
412 (((uint32_t)sizeof(as_flat_record) + (RBLOCK_SIZE - 1)) & -RBLOCK_SIZE)
413
414
415//
416// Size rounding needed for direct IO.
417//
418
419// Round bytes down to a multiple of device's minimum IO operation size.
420static inline uint64_t BYTES_DOWN_TO_IO_MIN(drv_ssd *ssd, uint64_t bytes) {
421 return bytes & -ssd->io_min_size;
422}
423
424// Round bytes up to a multiple of device's minimum IO operation size.
425static inline uint64_t BYTES_UP_TO_IO_MIN(drv_ssd *ssd, uint64_t bytes) {
426 return (bytes + (ssd->io_min_size - 1)) & -ssd->io_min_size;
427}
428
429// Round bytes down to a multiple of shadow device's minimum IO operation size.
430static inline uint64_t
431BYTES_DOWN_TO_SHADOW_IO_MIN(drv_ssd *ssd, uint64_t bytes) {
432 return bytes & -ssd->shadow_io_min_size;
433}
434
435// Round bytes up to a multiple of shadow device's minimum IO operation size.
436static inline uint64_t
437BYTES_UP_TO_SHADOW_IO_MIN(drv_ssd *ssd, uint64_t bytes) {
438 return (bytes + (ssd->shadow_io_min_size - 1)) & -ssd->shadow_io_min_size;
439}
440
441
442//
443// Device IO.
444//
445
446static inline bool
447pread_all(int fd, void* buf, size_t size, off_t offset)
448{
449 ssize_t result;
450
451 while ((result = pread(fd, buf, size, offset)) != (ssize_t)size) {
452 if (result < 0) {
453 return false; // let the caller log errors
454 }
455
456 if (result == 0) { // should only happen if caller passed 0 size
457 errno = EINVAL;
458 return false;
459 }
460
461 buf += result;
462 offset += result;
463 size -= result;
464 }
465
466 return true;
467}
468
469static inline bool
470pwrite_all(int fd, void* buf, size_t size, off_t offset)
471{
472 ssize_t result;
473
474 while ((result = pwrite(fd, buf, size, offset)) != (ssize_t)size) {
475 if (result < 0) {
476 return false; // let the caller log errors
477 }
478
479 if (result == 0) { // should only happen if caller passed 0 size
480 errno = EINVAL;
481 return false;
482 }
483
484 buf += result;
485 offset += result;
486 size -= result;
487 }
488
489 return true;
490}
491