| 1 | // Copyright (c) Microsoft Corporation. All rights reserved. | 
|---|
| 2 | // Licensed under the MIT license. | 
|---|
| 3 |  | 
|---|
| 4 | #pragma once | 
|---|
| 5 |  | 
|---|
| 6 | #include <atomic> | 
|---|
| 7 | #include <cassert> | 
|---|
| 8 | #include <cstdint> | 
|---|
| 9 | #include <cstring> | 
|---|
| 10 | #include <mutex> | 
|---|
| 11 | #include <thread> | 
|---|
| 12 |  | 
|---|
| 13 | #include "device/file_system_disk.h" | 
|---|
| 14 | #include "address.h" | 
|---|
| 15 | #include "async_result_types.h" | 
|---|
| 16 | #include "gc_state.h" | 
|---|
| 17 | #include "light_epoch.h" | 
|---|
| 18 | #include "native_buffer_pool.h" | 
|---|
| 19 | #include "recovery_status.h" | 
|---|
| 20 | #include "status.h" | 
|---|
| 21 |  | 
|---|
| 22 | namespace FASTER { | 
|---|
| 23 | namespace core { | 
|---|
| 24 |  | 
|---|
| 25 | /// The log allocator, used by FASTER to store records. | 
|---|
| 26 |  | 
|---|
| 27 | enum class FlushStatus : uint8_t { | 
|---|
| 28 | Flushed, | 
|---|
| 29 | InProgress | 
|---|
| 30 | }; | 
|---|
| 31 |  | 
|---|
| 32 | enum class CloseStatus : uint8_t { | 
|---|
| 33 | Closed, | 
|---|
| 34 | Open | 
|---|
| 35 | }; | 
|---|
| 36 |  | 
|---|
| 37 | /// Pack flush- and close-status into a single 16-bit value. | 
|---|
| 38 | /// State transitions are: | 
|---|
| 39 | /// { Flushed, Closed } (default state) | 
|---|
| 40 | /// --> { InProgress, Open } (when issuing the flush to disk) | 
|---|
| 41 | /// --> either { . , Closed} (when moving the head address forward) | 
|---|
| 42 | ///     or     { Flushed, . } (when the flush completes). | 
|---|
| 43 | struct FlushCloseStatus { | 
|---|
| 44 | FlushCloseStatus() | 
|---|
| 45 | : flush{ FlushStatus::Flushed } | 
|---|
| 46 | , close{ CloseStatus::Closed } { | 
|---|
| 47 | } | 
|---|
| 48 |  | 
|---|
| 49 | FlushCloseStatus(FlushStatus flush_, CloseStatus close_) | 
|---|
| 50 | : flush{ flush_ } | 
|---|
| 51 | , close{ close_ } { | 
|---|
| 52 | } | 
|---|
| 53 |  | 
|---|
| 54 | FlushCloseStatus(uint16_t control_) | 
|---|
| 55 | : control{ control_ } { | 
|---|
| 56 | } | 
|---|
| 57 |  | 
|---|
| 58 | /// Is the page ready for use? | 
|---|
| 59 | inline bool Ready() const { | 
|---|
| 60 | return flush == FlushStatus::Flushed && close == CloseStatus::Open; | 
|---|
| 61 | } | 
|---|
| 62 |  | 
|---|
| 63 | union { | 
|---|
| 64 | struct { | 
|---|
| 65 | FlushStatus flush; | 
|---|
| 66 | CloseStatus close; | 
|---|
| 67 | }; | 
|---|
| 68 | uint16_t control; | 
|---|
| 69 | }; | 
|---|
| 70 | }; | 
|---|
| 71 | static_assert(sizeof(FlushCloseStatus) == 2, "sizeof(FlushCloseStatus) != 2"); | 
|---|
| 72 |  | 
|---|
| 73 | /// Atomic version of FlushCloseStatus. Can set and get flush- and close- status, together, | 
|---|
| 74 | /// atomically. | 
|---|
| 75 | class AtomicFlushCloseStatus { | 
|---|
| 76 | public: | 
|---|
| 77 | AtomicFlushCloseStatus() | 
|---|
| 78 | : status_{} { | 
|---|
| 79 | } | 
|---|
| 80 |  | 
|---|
| 81 | inline void store(FlushStatus flush, CloseStatus close) { | 
|---|
| 82 | // Sets flush and close statuses, atomically. | 
|---|
| 83 | FlushCloseStatus status{ flush, close }; | 
|---|
| 84 | control_.store(status.control); | 
|---|
| 85 | } | 
|---|
| 86 |  | 
|---|
| 87 | inline FlushCloseStatus load() const { | 
|---|
| 88 | // Gets flush and close statuses, atomically. | 
|---|
| 89 | return FlushCloseStatus{ control_.load() }; | 
|---|
| 90 | } | 
|---|
| 91 |  | 
|---|
| 92 | inline bool compare_exchange_weak(FlushCloseStatus& expected, FlushCloseStatus value) { | 
|---|
| 93 | uint16_t expected_control = expected.control; | 
|---|
| 94 | bool result = control_.compare_exchange_weak(expected_control, value.control); | 
|---|
| 95 | expected.control = expected_control; | 
|---|
| 96 | return result; | 
|---|
| 97 | } | 
|---|
| 98 | inline bool compare_exchange_strong(FlushCloseStatus& expected, FlushCloseStatus value) { | 
|---|
| 99 | uint16_t expected_control = expected.control; | 
|---|
| 100 | bool result = control_.compare_exchange_strong(expected_control, value.control); | 
|---|
| 101 | expected.control = expected_control; | 
|---|
| 102 | return result; | 
|---|
| 103 | } | 
|---|
| 104 |  | 
|---|
| 105 | union { | 
|---|
| 106 | FlushCloseStatus status_; | 
|---|
| 107 | std::atomic<uint16_t> control_; | 
|---|
| 108 | }; | 
|---|
| 109 | }; | 
|---|
| 110 | static_assert(sizeof(AtomicFlushCloseStatus) == 2, "sizeof(FlushCloseStatus) != 2"); | 
|---|
| 111 |  | 
|---|
| 112 | struct FullPageStatus { | 
|---|
| 113 | FullPageStatus() | 
|---|
| 114 | : LastFlushedUntilAddress{ 0 } | 
|---|
| 115 | , status{} { | 
|---|
| 116 | } | 
|---|
| 117 |  | 
|---|
| 118 | AtomicAddress LastFlushedUntilAddress; | 
|---|
| 119 | AtomicFlushCloseStatus status; | 
|---|
| 120 | }; | 
|---|
| 121 | static_assert(sizeof(FullPageStatus) == 16, "sizeof(FullPageStatus) != 16"); | 
|---|
| 122 |  | 
|---|
| 123 | /// Page and offset of the tail of the log. Can reserve space within the current page or move to a | 
|---|
| 124 | /// new page. | 
|---|
| 125 | class PageOffset { | 
|---|
| 126 | public: | 
|---|
| 127 | PageOffset(uint32_t page, uint64_t offset) | 
|---|
| 128 | : offset_{ offset } | 
|---|
| 129 | , page_{ page } { | 
|---|
| 130 | assert(page <= Address::kMaxPage); | 
|---|
| 131 | } | 
|---|
| 132 |  | 
|---|
| 133 | PageOffset(uint64_t control) | 
|---|
| 134 | : control_{ control } { | 
|---|
| 135 | } | 
|---|
| 136 |  | 
|---|
| 137 | PageOffset(const Address& address) | 
|---|
| 138 | : offset_{ address.offset() } | 
|---|
| 139 | , page_{ address.page() } { | 
|---|
| 140 | } | 
|---|
| 141 |  | 
|---|
| 142 | /// Accessors. | 
|---|
| 143 | inline uint64_t offset() const { | 
|---|
| 144 | return offset_; | 
|---|
| 145 | } | 
|---|
| 146 | inline uint32_t page() const { | 
|---|
| 147 | return static_cast<uint32_t>(page_); | 
|---|
| 148 | } | 
|---|
| 149 | inline uint64_t control() const { | 
|---|
| 150 | return control_; | 
|---|
| 151 | } | 
|---|
| 152 |  | 
|---|
| 153 | /// Conversion operator. | 
|---|
| 154 | inline operator Address() const { | 
|---|
| 155 | assert(offset_ <= Address::kMaxOffset); | 
|---|
| 156 | return Address{ page(), static_cast<uint32_t>(offset()) }; | 
|---|
| 157 | } | 
|---|
| 158 |  | 
|---|
| 159 | private: | 
|---|
| 160 | /// Use 41 bits for offset, which gives us approximately 2 PB of overflow space, for | 
|---|
| 161 | /// Reserve(). | 
|---|
| 162 | union { | 
|---|
| 163 | struct { | 
|---|
| 164 | uint64_t offset_ : 64 - Address::kPageBits; | 
|---|
| 165 | uint64_t page_ : Address::kPageBits; | 
|---|
| 166 | }; | 
|---|
| 167 | uint64_t control_; | 
|---|
| 168 | }; | 
|---|
| 169 | }; | 
|---|
| 170 | static_assert(sizeof(PageOffset) == 8, "sizeof(PageOffset) != 8"); | 
|---|
| 171 |  | 
|---|
| 172 | /// Atomic page + offset marker. Can Reserve() space from current page, or move to NewPage(). | 
|---|
| 173 | class AtomicPageOffset { | 
|---|
| 174 | public: | 
|---|
| 175 | AtomicPageOffset() | 
|---|
| 176 | : control_{ 0 } { | 
|---|
| 177 | } | 
|---|
| 178 |  | 
|---|
| 179 | AtomicPageOffset(uint32_t page, uint64_t offset) | 
|---|
| 180 | : control_{ PageOffset{ page, offset } .control() } { | 
|---|
| 181 | } | 
|---|
| 182 |  | 
|---|
| 183 | AtomicPageOffset(const Address& address) { | 
|---|
| 184 | PageOffset page_offset{ address }; | 
|---|
| 185 | control_.store(page_offset.control()); | 
|---|
| 186 | } | 
|---|
| 187 |  | 
|---|
| 188 | /// Reserve space within the current page. Can overflow the page boundary (so result offset > | 
|---|
| 189 | /// Address::kMaxOffset). | 
|---|
| 190 | inline PageOffset Reserve(uint32_t num_slots) { | 
|---|
| 191 | assert(num_slots <= Address::kMaxOffset); | 
|---|
| 192 | PageOffset offset{ 0, num_slots }; | 
|---|
| 193 | return PageOffset{ control_.fetch_add(offset.control()) }; | 
|---|
| 194 | } | 
|---|
| 195 |  | 
|---|
| 196 | /// Move to the next page. The compare-and-swap can fail. Returns "true" if some thread advanced | 
|---|
| 197 | /// the thread; sets "won_cas" = "true" if this thread won the CAS, which means it has been | 
|---|
| 198 | /// chosen to set up the new page. | 
|---|
| 199 | inline bool NewPage(uint32_t old_page, bool& won_cas) { | 
|---|
| 200 | assert(old_page < Address::kMaxPage); | 
|---|
| 201 | won_cas = false; | 
|---|
| 202 | PageOffset expected_page_offset = load(); | 
|---|
| 203 | if(old_page != expected_page_offset.page()) { | 
|---|
| 204 | // Another thread already moved to the new page. | 
|---|
| 205 | assert(old_page < expected_page_offset.page()); | 
|---|
| 206 | return true; | 
|---|
| 207 | } | 
|---|
| 208 | PageOffset new_page{ old_page + 1, 0 }; | 
|---|
| 209 | uint64_t expected = expected_page_offset.control(); | 
|---|
| 210 | // Try to move to a new page. | 
|---|
| 211 | won_cas = control_.compare_exchange_strong(expected, new_page.control()); | 
|---|
| 212 | return PageOffset{ expected } .page() > old_page; | 
|---|
| 213 | } | 
|---|
| 214 |  | 
|---|
| 215 | inline PageOffset load() const { | 
|---|
| 216 | return PageOffset{ control_.load() }; | 
|---|
| 217 | } | 
|---|
| 218 | inline void store(Address address) { | 
|---|
| 219 | PageOffset page_offset{ address.page(), address.offset() }; | 
|---|
| 220 | control_.store(page_offset.control()); | 
|---|
| 221 | } | 
|---|
| 222 |  | 
|---|
| 223 | private: | 
|---|
| 224 | union { | 
|---|
| 225 | /// Atomic access to the page+offset. | 
|---|
| 226 | std::atomic<uint64_t> control_; | 
|---|
| 227 | }; | 
|---|
| 228 | }; | 
|---|
| 229 | static_assert(sizeof(AtomicPageOffset) == 8, "sizeof(AtomicPageOffset) != 8"); | 
|---|
| 230 |  | 
|---|
| 231 | /// The main allocator. | 
|---|
| 232 | template <class D> | 
|---|
| 233 | class PersistentMemoryMalloc { | 
|---|
| 234 | public: | 
|---|
| 235 | typedef D disk_t; | 
|---|
| 236 | typedef typename D::file_t file_t; | 
|---|
| 237 | typedef typename D::log_file_t log_file_t; | 
|---|
| 238 | typedef PersistentMemoryMalloc<disk_t> alloc_t; | 
|---|
| 239 |  | 
|---|
| 240 | /// Each page in the buffer is 2^25 bytes (= 32 MB). | 
|---|
| 241 | static constexpr uint64_t kPageSize = Address::kMaxOffset + 1; | 
|---|
| 242 |  | 
|---|
| 243 | /// The first 4 HLOG pages should be below the head (i.e., being flushed to disk). | 
|---|
| 244 | static constexpr uint32_t kNumHeadPages = 4; | 
|---|
| 245 |  | 
|---|
| 246 | PersistentMemoryMalloc(uint64_t log_size, LightEpoch& epoch, disk_t& disk_, log_file_t& file_, | 
|---|
| 247 | Address start_address, double log_mutable_fraction) | 
|---|
| 248 | : sector_size{ static_cast<uint32_t>(file_.alignment()) } | 
|---|
| 249 | , epoch_{ &epoch } | 
|---|
| 250 | , disk{ &disk_ } | 
|---|
| 251 | , file{ &file_ } | 
|---|
| 252 | , read_buffer_pool{ 1, sector_size } | 
|---|
| 253 | , io_buffer_pool{ 1, sector_size } | 
|---|
| 254 | , read_only_address{ start_address } | 
|---|
| 255 | , safe_read_only_address{ start_address } | 
|---|
| 256 | , head_address{ start_address } | 
|---|
| 257 | , safe_head_address{ start_address } | 
|---|
| 258 | , flushed_until_address{ start_address } | 
|---|
| 259 | , begin_address{ start_address } | 
|---|
| 260 | , tail_page_offset_{ start_address } | 
|---|
| 261 | , buffer_size_{ 0 } | 
|---|
| 262 | , pages_{ nullptr } | 
|---|
| 263 | , page_status_{ nullptr } { | 
|---|
| 264 | assert(start_address.page() <= Address::kMaxPage); | 
|---|
| 265 |  | 
|---|
| 266 | if(log_size % kPageSize != 0) { | 
|---|
| 267 | throw std::invalid_argument{ "Log size must be a multiple of 32 MB"}; | 
|---|
| 268 | } | 
|---|
| 269 | if(log_size % kPageSize > UINT32_MAX) { | 
|---|
| 270 | throw std::invalid_argument{ "Log size must be <= 128 PB"}; | 
|---|
| 271 | } | 
|---|
| 272 | buffer_size_ = static_cast<uint32_t>(log_size / kPageSize); | 
|---|
| 273 |  | 
|---|
| 274 | if(buffer_size_ <= kNumHeadPages + 1) { | 
|---|
| 275 | throw std::invalid_argument{ "Must have at least 2 non-head pages"}; | 
|---|
| 276 | } | 
|---|
| 277 | // The latest N pages should be mutable. | 
|---|
| 278 | num_mutable_pages_ = static_cast<uint32_t>(log_mutable_fraction * buffer_size_); | 
|---|
| 279 | if(num_mutable_pages_ <= 1) { | 
|---|
| 280 | // Need at least two mutable pages: one to write to, and one to open up when the previous | 
|---|
| 281 | // mutable page is full. | 
|---|
| 282 | throw std::invalid_argument{ "Must have at least 2 mutable pages"}; | 
|---|
| 283 | } | 
|---|
| 284 |  | 
|---|
| 285 | pages_ = new uint8_t* [buffer_size_]; | 
|---|
| 286 | for(uint32_t idx = 0; idx < buffer_size_; ++idx) { | 
|---|
| 287 | pages_[idx] = nullptr; | 
|---|
| 288 | } | 
|---|
| 289 |  | 
|---|
| 290 | page_status_ = new FullPageStatus[buffer_size_]; | 
|---|
| 291 |  | 
|---|
| 292 | PageOffset tail_page_offset = tail_page_offset_.load(); | 
|---|
| 293 | AllocatePage(tail_page_offset.page()); | 
|---|
| 294 | AllocatePage(tail_page_offset.page() + 1); | 
|---|
| 295 | } | 
|---|
| 296 |  | 
|---|
| 297 | PersistentMemoryMalloc(uint64_t log_size, LightEpoch& epoch, disk_t& disk_, log_file_t& file_, | 
|---|
| 298 | double log_mutable_fraction) | 
|---|
| 299 | : PersistentMemoryMalloc(log_size, epoch, disk_, file_, Address{ 0 }, log_mutable_fraction) { | 
|---|
| 300 | /// Allocate the invalid page. Supports allocations aligned up to kCacheLineBytes. | 
|---|
| 301 | uint32_t discard; | 
|---|
| 302 | Allocate(Constants::kCacheLineBytes, discard); | 
|---|
| 303 | assert(discard == UINT32_MAX); | 
|---|
| 304 | /// Move the head and read-only address past the invalid page. | 
|---|
| 305 | Address tail_address = tail_page_offset_.load(); | 
|---|
| 306 | begin_address.store(tail_address); | 
|---|
| 307 | read_only_address.store(tail_address); | 
|---|
| 308 | safe_read_only_address.store(tail_address); | 
|---|
| 309 | head_address.store(tail_address); | 
|---|
| 310 | safe_head_address.store(tail_address); | 
|---|
| 311 | } | 
|---|
| 312 |  | 
|---|
| 313 | ~PersistentMemoryMalloc() { | 
|---|
| 314 | if(pages_) { | 
|---|
| 315 | for(uint32_t idx = 0; idx < buffer_size_; ++idx) { | 
|---|
| 316 | if(pages_[idx]) { | 
|---|
| 317 | aligned_free(pages_[idx]); | 
|---|
| 318 | } | 
|---|
| 319 | } | 
|---|
| 320 | delete[] pages_; | 
|---|
| 321 | } | 
|---|
| 322 | if(page_status_) { | 
|---|
| 323 | delete[] page_status_; | 
|---|
| 324 | } | 
|---|
| 325 | } | 
|---|
| 326 |  | 
|---|
| 327 | inline const uint8_t* Page(uint32_t page) const { | 
|---|
| 328 | assert(page <= Address::kMaxPage); | 
|---|
| 329 | return pages_[page % buffer_size_]; | 
|---|
| 330 | } | 
|---|
| 331 | inline uint8_t* Page(uint32_t page) { | 
|---|
| 332 | assert(page <= Address::kMaxPage); | 
|---|
| 333 | return pages_[page % buffer_size_]; | 
|---|
| 334 | } | 
|---|
| 335 |  | 
|---|
| 336 | inline const FullPageStatus& PageStatus(uint32_t page) const { | 
|---|
| 337 | assert(page <= Address::kMaxPage); | 
|---|
| 338 | return page_status_[page % buffer_size_]; | 
|---|
| 339 | } | 
|---|
| 340 | inline FullPageStatus& PageStatus(uint32_t page) { | 
|---|
| 341 | assert(page <= Address::kMaxPage); | 
|---|
| 342 | return page_status_[page % buffer_size_]; | 
|---|
| 343 | } | 
|---|
| 344 |  | 
|---|
| 345 | inline uint32_t buffer_size() const { | 
|---|
| 346 | return buffer_size_; | 
|---|
| 347 | } | 
|---|
| 348 |  | 
|---|
| 349 | /// Read the tail page + offset, atomically, and convert it to an address. | 
|---|
| 350 | inline Address GetTailAddress() const { | 
|---|
| 351 | PageOffset tail_page_offset = tail_page_offset_.load(); | 
|---|
| 352 | return Address{ tail_page_offset.page(), std::min(Address::kMaxOffset, | 
|---|
| 353 | static_cast<uint32_t>(tail_page_offset.offset())) }; | 
|---|
| 354 | } | 
|---|
| 355 |  | 
|---|
| 356 | inline const uint8_t* Get(Address address) const { | 
|---|
| 357 | return Page(address.page()) + address.offset(); | 
|---|
| 358 | } | 
|---|
| 359 | inline uint8_t* Get(Address address) { | 
|---|
| 360 | return Page(address.page()) + address.offset(); | 
|---|
| 361 | } | 
|---|
| 362 |  | 
|---|
| 363 | /// Key function used to allocate memory for a specified number of items. If the current page is | 
|---|
| 364 | /// full, returns Address::kInvalidAddress and sets closed_page to the current page index. The | 
|---|
| 365 | /// caller should Refresh() the epoch and call NewPage() until successful, before trying to | 
|---|
| 366 | /// Allocate() again. | 
|---|
| 367 | inline Address Allocate(uint32_t num_slots, uint32_t& closed_page); | 
|---|
| 368 |  | 
|---|
| 369 | /// Tries to move the allocator to a new page; used when the current page is full. Returns "true" | 
|---|
| 370 | /// if the page advanced (so the caller can try to allocate, again). | 
|---|
| 371 | inline bool NewPage(uint32_t old_page); | 
|---|
| 372 |  | 
|---|
| 373 | /// Invoked by users to obtain a record from disk. It uses sector aligned memory to read | 
|---|
| 374 | /// the record efficiently into memory. | 
|---|
| 375 | inline void AsyncGetFromDisk(Address address, uint32_t num_records, AsyncIOCallback callback, | 
|---|
| 376 | AsyncIOContext& context); | 
|---|
| 377 |  | 
|---|
| 378 | /// Used by applications to make the current state of the database immutable quickly | 
|---|
| 379 | Address ShiftReadOnlyToTail(); | 
|---|
| 380 |  | 
|---|
| 381 | void Truncate(GcState::truncate_callback_t callback); | 
|---|
| 382 |  | 
|---|
| 383 | /// Action to be performed for when all threads have agreed that a page range is closed. | 
|---|
| 384 | class OnPagesClosed_Context : public IAsyncContext { | 
|---|
| 385 | public: | 
|---|
| 386 | OnPagesClosed_Context(alloc_t* allocator_, | 
|---|
| 387 | Address new_safe_head_address_, | 
|---|
| 388 | bool replace_with_clean_page_) | 
|---|
| 389 | : allocator{ allocator_ } | 
|---|
| 390 | , new_safe_head_address{ new_safe_head_address_ } | 
|---|
| 391 | , replace_with_clean_page{ replace_with_clean_page_ } { | 
|---|
| 392 | } | 
|---|
| 393 |  | 
|---|
| 394 | /// The deep-copy constructor. | 
|---|
| 395 | OnPagesClosed_Context(const OnPagesClosed_Context& other) | 
|---|
| 396 | : allocator{ other.allocator } | 
|---|
| 397 | , new_safe_head_address{ other.new_safe_head_address } | 
|---|
| 398 | , replace_with_clean_page{ other.replace_with_clean_page } { | 
|---|
| 399 | } | 
|---|
| 400 |  | 
|---|
| 401 | protected: | 
|---|
| 402 | Status DeepCopy_Internal(IAsyncContext*& context_copy) final { | 
|---|
| 403 | return IAsyncContext::DeepCopy_Internal(*this, context_copy); | 
|---|
| 404 | } | 
|---|
| 405 |  | 
|---|
| 406 | public: | 
|---|
| 407 | alloc_t* allocator; | 
|---|
| 408 | Address new_safe_head_address; | 
|---|
| 409 | bool replace_with_clean_page; | 
|---|
| 410 | }; | 
|---|
| 411 |  | 
|---|
| 412 | static void OnPagesClosed(IAsyncContext* ctxt); | 
|---|
| 413 |  | 
|---|
| 414 | /// Seal: make sure there are no longer any threads writing to the page | 
|---|
| 415 | /// Flush: send page to secondary store | 
|---|
| 416 | class OnPagesMarkedReadOnly_Context : public IAsyncContext { | 
|---|
| 417 | public: | 
|---|
| 418 | OnPagesMarkedReadOnly_Context(alloc_t* allocator_, | 
|---|
| 419 | Address new_safe_read_only_address_, | 
|---|
| 420 | bool wait_for_pending_flush_complete_) | 
|---|
| 421 | : allocator{ allocator_ } | 
|---|
| 422 | , new_safe_read_only_address{ new_safe_read_only_address_ } | 
|---|
| 423 | , wait_for_pending_flush_complete{ wait_for_pending_flush_complete_ } { | 
|---|
| 424 | } | 
|---|
| 425 |  | 
|---|
| 426 | /// The deep-copy constructor. | 
|---|
| 427 | OnPagesMarkedReadOnly_Context(const OnPagesMarkedReadOnly_Context& other) | 
|---|
| 428 | : allocator{ other.allocator } | 
|---|
| 429 | , new_safe_read_only_address{ other.new_safe_read_only_address } | 
|---|
| 430 | , wait_for_pending_flush_complete{ other.wait_for_pending_flush_complete } { | 
|---|
| 431 | } | 
|---|
| 432 |  | 
|---|
| 433 | protected: | 
|---|
| 434 | Status DeepCopy_Internal(IAsyncContext*& context_copy) final { | 
|---|
| 435 | return IAsyncContext::DeepCopy_Internal(*this, context_copy); | 
|---|
| 436 | } | 
|---|
| 437 |  | 
|---|
| 438 | public: | 
|---|
| 439 | alloc_t* allocator; | 
|---|
| 440 | Address new_safe_read_only_address; | 
|---|
| 441 | bool wait_for_pending_flush_complete; | 
|---|
| 442 | }; | 
|---|
| 443 |  | 
|---|
| 444 | static void OnPagesMarkedReadOnly(IAsyncContext* ctxt); | 
|---|
| 445 |  | 
|---|
| 446 | private: | 
|---|
| 447 | inline void GetFileReadBoundaries(Address read_offset, uint32_t read_length, | 
|---|
| 448 | uint64_t& begin_read, uint64_t& end_read, uint32_t& offset, | 
|---|
| 449 | uint32_t& length) const { | 
|---|
| 450 | assert(sector_size > 0); | 
|---|
| 451 | assert(Utility::IsPowerOfTwo(sector_size)); | 
|---|
| 452 | assert(sector_size <= UINT32_MAX); | 
|---|
| 453 | size_t alignment_mask = sector_size - 1; | 
|---|
| 454 | // Align read to sector boundary. | 
|---|
| 455 | begin_read = read_offset.control() & ~alignment_mask; | 
|---|
| 456 | end_read = (read_offset.control() + read_length + alignment_mask) & ~alignment_mask; | 
|---|
| 457 | offset = static_cast<uint32_t>(read_offset.control() & alignment_mask); | 
|---|
| 458 | assert(end_read - begin_read <= UINT32_MAX); | 
|---|
| 459 | length = static_cast<uint32_t>(end_read - begin_read); | 
|---|
| 460 | } | 
|---|
| 461 |  | 
|---|
| 462 | /// Allocate memory page, in sector aligned form | 
|---|
| 463 | inline void AllocatePage(uint32_t index); | 
|---|
| 464 |  | 
|---|
| 465 | /// Used by several functions to update the variable to newValue. Ignores if newValue is smaller | 
|---|
| 466 | /// than the current value. | 
|---|
| 467 | template <typename A, typename T> | 
|---|
| 468 | inline bool MonotonicUpdate(A& variable, T new_value, | 
|---|
| 469 | T& old_value) { | 
|---|
| 470 | old_value = variable.load(); | 
|---|
| 471 | while(old_value < new_value) { | 
|---|
| 472 | if(variable.compare_exchange_strong(old_value, new_value)) { | 
|---|
| 473 | return true; | 
|---|
| 474 | } | 
|---|
| 475 | } | 
|---|
| 476 | return false; | 
|---|
| 477 | } | 
|---|
| 478 |  | 
|---|
| 479 | Status AsyncFlushPages(uint32_t start_page, Address until_address, | 
|---|
| 480 | bool serialize_objects = false); | 
|---|
| 481 |  | 
|---|
| 482 | public: | 
|---|
| 483 | Status AsyncFlushPagesToFile(uint32_t start_page, Address until_address, file_t& file, | 
|---|
| 484 | std::atomic<uint32_t>& flush_pending); | 
|---|
| 485 |  | 
|---|
| 486 | /// Recovery. | 
|---|
| 487 | Status AsyncReadPagesFromLog(uint32_t start_page, uint32_t num_pages, | 
|---|
| 488 | RecoveryStatus& recovery_status); | 
|---|
| 489 | Status AsyncReadPagesFromSnapshot(file_t& snapshot_file, uint32_t file_start_page, | 
|---|
| 490 | uint32_t start_page, uint32_t num_pages, | 
|---|
| 491 | RecoveryStatus& recovery_status); | 
|---|
| 492 |  | 
|---|
| 493 | Status AsyncFlushPage(uint32_t page, RecoveryStatus& recovery_status, | 
|---|
| 494 | AsyncCallback caller_callback, IAsyncContext* caller_context); | 
|---|
| 495 | void RecoveryReset(Address begin_address_, Address head_address_, Address tail_address); | 
|---|
| 496 |  | 
|---|
| 497 | private: | 
|---|
| 498 | template <class F> | 
|---|
| 499 | Status AsyncReadPages(F& read_file, uint32_t file_start_page, uint32_t start_page, | 
|---|
| 500 | uint32_t num_pages, RecoveryStatus& recovery_status); | 
|---|
| 501 | inline void PageAlignedShiftHeadAddress(uint32_t tail_page); | 
|---|
| 502 | inline void PageAlignedShiftReadOnlyAddress(uint32_t tail_page); | 
|---|
| 503 |  | 
|---|
| 504 | /// Every async flush callback tries to update the flushed until address to the latest value | 
|---|
| 505 | /// possible | 
|---|
| 506 | /// Is there a better way to do this with enabling fine-grained addresses (not necessarily at | 
|---|
| 507 | /// page boundaries)? | 
|---|
| 508 | inline void ShiftFlushedUntilAddress() { | 
|---|
| 509 | Address current_flushed_until_address = flushed_until_address.load(); | 
|---|
| 510 | uint32_t page = current_flushed_until_address.page(); | 
|---|
| 511 |  | 
|---|
| 512 | bool update = false; | 
|---|
| 513 | Address page_last_flushed_address = PageStatus(page).LastFlushedUntilAddress.load(); | 
|---|
| 514 | while(page_last_flushed_address >= current_flushed_until_address) { | 
|---|
| 515 | current_flushed_until_address = page_last_flushed_address; | 
|---|
| 516 | update = true; | 
|---|
| 517 | ++page; | 
|---|
| 518 | page_last_flushed_address = PageStatus(page).LastFlushedUntilAddress.load(); | 
|---|
| 519 | } | 
|---|
| 520 |  | 
|---|
| 521 | if(update) { | 
|---|
| 522 | Address discard; | 
|---|
| 523 | MonotonicUpdate(flushed_until_address, current_flushed_until_address, discard); | 
|---|
| 524 | } | 
|---|
| 525 | } | 
|---|
| 526 |  | 
|---|
| 527 | public: | 
|---|
| 528 | uint32_t sector_size; | 
|---|
| 529 |  | 
|---|
| 530 | private: | 
|---|
| 531 | LightEpoch* epoch_; | 
|---|
| 532 | disk_t* disk; | 
|---|
| 533 |  | 
|---|
| 534 | public: | 
|---|
| 535 | log_file_t* file; | 
|---|
| 536 | // Read buffer pool | 
|---|
| 537 | NativeSectorAlignedBufferPool read_buffer_pool; | 
|---|
| 538 | NativeSectorAlignedBufferPool io_buffer_pool; | 
|---|
| 539 |  | 
|---|
| 540 | /// Every address < ReadOnlyAddress is read-only. | 
|---|
| 541 | AtomicAddress read_only_address; | 
|---|
| 542 | /// The minimum ReadOnlyAddress that every thread has seen. | 
|---|
| 543 | AtomicAddress safe_read_only_address; | 
|---|
| 544 |  | 
|---|
| 545 | /// The circular buffer can drop any page < HeadAddress.page()--must read those pages from disk. | 
|---|
| 546 | AtomicAddress head_address; | 
|---|
| 547 | /// The minimum HeadPage that every thread has seen. | 
|---|
| 548 | AtomicAddress safe_head_address; | 
|---|
| 549 |  | 
|---|
| 550 | AtomicAddress flushed_until_address; | 
|---|
| 551 |  | 
|---|
| 552 | /// The address of the true head of the log--everything before this address has been truncated | 
|---|
| 553 | /// by garbage collection. | 
|---|
| 554 | AtomicAddress begin_address; | 
|---|
| 555 |  | 
|---|
| 556 | private: | 
|---|
| 557 | uint32_t buffer_size_; | 
|---|
| 558 |  | 
|---|
| 559 | /// -- the latest N pages should be mutable. | 
|---|
| 560 | uint32_t num_mutable_pages_; | 
|---|
| 561 |  | 
|---|
| 562 | // Circular buffer definition | 
|---|
| 563 | uint8_t** pages_; | 
|---|
| 564 |  | 
|---|
| 565 | // Array that indicates the status of each buffer page | 
|---|
| 566 | FullPageStatus* page_status_; | 
|---|
| 567 |  | 
|---|
| 568 | // Global address of the current tail (next element to be allocated from the circular buffer) | 
|---|
| 569 | AtomicPageOffset tail_page_offset_; | 
|---|
| 570 | }; | 
|---|
| 571 |  | 
|---|
| 572 | /// Implementations. | 
|---|
| 573 | template <class D> | 
|---|
| 574 | inline void PersistentMemoryMalloc<D>::AllocatePage(uint32_t index) { | 
|---|
| 575 | index = index % buffer_size_; | 
|---|
| 576 | assert(pages_[index] == nullptr); | 
|---|
| 577 | pages_[index] = reinterpret_cast<uint8_t*>(aligned_alloc(sector_size, kPageSize));; | 
|---|
| 578 | std::memset(pages_[index], 0, kPageSize); | 
|---|
| 579 |  | 
|---|
| 580 | // Mark the page as accessible. | 
|---|
| 581 | page_status_[index].status.store(FlushStatus::Flushed, CloseStatus::Open); | 
|---|
| 582 | } | 
|---|
| 583 |  | 
|---|
| 584 | template <class D> | 
|---|
| 585 | inline Address PersistentMemoryMalloc<D>::Allocate(uint32_t num_slots, uint32_t& closed_page) { | 
|---|
| 586 | closed_page = UINT32_MAX; | 
|---|
| 587 | PageOffset page_offset = tail_page_offset_.Reserve(num_slots); | 
|---|
| 588 |  | 
|---|
| 589 | if(page_offset.offset() + num_slots > kPageSize) { | 
|---|
| 590 | // The current page is full. The caller should Refresh() the epoch and wait until | 
|---|
| 591 | // NewPage() is successful before trying to Allocate() again. | 
|---|
| 592 | closed_page = page_offset.page(); | 
|---|
| 593 | return Address::kInvalidAddress; | 
|---|
| 594 | } else { | 
|---|
| 595 | assert(Page(page_offset.page())); | 
|---|
| 596 | return static_cast<Address>(page_offset); | 
|---|
| 597 | } | 
|---|
| 598 | } | 
|---|
| 599 |  | 
|---|
| 600 | template <class D> | 
|---|
| 601 | inline bool PersistentMemoryMalloc<D>::NewPage(uint32_t old_page) { | 
|---|
| 602 | assert(old_page < Address::kMaxPage); | 
|---|
| 603 | PageOffset new_tail_offset{ old_page + 1, 0 }; | 
|---|
| 604 | // When the tail advances to page k+1, we clear page k+2. | 
|---|
| 605 | if(old_page + 2 >= safe_head_address.page() + buffer_size_) { | 
|---|
| 606 | // No room in the circular buffer for a new page; try to advance the head address, to make | 
|---|
| 607 | // more room available. | 
|---|
| 608 | disk->TryComplete(); | 
|---|
| 609 | PageAlignedShiftReadOnlyAddress(old_page + 1); | 
|---|
| 610 | PageAlignedShiftHeadAddress(old_page + 1); | 
|---|
| 611 | return false; | 
|---|
| 612 | } | 
|---|
| 613 | FlushCloseStatus status = PageStatus(old_page + 1).status.load(); | 
|---|
| 614 | if(!status.Ready()) { | 
|---|
| 615 | // Can't access the next page yet; try to advance the head address, to make the page | 
|---|
| 616 | // available. | 
|---|
| 617 | disk->TryComplete(); | 
|---|
| 618 | PageAlignedShiftReadOnlyAddress(old_page + 1); | 
|---|
| 619 | PageAlignedShiftHeadAddress(old_page + 1); | 
|---|
| 620 | return false; | 
|---|
| 621 | } | 
|---|
| 622 | bool won_cas; | 
|---|
| 623 | bool retval = tail_page_offset_.NewPage(old_page, won_cas); | 
|---|
| 624 | if(won_cas) { | 
|---|
| 625 | // We moved the tail to (page + 1), so we are responsible for moving the head and | 
|---|
| 626 | // read-only addresses. | 
|---|
| 627 | PageAlignedShiftReadOnlyAddress(old_page + 1); | 
|---|
| 628 | PageAlignedShiftHeadAddress(old_page + 1); | 
|---|
| 629 | if(!Page(old_page + 2)) { | 
|---|
| 630 | // We are also responsible for allocating (page + 2). | 
|---|
| 631 | AllocatePage(old_page + 2); | 
|---|
| 632 | } | 
|---|
| 633 | } | 
|---|
| 634 | return retval; | 
|---|
| 635 | } | 
|---|
| 636 |  | 
|---|
| 637 | template <class D> | 
|---|
| 638 | inline void PersistentMemoryMalloc<D>::AsyncGetFromDisk(Address address, uint32_t num_records, | 
|---|
| 639 | AsyncIOCallback callback, AsyncIOContext& context) { | 
|---|
| 640 | uint64_t begin_read, end_read; | 
|---|
| 641 | uint32_t offset, length; | 
|---|
| 642 | GetFileReadBoundaries(address, num_records, begin_read, end_read, offset, length); | 
|---|
| 643 | context.record = read_buffer_pool.Get(length); | 
|---|
| 644 | context.record.valid_offset = offset; | 
|---|
| 645 | context.record.available_bytes = length - offset; | 
|---|
| 646 | context.record.required_bytes = num_records; | 
|---|
| 647 |  | 
|---|
| 648 | file->ReadAsync(begin_read, context.record.buffer(), length, callback, context); | 
|---|
| 649 | } | 
|---|
| 650 |  | 
|---|
| 651 | template <class D> | 
|---|
| 652 | Address PersistentMemoryMalloc<D>::ShiftReadOnlyToTail() { | 
|---|
| 653 | Address tail_address = GetTailAddress(); | 
|---|
| 654 | Address old_read_only_address; | 
|---|
| 655 | if(MonotonicUpdate(read_only_address, tail_address, old_read_only_address)) { | 
|---|
| 656 | OnPagesMarkedReadOnly_Context context{ this, tail_address, false }; | 
|---|
| 657 | IAsyncContext* context_copy; | 
|---|
| 658 | Status result = context.DeepCopy(context_copy); | 
|---|
| 659 | assert(result == Status::Ok); | 
|---|
| 660 | epoch_->BumpCurrentEpoch(OnPagesMarkedReadOnly, context_copy); | 
|---|
| 661 | } | 
|---|
| 662 | return tail_address; | 
|---|
| 663 | } | 
|---|
| 664 |  | 
|---|
| 665 | template <class D> | 
|---|
| 666 | void PersistentMemoryMalloc<D>::Truncate(GcState::truncate_callback_t callback) { | 
|---|
| 667 | assert(sector_size > 0); | 
|---|
| 668 | assert(Utility::IsPowerOfTwo(sector_size)); | 
|---|
| 669 | assert(sector_size <= UINT32_MAX); | 
|---|
| 670 | size_t alignment_mask = sector_size - 1; | 
|---|
| 671 | // Align read to sector boundary. | 
|---|
| 672 | uint64_t begin_offset = begin_address.control() & ~alignment_mask; | 
|---|
| 673 | file->Truncate(begin_offset, callback); | 
|---|
| 674 | } | 
|---|
| 675 |  | 
|---|
| 676 | template <class D> | 
|---|
| 677 | void PersistentMemoryMalloc<D>::OnPagesClosed(IAsyncContext* ctxt) { | 
|---|
| 678 | CallbackContext<OnPagesClosed_Context> context{ ctxt }; | 
|---|
| 679 | Address old_safe_head_address; | 
|---|
| 680 | if(context->allocator->MonotonicUpdate(context->allocator->safe_head_address, | 
|---|
| 681 | context->new_safe_head_address, | 
|---|
| 682 | old_safe_head_address)) { | 
|---|
| 683 | for(uint32_t idx = old_safe_head_address.page(); idx < context->new_safe_head_address.page(); | 
|---|
| 684 | ++idx) { | 
|---|
| 685 | FlushCloseStatus old_status = context->allocator->PageStatus(idx).status.load(); | 
|---|
| 686 | FlushCloseStatus new_status; | 
|---|
| 687 | do { | 
|---|
| 688 | new_status = FlushCloseStatus{ old_status.flush, CloseStatus::Closed }; | 
|---|
| 689 | } while(!context->allocator->PageStatus(idx).status.compare_exchange_weak(old_status, | 
|---|
| 690 | new_status)); | 
|---|
| 691 |  | 
|---|
| 692 | if(old_status.flush == FlushStatus::Flushed) { | 
|---|
| 693 | // We closed the page after it was flushed, so we are responsible for clearing and | 
|---|
| 694 | // reopening it. | 
|---|
| 695 | std::memset(context->allocator->Page(idx), 0, kPageSize); | 
|---|
| 696 | context->allocator->PageStatus(idx).status.store(FlushStatus::Flushed, CloseStatus::Open); | 
|---|
| 697 | } | 
|---|
| 698 | } | 
|---|
| 699 | } | 
|---|
| 700 | } | 
|---|
| 701 |  | 
|---|
| 702 | template <class D> | 
|---|
| 703 | void PersistentMemoryMalloc<D>::OnPagesMarkedReadOnly(IAsyncContext* ctxt) { | 
|---|
| 704 | CallbackContext<OnPagesMarkedReadOnly_Context> context{ ctxt }; | 
|---|
| 705 | Address old_safe_read_only_address; | 
|---|
| 706 | if(context->allocator->MonotonicUpdate(context->allocator->safe_read_only_address, | 
|---|
| 707 | context->new_safe_read_only_address, | 
|---|
| 708 | old_safe_read_only_address)) { | 
|---|
| 709 | context->allocator->AsyncFlushPages(old_safe_read_only_address.page(), | 
|---|
| 710 | context->new_safe_read_only_address); | 
|---|
| 711 | } | 
|---|
| 712 | } | 
|---|
| 713 |  | 
|---|
| 714 | template <class D> | 
|---|
| 715 | Status PersistentMemoryMalloc<D>::AsyncFlushPages(uint32_t start_page, Address until_address, | 
|---|
| 716 | bool serialize_objects) { | 
|---|
| 717 | class Context : public IAsyncContext { | 
|---|
| 718 | public: | 
|---|
| 719 | Context(alloc_t* allocator_, uint32_t page_, Address until_address_) | 
|---|
| 720 | : allocator{ allocator_ } | 
|---|
| 721 | , page{ page_ } | 
|---|
| 722 | , until_address{ until_address_ } { | 
|---|
| 723 | } | 
|---|
| 724 | /// The deep-copy constructor | 
|---|
| 725 | Context(const Context& other) | 
|---|
| 726 | : allocator{ other.allocator } | 
|---|
| 727 | , page{ other.page } | 
|---|
| 728 | , until_address{ other.until_address } { | 
|---|
| 729 | } | 
|---|
| 730 | protected: | 
|---|
| 731 | Status DeepCopy_Internal(IAsyncContext*& context_copy) final { | 
|---|
| 732 | return IAsyncContext::DeepCopy_Internal(*this, context_copy); | 
|---|
| 733 | } | 
|---|
| 734 | public: | 
|---|
| 735 | alloc_t* allocator; | 
|---|
| 736 | uint32_t page; | 
|---|
| 737 | Address until_address; | 
|---|
| 738 | }; | 
|---|
| 739 |  | 
|---|
| 740 | auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { | 
|---|
| 741 | CallbackContext<Context> context{ ctxt }; | 
|---|
| 742 | if(result != Status::Ok) { | 
|---|
| 743 | fprintf(stderr, "AsyncFlushPages(), error: %u\n", static_cast<uint8_t>(result)); | 
|---|
| 744 | } | 
|---|
| 745 | context->allocator->PageStatus(context->page).LastFlushedUntilAddress.store( | 
|---|
| 746 | context->until_address); | 
|---|
| 747 | //Set the page status to flushed | 
|---|
| 748 | FlushCloseStatus old_status = context->allocator->PageStatus(context->page).status.load(); | 
|---|
| 749 | FlushCloseStatus new_status; | 
|---|
| 750 | do { | 
|---|
| 751 | new_status = FlushCloseStatus{ FlushStatus::Flushed, old_status.close }; | 
|---|
| 752 | } while(!context->allocator->PageStatus(context->page).status.compare_exchange_weak( | 
|---|
| 753 | old_status, new_status)); | 
|---|
| 754 | if(old_status.close == CloseStatus::Closed) { | 
|---|
| 755 | // We finished flushing the page after it was closed, so we are responsible for clearing and | 
|---|
| 756 | // reopening it. | 
|---|
| 757 | std::memset(context->allocator->Page(context->page), 0, kPageSize); | 
|---|
| 758 | context->allocator->PageStatus(context->page).status.store(FlushStatus::Flushed, | 
|---|
| 759 | CloseStatus::Open); | 
|---|
| 760 | } | 
|---|
| 761 | context->allocator->ShiftFlushedUntilAddress(); | 
|---|
| 762 | }; | 
|---|
| 763 |  | 
|---|
| 764 | uint32_t num_pages = until_address.page() - start_page; | 
|---|
| 765 | if(until_address.offset() > 0) { | 
|---|
| 766 | ++num_pages; | 
|---|
| 767 | } | 
|---|
| 768 | assert(num_pages > 0); | 
|---|
| 769 |  | 
|---|
| 770 | for(uint32_t flush_page = start_page; flush_page < start_page + num_pages; ++flush_page) { | 
|---|
| 771 | Address page_start_address{ flush_page, 0 }; | 
|---|
| 772 | Address page_end_address{ flush_page + 1, 0 }; | 
|---|
| 773 |  | 
|---|
| 774 | Context context{ this, flush_page, std::min(page_end_address, until_address) }; | 
|---|
| 775 |  | 
|---|
| 776 | //Set status to in-progress | 
|---|
| 777 | FlushCloseStatus old_status = PageStatus(flush_page).status.load(); | 
|---|
| 778 | FlushCloseStatus new_status; | 
|---|
| 779 | do { | 
|---|
| 780 | new_status = FlushCloseStatus{ FlushStatus::InProgress, old_status.close }; | 
|---|
| 781 | } while(!PageStatus(flush_page).status.compare_exchange_weak(old_status, new_status)); | 
|---|
| 782 | PageStatus(flush_page).LastFlushedUntilAddress.store(0); | 
|---|
| 783 |  | 
|---|
| 784 | RETURN_NOT_OK(file->WriteAsync(Page(flush_page), kPageSize * flush_page, kPageSize, callback, | 
|---|
| 785 | context)); | 
|---|
| 786 | } | 
|---|
| 787 | return Status::Ok; | 
|---|
| 788 | } | 
|---|
| 789 |  | 
|---|
| 790 | template <class D> | 
|---|
| 791 | Status PersistentMemoryMalloc<D>::AsyncFlushPagesToFile(uint32_t start_page, Address until_address, | 
|---|
| 792 | file_t& file, std::atomic<uint32_t>& flush_pending) { | 
|---|
| 793 | class Context : public IAsyncContext { | 
|---|
| 794 | public: | 
|---|
| 795 | Context(std::atomic<uint32_t>& flush_pending_) | 
|---|
| 796 | : flush_pending{ flush_pending_ } { | 
|---|
| 797 | } | 
|---|
| 798 | /// The deep-copy constructor | 
|---|
| 799 | Context(Context& other) | 
|---|
| 800 | : flush_pending{ other.flush_pending } { | 
|---|
| 801 | } | 
|---|
| 802 | protected: | 
|---|
| 803 | Status DeepCopy_Internal(IAsyncContext*& context_copy) final { | 
|---|
| 804 | return IAsyncContext::DeepCopy_Internal(*this, context_copy); | 
|---|
| 805 | } | 
|---|
| 806 | public: | 
|---|
| 807 | std::atomic<uint32_t>& flush_pending; | 
|---|
| 808 | }; | 
|---|
| 809 |  | 
|---|
| 810 | auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { | 
|---|
| 811 | CallbackContext<Context> context{ ctxt }; | 
|---|
| 812 | if(result != Status::Ok) { | 
|---|
| 813 | fprintf(stderr, "AsyncFlushPagesToFile(), error: %u\n", static_cast<uint8_t>(result)); | 
|---|
| 814 | } | 
|---|
| 815 | assert(context->flush_pending > 0); | 
|---|
| 816 | --context->flush_pending; | 
|---|
| 817 | }; | 
|---|
| 818 |  | 
|---|
| 819 | uint32_t num_pages = until_address.page() - start_page; | 
|---|
| 820 | if(until_address.offset() > 0) { | 
|---|
| 821 | ++num_pages; | 
|---|
| 822 | } | 
|---|
| 823 | assert(num_pages > 0); | 
|---|
| 824 | flush_pending = num_pages; | 
|---|
| 825 |  | 
|---|
| 826 | for(uint32_t flush_page = start_page; flush_page < start_page + num_pages; ++flush_page) { | 
|---|
| 827 | Address page_start_address{ flush_page, 0 }; | 
|---|
| 828 | Address page_end_address{ flush_page + 1, 0 }; | 
|---|
| 829 | Context context{ flush_pending }; | 
|---|
| 830 | RETURN_NOT_OK(file.WriteAsync(Page(flush_page), kPageSize * (flush_page - start_page), | 
|---|
| 831 | kPageSize, callback, context)); | 
|---|
| 832 | } | 
|---|
| 833 | return Status::Ok; | 
|---|
| 834 | } | 
|---|
| 835 |  | 
|---|
| 836 | template <class D> | 
|---|
| 837 | Status PersistentMemoryMalloc<D>::AsyncReadPagesFromLog(uint32_t start_page, uint32_t num_pages, | 
|---|
| 838 | RecoveryStatus& recovery_status) { | 
|---|
| 839 | return AsyncReadPages(*file, 0, start_page, num_pages, recovery_status); | 
|---|
| 840 | } | 
|---|
| 841 |  | 
|---|
| 842 | template <class D> | 
|---|
| 843 | Status PersistentMemoryMalloc<D>::AsyncReadPagesFromSnapshot(file_t& snapshot_file, | 
|---|
| 844 | uint32_t file_start_page, uint32_t start_page, uint32_t num_pages, | 
|---|
| 845 | RecoveryStatus& recovery_status) { | 
|---|
| 846 | return AsyncReadPages(snapshot_file, file_start_page, start_page, num_pages, recovery_status); | 
|---|
| 847 | } | 
|---|
| 848 |  | 
|---|
| 849 | template <class D> | 
|---|
| 850 | template <class F> | 
|---|
| 851 | Status PersistentMemoryMalloc<D>::AsyncReadPages(F& read_file, uint32_t file_start_page, | 
|---|
| 852 | uint32_t start_page, uint32_t num_pages, RecoveryStatus& recovery_status) { | 
|---|
| 853 | class Context : public IAsyncContext { | 
|---|
| 854 | public: | 
|---|
| 855 | Context(std::atomic<PageRecoveryStatus>& page_status_) | 
|---|
| 856 | : page_status{ &page_status_ } { | 
|---|
| 857 | } | 
|---|
| 858 | /// The deep-copy constructor | 
|---|
| 859 | Context(const Context& other) | 
|---|
| 860 | : page_status{ other.page_status } { | 
|---|
| 861 | } | 
|---|
| 862 | protected: | 
|---|
| 863 | Status DeepCopy_Internal(IAsyncContext*& context_copy) final { | 
|---|
| 864 | return IAsyncContext::DeepCopy_Internal(*this, context_copy); | 
|---|
| 865 | } | 
|---|
| 866 | public: | 
|---|
| 867 | std::atomic<PageRecoveryStatus>* page_status; | 
|---|
| 868 | }; | 
|---|
| 869 |  | 
|---|
| 870 | auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { | 
|---|
| 871 | CallbackContext<Context> context{ ctxt }; | 
|---|
| 872 | if(result != Status::Ok) { | 
|---|
| 873 | fprintf(stderr, "Error: %u\n", static_cast<uint8_t>(result)); | 
|---|
| 874 | } | 
|---|
| 875 | assert(context->page_status->load() == PageRecoveryStatus::IssuedRead); | 
|---|
| 876 | context->page_status->store(PageRecoveryStatus::ReadDone); | 
|---|
| 877 | }; | 
|---|
| 878 |  | 
|---|
| 879 | for(uint32_t read_page = start_page; read_page < start_page + num_pages; ++read_page) { | 
|---|
| 880 | if(!Page(read_page)) { | 
|---|
| 881 | // Allocate a new page. | 
|---|
| 882 | AllocatePage(read_page); | 
|---|
| 883 | } else { | 
|---|
| 884 | // Clear an old used page. | 
|---|
| 885 | std::memset(Page(read_page), 0, kPageSize); | 
|---|
| 886 | } | 
|---|
| 887 | assert(recovery_status.page_status(read_page) == PageRecoveryStatus::NotStarted); | 
|---|
| 888 | recovery_status.page_status(read_page).store(PageRecoveryStatus::IssuedRead); | 
|---|
| 889 | PageStatus(read_page).LastFlushedUntilAddress.store(Address{ read_page + 1, 0 }); | 
|---|
| 890 | Context context{ recovery_status.page_status(read_page) }; | 
|---|
| 891 | RETURN_NOT_OK(read_file.ReadAsync(kPageSize * (read_page - file_start_page), Page(read_page), | 
|---|
| 892 | kPageSize, callback, context)); | 
|---|
| 893 | } | 
|---|
| 894 | return Status::Ok; | 
|---|
| 895 | } | 
|---|
| 896 |  | 
|---|
| 897 | template <class D> | 
|---|
| 898 | Status PersistentMemoryMalloc<D>::AsyncFlushPage(uint32_t page, RecoveryStatus& recovery_status, | 
|---|
| 899 | AsyncCallback caller_callback, IAsyncContext* caller_context) { | 
|---|
| 900 | class Context : public IAsyncContext { | 
|---|
| 901 | public: | 
|---|
| 902 | Context(std::atomic<PageRecoveryStatus>& page_status_, AsyncCallback caller_callback_, | 
|---|
| 903 | IAsyncContext* caller_context_) | 
|---|
| 904 | : page_status{ &page_status_ } | 
|---|
| 905 | , caller_callback{ caller_callback_ } | 
|---|
| 906 | , caller_context{ caller_context_ } { | 
|---|
| 907 | } | 
|---|
| 908 | /// The deep-copy constructor | 
|---|
| 909 | Context(const Context& other, IAsyncContext* caller_context_copy) | 
|---|
| 910 | : page_status{ other.page_status } | 
|---|
| 911 | , caller_callback{ other.caller_callback } | 
|---|
| 912 | , caller_context{ caller_context_copy } { | 
|---|
| 913 | } | 
|---|
| 914 | protected: | 
|---|
| 915 | Status DeepCopy_Internal(IAsyncContext*& context_copy) final { | 
|---|
| 916 | if(caller_callback) { | 
|---|
| 917 | return IAsyncContext::DeepCopy_Internal(*this, caller_context, context_copy); | 
|---|
| 918 | } else { | 
|---|
| 919 | return IAsyncContext::DeepCopy_Internal(*this, context_copy); | 
|---|
| 920 | } | 
|---|
| 921 | } | 
|---|
| 922 | public: | 
|---|
| 923 | std::atomic<PageRecoveryStatus>* page_status; | 
|---|
| 924 | AsyncCallback caller_callback; | 
|---|
| 925 | IAsyncContext* caller_context; | 
|---|
| 926 | }; | 
|---|
| 927 |  | 
|---|
| 928 | auto callback = [](IAsyncContext* ctxt, Status result, size_t bytes_transferred) { | 
|---|
| 929 | CallbackContext<Context> context{ ctxt }; | 
|---|
| 930 | if(result != Status::Ok) { | 
|---|
| 931 | fprintf(stderr, "Error: %u\n", static_cast<uint8_t>(result)); | 
|---|
| 932 | } | 
|---|
| 933 | assert(context->page_status->load() == PageRecoveryStatus::IssuedFlush); | 
|---|
| 934 | context->page_status->store(PageRecoveryStatus::FlushDone); | 
|---|
| 935 | if(context->caller_callback) { | 
|---|
| 936 | context->caller_callback(context->caller_context, result); | 
|---|
| 937 | } | 
|---|
| 938 | }; | 
|---|
| 939 |  | 
|---|
| 940 | assert(recovery_status.page_status(page) == PageRecoveryStatus::ReadDone); | 
|---|
| 941 | recovery_status.page_status(page).store(PageRecoveryStatus::IssuedFlush); | 
|---|
| 942 | PageStatus(page).LastFlushedUntilAddress.store(Address{ page + 1, 0 }); | 
|---|
| 943 | Context context{ recovery_status.page_status(page), caller_callback, caller_context }; | 
|---|
| 944 | return file->WriteAsync(Page(page), kPageSize * page, kPageSize, callback, context); | 
|---|
| 945 | } | 
|---|
| 946 |  | 
|---|
| 947 | template <class D> | 
|---|
| 948 | void PersistentMemoryMalloc<D>::RecoveryReset(Address begin_address_, Address head_address_, | 
|---|
| 949 | Address tail_address) { | 
|---|
| 950 | begin_address.store(begin_address_); | 
|---|
| 951 | tail_page_offset_.store(tail_address); | 
|---|
| 952 | // issue read request to all pages until head lag | 
|---|
| 953 | head_address.store(head_address_); | 
|---|
| 954 | safe_head_address.store(head_address_); | 
|---|
| 955 |  | 
|---|
| 956 | flushed_until_address.store(Address{ tail_address.page(), 0 }); | 
|---|
| 957 | read_only_address.store(tail_address); | 
|---|
| 958 | safe_read_only_address.store(tail_address); | 
|---|
| 959 |  | 
|---|
| 960 | uint32_t start_page = head_address_.page(); | 
|---|
| 961 | uint32_t end_page = tail_address.offset() == 0 ? tail_address.page() : tail_address.page() + 1; | 
|---|
| 962 | if(!Page(end_page)) { | 
|---|
| 963 | AllocatePage(end_page); | 
|---|
| 964 | } | 
|---|
| 965 | if(!Page(end_page + 1)) { | 
|---|
| 966 | AllocatePage(end_page + 1); | 
|---|
| 967 | } | 
|---|
| 968 |  | 
|---|
| 969 | for(uint32_t idx = 0; idx < buffer_size_; ++idx) { | 
|---|
| 970 | PageStatus(idx).status.store(FlushStatus::Flushed, CloseStatus::Open); | 
|---|
| 971 | } | 
|---|
| 972 | } | 
|---|
| 973 |  | 
|---|
| 974 | template <class D> | 
|---|
| 975 | inline void PersistentMemoryMalloc<D>::PageAlignedShiftHeadAddress(uint32_t tail_page) { | 
|---|
| 976 | //obtain local values of variables that can change | 
|---|
| 977 | Address current_head_address = head_address.load(); | 
|---|
| 978 | Address current_flushed_until_address = flushed_until_address.load(); | 
|---|
| 979 |  | 
|---|
| 980 | if(tail_page <= (buffer_size_ - kNumHeadPages)) { | 
|---|
| 981 | // Desired head address is <= 0. | 
|---|
| 982 | return; | 
|---|
| 983 | } | 
|---|
| 984 |  | 
|---|
| 985 | Address desired_head_address{ tail_page - (buffer_size_ - kNumHeadPages), 0 }; | 
|---|
| 986 |  | 
|---|
| 987 | if(current_flushed_until_address < desired_head_address) { | 
|---|
| 988 | desired_head_address = Address{ current_flushed_until_address.page(), 0 }; | 
|---|
| 989 | } | 
|---|
| 990 |  | 
|---|
| 991 | Address old_head_address; | 
|---|
| 992 | if(MonotonicUpdate(head_address, desired_head_address, old_head_address)) { | 
|---|
| 993 | OnPagesClosed_Context context{ this, desired_head_address, false }; | 
|---|
| 994 | IAsyncContext* context_copy; | 
|---|
| 995 | Status result = context.DeepCopy(context_copy); | 
|---|
| 996 | assert(result == Status::Ok); | 
|---|
| 997 | epoch_->BumpCurrentEpoch(OnPagesClosed, context_copy); | 
|---|
| 998 | } | 
|---|
| 999 | } | 
|---|
| 1000 |  | 
|---|
| 1001 | template <class D> | 
|---|
| 1002 | inline void PersistentMemoryMalloc<D>::PageAlignedShiftReadOnlyAddress(uint32_t tail_page) { | 
|---|
| 1003 | Address current_read_only_address = read_only_address.load(); | 
|---|
| 1004 | if(tail_page <= num_mutable_pages_) { | 
|---|
| 1005 | // Desired read-only address is <= 0. | 
|---|
| 1006 | return; | 
|---|
| 1007 | } | 
|---|
| 1008 |  | 
|---|
| 1009 | Address desired_read_only_address{ tail_page - num_mutable_pages_, 0 }; | 
|---|
| 1010 | Address old_read_only_address; | 
|---|
| 1011 | if(MonotonicUpdate(read_only_address, desired_read_only_address, old_read_only_address)) { | 
|---|
| 1012 | OnPagesMarkedReadOnly_Context context{ this, desired_read_only_address, false }; | 
|---|
| 1013 | IAsyncContext* context_copy; | 
|---|
| 1014 | Status result = context.DeepCopy(context_copy); | 
|---|
| 1015 | assert(result == Status::Ok); | 
|---|
| 1016 | epoch_->BumpCurrentEpoch(OnPagesMarkedReadOnly, context_copy); | 
|---|
| 1017 | } | 
|---|
| 1018 | } | 
|---|
| 1019 |  | 
|---|
| 1020 | } | 
|---|
| 1021 | } // namespace FASTER::core | 
|---|
| 1022 |  | 
|---|