1/*
2 * Copyright 2013-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <folly/system/MemoryMapping.h>
18
19#include <algorithm>
20#include <functional>
21#include <utility>
22
23#include <folly/Format.h>
24#include <folly/portability/GFlags.h>
25#include <folly/portability/SysMman.h>
26
27#ifdef __linux__
28#include <folly/experimental/io/HugePages.h>
29#endif
30
31#include <fcntl.h>
32#include <sys/types.h>
33#include <system_error>
34
35static constexpr ssize_t kDefaultMlockChunkSize =
36#ifndef _MSC_VER
37 // Linux implementations of unmap/mlock/munlock take a kernel
38 // semaphore and block other threads from doing other memory
39 // operations. Split the operations in chunks.
40 (1 << 20) // 1MB
41#else // _MSC_VER
42 // MSVC doesn't have this problem, and calling munmap many times
43 // with the same address is a bad idea with the windows implementation.
44 (-1)
45#endif // _MSC_VER
46 ;
47
48DEFINE_int64(
49 mlock_chunk_size,
50 kDefaultMlockChunkSize,
51 "Maximum bytes to mlock/munlock/munmap at once "
52 "(will be rounded up to PAGESIZE). Ignored if negative.");
53
54#ifndef MAP_POPULATE
55#define MAP_POPULATE 0
56#endif
57
58namespace folly {
59
60MemoryMapping::MemoryMapping(MemoryMapping&& other) noexcept {
61 swap(other);
62}
63
64MemoryMapping::MemoryMapping(
65 File file,
66 off_t offset,
67 off_t length,
68 Options options)
69 : file_(std::move(file)), options_(std::move(options)) {
70 CHECK(file_);
71 init(offset, length);
72}
73
74MemoryMapping::MemoryMapping(
75 const char* name,
76 off_t offset,
77 off_t length,
78 Options options)
79 : MemoryMapping(
80 File(name, options.writable ? O_RDWR : O_RDONLY),
81 offset,
82 length,
83 options) {}
84
85MemoryMapping::MemoryMapping(
86 int fd,
87 off_t offset,
88 off_t length,
89 Options options)
90 : MemoryMapping(File(fd), offset, length, options) {}
91
92MemoryMapping::MemoryMapping(AnonymousType, off_t length, Options options)
93 : options_(std::move(options)) {
94 init(0, length);
95}
96
97namespace {
98
99#ifdef __linux__
100void getDeviceOptions(dev_t device, off_t& pageSize, bool& autoExtend) {
101 auto ps = getHugePageSizeForDevice(device);
102 if (ps) {
103 pageSize = ps->size;
104 autoExtend = true;
105 }
106}
107#else
108inline void getDeviceOptions(dev_t, off_t&, bool&) {}
109#endif
110
111} // namespace
112
113void MemoryMapping::init(off_t offset, off_t length) {
114 const bool grow = options_.grow;
115 const bool anon = !file_;
116 CHECK(!(grow && anon));
117
118 off_t& pageSize = options_.pageSize;
119
120 struct stat st;
121
122 // On Linux, hugetlbfs file systems don't require ftruncate() to grow the
123 // file, and (on kernels before 2.6.24) don't even allow it. Also, the file
124 // size is always a multiple of the page size.
125 bool autoExtend = false;
126
127 if (!anon) {
128 // Stat the file
129 CHECK_ERR(fstat(file_.fd(), &st));
130
131 if (pageSize == 0) {
132 getDeviceOptions(st.st_dev, pageSize, autoExtend);
133 }
134 } else {
135 DCHECK(!file_);
136 DCHECK_EQ(offset, 0);
137 CHECK_EQ(pageSize, 0);
138 CHECK_GE(length, 0);
139 }
140
141 if (pageSize == 0) {
142 pageSize = off_t(sysconf(_SC_PAGESIZE));
143 }
144
145 CHECK_GT(pageSize, 0);
146 CHECK_EQ(pageSize & (pageSize - 1), 0); // power of two
147 CHECK_GE(offset, 0);
148
149 // Round down the start of the mapped region
150 off_t skipStart = offset % pageSize;
151 offset -= skipStart;
152
153 mapLength_ = length;
154 if (mapLength_ != -1) {
155 mapLength_ += skipStart;
156
157 // Round up the end of the mapped region
158 mapLength_ = (mapLength_ + pageSize - 1) / pageSize * pageSize;
159 }
160
161 off_t remaining = anon ? length : st.st_size - offset;
162
163 if (mapLength_ == -1) {
164 length = mapLength_ = remaining;
165 } else {
166 if (length > remaining) {
167 if (grow) {
168 if (!autoExtend) {
169 PCHECK(0 == ftruncate(file_.fd(), offset + length))
170 << "ftruncate() failed, couldn't grow file to "
171 << offset + length;
172 remaining = length;
173 } else {
174 // Extend mapping to multiple of page size, don't use ftruncate
175 remaining = mapLength_;
176 }
177 } else {
178 length = remaining;
179 }
180 }
181 if (mapLength_ > remaining) {
182 mapLength_ = remaining;
183 }
184 }
185
186 if (length == 0) {
187 mapLength_ = 0;
188 mapStart_ = nullptr;
189 } else {
190 int flags = options_.shared ? MAP_SHARED : MAP_PRIVATE;
191 if (anon) {
192 flags |= MAP_ANONYMOUS;
193 }
194 if (options_.prefault) {
195 flags |= MAP_POPULATE;
196 }
197
198 // The standard doesn't actually require PROT_NONE to be zero...
199 int prot = PROT_NONE;
200 if (options_.readable || options_.writable) {
201 prot =
202 ((options_.readable ? PROT_READ : 0) |
203 (options_.writable ? PROT_WRITE : 0));
204 }
205
206 unsigned char* start = static_cast<unsigned char*>(mmap(
207 options_.address, size_t(mapLength_), prot, flags, file_.fd(), offset));
208 PCHECK(start != MAP_FAILED)
209 << " offset=" << offset << " length=" << mapLength_;
210 mapStart_ = start;
211 data_.reset(start + skipStart, size_t(length));
212 }
213}
214
215namespace {
216
217off_t memOpChunkSize(off_t length, off_t pageSize) {
218 off_t chunkSize = length;
219 if (FLAGS_mlock_chunk_size <= 0) {
220 return chunkSize;
221 }
222
223 chunkSize = off_t(FLAGS_mlock_chunk_size);
224 off_t r = chunkSize % pageSize;
225 if (r) {
226 chunkSize += (pageSize - r);
227 }
228 return chunkSize;
229}
230
231/**
232 * Run @op in chunks over the buffer @mem of @bufSize length.
233 *
234 * Return:
235 * - success: true + amountSucceeded == bufSize (op success on whole buffer)
236 * - failure: false + amountSucceeded == nr bytes on which op succeeded.
237 */
238bool memOpInChunks(
239 std::function<int(void*, size_t)> op,
240 void* mem,
241 size_t bufSize,
242 off_t pageSize,
243 size_t& amountSucceeded) {
244 // Linux' unmap/mlock/munlock take a kernel semaphore and block other threads
245 // from doing other memory operations. If the size of the buffer is big the
246 // semaphore can be down for seconds (for benchmarks see
247 // http://kostja-osipov.livejournal.com/42963.html). Doing the operations in
248 // chunks breaks the locking into intervals and lets other threads do memory
249 // operations of their own.
250
251 size_t chunkSize = size_t(memOpChunkSize(off_t(bufSize), pageSize));
252
253 char* addr = static_cast<char*>(mem);
254 amountSucceeded = 0;
255
256 while (amountSucceeded < bufSize) {
257 size_t size = std::min(chunkSize, bufSize - amountSucceeded);
258 if (op(addr + amountSucceeded, size) != 0) {
259 return false;
260 }
261 amountSucceeded += size;
262 }
263
264 return true;
265}
266
267} // namespace
268
269bool MemoryMapping::mlock(LockMode lock) {
270 size_t amountSucceeded = 0;
271 locked_ = memOpInChunks(
272 ::mlock,
273 mapStart_,
274 size_t(mapLength_),
275 options_.pageSize,
276 amountSucceeded);
277 if (locked_) {
278 return true;
279 }
280
281 auto msg =
282 folly::format("mlock({}) failed at {}", mapLength_, amountSucceeded);
283 if (lock == LockMode::TRY_LOCK && errno == EPERM) {
284 PLOG(WARNING) << msg;
285 } else if (lock == LockMode::TRY_LOCK && errno == ENOMEM) {
286 VLOG(1) << msg;
287 } else {
288 PLOG(FATAL) << msg;
289 }
290
291 // only part of the buffer was mlocked, unlock it back
292 if (!memOpInChunks(
293 ::munlock,
294 mapStart_,
295 amountSucceeded,
296 options_.pageSize,
297 amountSucceeded)) {
298 PLOG(WARNING) << "munlock()";
299 }
300
301 return false;
302}
303
304void MemoryMapping::munlock(bool dontneed) {
305 if (!locked_) {
306 return;
307 }
308
309 size_t amountSucceeded = 0;
310 if (!memOpInChunks(
311 ::munlock,
312 mapStart_,
313 size_t(mapLength_),
314 options_.pageSize,
315 amountSucceeded)) {
316 PLOG(WARNING) << "munlock()";
317 }
318 if (mapLength_ && dontneed &&
319 ::madvise(mapStart_, size_t(mapLength_), MADV_DONTNEED)) {
320 PLOG(WARNING) << "madvise()";
321 }
322 locked_ = false;
323}
324
325void MemoryMapping::hintLinearScan() {
326 advise(MADV_SEQUENTIAL);
327}
328
329MemoryMapping::~MemoryMapping() {
330 if (mapLength_) {
331 size_t amountSucceeded = 0;
332 if (!memOpInChunks(
333 ::munmap,
334 mapStart_,
335 size_t(mapLength_),
336 options_.pageSize,
337 amountSucceeded)) {
338 PLOG(FATAL) << folly::format(
339 "munmap({}) failed at {}", mapLength_, amountSucceeded);
340 }
341 }
342}
343
344void MemoryMapping::advise(int advice) const {
345 advise(advice, 0, size_t(mapLength_));
346}
347
348void MemoryMapping::advise(int advice, size_t offset, size_t length) const {
349 CHECK_LE(offset + length, size_t(mapLength_))
350 << " offset: " << offset << " length: " << length
351 << " mapLength_: " << mapLength_;
352
353 // Include the entire start page: round down to page boundary.
354 const auto offMisalign = offset % options_.pageSize;
355 offset -= offMisalign;
356 length += offMisalign;
357
358 // Round the last page down to page boundary.
359 if (offset + length != size_t(mapLength_)) {
360 length -= length % options_.pageSize;
361 }
362
363 if (length == 0) {
364 return;
365 }
366
367 char* mapStart = static_cast<char*>(mapStart_) + offset;
368 PLOG_IF(WARNING, ::madvise(mapStart, length, advice)) << "madvise";
369}
370
371MemoryMapping& MemoryMapping::operator=(MemoryMapping other) {
372 swap(other);
373 return *this;
374}
375
376void MemoryMapping::swap(MemoryMapping& other) noexcept {
377 using std::swap;
378 swap(this->file_, other.file_);
379 swap(this->mapStart_, other.mapStart_);
380 swap(this->mapLength_, other.mapLength_);
381 swap(this->options_, other.options_);
382 swap(this->locked_, other.locked_);
383 swap(this->data_, other.data_);
384}
385
386void swap(MemoryMapping& a, MemoryMapping& b) noexcept {
387 a.swap(b);
388}
389
390void alignedForwardMemcpy(void* dst, const void* src, size_t size) {
391 assert(reinterpret_cast<uintptr_t>(src) % alignof(unsigned long) == 0);
392 assert(reinterpret_cast<uintptr_t>(dst) % alignof(unsigned long) == 0);
393
394 auto srcl = static_cast<const unsigned long*>(src);
395 auto dstl = static_cast<unsigned long*>(dst);
396
397 while (size >= sizeof(unsigned long)) {
398 *dstl++ = *srcl++;
399 size -= sizeof(unsigned long);
400 }
401
402 auto srcc = reinterpret_cast<const unsigned char*>(srcl);
403 auto dstc = reinterpret_cast<unsigned char*>(dstl);
404
405 while (size != 0) {
406 *dstc++ = *srcc++;
407 --size;
408 }
409}
410
411void mmapFileCopy(const char* src, const char* dest, mode_t mode) {
412 MemoryMapping srcMap(src);
413 srcMap.hintLinearScan();
414
415 MemoryMapping destMap(
416 File(dest, O_RDWR | O_CREAT | O_TRUNC, mode),
417 0,
418 off_t(srcMap.range().size()),
419 MemoryMapping::writable());
420
421 alignedForwardMemcpy(
422 destMap.writableRange().data(),
423 srcMap.range().data(),
424 srcMap.range().size());
425}
426
427} // namespace folly
428