1 | /* |
2 | * Copyright 2013-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include <folly/system/MemoryMapping.h> |
18 | |
19 | #include <algorithm> |
20 | #include <functional> |
21 | #include <utility> |
22 | |
23 | #include <folly/Format.h> |
24 | #include <folly/portability/GFlags.h> |
25 | #include <folly/portability/SysMman.h> |
26 | |
27 | #ifdef __linux__ |
28 | #include <folly/experimental/io/HugePages.h> |
29 | #endif |
30 | |
31 | #include <fcntl.h> |
32 | #include <sys/types.h> |
33 | #include <system_error> |
34 | |
35 | static constexpr ssize_t kDefaultMlockChunkSize = |
36 | #ifndef _MSC_VER |
37 | // Linux implementations of unmap/mlock/munlock take a kernel |
38 | // semaphore and block other threads from doing other memory |
39 | // operations. Split the operations in chunks. |
40 | (1 << 20) // 1MB |
41 | #else // _MSC_VER |
42 | // MSVC doesn't have this problem, and calling munmap many times |
43 | // with the same address is a bad idea with the windows implementation. |
44 | (-1) |
45 | #endif // _MSC_VER |
46 | ; |
47 | |
48 | DEFINE_int64( |
49 | mlock_chunk_size, |
50 | kDefaultMlockChunkSize, |
51 | "Maximum bytes to mlock/munlock/munmap at once " |
52 | "(will be rounded up to PAGESIZE). Ignored if negative." ); |
53 | |
54 | #ifndef MAP_POPULATE |
55 | #define MAP_POPULATE 0 |
56 | #endif |
57 | |
58 | namespace folly { |
59 | |
60 | MemoryMapping::MemoryMapping(MemoryMapping&& other) noexcept { |
61 | swap(other); |
62 | } |
63 | |
64 | MemoryMapping::MemoryMapping( |
65 | File file, |
66 | off_t offset, |
67 | off_t length, |
68 | Options options) |
69 | : file_(std::move(file)), options_(std::move(options)) { |
70 | CHECK(file_); |
71 | init(offset, length); |
72 | } |
73 | |
74 | MemoryMapping::MemoryMapping( |
75 | const char* name, |
76 | off_t offset, |
77 | off_t length, |
78 | Options options) |
79 | : MemoryMapping( |
80 | File(name, options.writable ? O_RDWR : O_RDONLY), |
81 | offset, |
82 | length, |
83 | options) {} |
84 | |
85 | MemoryMapping::MemoryMapping( |
86 | int fd, |
87 | off_t offset, |
88 | off_t length, |
89 | Options options) |
90 | : MemoryMapping(File(fd), offset, length, options) {} |
91 | |
92 | MemoryMapping::MemoryMapping(AnonymousType, off_t length, Options options) |
93 | : options_(std::move(options)) { |
94 | init(0, length); |
95 | } |
96 | |
97 | namespace { |
98 | |
99 | #ifdef __linux__ |
100 | void getDeviceOptions(dev_t device, off_t& pageSize, bool& autoExtend) { |
101 | auto ps = getHugePageSizeForDevice(device); |
102 | if (ps) { |
103 | pageSize = ps->size; |
104 | autoExtend = true; |
105 | } |
106 | } |
107 | #else |
108 | inline void getDeviceOptions(dev_t, off_t&, bool&) {} |
109 | #endif |
110 | |
111 | } // namespace |
112 | |
113 | void MemoryMapping::init(off_t offset, off_t length) { |
114 | const bool grow = options_.grow; |
115 | const bool anon = !file_; |
116 | CHECK(!(grow && anon)); |
117 | |
118 | off_t& pageSize = options_.pageSize; |
119 | |
120 | struct stat st; |
121 | |
122 | // On Linux, hugetlbfs file systems don't require ftruncate() to grow the |
123 | // file, and (on kernels before 2.6.24) don't even allow it. Also, the file |
124 | // size is always a multiple of the page size. |
125 | bool autoExtend = false; |
126 | |
127 | if (!anon) { |
128 | // Stat the file |
129 | CHECK_ERR(fstat(file_.fd(), &st)); |
130 | |
131 | if (pageSize == 0) { |
132 | getDeviceOptions(st.st_dev, pageSize, autoExtend); |
133 | } |
134 | } else { |
135 | DCHECK(!file_); |
136 | DCHECK_EQ(offset, 0); |
137 | CHECK_EQ(pageSize, 0); |
138 | CHECK_GE(length, 0); |
139 | } |
140 | |
141 | if (pageSize == 0) { |
142 | pageSize = off_t(sysconf(_SC_PAGESIZE)); |
143 | } |
144 | |
145 | CHECK_GT(pageSize, 0); |
146 | CHECK_EQ(pageSize & (pageSize - 1), 0); // power of two |
147 | CHECK_GE(offset, 0); |
148 | |
149 | // Round down the start of the mapped region |
150 | off_t skipStart = offset % pageSize; |
151 | offset -= skipStart; |
152 | |
153 | mapLength_ = length; |
154 | if (mapLength_ != -1) { |
155 | mapLength_ += skipStart; |
156 | |
157 | // Round up the end of the mapped region |
158 | mapLength_ = (mapLength_ + pageSize - 1) / pageSize * pageSize; |
159 | } |
160 | |
161 | off_t remaining = anon ? length : st.st_size - offset; |
162 | |
163 | if (mapLength_ == -1) { |
164 | length = mapLength_ = remaining; |
165 | } else { |
166 | if (length > remaining) { |
167 | if (grow) { |
168 | if (!autoExtend) { |
169 | PCHECK(0 == ftruncate(file_.fd(), offset + length)) |
170 | << "ftruncate() failed, couldn't grow file to " |
171 | << offset + length; |
172 | remaining = length; |
173 | } else { |
174 | // Extend mapping to multiple of page size, don't use ftruncate |
175 | remaining = mapLength_; |
176 | } |
177 | } else { |
178 | length = remaining; |
179 | } |
180 | } |
181 | if (mapLength_ > remaining) { |
182 | mapLength_ = remaining; |
183 | } |
184 | } |
185 | |
186 | if (length == 0) { |
187 | mapLength_ = 0; |
188 | mapStart_ = nullptr; |
189 | } else { |
190 | int flags = options_.shared ? MAP_SHARED : MAP_PRIVATE; |
191 | if (anon) { |
192 | flags |= MAP_ANONYMOUS; |
193 | } |
194 | if (options_.prefault) { |
195 | flags |= MAP_POPULATE; |
196 | } |
197 | |
198 | // The standard doesn't actually require PROT_NONE to be zero... |
199 | int prot = PROT_NONE; |
200 | if (options_.readable || options_.writable) { |
201 | prot = |
202 | ((options_.readable ? PROT_READ : 0) | |
203 | (options_.writable ? PROT_WRITE : 0)); |
204 | } |
205 | |
206 | unsigned char* start = static_cast<unsigned char*>(mmap( |
207 | options_.address, size_t(mapLength_), prot, flags, file_.fd(), offset)); |
208 | PCHECK(start != MAP_FAILED) |
209 | << " offset=" << offset << " length=" << mapLength_; |
210 | mapStart_ = start; |
211 | data_.reset(start + skipStart, size_t(length)); |
212 | } |
213 | } |
214 | |
215 | namespace { |
216 | |
217 | off_t memOpChunkSize(off_t length, off_t pageSize) { |
218 | off_t chunkSize = length; |
219 | if (FLAGS_mlock_chunk_size <= 0) { |
220 | return chunkSize; |
221 | } |
222 | |
223 | chunkSize = off_t(FLAGS_mlock_chunk_size); |
224 | off_t r = chunkSize % pageSize; |
225 | if (r) { |
226 | chunkSize += (pageSize - r); |
227 | } |
228 | return chunkSize; |
229 | } |
230 | |
231 | /** |
232 | * Run @op in chunks over the buffer @mem of @bufSize length. |
233 | * |
234 | * Return: |
235 | * - success: true + amountSucceeded == bufSize (op success on whole buffer) |
236 | * - failure: false + amountSucceeded == nr bytes on which op succeeded. |
237 | */ |
238 | bool memOpInChunks( |
239 | std::function<int(void*, size_t)> op, |
240 | void* mem, |
241 | size_t bufSize, |
242 | off_t pageSize, |
243 | size_t& amountSucceeded) { |
244 | // Linux' unmap/mlock/munlock take a kernel semaphore and block other threads |
245 | // from doing other memory operations. If the size of the buffer is big the |
246 | // semaphore can be down for seconds (for benchmarks see |
247 | // http://kostja-osipov.livejournal.com/42963.html). Doing the operations in |
248 | // chunks breaks the locking into intervals and lets other threads do memory |
249 | // operations of their own. |
250 | |
251 | size_t chunkSize = size_t(memOpChunkSize(off_t(bufSize), pageSize)); |
252 | |
253 | char* addr = static_cast<char*>(mem); |
254 | amountSucceeded = 0; |
255 | |
256 | while (amountSucceeded < bufSize) { |
257 | size_t size = std::min(chunkSize, bufSize - amountSucceeded); |
258 | if (op(addr + amountSucceeded, size) != 0) { |
259 | return false; |
260 | } |
261 | amountSucceeded += size; |
262 | } |
263 | |
264 | return true; |
265 | } |
266 | |
267 | } // namespace |
268 | |
269 | bool MemoryMapping::mlock(LockMode lock) { |
270 | size_t amountSucceeded = 0; |
271 | locked_ = memOpInChunks( |
272 | ::mlock, |
273 | mapStart_, |
274 | size_t(mapLength_), |
275 | options_.pageSize, |
276 | amountSucceeded); |
277 | if (locked_) { |
278 | return true; |
279 | } |
280 | |
281 | auto msg = |
282 | folly::format("mlock({}) failed at {}" , mapLength_, amountSucceeded); |
283 | if (lock == LockMode::TRY_LOCK && errno == EPERM) { |
284 | PLOG(WARNING) << msg; |
285 | } else if (lock == LockMode::TRY_LOCK && errno == ENOMEM) { |
286 | VLOG(1) << msg; |
287 | } else { |
288 | PLOG(FATAL) << msg; |
289 | } |
290 | |
291 | // only part of the buffer was mlocked, unlock it back |
292 | if (!memOpInChunks( |
293 | ::munlock, |
294 | mapStart_, |
295 | amountSucceeded, |
296 | options_.pageSize, |
297 | amountSucceeded)) { |
298 | PLOG(WARNING) << "munlock()" ; |
299 | } |
300 | |
301 | return false; |
302 | } |
303 | |
304 | void MemoryMapping::munlock(bool dontneed) { |
305 | if (!locked_) { |
306 | return; |
307 | } |
308 | |
309 | size_t amountSucceeded = 0; |
310 | if (!memOpInChunks( |
311 | ::munlock, |
312 | mapStart_, |
313 | size_t(mapLength_), |
314 | options_.pageSize, |
315 | amountSucceeded)) { |
316 | PLOG(WARNING) << "munlock()" ; |
317 | } |
318 | if (mapLength_ && dontneed && |
319 | ::madvise(mapStart_, size_t(mapLength_), MADV_DONTNEED)) { |
320 | PLOG(WARNING) << "madvise()" ; |
321 | } |
322 | locked_ = false; |
323 | } |
324 | |
325 | void MemoryMapping::hintLinearScan() { |
326 | advise(MADV_SEQUENTIAL); |
327 | } |
328 | |
329 | MemoryMapping::~MemoryMapping() { |
330 | if (mapLength_) { |
331 | size_t amountSucceeded = 0; |
332 | if (!memOpInChunks( |
333 | ::munmap, |
334 | mapStart_, |
335 | size_t(mapLength_), |
336 | options_.pageSize, |
337 | amountSucceeded)) { |
338 | PLOG(FATAL) << folly::format( |
339 | "munmap({}) failed at {}" , mapLength_, amountSucceeded); |
340 | } |
341 | } |
342 | } |
343 | |
344 | void MemoryMapping::advise(int advice) const { |
345 | advise(advice, 0, size_t(mapLength_)); |
346 | } |
347 | |
348 | void MemoryMapping::advise(int advice, size_t offset, size_t length) const { |
349 | CHECK_LE(offset + length, size_t(mapLength_)) |
350 | << " offset: " << offset << " length: " << length |
351 | << " mapLength_: " << mapLength_; |
352 | |
353 | // Include the entire start page: round down to page boundary. |
354 | const auto offMisalign = offset % options_.pageSize; |
355 | offset -= offMisalign; |
356 | length += offMisalign; |
357 | |
358 | // Round the last page down to page boundary. |
359 | if (offset + length != size_t(mapLength_)) { |
360 | length -= length % options_.pageSize; |
361 | } |
362 | |
363 | if (length == 0) { |
364 | return; |
365 | } |
366 | |
367 | char* mapStart = static_cast<char*>(mapStart_) + offset; |
368 | PLOG_IF(WARNING, ::madvise(mapStart, length, advice)) << "madvise" ; |
369 | } |
370 | |
371 | MemoryMapping& MemoryMapping::operator=(MemoryMapping other) { |
372 | swap(other); |
373 | return *this; |
374 | } |
375 | |
376 | void MemoryMapping::swap(MemoryMapping& other) noexcept { |
377 | using std::swap; |
378 | swap(this->file_, other.file_); |
379 | swap(this->mapStart_, other.mapStart_); |
380 | swap(this->mapLength_, other.mapLength_); |
381 | swap(this->options_, other.options_); |
382 | swap(this->locked_, other.locked_); |
383 | swap(this->data_, other.data_); |
384 | } |
385 | |
386 | void swap(MemoryMapping& a, MemoryMapping& b) noexcept { |
387 | a.swap(b); |
388 | } |
389 | |
390 | void alignedForwardMemcpy(void* dst, const void* src, size_t size) { |
391 | assert(reinterpret_cast<uintptr_t>(src) % alignof(unsigned long) == 0); |
392 | assert(reinterpret_cast<uintptr_t>(dst) % alignof(unsigned long) == 0); |
393 | |
394 | auto srcl = static_cast<const unsigned long*>(src); |
395 | auto dstl = static_cast<unsigned long*>(dst); |
396 | |
397 | while (size >= sizeof(unsigned long)) { |
398 | *dstl++ = *srcl++; |
399 | size -= sizeof(unsigned long); |
400 | } |
401 | |
402 | auto srcc = reinterpret_cast<const unsigned char*>(srcl); |
403 | auto dstc = reinterpret_cast<unsigned char*>(dstl); |
404 | |
405 | while (size != 0) { |
406 | *dstc++ = *srcc++; |
407 | --size; |
408 | } |
409 | } |
410 | |
411 | void mmapFileCopy(const char* src, const char* dest, mode_t mode) { |
412 | MemoryMapping srcMap(src); |
413 | srcMap.hintLinearScan(); |
414 | |
415 | MemoryMapping destMap( |
416 | File(dest, O_RDWR | O_CREAT | O_TRUNC, mode), |
417 | 0, |
418 | off_t(srcMap.range().size()), |
419 | MemoryMapping::writable()); |
420 | |
421 | alignedForwardMemcpy( |
422 | destMap.writableRange().data(), |
423 | srcMap.range().data(), |
424 | srcMap.range().size()); |
425 | } |
426 | |
427 | } // namespace folly |
428 | |