1 | /* |
2 | * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved. |
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | * |
5 | * This code is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License version 2 only, as |
7 | * published by the Free Software Foundation. |
8 | * |
9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
12 | * version 2 for more details (a copy is included in the LICENSE file that |
13 | * accompanied this code). |
14 | * |
15 | * You should have received a copy of the GNU General Public License version |
16 | * 2 along with this work; if not, write to the Free Software Foundation, |
17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
18 | * |
19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 | * or visit www.oracle.com if you need additional information or have any |
21 | * questions. |
22 | */ |
23 | |
24 | #include "precompiled.hpp" |
25 | #include "gc/z/zArray.inline.hpp" |
26 | #include "gc/z/zBackingFile_linux_x86.hpp" |
27 | #include "gc/z/zBackingPath_linux_x86.hpp" |
28 | #include "gc/z/zErrno.hpp" |
29 | #include "gc/z/zGlobals.hpp" |
30 | #include "gc/z/zLargePages.inline.hpp" |
31 | #include "logging/log.hpp" |
32 | #include "runtime/init.hpp" |
33 | #include "runtime/os.hpp" |
34 | #include "utilities/align.hpp" |
35 | #include "utilities/debug.hpp" |
36 | |
37 | #include <fcntl.h> |
38 | #include <sys/mman.h> |
39 | #include <sys/stat.h> |
40 | #include <sys/statfs.h> |
41 | #include <sys/syscall.h> |
42 | #include <sys/types.h> |
43 | #include <unistd.h> |
44 | |
45 | // |
46 | // Support for building on older Linux systems |
47 | // |
48 | |
49 | // System calls |
50 | #ifndef SYS_fallocate |
51 | #define SYS_fallocate 285 |
52 | #endif |
53 | #ifndef SYS_memfd_create |
54 | #define SYS_memfd_create 319 |
55 | #endif |
56 | |
57 | // memfd_create(2) flags |
58 | #ifndef MFD_CLOEXEC |
59 | #define MFD_CLOEXEC 0x0001U |
60 | #endif |
61 | #ifndef MFD_HUGETLB |
62 | #define MFD_HUGETLB 0x0004U |
63 | #endif |
64 | |
65 | // open(2) flags |
66 | #ifndef O_CLOEXEC |
67 | #define O_CLOEXEC 02000000 |
68 | #endif |
69 | #ifndef O_TMPFILE |
70 | #define O_TMPFILE (020000000 | O_DIRECTORY) |
71 | #endif |
72 | |
73 | // fallocate(2) flags |
74 | #ifndef FALLOC_FL_KEEP_SIZE |
75 | #define FALLOC_FL_KEEP_SIZE 0x01 |
76 | #endif |
77 | #ifndef FALLOC_FL_PUNCH_HOLE |
78 | #define FALLOC_FL_PUNCH_HOLE 0x02 |
79 | #endif |
80 | |
81 | // Filesystem types, see statfs(2) |
82 | #ifndef TMPFS_MAGIC |
83 | #define TMPFS_MAGIC 0x01021994 |
84 | #endif |
85 | #ifndef HUGETLBFS_MAGIC |
86 | #define HUGETLBFS_MAGIC 0x958458f6 |
87 | #endif |
88 | |
89 | // Filesystem names |
90 | #define ZFILESYSTEM_TMPFS "tmpfs" |
91 | #define ZFILESYSTEM_HUGETLBFS "hugetlbfs" |
92 | |
93 | // Sysfs file for transparent huge page on tmpfs |
94 | #define ZFILENAME_SHMEM_ENABLED "/sys/kernel/mm/transparent_hugepage/shmem_enabled" |
95 | |
96 | // Java heap filename |
97 | #define ZFILENAME_HEAP "java_heap" |
98 | |
99 | // Preferred tmpfs mount points, ordered by priority |
100 | static const char* z_preferred_tmpfs_mountpoints[] = { |
101 | "/dev/shm" , |
102 | "/run/shm" , |
103 | NULL |
104 | }; |
105 | |
106 | // Preferred hugetlbfs mount points, ordered by priority |
107 | static const char* z_preferred_hugetlbfs_mountpoints[] = { |
108 | "/dev/hugepages" , |
109 | "/hugepages" , |
110 | NULL |
111 | }; |
112 | |
113 | static int z_fallocate_hugetlbfs_attempts = 3; |
114 | static bool z_fallocate_supported = true; |
115 | |
116 | static int z_fallocate(int fd, int mode, size_t offset, size_t length) { |
117 | return syscall(SYS_fallocate, fd, mode, offset, length); |
118 | } |
119 | |
120 | static int z_memfd_create(const char *name, unsigned int flags) { |
121 | return syscall(SYS_memfd_create, name, flags); |
122 | } |
123 | |
124 | ZBackingFile::ZBackingFile() : |
125 | _fd(-1), |
126 | _size(0), |
127 | _filesystem(0), |
128 | _block_size(0), |
129 | _available(0), |
130 | _initialized(false) { |
131 | |
132 | // Create backing file |
133 | _fd = create_fd(ZFILENAME_HEAP); |
134 | if (_fd == -1) { |
135 | return; |
136 | } |
137 | |
138 | // Get filesystem statistics |
139 | struct statfs buf; |
140 | if (fstatfs(_fd, &buf) == -1) { |
141 | ZErrno err; |
142 | log_error(gc)("Failed to determine filesystem type for backing file (%s)" , err.to_string()); |
143 | return; |
144 | } |
145 | |
146 | _filesystem = buf.f_type; |
147 | _block_size = buf.f_bsize; |
148 | _available = buf.f_bavail * _block_size; |
149 | |
150 | // Make sure we're on a supported filesystem |
151 | if (!is_tmpfs() && !is_hugetlbfs()) { |
152 | log_error(gc)("Backing file must be located on a %s or a %s filesystem" , |
153 | ZFILESYSTEM_TMPFS, ZFILESYSTEM_HUGETLBFS); |
154 | return; |
155 | } |
156 | |
157 | // Make sure the filesystem type matches requested large page type |
158 | if (ZLargePages::is_transparent() && !is_tmpfs()) { |
159 | log_error(gc)("-XX:+UseTransparentHugePages can only be enable when using a %s filesystem" , |
160 | ZFILESYSTEM_TMPFS); |
161 | return; |
162 | } |
163 | |
164 | if (ZLargePages::is_transparent() && !tmpfs_supports_transparent_huge_pages()) { |
165 | log_error(gc)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel" , |
166 | ZFILESYSTEM_TMPFS); |
167 | return; |
168 | } |
169 | |
170 | if (ZLargePages::is_explicit() && !is_hugetlbfs()) { |
171 | log_error(gc)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled " |
172 | "when using a %s filesystem" , ZFILESYSTEM_HUGETLBFS); |
173 | return; |
174 | } |
175 | |
176 | if (!ZLargePages::is_explicit() && is_hugetlbfs()) { |
177 | log_error(gc)("-XX:+UseLargePages must be enabled when using a %s filesystem" , |
178 | ZFILESYSTEM_HUGETLBFS); |
179 | return; |
180 | } |
181 | |
182 | const size_t expected_block_size = is_tmpfs() ? os::vm_page_size() : os::large_page_size(); |
183 | if (expected_block_size != _block_size) { |
184 | log_error(gc)("%s filesystem has unexpected block size " SIZE_FORMAT " (expected " SIZE_FORMAT ")" , |
185 | is_tmpfs() ? ZFILESYSTEM_TMPFS : ZFILESYSTEM_HUGETLBFS, _block_size, expected_block_size); |
186 | return; |
187 | } |
188 | |
189 | // Successfully initialized |
190 | _initialized = true; |
191 | } |
192 | |
193 | int ZBackingFile::create_mem_fd(const char* name) const { |
194 | // Create file name |
195 | char filename[PATH_MAX]; |
196 | snprintf(filename, sizeof(filename), "%s%s" , name, ZLargePages::is_explicit() ? ".hugetlb" : "" ); |
197 | |
198 | // Create file |
199 | const int = ZLargePages::is_explicit() ? MFD_HUGETLB : 0; |
200 | const int fd = z_memfd_create(filename, MFD_CLOEXEC | extra_flags); |
201 | if (fd == -1) { |
202 | ZErrno err; |
203 | log_debug(gc, init)("Failed to create memfd file (%s)" , |
204 | ((ZLargePages::is_explicit() && err == EINVAL) ? "Hugepages not supported" : err.to_string())); |
205 | return -1; |
206 | } |
207 | |
208 | log_info(gc, init)("Heap backed by file: /memfd:%s" , filename); |
209 | |
210 | return fd; |
211 | } |
212 | |
213 | int ZBackingFile::create_file_fd(const char* name) const { |
214 | const char* const filesystem = ZLargePages::is_explicit() |
215 | ? ZFILESYSTEM_HUGETLBFS |
216 | : ZFILESYSTEM_TMPFS; |
217 | const char** const preferred_mountpoints = ZLargePages::is_explicit() |
218 | ? z_preferred_hugetlbfs_mountpoints |
219 | : z_preferred_tmpfs_mountpoints; |
220 | |
221 | // Find mountpoint |
222 | ZBackingPath path(filesystem, preferred_mountpoints); |
223 | if (path.get() == NULL) { |
224 | log_error(gc)("Use -XX:ZPath to specify the path to a %s filesystem" , filesystem); |
225 | return -1; |
226 | } |
227 | |
228 | // Try to create an anonymous file using the O_TMPFILE flag. Note that this |
229 | // flag requires kernel >= 3.11. If this fails we fall back to open/unlink. |
230 | const int fd_anon = os::open(path.get(), O_TMPFILE|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR); |
231 | if (fd_anon == -1) { |
232 | ZErrno err; |
233 | log_debug(gc, init)("Failed to create anonymous file in %s (%s)" , path.get(), |
234 | (err == EINVAL ? "Not supported" : err.to_string())); |
235 | } else { |
236 | // Get inode number for anonymous file |
237 | struct stat stat_buf; |
238 | if (fstat(fd_anon, &stat_buf) == -1) { |
239 | ZErrno err; |
240 | log_error(gc)("Failed to determine inode number for anonymous file (%s)" , err.to_string()); |
241 | return -1; |
242 | } |
243 | |
244 | log_info(gc, init)("Heap backed by file: %s/#" UINT64_FORMAT, path.get(), (uint64_t)stat_buf.st_ino); |
245 | |
246 | return fd_anon; |
247 | } |
248 | |
249 | log_debug(gc, init)("Falling back to open/unlink" ); |
250 | |
251 | // Create file name |
252 | char filename[PATH_MAX]; |
253 | snprintf(filename, sizeof(filename), "%s/%s.%d" , path.get(), name, os::current_process_id()); |
254 | |
255 | // Create file |
256 | const int fd = os::open(filename, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR); |
257 | if (fd == -1) { |
258 | ZErrno err; |
259 | log_error(gc)("Failed to create file %s (%s)" , filename, err.to_string()); |
260 | return -1; |
261 | } |
262 | |
263 | // Unlink file |
264 | if (unlink(filename) == -1) { |
265 | ZErrno err; |
266 | log_error(gc)("Failed to unlink file %s (%s)" , filename, err.to_string()); |
267 | return -1; |
268 | } |
269 | |
270 | log_info(gc, init)("Heap backed by file: %s" , filename); |
271 | |
272 | return fd; |
273 | } |
274 | |
275 | int ZBackingFile::create_fd(const char* name) const { |
276 | if (ZPath == NULL) { |
277 | // If the path is not explicitly specified, then we first try to create a memfd file |
278 | // instead of looking for a tmpfd/hugetlbfs mount point. Note that memfd_create() might |
279 | // not be supported at all (requires kernel >= 3.17), or it might not support large |
280 | // pages (requires kernel >= 4.14). If memfd_create() fails, then we try to create a |
281 | // file on an accessible tmpfs or hugetlbfs mount point. |
282 | const int fd = create_mem_fd(name); |
283 | if (fd != -1) { |
284 | return fd; |
285 | } |
286 | |
287 | log_debug(gc, init)("Falling back to searching for an accessible mount point" ); |
288 | } |
289 | |
290 | return create_file_fd(name); |
291 | } |
292 | |
293 | bool ZBackingFile::is_initialized() const { |
294 | return _initialized; |
295 | } |
296 | |
297 | int ZBackingFile::fd() const { |
298 | return _fd; |
299 | } |
300 | |
301 | size_t ZBackingFile::size() const { |
302 | return _size; |
303 | } |
304 | |
305 | size_t ZBackingFile::available() const { |
306 | return _available; |
307 | } |
308 | |
309 | bool ZBackingFile::is_tmpfs() const { |
310 | return _filesystem == TMPFS_MAGIC; |
311 | } |
312 | |
313 | bool ZBackingFile::is_hugetlbfs() const { |
314 | return _filesystem == HUGETLBFS_MAGIC; |
315 | } |
316 | |
317 | bool ZBackingFile::tmpfs_supports_transparent_huge_pages() const { |
318 | // If the shmem_enabled file exists and is readable then we |
319 | // know the kernel supports transparent huge pages for tmpfs. |
320 | return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0; |
321 | } |
322 | |
323 | ZErrno ZBackingFile::fallocate_compat_ftruncate(size_t size) const { |
324 | while (ftruncate(_fd, size) == -1) { |
325 | if (errno != EINTR) { |
326 | // Failed |
327 | return errno; |
328 | } |
329 | } |
330 | |
331 | // Success |
332 | return 0; |
333 | } |
334 | |
335 | ZErrno ZBackingFile::fallocate_compat_mmap(size_t offset, size_t length, bool touch) const { |
336 | // On hugetlbfs, mapping a file segment will fail immediately, without |
337 | // the need to touch the mapped pages first, if there aren't enough huge |
338 | // pages available to back the mapping. |
339 | void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset); |
340 | if (addr == MAP_FAILED) { |
341 | // Failed |
342 | return errno; |
343 | } |
344 | |
345 | // Once mapped, the huge pages are only reserved. We need to touch them |
346 | // to associate them with the file segment. Note that we can not punch |
347 | // hole in file segments which only have reserved pages. |
348 | if (touch) { |
349 | char* const start = (char*)addr; |
350 | char* const end = start + length; |
351 | os::pretouch_memory(start, end, _block_size); |
352 | } |
353 | |
354 | // Unmap again. From now on, the huge pages that were mapped are allocated |
355 | // to this file. There's no risk in getting SIGBUS when touching them. |
356 | if (munmap(addr, length) == -1) { |
357 | // Failed |
358 | return errno; |
359 | } |
360 | |
361 | // Success |
362 | return 0; |
363 | } |
364 | |
365 | ZErrno ZBackingFile::fallocate_compat_pwrite(size_t offset, size_t length) const { |
366 | uint8_t data = 0; |
367 | |
368 | // Allocate backing memory by writing to each block |
369 | for (size_t pos = offset; pos < offset + length; pos += _block_size) { |
370 | if (pwrite(_fd, &data, sizeof(data), pos) == -1) { |
371 | // Failed |
372 | return errno; |
373 | } |
374 | } |
375 | |
376 | // Success |
377 | return 0; |
378 | } |
379 | |
380 | ZErrno ZBackingFile::fallocate_fill_hole_compat(size_t offset, size_t length) { |
381 | // fallocate(2) is only supported by tmpfs since Linux 3.5, and by hugetlbfs |
382 | // since Linux 4.3. When fallocate(2) is not supported we emulate it using |
383 | // ftruncate/pwrite (for tmpfs) or ftruncate/mmap/munmap (for hugetlbfs). |
384 | |
385 | const size_t end = offset + length; |
386 | if (end > _size) { |
387 | // Increase file size |
388 | const ZErrno err = fallocate_compat_ftruncate(end); |
389 | if (err) { |
390 | // Failed |
391 | return err; |
392 | } |
393 | } |
394 | |
395 | // Allocate backing memory |
396 | const ZErrno err = is_hugetlbfs() ? fallocate_compat_mmap(offset, length, false /* touch */) |
397 | : fallocate_compat_pwrite(offset, length); |
398 | if (err) { |
399 | if (end > _size) { |
400 | // Restore file size |
401 | fallocate_compat_ftruncate(_size); |
402 | } |
403 | |
404 | // Failed |
405 | return err; |
406 | } |
407 | |
408 | if (end > _size) { |
409 | // Record new file size |
410 | _size = end; |
411 | } |
412 | |
413 | // Success |
414 | return 0; |
415 | } |
416 | |
417 | ZErrno ZBackingFile::fallocate_fill_hole_syscall(size_t offset, size_t length) { |
418 | const int mode = 0; // Allocate |
419 | const int res = z_fallocate(_fd, mode, offset, length); |
420 | if (res == -1) { |
421 | // Failed |
422 | return errno; |
423 | } |
424 | |
425 | const size_t end = offset + length; |
426 | if (end > _size) { |
427 | // Record new file size |
428 | _size = end; |
429 | } |
430 | |
431 | // Success |
432 | return 0; |
433 | } |
434 | |
435 | ZErrno ZBackingFile::fallocate_fill_hole(size_t offset, size_t length) { |
436 | // Using compat mode is more efficient when allocating space on hugetlbfs. |
437 | // Note that allocating huge pages this way will only reserve them, and not |
438 | // associate them with segments of the file. We must guarantee that we at |
439 | // some point touch these segments, otherwise we can not punch hole in them. |
440 | if (z_fallocate_supported && !is_hugetlbfs()) { |
441 | const ZErrno err = fallocate_fill_hole_syscall(offset, length); |
442 | if (!err) { |
443 | // Success |
444 | return 0; |
445 | } |
446 | |
447 | if (err != ENOSYS && err != EOPNOTSUPP) { |
448 | // Failed |
449 | return err; |
450 | } |
451 | |
452 | // Not supported |
453 | log_debug(gc)("Falling back to fallocate() compatibility mode" ); |
454 | z_fallocate_supported = false; |
455 | } |
456 | |
457 | return fallocate_fill_hole_compat(offset, length); |
458 | } |
459 | |
460 | ZErrno ZBackingFile::fallocate_punch_hole(size_t offset, size_t length) { |
461 | if (is_hugetlbfs()) { |
462 | // We can only punch hole in pages that have been touched. Non-touched |
463 | // pages are only reserved, and not associated with any specific file |
464 | // segment. We don't know which pages have been previously touched, so |
465 | // we always touch them here to guarantee that we can punch hole. |
466 | const ZErrno err = fallocate_compat_mmap(offset, length, true /* touch */); |
467 | if (err) { |
468 | // Failed |
469 | return err; |
470 | } |
471 | } |
472 | |
473 | const int mode = FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE; |
474 | if (z_fallocate(_fd, mode, offset, length) == -1) { |
475 | // Failed |
476 | return errno; |
477 | } |
478 | |
479 | // Success |
480 | return 0; |
481 | } |
482 | |
483 | ZErrno ZBackingFile::split_and_fallocate(bool punch_hole, size_t offset, size_t length) { |
484 | // Try first half |
485 | const size_t offset0 = offset; |
486 | const size_t length0 = align_up(length / 2, _block_size); |
487 | const ZErrno err0 = fallocate(punch_hole, offset0, length0); |
488 | if (err0) { |
489 | return err0; |
490 | } |
491 | |
492 | // Try second half |
493 | const size_t offset1 = offset0 + length0; |
494 | const size_t length1 = length - length0; |
495 | const ZErrno err1 = fallocate(punch_hole, offset1, length1); |
496 | if (err1) { |
497 | return err1; |
498 | } |
499 | |
500 | // Success |
501 | return 0; |
502 | } |
503 | |
504 | ZErrno ZBackingFile::fallocate(bool punch_hole, size_t offset, size_t length) { |
505 | assert(is_aligned(offset, _block_size), "Invalid offset" ); |
506 | assert(is_aligned(length, _block_size), "Invalid length" ); |
507 | |
508 | const ZErrno err = punch_hole ? fallocate_punch_hole(offset, length) : fallocate_fill_hole(offset, length); |
509 | if (err == EINTR && length > _block_size) { |
510 | // Calling fallocate(2) with a large length can take a long time to |
511 | // complete. When running profilers, such as VTune, this syscall will |
512 | // be constantly interrupted by signals. Expanding the file in smaller |
513 | // steps avoids this problem. |
514 | return split_and_fallocate(punch_hole, offset, length); |
515 | } |
516 | |
517 | return err; |
518 | } |
519 | |
520 | bool ZBackingFile::commit_inner(size_t offset, size_t length) { |
521 | log_trace(gc, heap)("Committing memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)" , |
522 | offset / M, (offset + length) / M, length / M); |
523 | |
524 | retry: |
525 | const ZErrno err = fallocate(false /* punch_hole */, offset, length); |
526 | if (err) { |
527 | if (err == ENOSPC && !is_init_completed() && is_hugetlbfs() && z_fallocate_hugetlbfs_attempts-- > 0) { |
528 | // If we fail to allocate during initialization, due to lack of space on |
529 | // the hugetlbfs filesystem, then we wait and retry a few times before |
530 | // giving up. Otherwise there is a risk that running JVMs back-to-back |
531 | // will fail, since there is a delay between process termination and the |
532 | // huge pages owned by that process being returned to the huge page pool |
533 | // and made available for new allocations. |
534 | log_debug(gc, init)("Failed to commit memory (%s), retrying" , err.to_string()); |
535 | |
536 | // Wait and retry in one second, in the hope that huge pages will be |
537 | // available by then. |
538 | sleep(1); |
539 | goto retry; |
540 | } |
541 | |
542 | // Failed |
543 | log_error(gc)("Failed to commit memory (%s)" , err.to_string()); |
544 | return false; |
545 | } |
546 | |
547 | // Success |
548 | return true; |
549 | } |
550 | |
551 | size_t ZBackingFile::commit(size_t offset, size_t length) { |
552 | // Try to commit the whole region |
553 | if (commit_inner(offset, length)) { |
554 | // Success |
555 | return length; |
556 | } |
557 | |
558 | // Failed, try to commit as much as possible |
559 | size_t start = offset; |
560 | size_t end = offset + length; |
561 | |
562 | for (;;) { |
563 | length = align_down((end - start) / 2, ZGranuleSize); |
564 | if (length < ZGranuleSize) { |
565 | // Done, don't commit more |
566 | return start - offset; |
567 | } |
568 | |
569 | if (commit_inner(start, length)) { |
570 | // Success, try commit more |
571 | start += length; |
572 | } else { |
573 | // Failed, try commit less |
574 | end -= length; |
575 | } |
576 | } |
577 | } |
578 | |
579 | size_t ZBackingFile::uncommit(size_t offset, size_t length) { |
580 | log_trace(gc, heap)("Uncommitting memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)" , |
581 | offset / M, (offset + length) / M, length / M); |
582 | |
583 | const ZErrno err = fallocate(true /* punch_hole */, offset, length); |
584 | if (err) { |
585 | log_error(gc)("Failed to uncommit memory (%s)" , err.to_string()); |
586 | return 0; |
587 | } |
588 | |
589 | return length; |
590 | } |
591 | |