1/*
2 * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 */
23
24#include "precompiled.hpp"
25#include "gc/z/zArray.inline.hpp"
26#include "gc/z/zBackingFile_linux_x86.hpp"
27#include "gc/z/zBackingPath_linux_x86.hpp"
28#include "gc/z/zErrno.hpp"
29#include "gc/z/zGlobals.hpp"
30#include "gc/z/zLargePages.inline.hpp"
31#include "logging/log.hpp"
32#include "runtime/init.hpp"
33#include "runtime/os.hpp"
34#include "utilities/align.hpp"
35#include "utilities/debug.hpp"
36
37#include <fcntl.h>
38#include <sys/mman.h>
39#include <sys/stat.h>
40#include <sys/statfs.h>
41#include <sys/syscall.h>
42#include <sys/types.h>
43#include <unistd.h>
44
45//
46// Support for building on older Linux systems
47//
48
49// System calls
50#ifndef SYS_fallocate
51#define SYS_fallocate 285
52#endif
53#ifndef SYS_memfd_create
54#define SYS_memfd_create 319
55#endif
56
57// memfd_create(2) flags
58#ifndef MFD_CLOEXEC
59#define MFD_CLOEXEC 0x0001U
60#endif
61#ifndef MFD_HUGETLB
62#define MFD_HUGETLB 0x0004U
63#endif
64
65// open(2) flags
66#ifndef O_CLOEXEC
67#define O_CLOEXEC 02000000
68#endif
69#ifndef O_TMPFILE
70#define O_TMPFILE (020000000 | O_DIRECTORY)
71#endif
72
73// fallocate(2) flags
74#ifndef FALLOC_FL_KEEP_SIZE
75#define FALLOC_FL_KEEP_SIZE 0x01
76#endif
77#ifndef FALLOC_FL_PUNCH_HOLE
78#define FALLOC_FL_PUNCH_HOLE 0x02
79#endif
80
81// Filesystem types, see statfs(2)
82#ifndef TMPFS_MAGIC
83#define TMPFS_MAGIC 0x01021994
84#endif
85#ifndef HUGETLBFS_MAGIC
86#define HUGETLBFS_MAGIC 0x958458f6
87#endif
88
89// Filesystem names
90#define ZFILESYSTEM_TMPFS "tmpfs"
91#define ZFILESYSTEM_HUGETLBFS "hugetlbfs"
92
93// Sysfs file for transparent huge page on tmpfs
94#define ZFILENAME_SHMEM_ENABLED "/sys/kernel/mm/transparent_hugepage/shmem_enabled"
95
96// Java heap filename
97#define ZFILENAME_HEAP "java_heap"
98
99// Preferred tmpfs mount points, ordered by priority
100static const char* z_preferred_tmpfs_mountpoints[] = {
101 "/dev/shm",
102 "/run/shm",
103 NULL
104};
105
106// Preferred hugetlbfs mount points, ordered by priority
107static const char* z_preferred_hugetlbfs_mountpoints[] = {
108 "/dev/hugepages",
109 "/hugepages",
110 NULL
111};
112
113static int z_fallocate_hugetlbfs_attempts = 3;
114static bool z_fallocate_supported = true;
115
116static int z_fallocate(int fd, int mode, size_t offset, size_t length) {
117 return syscall(SYS_fallocate, fd, mode, offset, length);
118}
119
120static int z_memfd_create(const char *name, unsigned int flags) {
121 return syscall(SYS_memfd_create, name, flags);
122}
123
124ZBackingFile::ZBackingFile() :
125 _fd(-1),
126 _size(0),
127 _filesystem(0),
128 _block_size(0),
129 _available(0),
130 _initialized(false) {
131
132 // Create backing file
133 _fd = create_fd(ZFILENAME_HEAP);
134 if (_fd == -1) {
135 return;
136 }
137
138 // Get filesystem statistics
139 struct statfs buf;
140 if (fstatfs(_fd, &buf) == -1) {
141 ZErrno err;
142 log_error(gc)("Failed to determine filesystem type for backing file (%s)", err.to_string());
143 return;
144 }
145
146 _filesystem = buf.f_type;
147 _block_size = buf.f_bsize;
148 _available = buf.f_bavail * _block_size;
149
150 // Make sure we're on a supported filesystem
151 if (!is_tmpfs() && !is_hugetlbfs()) {
152 log_error(gc)("Backing file must be located on a %s or a %s filesystem",
153 ZFILESYSTEM_TMPFS, ZFILESYSTEM_HUGETLBFS);
154 return;
155 }
156
157 // Make sure the filesystem type matches requested large page type
158 if (ZLargePages::is_transparent() && !is_tmpfs()) {
159 log_error(gc)("-XX:+UseTransparentHugePages can only be enable when using a %s filesystem",
160 ZFILESYSTEM_TMPFS);
161 return;
162 }
163
164 if (ZLargePages::is_transparent() && !tmpfs_supports_transparent_huge_pages()) {
165 log_error(gc)("-XX:+UseTransparentHugePages on a %s filesystem not supported by kernel",
166 ZFILESYSTEM_TMPFS);
167 return;
168 }
169
170 if (ZLargePages::is_explicit() && !is_hugetlbfs()) {
171 log_error(gc)("-XX:+UseLargePages (without -XX:+UseTransparentHugePages) can only be enabled "
172 "when using a %s filesystem", ZFILESYSTEM_HUGETLBFS);
173 return;
174 }
175
176 if (!ZLargePages::is_explicit() && is_hugetlbfs()) {
177 log_error(gc)("-XX:+UseLargePages must be enabled when using a %s filesystem",
178 ZFILESYSTEM_HUGETLBFS);
179 return;
180 }
181
182 const size_t expected_block_size = is_tmpfs() ? os::vm_page_size() : os::large_page_size();
183 if (expected_block_size != _block_size) {
184 log_error(gc)("%s filesystem has unexpected block size " SIZE_FORMAT " (expected " SIZE_FORMAT ")",
185 is_tmpfs() ? ZFILESYSTEM_TMPFS : ZFILESYSTEM_HUGETLBFS, _block_size, expected_block_size);
186 return;
187 }
188
189 // Successfully initialized
190 _initialized = true;
191}
192
193int ZBackingFile::create_mem_fd(const char* name) const {
194 // Create file name
195 char filename[PATH_MAX];
196 snprintf(filename, sizeof(filename), "%s%s", name, ZLargePages::is_explicit() ? ".hugetlb" : "");
197
198 // Create file
199 const int extra_flags = ZLargePages::is_explicit() ? MFD_HUGETLB : 0;
200 const int fd = z_memfd_create(filename, MFD_CLOEXEC | extra_flags);
201 if (fd == -1) {
202 ZErrno err;
203 log_debug(gc, init)("Failed to create memfd file (%s)",
204 ((ZLargePages::is_explicit() && err == EINVAL) ? "Hugepages not supported" : err.to_string()));
205 return -1;
206 }
207
208 log_info(gc, init)("Heap backed by file: /memfd:%s", filename);
209
210 return fd;
211}
212
213int ZBackingFile::create_file_fd(const char* name) const {
214 const char* const filesystem = ZLargePages::is_explicit()
215 ? ZFILESYSTEM_HUGETLBFS
216 : ZFILESYSTEM_TMPFS;
217 const char** const preferred_mountpoints = ZLargePages::is_explicit()
218 ? z_preferred_hugetlbfs_mountpoints
219 : z_preferred_tmpfs_mountpoints;
220
221 // Find mountpoint
222 ZBackingPath path(filesystem, preferred_mountpoints);
223 if (path.get() == NULL) {
224 log_error(gc)("Use -XX:ZPath to specify the path to a %s filesystem", filesystem);
225 return -1;
226 }
227
228 // Try to create an anonymous file using the O_TMPFILE flag. Note that this
229 // flag requires kernel >= 3.11. If this fails we fall back to open/unlink.
230 const int fd_anon = os::open(path.get(), O_TMPFILE|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
231 if (fd_anon == -1) {
232 ZErrno err;
233 log_debug(gc, init)("Failed to create anonymous file in %s (%s)", path.get(),
234 (err == EINVAL ? "Not supported" : err.to_string()));
235 } else {
236 // Get inode number for anonymous file
237 struct stat stat_buf;
238 if (fstat(fd_anon, &stat_buf) == -1) {
239 ZErrno err;
240 log_error(gc)("Failed to determine inode number for anonymous file (%s)", err.to_string());
241 return -1;
242 }
243
244 log_info(gc, init)("Heap backed by file: %s/#" UINT64_FORMAT, path.get(), (uint64_t)stat_buf.st_ino);
245
246 return fd_anon;
247 }
248
249 log_debug(gc, init)("Falling back to open/unlink");
250
251 // Create file name
252 char filename[PATH_MAX];
253 snprintf(filename, sizeof(filename), "%s/%s.%d", path.get(), name, os::current_process_id());
254
255 // Create file
256 const int fd = os::open(filename, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, S_IRUSR|S_IWUSR);
257 if (fd == -1) {
258 ZErrno err;
259 log_error(gc)("Failed to create file %s (%s)", filename, err.to_string());
260 return -1;
261 }
262
263 // Unlink file
264 if (unlink(filename) == -1) {
265 ZErrno err;
266 log_error(gc)("Failed to unlink file %s (%s)", filename, err.to_string());
267 return -1;
268 }
269
270 log_info(gc, init)("Heap backed by file: %s", filename);
271
272 return fd;
273}
274
275int ZBackingFile::create_fd(const char* name) const {
276 if (ZPath == NULL) {
277 // If the path is not explicitly specified, then we first try to create a memfd file
278 // instead of looking for a tmpfd/hugetlbfs mount point. Note that memfd_create() might
279 // not be supported at all (requires kernel >= 3.17), or it might not support large
280 // pages (requires kernel >= 4.14). If memfd_create() fails, then we try to create a
281 // file on an accessible tmpfs or hugetlbfs mount point.
282 const int fd = create_mem_fd(name);
283 if (fd != -1) {
284 return fd;
285 }
286
287 log_debug(gc, init)("Falling back to searching for an accessible mount point");
288 }
289
290 return create_file_fd(name);
291}
292
293bool ZBackingFile::is_initialized() const {
294 return _initialized;
295}
296
297int ZBackingFile::fd() const {
298 return _fd;
299}
300
301size_t ZBackingFile::size() const {
302 return _size;
303}
304
305size_t ZBackingFile::available() const {
306 return _available;
307}
308
309bool ZBackingFile::is_tmpfs() const {
310 return _filesystem == TMPFS_MAGIC;
311}
312
313bool ZBackingFile::is_hugetlbfs() const {
314 return _filesystem == HUGETLBFS_MAGIC;
315}
316
317bool ZBackingFile::tmpfs_supports_transparent_huge_pages() const {
318 // If the shmem_enabled file exists and is readable then we
319 // know the kernel supports transparent huge pages for tmpfs.
320 return access(ZFILENAME_SHMEM_ENABLED, R_OK) == 0;
321}
322
323ZErrno ZBackingFile::fallocate_compat_ftruncate(size_t size) const {
324 while (ftruncate(_fd, size) == -1) {
325 if (errno != EINTR) {
326 // Failed
327 return errno;
328 }
329 }
330
331 // Success
332 return 0;
333}
334
335ZErrno ZBackingFile::fallocate_compat_mmap(size_t offset, size_t length, bool touch) const {
336 // On hugetlbfs, mapping a file segment will fail immediately, without
337 // the need to touch the mapped pages first, if there aren't enough huge
338 // pages available to back the mapping.
339 void* const addr = mmap(0, length, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, offset);
340 if (addr == MAP_FAILED) {
341 // Failed
342 return errno;
343 }
344
345 // Once mapped, the huge pages are only reserved. We need to touch them
346 // to associate them with the file segment. Note that we can not punch
347 // hole in file segments which only have reserved pages.
348 if (touch) {
349 char* const start = (char*)addr;
350 char* const end = start + length;
351 os::pretouch_memory(start, end, _block_size);
352 }
353
354 // Unmap again. From now on, the huge pages that were mapped are allocated
355 // to this file. There's no risk in getting SIGBUS when touching them.
356 if (munmap(addr, length) == -1) {
357 // Failed
358 return errno;
359 }
360
361 // Success
362 return 0;
363}
364
365ZErrno ZBackingFile::fallocate_compat_pwrite(size_t offset, size_t length) const {
366 uint8_t data = 0;
367
368 // Allocate backing memory by writing to each block
369 for (size_t pos = offset; pos < offset + length; pos += _block_size) {
370 if (pwrite(_fd, &data, sizeof(data), pos) == -1) {
371 // Failed
372 return errno;
373 }
374 }
375
376 // Success
377 return 0;
378}
379
380ZErrno ZBackingFile::fallocate_fill_hole_compat(size_t offset, size_t length) {
381 // fallocate(2) is only supported by tmpfs since Linux 3.5, and by hugetlbfs
382 // since Linux 4.3. When fallocate(2) is not supported we emulate it using
383 // ftruncate/pwrite (for tmpfs) or ftruncate/mmap/munmap (for hugetlbfs).
384
385 const size_t end = offset + length;
386 if (end > _size) {
387 // Increase file size
388 const ZErrno err = fallocate_compat_ftruncate(end);
389 if (err) {
390 // Failed
391 return err;
392 }
393 }
394
395 // Allocate backing memory
396 const ZErrno err = is_hugetlbfs() ? fallocate_compat_mmap(offset, length, false /* touch */)
397 : fallocate_compat_pwrite(offset, length);
398 if (err) {
399 if (end > _size) {
400 // Restore file size
401 fallocate_compat_ftruncate(_size);
402 }
403
404 // Failed
405 return err;
406 }
407
408 if (end > _size) {
409 // Record new file size
410 _size = end;
411 }
412
413 // Success
414 return 0;
415}
416
417ZErrno ZBackingFile::fallocate_fill_hole_syscall(size_t offset, size_t length) {
418 const int mode = 0; // Allocate
419 const int res = z_fallocate(_fd, mode, offset, length);
420 if (res == -1) {
421 // Failed
422 return errno;
423 }
424
425 const size_t end = offset + length;
426 if (end > _size) {
427 // Record new file size
428 _size = end;
429 }
430
431 // Success
432 return 0;
433}
434
435ZErrno ZBackingFile::fallocate_fill_hole(size_t offset, size_t length) {
436 // Using compat mode is more efficient when allocating space on hugetlbfs.
437 // Note that allocating huge pages this way will only reserve them, and not
438 // associate them with segments of the file. We must guarantee that we at
439 // some point touch these segments, otherwise we can not punch hole in them.
440 if (z_fallocate_supported && !is_hugetlbfs()) {
441 const ZErrno err = fallocate_fill_hole_syscall(offset, length);
442 if (!err) {
443 // Success
444 return 0;
445 }
446
447 if (err != ENOSYS && err != EOPNOTSUPP) {
448 // Failed
449 return err;
450 }
451
452 // Not supported
453 log_debug(gc)("Falling back to fallocate() compatibility mode");
454 z_fallocate_supported = false;
455 }
456
457 return fallocate_fill_hole_compat(offset, length);
458}
459
460ZErrno ZBackingFile::fallocate_punch_hole(size_t offset, size_t length) {
461 if (is_hugetlbfs()) {
462 // We can only punch hole in pages that have been touched. Non-touched
463 // pages are only reserved, and not associated with any specific file
464 // segment. We don't know which pages have been previously touched, so
465 // we always touch them here to guarantee that we can punch hole.
466 const ZErrno err = fallocate_compat_mmap(offset, length, true /* touch */);
467 if (err) {
468 // Failed
469 return err;
470 }
471 }
472
473 const int mode = FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE;
474 if (z_fallocate(_fd, mode, offset, length) == -1) {
475 // Failed
476 return errno;
477 }
478
479 // Success
480 return 0;
481}
482
483ZErrno ZBackingFile::split_and_fallocate(bool punch_hole, size_t offset, size_t length) {
484 // Try first half
485 const size_t offset0 = offset;
486 const size_t length0 = align_up(length / 2, _block_size);
487 const ZErrno err0 = fallocate(punch_hole, offset0, length0);
488 if (err0) {
489 return err0;
490 }
491
492 // Try second half
493 const size_t offset1 = offset0 + length0;
494 const size_t length1 = length - length0;
495 const ZErrno err1 = fallocate(punch_hole, offset1, length1);
496 if (err1) {
497 return err1;
498 }
499
500 // Success
501 return 0;
502}
503
504ZErrno ZBackingFile::fallocate(bool punch_hole, size_t offset, size_t length) {
505 assert(is_aligned(offset, _block_size), "Invalid offset");
506 assert(is_aligned(length, _block_size), "Invalid length");
507
508 const ZErrno err = punch_hole ? fallocate_punch_hole(offset, length) : fallocate_fill_hole(offset, length);
509 if (err == EINTR && length > _block_size) {
510 // Calling fallocate(2) with a large length can take a long time to
511 // complete. When running profilers, such as VTune, this syscall will
512 // be constantly interrupted by signals. Expanding the file in smaller
513 // steps avoids this problem.
514 return split_and_fallocate(punch_hole, offset, length);
515 }
516
517 return err;
518}
519
520bool ZBackingFile::commit_inner(size_t offset, size_t length) {
521 log_trace(gc, heap)("Committing memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
522 offset / M, (offset + length) / M, length / M);
523
524retry:
525 const ZErrno err = fallocate(false /* punch_hole */, offset, length);
526 if (err) {
527 if (err == ENOSPC && !is_init_completed() && is_hugetlbfs() && z_fallocate_hugetlbfs_attempts-- > 0) {
528 // If we fail to allocate during initialization, due to lack of space on
529 // the hugetlbfs filesystem, then we wait and retry a few times before
530 // giving up. Otherwise there is a risk that running JVMs back-to-back
531 // will fail, since there is a delay between process termination and the
532 // huge pages owned by that process being returned to the huge page pool
533 // and made available for new allocations.
534 log_debug(gc, init)("Failed to commit memory (%s), retrying", err.to_string());
535
536 // Wait and retry in one second, in the hope that huge pages will be
537 // available by then.
538 sleep(1);
539 goto retry;
540 }
541
542 // Failed
543 log_error(gc)("Failed to commit memory (%s)", err.to_string());
544 return false;
545 }
546
547 // Success
548 return true;
549}
550
551size_t ZBackingFile::commit(size_t offset, size_t length) {
552 // Try to commit the whole region
553 if (commit_inner(offset, length)) {
554 // Success
555 return length;
556 }
557
558 // Failed, try to commit as much as possible
559 size_t start = offset;
560 size_t end = offset + length;
561
562 for (;;) {
563 length = align_down((end - start) / 2, ZGranuleSize);
564 if (length < ZGranuleSize) {
565 // Done, don't commit more
566 return start - offset;
567 }
568
569 if (commit_inner(start, length)) {
570 // Success, try commit more
571 start += length;
572 } else {
573 // Failed, try commit less
574 end -= length;
575 }
576 }
577}
578
579size_t ZBackingFile::uncommit(size_t offset, size_t length) {
580 log_trace(gc, heap)("Uncommitting memory: " SIZE_FORMAT "M-" SIZE_FORMAT "M (" SIZE_FORMAT "M)",
581 offset / M, (offset + length) / M, length / M);
582
583 const ZErrno err = fallocate(true /* punch_hole */, offset, length);
584 if (err) {
585 log_error(gc)("Failed to uncommit memory (%s)", err.to_string());
586 return 0;
587 }
588
589 return length;
590}
591