1// Copyright (c) 2015 Sandstorm Development Group, Inc. and contributors
2// Licensed under the MIT License:
3//
4// Permission is hereby granted, free of charge, to any person obtaining a copy
5// of this software and associated documentation files (the "Software"), to deal
6// in the Software without restriction, including without limitation the rights
7// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8// copies of the Software, and to permit persons to whom the Software is
9// furnished to do so, subject to the following conditions:
10//
11// The above copyright notice and this permission notice shall be included in
12// all copies or substantial portions of the Software.
13//
14// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20// THE SOFTWARE.
21
22#if !_WIN32
23
24#ifndef _GNU_SOURCE
25#define _GNU_SOURCE
26#endif
27
28#include "filesystem.h"
29#include "debug.h"
30#include <sys/types.h>
31#include <sys/stat.h>
32#include <sys/ioctl.h>
33#include <fcntl.h>
34#include <unistd.h>
35#include <stdio.h>
36#include <sys/mman.h>
37#include <errno.h>
38#include <dirent.h>
39#include <stdlib.h>
40#include "vector.h"
41#include "miniposix.h"
42#include <algorithm>
43
44#if __linux__
45#include <syscall.h>
46#include <linux/fs.h>
47#include <sys/sendfile.h>
48#endif
49
50namespace kj {
51namespace {
52
53#define HIDDEN_PREFIX ".kj-tmp."
54// Prefix for temp files which should be hidden when listing a directory.
55//
56// If you change this, make sure to update the unit test.
57
58#ifdef O_CLOEXEC
59#define MAYBE_O_CLOEXEC O_CLOEXEC
60#else
61#define MAYBE_O_CLOEXEC 0
62#endif
63
64#ifdef O_DIRECTORY
65#define MAYBE_O_DIRECTORY O_DIRECTORY
66#else
67#define MAYBE_O_DIRECTORY 0
68#endif
69
70#if __APPLE__
71// Mac OSX defines SEEK_HOLE, but it doesn't work. ("Inappropriate ioctl for device", it says.)
72#undef SEEK_HOLE
73#endif
74
75#if __BIONIC__
76// No no DTTOIF function
77#undef DT_UNKNOWN
78#endif
79
80static void setCloexec(int fd) KJ_UNUSED;
81static void setCloexec(int fd) {
82 // Set the O_CLOEXEC flag on the given fd.
83 //
84 // We try to avoid the need to call this by taking advantage of syscall flags that set it
85 // atomically on new file descriptors. Unfortunately some platforms do not support such syscalls.
86
87#ifdef FIOCLEX
88 // Yay, we can set the flag in one call.
89 KJ_SYSCALL_HANDLE_ERRORS(ioctl(fd, FIOCLEX)) {
90 case EINVAL:
91 case EOPNOTSUPP:
92 break;
93 default:
94 KJ_FAIL_SYSCALL("ioctl(fd, FIOCLEX)", error) { break; }
95 break;
96 } else {
97 // success
98 return;
99 }
100#endif
101
102 // Sadness, we must resort to read/modify/write.
103 //
104 // (On many platforms, FD_CLOEXEC is the only flag modifiable via F_SETFD and therefore we could
105 // skip the read... but it seems dangerous to assume that's true of all platforms, and anyway
106 // most platforms support FIOCLEX.)
107 int flags;
108 KJ_SYSCALL(flags = fcntl(fd, F_GETFD));
109 if (!(flags & FD_CLOEXEC)) {
110 KJ_SYSCALL(fcntl(fd, F_SETFD, flags | FD_CLOEXEC));
111 }
112}
113
114static Date toKjDate(struct timespec tv) {
115 return tv.tv_sec * SECONDS + tv.tv_nsec * NANOSECONDS + UNIX_EPOCH;
116}
117
118static FsNode::Type modeToType(mode_t mode) {
119 switch (mode & S_IFMT) {
120 case S_IFREG : return FsNode::Type::FILE;
121 case S_IFDIR : return FsNode::Type::DIRECTORY;
122 case S_IFLNK : return FsNode::Type::SYMLINK;
123 case S_IFBLK : return FsNode::Type::BLOCK_DEVICE;
124 case S_IFCHR : return FsNode::Type::CHARACTER_DEVICE;
125 case S_IFIFO : return FsNode::Type::NAMED_PIPE;
126 case S_IFSOCK: return FsNode::Type::SOCKET;
127 default: return FsNode::Type::OTHER;
128 }
129}
130
131static FsNode::Metadata statToMetadata(struct stat& stats) {
132 // Probably st_ino and st_dev are usually under 32 bits, so mix by rotating st_dev left 32 bits
133 // and XOR.
134 uint64_t d = stats.st_dev;
135 uint64_t hash = ((d << 32) | (d >> 32)) ^ stats.st_ino;
136
137 return FsNode::Metadata {
138 modeToType(stats.st_mode),
139 implicitCast<uint64_t>(stats.st_size),
140 implicitCast<uint64_t>(stats.st_blocks * 512u),
141#if __APPLE__
142 toKjDate(stats.st_mtimespec),
143#else
144 toKjDate(stats.st_mtim),
145#endif
146 implicitCast<uint>(stats.st_nlink),
147 hash
148 };
149}
150
151static bool rmrf(int fd, StringPtr path);
152
153static void rmrfChildrenAndClose(int fd) {
154 // Assumes fd is seeked to beginning.
155
156 DIR* dir = fdopendir(fd);
157 if (dir == nullptr) {
158 close(fd);
159 KJ_FAIL_SYSCALL("fdopendir", errno);
160 };
161 KJ_DEFER(closedir(dir));
162
163 for (;;) {
164 errno = 0;
165 struct dirent* entry = readdir(dir);
166 if (entry == nullptr) {
167 int error = errno;
168 if (error == 0) {
169 break;
170 } else {
171 KJ_FAIL_SYSCALL("readdir", error);
172 }
173 }
174
175 if (entry->d_name[0] == '.' &&
176 (entry->d_name[1] == '\0' ||
177 (entry->d_name[1] == '.' &&
178 entry->d_name[2] == '\0'))) {
179 // ignore . and ..
180 } else {
181#ifdef DT_UNKNOWN // d_type is not available on all platforms.
182 if (entry->d_type == DT_DIR) {
183 int subdirFd;
184 KJ_SYSCALL(subdirFd = openat(
185 fd, entry->d_name, O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC));
186 rmrfChildrenAndClose(subdirFd);
187 KJ_SYSCALL(unlinkat(fd, entry->d_name, AT_REMOVEDIR));
188 } else if (entry->d_type != DT_UNKNOWN) {
189 KJ_SYSCALL(unlinkat(fd, entry->d_name, 0));
190 } else {
191#endif
192 KJ_ASSERT(rmrf(fd, entry->d_name));
193#ifdef DT_UNKNOWN
194 }
195#endif
196 }
197 }
198}
199
200static bool rmrf(int fd, StringPtr path) {
201 struct stat stats;
202 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
203 case ENOENT:
204 case ENOTDIR:
205 // Doesn't exist.
206 return false;
207 default:
208 KJ_FAIL_SYSCALL("lstat(path)", error, path) { return false; }
209 }
210
211 if (S_ISDIR(stats.st_mode)) {
212 int subdirFd;
213 KJ_SYSCALL(subdirFd = openat(
214 fd, path.cStr(), O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)) { return false; }
215 rmrfChildrenAndClose(subdirFd);
216 KJ_SYSCALL(unlinkat(fd, path.cStr(), AT_REMOVEDIR)) { return false; }
217 } else {
218 KJ_SYSCALL(unlinkat(fd, path.cStr(), 0)) { return false; }
219 }
220
221 return true;
222}
223
224struct MmapRange {
225 uint64_t offset;
226 uint64_t size;
227};
228
229static MmapRange getMmapRange(uint64_t offset, uint64_t size) {
230 // Comes up with an offset and size to pass to mmap(), given an offset and size requested by
231 // the caller, and considering the fact that mappings must start at a page boundary.
232 //
233 // The offset is rounded down to the nearest page boundary, and the size is increased to
234 // compensate. Note that the endpoint of the mapping is *not* rounded up to a page boundary, as
235 // mmap() does not actually require this, and it causes trouble on some systems (notably Cygwin).
236
237#ifndef _SC_PAGESIZE
238#define _SC_PAGESIZE _SC_PAGE_SIZE
239#endif
240 static const uint64_t pageSize = sysconf(_SC_PAGESIZE);
241 uint64_t pageMask = pageSize - 1;
242
243 uint64_t realOffset = offset & ~pageMask;
244
245 return { realOffset, offset + size - realOffset };
246}
247
248class MmapDisposer: public ArrayDisposer {
249protected:
250 void disposeImpl(void* firstElement, size_t elementSize, size_t elementCount,
251 size_t capacity, void (*destroyElement)(void*)) const {
252 auto range = getMmapRange(reinterpret_cast<uintptr_t>(firstElement),
253 elementSize * elementCount);
254 KJ_SYSCALL(munmap(reinterpret_cast<byte*>(range.offset), range.size)) { break; }
255 }
256};
257
258constexpr MmapDisposer mmapDisposer = MmapDisposer();
259
260class DiskHandle {
261 // We need to implement each of ReadableFile, AppendableFile, File, ReadableDirectory, and
262 // Directory for disk handles. There is a lot of implementation overlap between these, especially
263 // stat(), sync(), etc. We can't have everything inherit from a common DiskFsNode that implements
264 // these because then we get diamond inheritance which means we need to make all our inheritance
265 // virtual which means downcasting requires RTTI which violates our goal of supporting compiling
266 // with no RTTI. So instead we have the DiskHandle class which implements all the methods without
267 // inheriting anything, and then we have DiskFile, DiskDirectory, etc. hold this and delegate to
268 // it. Ugly, but works.
269
270public:
271 DiskHandle(AutoCloseFd&& fd): fd(kj::mv(fd)) {}
272
273 // OsHandle ------------------------------------------------------------------
274
275 AutoCloseFd clone() const {
276 int fd2;
277#ifdef F_DUPFD_CLOEXEC
278 KJ_SYSCALL_HANDLE_ERRORS(fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 3)) {
279 case EINVAL:
280 case EOPNOTSUPP:
281 // fall back
282 break;
283 default:
284 KJ_FAIL_SYSCALL("fnctl(fd, F_DUPFD_CLOEXEC, 3)", error) { break; }
285 break;
286 } else {
287 return AutoCloseFd(fd2);
288 }
289#endif
290
291 KJ_SYSCALL(fd2 = ::dup(fd));
292 AutoCloseFd result(fd2);
293 setCloexec(result);
294 return result;
295 }
296
297 int getFd() const {
298 return fd.get();
299 }
300
301 // FsNode --------------------------------------------------------------------
302
303 FsNode::Metadata stat() const {
304 struct stat stats;
305 KJ_SYSCALL(::fstat(fd, &stats));
306 return statToMetadata(stats);
307 }
308
309 void sync() const {
310#if __APPLE__
311 // For whatever reason, fsync() on OSX only flushes kernel buffers. It does not flush hardware
312 // disk buffers. This makes it not very useful. But OSX documents fcntl F_FULLFSYNC which does
313 // the right thing. Why they don't just make fsync() do the right thing, I do not know.
314 KJ_SYSCALL(fcntl(fd, F_FULLFSYNC));
315#else
316 KJ_SYSCALL(fsync(fd));
317#endif
318 }
319
320 void datasync() const {
321 // The presence of the _POSIX_SYNCHRONIZED_IO define is supposed to tell us that fdatasync()
322 // exists. But Apple defines this yet doesn't offer fdatasync(). Thanks, Apple.
323#if _POSIX_SYNCHRONIZED_IO && !__APPLE__
324 KJ_SYSCALL(fdatasync(fd));
325#else
326 this->sync();
327#endif
328 }
329
330 // ReadableFile --------------------------------------------------------------
331
332 size_t read(uint64_t offset, ArrayPtr<byte> buffer) const {
333 // pread() probably never returns short reads unless it hits EOF. Unfortunately, though, per
334 // spec we are not allowed to assume this.
335
336 size_t total = 0;
337 while (buffer.size() > 0) {
338 ssize_t n;
339 KJ_SYSCALL(n = pread(fd, buffer.begin(), buffer.size(), offset));
340 if (n == 0) break;
341 total += n;
342 offset += n;
343 buffer = buffer.slice(n, buffer.size());
344 }
345 return total;
346 }
347
348 Array<const byte> mmap(uint64_t offset, uint64_t size) const {
349 auto range = getMmapRange(offset, size);
350 const void* mapping = ::mmap(NULL, range.size, PROT_READ, MAP_SHARED, fd, range.offset);
351 if (mapping == MAP_FAILED) {
352 KJ_FAIL_SYSCALL("mmap", errno);
353 }
354 return Array<const byte>(reinterpret_cast<const byte*>(mapping) + (offset - range.offset),
355 size, mmapDisposer);
356 }
357
358 Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const {
359 auto range = getMmapRange(offset, size);
360 void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, range.offset);
361 if (mapping == MAP_FAILED) {
362 KJ_FAIL_SYSCALL("mmap", errno);
363 }
364 return Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset),
365 size, mmapDisposer);
366 }
367
368 // File ----------------------------------------------------------------------
369
370 void write(uint64_t offset, ArrayPtr<const byte> data) const {
371 // pwrite() probably never returns short writes unless there's no space left on disk.
372 // Unfortunately, though, per spec we are not allowed to assume this.
373
374 while (data.size() > 0) {
375 ssize_t n;
376 KJ_SYSCALL(n = pwrite(fd, data.begin(), data.size(), offset));
377 KJ_ASSERT(n > 0, "pwrite() returned zero?");
378 offset += n;
379 data = data.slice(n, data.size());
380 }
381 }
382
383 void zero(uint64_t offset, uint64_t size) const {
384#ifdef FALLOC_FL_PUNCH_HOLE
385 KJ_SYSCALL_HANDLE_ERRORS(
386 fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, size)) {
387 case EOPNOTSUPP:
388 // fall back to below
389 break;
390 default:
391 KJ_FAIL_SYSCALL("fallocate(FALLOC_FL_PUNCH_HOLE)", error) { return; }
392 } else {
393 return;
394 }
395#endif
396
397 static const byte ZEROS[4096] = { 0 };
398
399#if __APPLE__ || __CYGWIN__
400 // Mac & Cygwin doesn't have pwritev().
401 while (size > sizeof(ZEROS)) {
402 write(offset, ZEROS);
403 size -= sizeof(ZEROS);
404 offset += sizeof(ZEROS);
405 }
406 write(offset, kj::arrayPtr(ZEROS, size));
407#else
408 // Use a 4k buffer of zeros amplified by iov to write zeros with as few syscalls as possible.
409 size_t count = (size + sizeof(ZEROS) - 1) / sizeof(ZEROS);
410 const size_t iovmax = miniposix::iovMax(count);
411 KJ_STACK_ARRAY(struct iovec, iov, kj::min(iovmax, count), 16, 256);
412
413 for (auto& item: iov) {
414 item.iov_base = const_cast<byte*>(ZEROS);
415 item.iov_len = sizeof(ZEROS);
416 }
417
418 while (size > 0) {
419 size_t iovCount;
420 if (size >= iov.size() * sizeof(ZEROS)) {
421 iovCount = iov.size();
422 } else {
423 iovCount = size / sizeof(ZEROS);
424 size_t rem = size % sizeof(ZEROS);
425 if (rem > 0) {
426 iov[iovCount++].iov_len = rem;
427 }
428 }
429
430 ssize_t n;
431 KJ_SYSCALL(n = pwritev(fd, iov.begin(), count, offset));
432 KJ_ASSERT(n > 0, "pwrite() returned zero?");
433
434 offset += n;
435 size -= n;
436 }
437#endif
438 }
439
440 void truncate(uint64_t size) const {
441 KJ_SYSCALL(ftruncate(fd, size));
442 }
443
444 class WritableFileMappingImpl final: public WritableFileMapping {
445 public:
446 WritableFileMappingImpl(Array<byte> bytes): bytes(kj::mv(bytes)) {}
447
448 ArrayPtr<byte> get() const override {
449 // const_cast OK because WritableFileMapping does indeed provide a writable view despite
450 // being const itself.
451 return arrayPtr(const_cast<byte*>(bytes.begin()), bytes.size());
452 }
453
454 void changed(ArrayPtr<byte> slice) const override {
455 KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(),
456 "byte range is not part of this mapping");
457
458 // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that.
459 auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size());
460 KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_ASYNC));
461 }
462
463 void sync(ArrayPtr<byte> slice) const override {
464 KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(),
465 "byte range is not part of this mapping");
466
467 // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that.
468 auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size());
469 KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_SYNC));
470 }
471
472 private:
473 Array<byte> bytes;
474 };
475
476 Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const {
477 auto range = getMmapRange(offset, size);
478 void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, range.offset);
479 if (mapping == MAP_FAILED) {
480 KJ_FAIL_SYSCALL("mmap", errno);
481 }
482 auto array = Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset),
483 size, mmapDisposer);
484 return heap<WritableFileMappingImpl>(kj::mv(array));
485 }
486
487 size_t copyChunk(uint64_t offset, int fromFd, uint64_t fromOffset, uint64_t size) const {
488 // Copies a range of bytes from `fromFd` to this file in the most efficient way possible for
489 // the OS. Only returns less than `size` if EOF. Does not account for holes.
490
491#if __linux__
492 {
493 KJ_SYSCALL(lseek(fd, offset, SEEK_SET));
494 off_t fromPos = fromOffset;
495 off_t end = fromOffset + size;
496 while (fromPos < end) {
497 ssize_t n;
498 KJ_SYSCALL_HANDLE_ERRORS(n = sendfile(fd, fromFd, &fromPos, end - fromPos)) {
499 case EINVAL:
500 case ENOSYS:
501 goto sendfileNotAvailable;
502 default:
503 KJ_FAIL_SYSCALL("sendfile", error) { return fromPos - fromOffset; }
504 }
505 }
506 return fromPos - fromOffset;
507 }
508
509 sendfileNotAvailable:
510#endif
511 uint64_t total = 0;
512 while (size > 0) {
513 byte buffer[4096];
514 ssize_t n;
515 KJ_SYSCALL(n = pread(fromFd, buffer, kj::min(sizeof(buffer), size), fromOffset));
516 if (n == 0) break;
517 write(offset, arrayPtr(buffer, n));
518 fromOffset += n;
519 offset += n;
520 total += n;
521 size -= n;
522 }
523 return total;
524 }
525
526 kj::Maybe<size_t> copy(uint64_t offset, const ReadableFile& from,
527 uint64_t fromOffset, uint64_t size) const {
528 KJ_IF_MAYBE(otherFd, from.getFd()) {
529#ifdef FICLONE
530 if (offset == 0 && fromOffset == 0 && size == kj::maxValue && stat().size == 0) {
531 if (ioctl(fd, FICLONE, *otherFd) >= 0) {
532 return stat().size;
533 }
534 } else if (size > 0) { // src_length = 0 has special meaning for the syscall, so avoid.
535 struct file_clone_range range;
536 memset(&range, 0, sizeof(range));
537 range.src_fd = *otherFd;
538 range.dest_offset = offset;
539 range.src_offset = fromOffset;
540 range.src_length = size == kj::maxValue ? 0 : size;
541 if (ioctl(fd, FICLONERANGE, &range) >= 0) {
542 // TODO(someday): What does FICLONERANGE actually do if the range goes past EOF? The docs
543 // don't say. Maybe it only copies the parts that exist. Maybe it punches holes for the
544 // rest. Where does the destination file's EOF marker end up? Who knows?
545 return kj::min(from.stat().size - fromOffset, size);
546 }
547 } else {
548 // size == 0
549 return size_t(0);
550 }
551
552 // ioctl failed. Almost all failures documented for these are of the form "the operation is
553 // not supported for the filesystem(s) specified", so fall back to other approaches.
554#endif
555
556 off_t toPos = offset;
557 off_t fromPos = fromOffset;
558 off_t end = size == kj::maxValue ? off_t(kj::maxValue) : off_t(fromOffset + size);
559
560 for (;;) {
561 // Handle data.
562 {
563 // Find out how much data there is before the next hole.
564 off_t nextHole;
565#ifdef SEEK_HOLE
566 KJ_SYSCALL_HANDLE_ERRORS(nextHole = lseek(*otherFd, fromPos, SEEK_HOLE)) {
567 case EINVAL:
568 // SEEK_HOLE probably not supported. Assume no holes.
569 nextHole = end;
570 break;
571 case ENXIO:
572 // Past EOF. Stop here.
573 return fromPos - fromOffset;
574 default:
575 KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; }
576 }
577#else
578 // SEEK_HOLE not supported. Assume no holes.
579 nextHole = end;
580#endif
581
582 // Copy the next chunk of data.
583 off_t copyTo = kj::min(end, nextHole);
584 size_t amount = copyTo - fromPos;
585 if (amount > 0) {
586 size_t n = copyChunk(toPos, *otherFd, fromPos, amount);
587 fromPos += n;
588 toPos += n;
589
590 if (n < amount) {
591 return fromPos - fromOffset;
592 }
593 }
594
595 if (fromPos == end) {
596 return fromPos - fromOffset;
597 }
598 }
599
600#ifdef SEEK_HOLE
601 // Handle hole.
602 {
603 // Find out how much hole there is before the next data.
604 off_t nextData;
605 KJ_SYSCALL_HANDLE_ERRORS(nextData = lseek(*otherFd, fromPos, SEEK_DATA)) {
606 case EINVAL:
607 // SEEK_DATA probably not supported. But we should only have gotten here if we
608 // were expecting a hole.
609 KJ_FAIL_ASSERT("can't determine hole size; SEEK_DATA not supported");
610 break;
611 case ENXIO:
612 // No more data. Set to EOF.
613 KJ_SYSCALL(nextData = lseek(*otherFd, 0, SEEK_END));
614 if (nextData > end) {
615 end = nextData;
616 }
617 break;
618 default:
619 KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; }
620 }
621
622 // Write zeros.
623 off_t zeroTo = kj::min(end, nextData);
624 off_t amount = zeroTo - fromPos;
625 if (amount > 0) {
626 zero(toPos, amount);
627 toPos += amount;
628 fromPos = zeroTo;
629 }
630
631 if (fromPos == end) {
632 return fromPos - fromOffset;
633 }
634 }
635#endif
636 }
637 }
638
639 // Indicates caller should call File::copy() default implementation.
640 return nullptr;
641 }
642
643 // ReadableDirectory ---------------------------------------------------------
644
645 template <typename Func>
646 auto list(bool needTypes, Func&& func) const
647 -> Array<Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))>> {
648 // Seek to start of directory.
649 KJ_SYSCALL(lseek(fd, 0, SEEK_SET));
650
651 // Unfortunately, fdopendir() takes ownership of the file descriptor. Therefore we need to
652 // make a duplicate.
653 int duped;
654 KJ_SYSCALL(duped = dup(fd));
655 DIR* dir = fdopendir(duped);
656 if (dir == nullptr) {
657 close(duped);
658 KJ_FAIL_SYSCALL("fdopendir", errno);
659 }
660
661 KJ_DEFER(closedir(dir));
662 typedef Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))> Entry;
663 kj::Vector<Entry> entries;
664
665 for (;;) {
666 errno = 0;
667 struct dirent* entry = readdir(dir);
668 if (entry == nullptr) {
669 int error = errno;
670 if (error == 0) {
671 break;
672 } else {
673 KJ_FAIL_SYSCALL("readdir", error);
674 }
675 }
676
677 kj::StringPtr name = entry->d_name;
678 if (name != "." && name != ".." && !name.startsWith(HIDDEN_PREFIX)) {
679#ifdef DT_UNKNOWN // d_type is not available on all platforms.
680 if (entry->d_type != DT_UNKNOWN) {
681 entries.add(func(name, modeToType(DTTOIF(entry->d_type))));
682 } else {
683#endif
684 if (needTypes) {
685 // Unknown type. Fall back to stat.
686 struct stat stats;
687 KJ_SYSCALL(fstatat(fd, name.cStr(), &stats, AT_SYMLINK_NOFOLLOW));
688 entries.add(func(name, modeToType(stats.st_mode)));
689 } else {
690 entries.add(func(name, FsNode::Type::OTHER));
691 }
692#ifdef DT_UNKNOWN
693 }
694#endif
695 }
696 }
697
698 auto result = entries.releaseAsArray();
699 std::sort(result.begin(), result.end());
700 return result;
701 }
702
703 Array<String> listNames() const {
704 return list(false, [](StringPtr name, FsNode::Type type) { return heapString(name); });
705 }
706
707 Array<ReadableDirectory::Entry> listEntries() const {
708 return list(true, [](StringPtr name, FsNode::Type type) {
709 return ReadableDirectory::Entry { type, heapString(name), };
710 });
711 }
712
713 bool exists(PathPtr path) const {
714 KJ_SYSCALL_HANDLE_ERRORS(faccessat(fd, path.toString().cStr(), F_OK, 0)) {
715 case ENOENT:
716 case ENOTDIR:
717 return false;
718 default:
719 KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return false; }
720 }
721 return true;
722 }
723
724 Maybe<FsNode::Metadata> tryLstat(PathPtr path) const {
725 struct stat stats;
726 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.toString().cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
727 case ENOENT:
728 case ENOTDIR:
729 return nullptr;
730 default:
731 KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return nullptr; }
732 }
733 return statToMetadata(stats);
734 }
735
736 Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const {
737 int newFd;
738 KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(
739 fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC)) {
740 case ENOENT:
741 case ENOTDIR:
742 return nullptr;
743 default:
744 KJ_FAIL_SYSCALL("openat(fd, path, O_RDONLY)", error, path) { return nullptr; }
745 }
746
747 kj::AutoCloseFd result(newFd);
748#ifndef O_CLOEXEC
749 setCloexec(result);
750#endif
751
752 return newDiskReadableFile(kj::mv(result));
753 }
754
755 Maybe<AutoCloseFd> tryOpenSubdirInternal(PathPtr path) const {
756 int newFd;
757 KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(
758 fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) {
759 case ENOENT:
760 return nullptr;
761 case ENOTDIR:
762 // Could mean that a parent is not a directory, which we treat as "doesn't exist".
763 // Could also mean that the specified file is not a directory, which should throw.
764 // Check using exists().
765 if (!exists(path)) {
766 return nullptr;
767 }
768 // fallthrough
769 default:
770 KJ_FAIL_SYSCALL("openat(fd, path, O_DIRECTORY)", error, path) { return nullptr; }
771 }
772
773 kj::AutoCloseFd result(newFd);
774#ifndef O_CLOEXEC
775 setCloexec(result);
776#endif
777
778 return kj::mv(result);
779 }
780
781 Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const {
782 return tryOpenSubdirInternal(path).map(newDiskReadableDirectory);
783 }
784
785 Maybe<String> tryReadlink(PathPtr path) const {
786 size_t trySize = 256;
787 for (;;) {
788 KJ_STACK_ARRAY(char, buf, trySize, 256, 4096);
789 ssize_t n = readlinkat(fd, path.toString().cStr(), buf.begin(), buf.size());
790 if (n < 0) {
791 int error = errno;
792 switch (error) {
793 case EINTR:
794 continue;
795 case ENOENT:
796 case ENOTDIR:
797 case EINVAL: // not a link
798 return nullptr;
799 default:
800 KJ_FAIL_SYSCALL("readlinkat(fd, path)", error, path) { return nullptr; }
801 }
802 }
803
804 if (n >= buf.size()) {
805 // Didn't give it enough space. Better retry with a bigger buffer.
806 trySize *= 2;
807 continue;
808 }
809
810 return heapString(buf.begin(), n);
811 }
812 }
813
814 // Directory -----------------------------------------------------------------
815
816 bool tryMkdir(PathPtr path, WriteMode mode, bool noThrow) const {
817 // Internal function to make a directory.
818
819 auto filename = path.toString();
820 mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777;
821
822 KJ_SYSCALL_HANDLE_ERRORS(mkdirat(fd, filename.cStr(), acl)) {
823 case EEXIST: {
824 // Apparently this path exists.
825 if (!has(mode, WriteMode::MODIFY)) {
826 // Require exclusive create.
827 return false;
828 }
829
830 // MODIFY is allowed, so we just need to check whether the existing entry is a directory.
831 struct stat stats;
832 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, filename.cStr(), &stats, 0)) {
833 default:
834 // mkdir() says EEXIST but we can't stat it. Maybe it's a dangling link, or maybe
835 // we can't access it for some reason. Assume failure.
836 //
837 // TODO(someday): Maybe we should be creating the directory at the target of the
838 // link?
839 goto failed;
840 }
841 return (stats.st_mode & S_IFMT) == S_IFDIR;
842 }
843 case ENOENT:
844 if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
845 tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
846 WriteMode::CREATE_PARENT, true)) {
847 // Retry, but make sure we don't try to create the parent again.
848 return tryMkdir(path, mode - WriteMode::CREATE_PARENT, noThrow);
849 } else {
850 goto failed;
851 }
852 default:
853 failed:
854 if (noThrow) {
855 // Caller requested no throwing.
856 return false;
857 } else {
858 KJ_FAIL_SYSCALL("mkdirat(fd, path)", error, path);
859 }
860 }
861
862 return true;
863 }
864
865 kj::Maybe<String> createNamedTemporary(
866 PathPtr finalName, WriteMode mode, Function<int(StringPtr)> tryCreate) const {
867 // Create a temporary file which will eventually replace `finalName`.
868 //
869 // Calls `tryCreate` to actually create the temporary, passing in the desired path. tryCreate()
870 // is expected to behave like a syscall, returning a negative value and setting `errno` on
871 // error. tryCreate() MUST fail with EEXIST if the path exists -- this is not checked in
872 // advance, since it needs to be checked atomically. In the case of EEXIST, tryCreate() will
873 // be called again with a new path.
874 //
875 // Returns the temporary path that succeeded. Only returns nullptr if there was an exception
876 // but we're compiled with -fno-exceptions.
877
878 if (finalName.size() == 0) {
879 KJ_FAIL_REQUIRE("can't replace self") { break; }
880 return nullptr;
881 }
882
883 static uint counter = 0;
884 static const pid_t pid = getpid();
885 String pathPrefix;
886 if (finalName.size() > 1) {
887 pathPrefix = kj::str(finalName.parent(), '/');
888 }
889 auto path = kj::str(pathPrefix, HIDDEN_PREFIX, pid, '.', counter++, '.',
890 finalName.basename()[0], ".partial");
891
892 KJ_SYSCALL_HANDLE_ERRORS(tryCreate(path)) {
893 case EEXIST:
894 return createNamedTemporary(finalName, mode, kj::mv(tryCreate));
895 case ENOENT:
896 if (has(mode, WriteMode::CREATE_PARENT) && finalName.size() > 1 &&
897 tryMkdir(finalName.parent(), WriteMode::CREATE | WriteMode::MODIFY |
898 WriteMode::CREATE_PARENT, true)) {
899 // Retry, but make sure we don't try to create the parent again.
900 mode = mode - WriteMode::CREATE_PARENT;
901 return createNamedTemporary(finalName, mode, kj::mv(tryCreate));
902 }
903 // fallthrough
904 default:
905 KJ_FAIL_SYSCALL("create(path)", error, path) { break; }
906 return nullptr;
907 }
908
909 return kj::mv(path);
910 }
911
912 bool tryReplaceNode(PathPtr path, WriteMode mode, Function<int(StringPtr)> tryCreate) const {
913 // Replaces the given path with an object created by calling tryCreate().
914 //
915 // tryCreate() must behave like a syscall which creates the node at the path passed to it,
916 // returning a negative value on error. If the path passed to tryCreate already exists, it
917 // MUST fail with EEXIST.
918 //
919 // When `mode` includes MODIFY, replaceNode() reacts to EEXIST by creating the node in a
920 // temporary location and then rename()ing it into place.
921
922 if (path.size() == 0) {
923 KJ_FAIL_REQUIRE("can't replace self") { return false; }
924 }
925
926 auto filename = path.toString();
927
928 if (has(mode, WriteMode::CREATE)) {
929 // First try just cerating the node in-place.
930 KJ_SYSCALL_HANDLE_ERRORS(tryCreate(filename)) {
931 case EEXIST:
932 // Target exists.
933 if (has(mode, WriteMode::MODIFY)) {
934 // Fall back to MODIFY path, below.
935 break;
936 } else {
937 return false;
938 }
939 case ENOENT:
940 if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
941 tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
942 WriteMode::CREATE_PARENT, true)) {
943 // Retry, but make sure we don't try to create the parent again.
944 return tryReplaceNode(path, mode - WriteMode::CREATE_PARENT, kj::mv(tryCreate));
945 }
946 // fallthrough
947 default:
948 KJ_FAIL_SYSCALL("create(path)", error, path) { return false; }
949 } else {
950 // Success.
951 return true;
952 }
953 }
954
955 // Either we don't have CREATE mode or the target already exists. We need to perform a
956 // replacement instead.
957
958 KJ_IF_MAYBE(tempPath, createNamedTemporary(path, mode, kj::mv(tryCreate))) {
959 if (tryCommitReplacement(filename, fd, *tempPath, mode)) {
960 return true;
961 } else {
962 KJ_SYSCALL_HANDLE_ERRORS(unlinkat(fd, tempPath->cStr(), 0)) {
963 case ENOENT:
964 // meh
965 break;
966 default:
967 KJ_FAIL_SYSCALL("unlinkat(fd, tempPath, 0)", error, *tempPath);
968 }
969 return false;
970 }
971 } else {
972 // threw, but exceptions are disabled
973 return false;
974 }
975 }
976
977 Maybe<AutoCloseFd> tryOpenFileInternal(PathPtr path, WriteMode mode, bool append) const {
978 uint flags = O_RDWR | MAYBE_O_CLOEXEC;
979 mode_t acl = 0666;
980 if (has(mode, WriteMode::CREATE)) {
981 flags |= O_CREAT;
982 }
983 if (!has(mode, WriteMode::MODIFY)) {
984 if (!has(mode, WriteMode::CREATE)) {
985 // Neither CREATE nor MODIFY -- impossible to satisfy preconditions.
986 return nullptr;
987 }
988 flags |= O_EXCL;
989 }
990 if (append) {
991 flags |= O_APPEND;
992 }
993 if (has(mode, WriteMode::EXECUTABLE)) {
994 acl = 0777;
995 }
996 if (has(mode, WriteMode::PRIVATE)) {
997 acl &= 0700;
998 }
999
1000 auto filename = path.toString();
1001
1002 int newFd;
1003 KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(fd, filename.cStr(), flags, acl)) {
1004 case ENOENT:
1005 if (has(mode, WriteMode::CREATE)) {
1006 // Either:
1007 // - The file is a broken symlink.
1008 // - A parent directory didn't exist.
1009 if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 &&
1010 tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY |
1011 WriteMode::CREATE_PARENT, true)) {
1012 // Retry, but make sure we don't try to create the parent again.
1013 return tryOpenFileInternal(path, mode - WriteMode::CREATE_PARENT, append);
1014 }
1015
1016 // Check for broken link.
1017 if (!has(mode, WriteMode::MODIFY) &&
1018 faccessat(fd, filename.cStr(), F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
1019 // Yep. We treat this as already-exists, which means in CREATE-only mode this is a
1020 // simple failure.
1021 return nullptr;
1022 }
1023
1024 KJ_FAIL_REQUIRE("parent is not a directory", path) { return nullptr; }
1025 } else {
1026 // MODIFY-only mode. ENOENT = doesn't exist = return null.
1027 return nullptr;
1028 }
1029 case ENOTDIR:
1030 if (!has(mode, WriteMode::CREATE)) {
1031 // MODIFY-only mode. ENOTDIR = parent not a directory = doesn't exist = return null.
1032 return nullptr;
1033 }
1034 goto failed;
1035 case EEXIST:
1036 if (!has(mode, WriteMode::MODIFY)) {
1037 // CREATE-only mode. EEXIST = already exists = return null.
1038 return nullptr;
1039 }
1040 goto failed;
1041 default:
1042 failed:
1043 KJ_FAIL_SYSCALL("openat(fd, path, O_RDWR | ...)", error, path) { return nullptr; }
1044 }
1045
1046 kj::AutoCloseFd result(newFd);
1047#ifndef O_CLOEXEC
1048 setCloexec(result);
1049#endif
1050
1051 return kj::mv(result);
1052 }
1053
1054 bool tryCommitReplacement(StringPtr toPath, int fromDirFd, StringPtr fromPath, WriteMode mode,
1055 int* errorReason = nullptr) const {
1056 if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) {
1057 // Always clobber. Try it.
1058 KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr())) {
1059 case EISDIR:
1060 case ENOTDIR:
1061 case ENOTEMPTY:
1062 case EEXIST:
1063 // Failed because target exists and due to the various weird quirks of rename(), it
1064 // can't remove it for us. On Linux we can try an exchange instead. On others we have
1065 // to move the target out of the way.
1066 break;
1067 default:
1068 if (errorReason == nullptr) {
1069 KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { return false; }
1070 } else {
1071 *errorReason = error;
1072 return false;
1073 }
1074 } else {
1075 return true;
1076 }
1077 }
1078
1079#if __linux__ && defined(RENAME_EXCHANGE)
1080 // Try to use Linux's renameat2() to atomically check preconditions and apply.
1081
1082 if (has(mode, WriteMode::MODIFY)) {
1083 // Use an exchange to implement modification.
1084 //
1085 // We reach this branch when performing a MODIFY-only, or when performing a CREATE | MODIFY
1086 // in which we determined above that there's a node of a different type blocking the
1087 // exchange.
1088
1089 KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2,
1090 fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_EXCHANGE)) {
1091 case ENOSYS:
1092 break; // fall back to traditional means
1093 case ENOENT:
1094 // Presumably because the target path doesn't exist.
1095 if (has(mode, WriteMode::CREATE)) {
1096 KJ_FAIL_ASSERT("rename(tmp, path) claimed path exists but "
1097 "renameat2(fromPath, toPath, EXCAHNGE) said it doest; concurrent modification?",
1098 fromPath, toPath) { return false; }
1099 } else {
1100 // Assume target doesn't exist.
1101 return false;
1102 }
1103 default:
1104 if (errorReason == nullptr) {
1105 KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, EXCHANGE)", error, fromPath, toPath) {
1106 return false;
1107 }
1108 } else {
1109 *errorReason = error;
1110 return false;
1111 }
1112 } else {
1113 // Successful swap! Delete swapped-out content.
1114 rmrf(fromDirFd, fromPath);
1115 return true;
1116 }
1117 } else if (has(mode, WriteMode::CREATE)) {
1118 KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2,
1119 fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_NOREPLACE)) {
1120 case ENOSYS:
1121 break; // fall back to traditional means
1122 case EEXIST:
1123 return false;
1124 default:
1125 if (errorReason == nullptr) {
1126 KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, NOREPLACE)", error, fromPath, toPath) {
1127 return false;
1128 }
1129 } else {
1130 *errorReason = error;
1131 return false;
1132 }
1133 } else {
1134 return true;
1135 }
1136 }
1137#endif
1138
1139 // We're unable to do what we wanted atomically. :(
1140
1141 if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) {
1142 // We failed to atomically delete the target previously. So now we need to do two calls in
1143 // rapid succession to move the old file away then move the new one into place.
1144
1145 // Find out what kind of file exists at the target path.
1146 struct stat stats;
1147 KJ_SYSCALL(fstatat(fd, toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { return false; }
1148
1149 // Create a temporary location to move the existing object to. Note that rename() allows a
1150 // non-directory to replace a non-directory, and allows a directory to replace an empty
1151 // directory. So we have to create the right type.
1152 Path toPathParsed = Path::parse(toPath);
1153 String away;
1154 KJ_IF_MAYBE(awayPath, createNamedTemporary(toPathParsed, WriteMode::CREATE,
1155 [&](StringPtr candidatePath) {
1156 if (S_ISDIR(stats.st_mode)) {
1157 return mkdirat(fd, candidatePath.cStr(), 0700);
1158 } else {
1159#if __APPLE__
1160 // No mknodat() on OSX, gotta open() a file, ugh.
1161 int newFd = openat(fd, candidatePath.cStr(),
1162 O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0700);
1163 if (newFd >= 0) close(newFd);
1164 return newFd;
1165#else
1166 return mknodat(fd, candidatePath.cStr(), S_IFREG | 0600, dev_t());
1167#endif
1168 }
1169 })) {
1170 away = kj::mv(*awayPath);
1171 } else {
1172 // Already threw.
1173 return false;
1174 }
1175
1176 // OK, now move the target object to replace the thing we just created.
1177 KJ_SYSCALL(renameat(fd, toPath.cStr(), fd, away.cStr())) {
1178 // Something went wrong. Remove the thing we just created.
1179 unlinkat(fd, away.cStr(), S_ISDIR(stats.st_mode) ? AT_REMOVEDIR : 0);
1180 return false;
1181 }
1182
1183 // Now move the source object to the target location.
1184 KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd, toPath.cStr())) {
1185 default:
1186 // Try to put things back where they were. If this fails, though, then we have little
1187 // choice but to leave things broken.
1188 KJ_SYSCALL_HANDLE_ERRORS(renameat(fd, away.cStr(), fd, toPath.cStr())) {
1189 default: break;
1190 }
1191
1192 if (errorReason == nullptr) {
1193 KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) {
1194 return false;
1195 }
1196 } else {
1197 *errorReason = error;
1198 return false;
1199 }
1200 }
1201
1202 // OK, success. Delete the old content.
1203 rmrf(fd, away);
1204 return true;
1205 } else {
1206 // Only one of CREATE or MODIFY is specified, so we need to verify non-atomically that the
1207 // corresponding precondition (must-not-exist or must-exist, respectively) is held.
1208 if (has(mode, WriteMode::CREATE)) {
1209 struct stat stats;
1210 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
1211 case ENOENT:
1212 case ENOTDIR:
1213 break; // doesn't exist; continue
1214 default:
1215 KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; }
1216 } else {
1217 return false; // already exists; fail
1218 }
1219 } else if (has(mode, WriteMode::MODIFY)) {
1220 struct stat stats;
1221 KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) {
1222 case ENOENT:
1223 case ENOTDIR:
1224 return false; // doesn't exist; fail
1225 default:
1226 KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; }
1227 } else {
1228 // already exists; continue
1229 }
1230 } else {
1231 // Neither CREATE nor MODIFY.
1232 return false;
1233 }
1234
1235 // Start over in create-and-modify mode.
1236 return tryCommitReplacement(toPath, fromDirFd, fromPath,
1237 WriteMode::CREATE | WriteMode::MODIFY,
1238 errorReason);
1239 }
1240 }
1241
1242 template <typename T>
1243 class ReplacerImpl final: public Directory::Replacer<T> {
1244 public:
1245 ReplacerImpl(Own<const T>&& object, const DiskHandle& handle,
1246 String&& tempPath, String&& path, WriteMode mode)
1247 : Directory::Replacer<T>(mode),
1248 object(kj::mv(object)), handle(handle),
1249 tempPath(kj::mv(tempPath)), path(kj::mv(path)) {}
1250
1251 ~ReplacerImpl() noexcept(false) {
1252 if (!committed) {
1253 rmrf(handle.fd, tempPath);
1254 }
1255 }
1256
1257 const T& get() override {
1258 return *object;
1259 }
1260
1261 bool tryCommit() override {
1262 KJ_ASSERT(!committed, "already committed") { return false; }
1263 return committed = handle.tryCommitReplacement(path, handle.fd, tempPath,
1264 Directory::Replacer<T>::mode);
1265 }
1266
1267 private:
1268 Own<const T> object;
1269 const DiskHandle& handle;
1270 String tempPath;
1271 String path;
1272 bool committed = false; // true if *successfully* committed (in which case tempPath is gone)
1273 };
1274
1275 template <typename T>
1276 class BrokenReplacer final: public Directory::Replacer<T> {
1277 // For recovery path when exceptions are disabled.
1278
1279 public:
1280 BrokenReplacer(Own<const T> inner)
1281 : Directory::Replacer<T>(WriteMode::CREATE | WriteMode::MODIFY),
1282 inner(kj::mv(inner)) {}
1283
1284 const T& get() override { return *inner; }
1285 bool tryCommit() override { return false; }
1286
1287 private:
1288 Own<const T> inner;
1289 };
1290
1291 Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const {
1292 return tryOpenFileInternal(path, mode, false).map(newDiskFile);
1293 }
1294
1295 Own<Directory::Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const {
1296 mode_t acl = 0666;
1297 if (has(mode, WriteMode::EXECUTABLE)) {
1298 acl = 0777;
1299 }
1300 if (has(mode, WriteMode::PRIVATE)) {
1301 acl &= 0700;
1302 }
1303
1304 int newFd_;
1305 KJ_IF_MAYBE(temp, createNamedTemporary(path, mode,
1306 [&](StringPtr candidatePath) {
1307 return newFd_ = openat(fd, candidatePath.cStr(),
1308 O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, acl);
1309 })) {
1310 AutoCloseFd newFd(newFd_);
1311#ifndef O_CLOEXEC
1312 setCloexec(newFd);
1313#endif
1314 return heap<ReplacerImpl<File>>(newDiskFile(kj::mv(newFd)), *this, kj::mv(*temp),
1315 path.toString(), mode);
1316 } else {
1317 // threw, but exceptions are disabled
1318 return heap<BrokenReplacer<File>>(newInMemoryFile(nullClock()));
1319 }
1320 }
1321
1322 Own<const File> createTemporary() const {
1323 int newFd_;
1324
1325#if __linux__ && defined(O_TMPFILE)
1326 // Use syscall() to work around glibc bug with O_TMPFILE:
1327 // https://sourceware.org/bugzilla/show_bug.cgi?id=17523
1328 KJ_SYSCALL_HANDLE_ERRORS(newFd_ = syscall(
1329 SYS_openat, fd.get(), ".", O_RDWR | O_TMPFILE, 0700)) {
1330 case EOPNOTSUPP:
1331 case EINVAL:
1332 case EISDIR:
1333 // Maybe not supported by this kernel / filesystem. Fall back to below.
1334 break;
1335 default:
1336 KJ_FAIL_SYSCALL("open(O_TMPFILE)", error) { break; }
1337 break;
1338 } else {
1339 AutoCloseFd newFd(newFd_);
1340#ifndef O_CLOEXEC
1341 setCloexec(newFd);
1342#endif
1343 return newDiskFile(kj::mv(newFd));
1344 }
1345#endif
1346
1347 KJ_IF_MAYBE(temp, createNamedTemporary(Path("unnamed"), WriteMode::CREATE,
1348 [&](StringPtr path) {
1349 return newFd_ = openat(fd, path.cStr(), O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0600);
1350 })) {
1351 AutoCloseFd newFd(newFd_);
1352#ifndef O_CLOEXEC
1353 setCloexec(newFd);
1354#endif
1355 auto result = newDiskFile(kj::mv(newFd));
1356 KJ_SYSCALL(unlinkat(fd, temp->cStr(), 0)) { break; }
1357 return kj::mv(result);
1358 } else {
1359 // threw, but exceptions are disabled
1360 return newInMemoryFile(nullClock());
1361 }
1362 }
1363
1364 Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const {
1365 return tryOpenFileInternal(path, mode, true).map(newDiskAppendableFile);
1366 }
1367
1368 Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const {
1369 // Must create before open.
1370 if (has(mode, WriteMode::CREATE)) {
1371 if (!tryMkdir(path, mode, false)) return nullptr;
1372 }
1373
1374 return tryOpenSubdirInternal(path).map(newDiskDirectory);
1375 }
1376
1377 Own<Directory::Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const {
1378 mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777;
1379
1380 KJ_IF_MAYBE(temp, createNamedTemporary(path, mode,
1381 [&](StringPtr candidatePath) {
1382 return mkdirat(fd, candidatePath.cStr(), acl);
1383 })) {
1384 int subdirFd_;
1385 KJ_SYSCALL_HANDLE_ERRORS(subdirFd_ = openat(
1386 fd, temp->cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) {
1387 default:
1388 KJ_FAIL_SYSCALL("open(just-created-temporary)", error);
1389 return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock()));
1390 }
1391
1392 AutoCloseFd subdirFd(subdirFd_);
1393#ifndef O_CLOEXEC
1394 setCloexec(subdirFd);
1395#endif
1396 return heap<ReplacerImpl<Directory>>(
1397 newDiskDirectory(kj::mv(subdirFd)), *this, kj::mv(*temp), path.toString(), mode);
1398 } else {
1399 // threw, but exceptions are disabled
1400 return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock()));
1401 }
1402 }
1403
1404 bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const {
1405 return tryReplaceNode(linkpath, mode, [&](StringPtr candidatePath) {
1406 return symlinkat(content.cStr(), fd, candidatePath.cStr());
1407 });
1408 }
1409
1410 bool tryTransfer(PathPtr toPath, WriteMode toMode,
1411 const Directory& fromDirectory, PathPtr fromPath,
1412 TransferMode mode, const Directory& self) const {
1413 KJ_REQUIRE(toPath.size() > 0, "can't replace self") { return false; }
1414
1415 if (mode == TransferMode::LINK) {
1416 KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) {
1417 // Other is a disk directory, so we can hopefully do an efficient move/link.
1418 return tryReplaceNode(toPath, toMode, [&](StringPtr candidatePath) {
1419 return linkat(*fromFd, fromPath.toString().cStr(), fd, candidatePath.cStr(), 0);
1420 });
1421 };
1422 } else if (mode == TransferMode::MOVE) {
1423 KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) {
1424 KJ_ASSERT(mode == TransferMode::MOVE);
1425
1426 int error = 0;
1427 if (tryCommitReplacement(toPath.toString(), *fromFd, fromPath.toString(), toMode,
1428 &error)) {
1429 return true;
1430 } else switch (error) {
1431 case 0:
1432 // Plain old WriteMode precondition failure.
1433 return false;
1434 case EXDEV:
1435 // Can't move between devices. Fall back to default implementation, which does
1436 // copy/delete.
1437 break;
1438 case ENOENT:
1439 // Either the destination directory doesn't exist or the source path doesn't exist.
1440 // Unfortunately we don't really know. If CREATE_PARENT was provided, try creating
1441 // the parent directory. Otherwise, we don't actually need to distinguish between
1442 // these two errors; just return false.
1443 if (has(toMode, WriteMode::CREATE) && has(toMode, WriteMode::CREATE_PARENT) &&
1444 toPath.size() > 0 && tryMkdir(toPath.parent(),
1445 WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) {
1446 // Retry, but make sure we don't try to create the parent again.
1447 return tryTransfer(toPath, toMode - WriteMode::CREATE_PARENT,
1448 fromDirectory, fromPath, mode, self);
1449 }
1450 return false;
1451 default:
1452 KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) {
1453 return false;
1454 }
1455 }
1456 }
1457 }
1458
1459 // OK, we can't do anything efficient using the OS. Fall back to default implementation.
1460 return self.Directory::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode);
1461 }
1462
1463 bool tryRemove(PathPtr path) const {
1464 return rmrf(fd, path.toString());
1465 }
1466
1467protected:
1468 AutoCloseFd fd;
1469};
1470
1471#define FSNODE_METHODS(classname) \
1472 Maybe<int> getFd() const override { return DiskHandle::getFd(); } \
1473 \
1474 Own<const FsNode> cloneFsNode() const override { \
1475 return heap<classname>(DiskHandle::clone()); \
1476 } \
1477 \
1478 Metadata stat() const override { return DiskHandle::stat(); } \
1479 void sync() const override { DiskHandle::sync(); } \
1480 void datasync() const override { DiskHandle::datasync(); }
1481
1482class DiskReadableFile final: public ReadableFile, public DiskHandle {
1483public:
1484 DiskReadableFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
1485
1486 FSNODE_METHODS(DiskReadableFile);
1487
1488 size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override {
1489 return DiskHandle::read(offset, buffer);
1490 }
1491 Array<const byte> mmap(uint64_t offset, uint64_t size) const override {
1492 return DiskHandle::mmap(offset, size);
1493 }
1494 Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override {
1495 return DiskHandle::mmapPrivate(offset, size);
1496 }
1497};
1498
1499class DiskAppendableFile final: public AppendableFile, public DiskHandle, public FdOutputStream {
1500public:
1501 DiskAppendableFile(AutoCloseFd&& fd)
1502 : DiskHandle(kj::mv(fd)),
1503 FdOutputStream(DiskHandle::fd.get()) {}
1504
1505 FSNODE_METHODS(DiskAppendableFile);
1506
1507 void write(const void* buffer, size_t size) override {
1508 FdOutputStream::write(buffer, size);
1509 }
1510 void write(ArrayPtr<const ArrayPtr<const byte>> pieces) override {
1511 FdOutputStream::write(pieces);
1512 }
1513};
1514
1515class DiskFile final: public File, public DiskHandle {
1516public:
1517 DiskFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
1518
1519 FSNODE_METHODS(DiskFile);
1520
1521 size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override {
1522 return DiskHandle::read(offset, buffer);
1523 }
1524 Array<const byte> mmap(uint64_t offset, uint64_t size) const override {
1525 return DiskHandle::mmap(offset, size);
1526 }
1527 Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override {
1528 return DiskHandle::mmapPrivate(offset, size);
1529 }
1530
1531 void write(uint64_t offset, ArrayPtr<const byte> data) const override {
1532 DiskHandle::write(offset, data);
1533 }
1534 void zero(uint64_t offset, uint64_t size) const override {
1535 DiskHandle::zero(offset, size);
1536 }
1537 void truncate(uint64_t size) const override {
1538 DiskHandle::truncate(size);
1539 }
1540 Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const override {
1541 return DiskHandle::mmapWritable(offset, size);
1542 }
1543 size_t copy(uint64_t offset, const ReadableFile& from,
1544 uint64_t fromOffset, uint64_t size) const override {
1545 KJ_IF_MAYBE(result, DiskHandle::copy(offset, from, fromOffset, size)) {
1546 return *result;
1547 } else {
1548 return File::copy(offset, from, fromOffset, size);
1549 }
1550 }
1551};
1552
1553class DiskReadableDirectory final: public ReadableDirectory, public DiskHandle {
1554public:
1555 DiskReadableDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
1556
1557 FSNODE_METHODS(DiskReadableDirectory);
1558
1559 Array<String> listNames() const override { return DiskHandle::listNames(); }
1560 Array<Entry> listEntries() const override { return DiskHandle::listEntries(); }
1561 bool exists(PathPtr path) const override { return DiskHandle::exists(path); }
1562 Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override {
1563 return DiskHandle::tryLstat(path);
1564 }
1565 Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override {
1566 return DiskHandle::tryOpenFile(path);
1567 }
1568 Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override {
1569 return DiskHandle::tryOpenSubdir(path);
1570 }
1571 Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); }
1572};
1573
1574class DiskDirectory final: public Directory, public DiskHandle {
1575public:
1576 DiskDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {}
1577
1578 FSNODE_METHODS(DiskDirectory);
1579
1580 Array<String> listNames() const override { return DiskHandle::listNames(); }
1581 Array<Entry> listEntries() const override { return DiskHandle::listEntries(); }
1582 bool exists(PathPtr path) const override { return DiskHandle::exists(path); }
1583 Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override {
1584 return DiskHandle::tryLstat(path);
1585 }
1586 Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override {
1587 return DiskHandle::tryOpenFile(path);
1588 }
1589 Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override {
1590 return DiskHandle::tryOpenSubdir(path);
1591 }
1592 Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); }
1593
1594 Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const override {
1595 return DiskHandle::tryOpenFile(path, mode);
1596 }
1597 Own<Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const override {
1598 return DiskHandle::replaceFile(path, mode);
1599 }
1600 Own<const File> createTemporary() const override {
1601 return DiskHandle::createTemporary();
1602 }
1603 Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const override {
1604 return DiskHandle::tryAppendFile(path, mode);
1605 }
1606 Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const override {
1607 return DiskHandle::tryOpenSubdir(path, mode);
1608 }
1609 Own<Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const override {
1610 return DiskHandle::replaceSubdir(path, mode);
1611 }
1612 bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const override {
1613 return DiskHandle::trySymlink(linkpath, content, mode);
1614 }
1615 bool tryTransfer(PathPtr toPath, WriteMode toMode,
1616 const Directory& fromDirectory, PathPtr fromPath,
1617 TransferMode mode) const override {
1618 return DiskHandle::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode, *this);
1619 }
1620 // tryTransferTo() not implemented because we have nothing special we can do.
1621 bool tryRemove(PathPtr path) const override {
1622 return DiskHandle::tryRemove(path);
1623 }
1624};
1625
1626class DiskFilesystem final: public Filesystem {
1627public:
1628 DiskFilesystem()
1629 : root(openDir("/")),
1630 current(openDir(".")),
1631 currentPath(computeCurrentPath()) {}
1632
1633 const Directory& getRoot() const override {
1634 return root;
1635 }
1636
1637 const Directory& getCurrent() const override {
1638 return current;
1639 }
1640
1641 PathPtr getCurrentPath() const override {
1642 return currentPath;
1643 }
1644
1645private:
1646 DiskDirectory root;
1647 DiskDirectory current;
1648 Path currentPath;
1649
1650 static AutoCloseFd openDir(const char* dir) {
1651 int newFd;
1652 KJ_SYSCALL(newFd = open(dir, O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY));
1653 AutoCloseFd result(newFd);
1654#ifndef O_CLOEXEC
1655 setCloexec(result);
1656#endif
1657 return result;
1658 }
1659
1660 static Path computeCurrentPath() {
1661 // If env var PWD is set and points to the current directory, use it. This captures the current
1662 // path according to the user's shell, which may differ from the kernel's idea in the presence
1663 // of symlinks.
1664 const char* pwd = getenv("PWD");
1665 if (pwd != nullptr) {
1666 Path result = nullptr;
1667 struct stat pwdStat, dotStat;
1668 KJ_IF_MAYBE(e, kj::runCatchingExceptions([&]() {
1669 KJ_ASSERT(pwd[0] == '/') { return; }
1670 result = Path::parse(pwd + 1);
1671 KJ_SYSCALL(lstat(result.toString(true).cStr(), &pwdStat), result) { return; }
1672 KJ_SYSCALL(lstat(".", &dotStat)) { return; }
1673 })) {
1674 // failed, give up on PWD
1675 KJ_LOG(WARNING, "PWD environment variable seems invalid", pwd, *e);
1676 } else {
1677 if (pwdStat.st_ino == dotStat.st_ino &&
1678 pwdStat.st_dev == dotStat.st_dev) {
1679 return kj::mv(result);
1680 } else {
1681 KJ_LOG(WARNING, "PWD environment variable doesn't match current directory", pwd);
1682 }
1683 }
1684 }
1685
1686 size_t size = 256;
1687 retry:
1688 KJ_STACK_ARRAY(char, buf, size, 256, 4096);
1689 if (getcwd(buf.begin(), size) == nullptr) {
1690 int error = errno;
1691 if (error == ENAMETOOLONG) {
1692 size *= 2;
1693 goto retry;
1694 } else {
1695 KJ_FAIL_SYSCALL("getcwd()", error);
1696 }
1697 }
1698
1699 StringPtr path = buf.begin();
1700
1701 // On Linux, the path will start with "(unreachable)" if the working directory is not a subdir
1702 // of the root directory, which is possible via chroot() or mount namespaces.
1703 KJ_ASSERT(!path.startsWith("(unreachable)"),
1704 "working directory is not reachable from root", path);
1705 KJ_ASSERT(path.startsWith("/"), "current directory is not absolute", path);
1706
1707 return Path::parse(path.slice(1));
1708 }
1709};
1710
1711} // namespace
1712
1713Own<ReadableFile> newDiskReadableFile(kj::AutoCloseFd fd) {
1714 return heap<DiskReadableFile>(kj::mv(fd));
1715}
1716Own<AppendableFile> newDiskAppendableFile(kj::AutoCloseFd fd) {
1717 return heap<DiskAppendableFile>(kj::mv(fd));
1718}
1719Own<File> newDiskFile(kj::AutoCloseFd fd) {
1720 return heap<DiskFile>(kj::mv(fd));
1721}
1722Own<ReadableDirectory> newDiskReadableDirectory(kj::AutoCloseFd fd) {
1723 return heap<DiskReadableDirectory>(kj::mv(fd));
1724}
1725Own<Directory> newDiskDirectory(kj::AutoCloseFd fd) {
1726 return heap<DiskDirectory>(kj::mv(fd));
1727}
1728
1729Own<Filesystem> newDiskFilesystem() {
1730 return heap<DiskFilesystem>();
1731}
1732
1733} // namespace kj
1734
1735#endif // !_WIN32
1736