| 1 | /**************************************************************************** |
| 2 | ** |
| 3 | ** Copyright (C) 2020 Intel Corporation. |
| 4 | ** |
| 5 | ** Permission is hereby granted, free of charge, to any person obtaining a copy |
| 6 | ** of this software and associated documentation files (the "Software"), to deal |
| 7 | ** in the Software without restriction, including without limitation the rights |
| 8 | ** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 9 | ** copies of the Software, and to permit persons to whom the Software is |
| 10 | ** furnished to do so, subject to the following conditions: |
| 11 | ** |
| 12 | ** The above copyright notice and this permission notice shall be included in |
| 13 | ** all copies or substantial portions of the Software. |
| 14 | ** |
| 15 | ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 16 | ** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 17 | ** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 18 | ** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 19 | ** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 20 | ** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 21 | ** THE SOFTWARE. |
| 22 | ** |
| 23 | ****************************************************************************/ |
| 24 | |
| 25 | #ifndef _GNU_SOURCE |
| 26 | # define _GNU_SOURCE |
| 27 | #endif |
| 28 | |
| 29 | #include "forkfd.h" |
| 30 | |
| 31 | #include <errno.h> |
| 32 | #include <fcntl.h> |
| 33 | #include <limits.h> |
| 34 | #include <sched.h> |
| 35 | #include <signal.h> |
| 36 | #include <stdio.h> |
| 37 | #include <stdlib.h> |
| 38 | #include <string.h> |
| 39 | #include <sys/resource.h> |
| 40 | #include <sys/syscall.h> |
| 41 | #include <sys/types.h> |
| 42 | #include <sys/wait.h> |
| 43 | #include <unistd.h> |
| 44 | |
| 45 | #include "forkfd_atomic.h" |
| 46 | |
| 47 | #ifndef CLONE_PIDFD |
| 48 | # define CLONE_PIDFD 0x00001000 |
| 49 | #endif |
| 50 | #ifndef P_PIDFD |
| 51 | # define P_PIDFD 3 |
| 52 | #endif |
| 53 | |
| 54 | // in forkfd.c |
| 55 | static int convertForkfdWaitFlagsToWaitFlags(int ffdoptions); |
| 56 | static void convertStatusToForkfdInfo(int status, struct forkfd_info *info); |
| 57 | |
| 58 | static ffd_atomic_int system_forkfd_state = FFD_ATOMIC_INIT(0); |
| 59 | |
| 60 | static int sys_waitid(int which, int pid_or_pidfd, siginfo_t *infop, int options, |
| 61 | struct rusage *ru) |
| 62 | { |
| 63 | /* use the waitid raw system call, which has an extra parameter that glibc |
| 64 | * doesn't offer to us */ |
| 65 | return syscall(__NR_waitid, which, pid_or_pidfd, infop, options, ru); |
| 66 | } |
| 67 | |
| 68 | static int sys_clone(unsigned long cloneflags, int *ptid) |
| 69 | { |
| 70 | void *child_stack = NULL; |
| 71 | int *ctid = NULL; |
| 72 | unsigned long newtls = 0; |
| 73 | #if defined(__NR_clone2) |
| 74 | size_t stack_size = 0; |
| 75 | return syscall(__NR_clone2, cloneflags, child_stack, stack_size, ptid, ctid, newtls); |
| 76 | #elif defined(__cris__) || defined(__s390__) |
| 77 | /* a.k.a., CONFIG_CLONE_BACKWARDS2 architectures */ |
| 78 | return syscall(__NR_clone, child_stack, cloneflags, ptid, newtls, ctid); |
| 79 | #elif defined(__microblaze__) |
| 80 | /* a.k.a., CONFIG_CLONE_BACKWARDS3 architectures */ |
| 81 | size_t stack_size = 0; |
| 82 | return syscall(__NR_clone, cloneflags, child_stack, stack_size, ptid, newtls, ctid); |
| 83 | #elif defined(__arc__) || defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \ |
| 84 | defined(__nds32__) || defined(__hppa__) || defined(__powerpc__) || defined(__i386__) || \ |
| 85 | defined(__x86_64__) || defined(__xtensa__) || defined(__alpha__) || defined(__riscv) |
| 86 | /* ctid and newtls are inverted on CONFIG_CLONE_BACKWARDS architectures, |
| 87 | * but since both values are 0, there's no harm. */ |
| 88 | return syscall(__NR_clone, cloneflags, child_stack, ptid, ctid, newtls); |
| 89 | #else |
| 90 | (void) child_stack; |
| 91 | (void) ctid; |
| 92 | (void) newtls; |
| 93 | errno = ENOSYS; |
| 94 | return -1; |
| 95 | #endif |
| 96 | } |
| 97 | |
| 98 | static int detect_clone_pidfd_support() |
| 99 | { |
| 100 | /* |
| 101 | * Detect support for CLONE_PIDFD and P_PIDFD. Support was added in steps: |
| 102 | * - Linux 5.2 added CLONE_PIDFD support in clone(2) system call |
| 103 | * - Linux 5.2 added pidfd_send_signal(2) |
| 104 | * - Linux 5.3 added support for poll(2) on pidfds |
| 105 | * - Linux 5.3 added clone3(2) |
| 106 | * - Linux 5.4 added P_PIDFD support in waitid(2) |
| 107 | * |
| 108 | * We need CLONE_PIDFD and the poll(2) support. We could emulate the |
| 109 | * P_PIDFD support by reading the PID from /proc/self/fdinfo/n, which works |
| 110 | * in Linux 5.2, but without poll(2), we can't guarantee the functionality |
| 111 | * anyway. |
| 112 | * |
| 113 | * So we detect by trying to waitid(2) on a positive file descriptor that |
| 114 | * is definitely closed (INT_MAX). If P_PIDFD is supported, waitid(2) will |
| 115 | * return EBADF. If it isn't supported, it returns EINVAL (as it would for |
| 116 | * a negative file descriptor). This will succeed on Linux 5.4. |
| 117 | * |
| 118 | * We could have instead detected by the existence of the clone3(2) system |
| 119 | * call, but for that we would have needed to wait for __NR_clone3 to show |
| 120 | * up on the libcs. We choose to go via the waitid(2) route, which requires |
| 121 | * platform-independent constants only. It would have simplified the |
| 122 | * sys_clone() mess above... |
| 123 | */ |
| 124 | |
| 125 | sys_waitid(P_PIDFD, INT_MAX, NULL, WEXITED|WNOHANG, NULL); |
| 126 | return errno == EBADF ? 1 : -1; |
| 127 | } |
| 128 | |
| 129 | int system_has_forkfd() |
| 130 | { |
| 131 | return ffd_atomic_load(&system_forkfd_state, FFD_ATOMIC_RELAXED) > 0; |
| 132 | } |
| 133 | |
| 134 | int system_forkfd(int flags, pid_t *ppid, int *system) |
| 135 | { |
| 136 | pid_t pid; |
| 137 | int pidfd; |
| 138 | |
| 139 | int state = ffd_atomic_load(&system_forkfd_state, FFD_ATOMIC_RELAXED); |
| 140 | if (state == 0) { |
| 141 | state = detect_clone_pidfd_support(); |
| 142 | ffd_atomic_store(&system_forkfd_state, state, FFD_ATOMIC_RELAXED); |
| 143 | } |
| 144 | if (state < 0) { |
| 145 | *system = 0; |
| 146 | return state; |
| 147 | } |
| 148 | |
| 149 | *system = 1; |
| 150 | unsigned long cloneflags = CLONE_PIDFD | SIGCHLD; |
| 151 | pid = sys_clone(cloneflags, &pidfd); |
| 152 | if (pid < 0) |
| 153 | return pid; |
| 154 | if (ppid) |
| 155 | *ppid = pid; |
| 156 | |
| 157 | if (pid == 0) { |
| 158 | /* Child process */ |
| 159 | return FFD_CHILD_PROCESS; |
| 160 | } |
| 161 | |
| 162 | /* parent process */ |
| 163 | if ((flags & FFD_CLOEXEC) == 0) { |
| 164 | /* pidfd defaults to O_CLOEXEC */ |
| 165 | fcntl(pidfd, F_SETFD, 0); |
| 166 | } |
| 167 | if (flags & FFD_NONBLOCK) |
| 168 | fcntl(pidfd, F_SETFL, fcntl(pidfd, F_GETFL) | O_NONBLOCK); |
| 169 | return pidfd; |
| 170 | } |
| 171 | |
| 172 | int system_forkfd_wait(int ffd, struct forkfd_info *info, int ffdoptions, struct rusage *rusage) |
| 173 | { |
| 174 | siginfo_t si; |
| 175 | int ret; |
| 176 | int options = convertForkfdWaitFlagsToWaitFlags(ffdoptions); |
| 177 | |
| 178 | if ((options & WNOHANG) == 0) { |
| 179 | /* check if the file descriptor is non-blocking */ |
| 180 | ret = fcntl(ffd, F_GETFL); |
| 181 | if (ret == -1) |
| 182 | return ret; |
| 183 | if (ret & O_NONBLOCK) |
| 184 | options |= WNOHANG; |
| 185 | } |
| 186 | |
| 187 | ret = sys_waitid(P_PIDFD, ffd, &si, options, rusage); |
| 188 | if (ret == -1 && errno == ECHILD) { |
| 189 | errno = EWOULDBLOCK; |
| 190 | } else if (ret == 0 && info) { |
| 191 | info->code = si.si_code; |
| 192 | info->status = si.si_status; |
| 193 | } |
| 194 | return ret; |
| 195 | } |
| 196 | |