| |
| #define _GNU_SOURCE |
| #include <endian.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <grp.h> |
| #include <sched.h> |
| #include <setjmp.h> |
| #include <signal.h> |
| #include <stdarg.h> |
| #include <stdbool.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <stdbool.h> |
| #include <string.h> |
| #include <unistd.h> |
| |
| #include <sys/ioctl.h> |
| #include <sys/prctl.h> |
| #include <sys/socket.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| |
| #include <linux/limits.h> |
| #include <linux/netlink.h> |
| #include <linux/types.h> |
| |
| #include "getenv.h" |
| #include "log.h" |
| /* Get all of the CLONE_NEW* flags. */ |
| #include "namespace.h" |
| |
| /* Synchronisation values. */ |
| enum sync_t { |
| SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ |
| SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ |
| SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ |
| SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ |
| SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ |
| SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ |
| SYNC_TIMEOFFSETS_PLS = 0x46, /* Request parent to write timens offsets. */ |
| SYNC_TIMEOFFSETS_ACK = 0x47, /* Timens offsets were written. */ |
| }; |
| |
| #define STAGE_SETUP -1 |
| /* longjmp() arguments. */ |
| #define STAGE_PARENT 0 |
| #define STAGE_CHILD 1 |
| #define STAGE_INIT 2 |
| |
| /* Stores the current stage of nsexec. */ |
| int current_stage = STAGE_SETUP; |
| |
| /* Assume the stack grows down, so arguments should be above it. */ |
| struct clone_t { |
| /* |
| * Reserve some space for clone() to locate arguments |
| * and retcode in this place |
| */ |
| char stack[4096] __attribute__((aligned(16))); |
| char stack_ptr[0]; |
| |
| /* There's two children. This is used to execute the different code. */ |
| jmp_buf *env; |
| int jmpval; |
| }; |
| |
| struct nlconfig_t { |
| char *data; |
| |
| /* Process settings. */ |
| uint32_t cloneflags; |
| char *oom_score_adj; |
| size_t oom_score_adj_len; |
| |
| /* User namespace settings. */ |
| char *uidmap; |
| size_t uidmap_len; |
| char *gidmap; |
| size_t gidmap_len; |
| char *namespaces; |
| size_t namespaces_len; |
| uint8_t is_setgroup; |
| |
| /* Rootless container settings. */ |
| uint8_t is_rootless_euid; /* boolean */ |
| char *uidmappath; |
| size_t uidmappath_len; |
| char *gidmappath; |
| size_t gidmappath_len; |
| |
| /* Time NS offsets. */ |
| char *timensoffset; |
| size_t timensoffset_len; |
| }; |
| |
| /* |
| * List of netlink message types sent to us as part of bootstrapping the init. |
| * These constants are defined in libcontainer/message_linux.go. |
| */ |
| #define INIT_MSG 62000 |
| #define CLONE_FLAGS_ATTR 27281 |
| #define NS_PATHS_ATTR 27282 |
| #define UIDMAP_ATTR 27283 |
| #define GIDMAP_ATTR 27284 |
| #define SETGROUP_ATTR 27285 |
| #define OOM_SCORE_ADJ_ATTR 27286 |
| #define ROOTLESS_EUID_ATTR 27287 |
| #define UIDMAPPATH_ATTR 27288 |
| #define GIDMAPPATH_ATTR 27289 |
| #define TIMENSOFFSET_ATTR 27290 |
| |
| /* |
| * Use the raw syscall for versions of glibc which don't include a function for |
| * it, namely (glibc 2.12). |
| */ |
| #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 |
| # define _GNU_SOURCE |
| # include "syscall.h" |
| # if !defined(SYS_setns) && defined(__NR_setns) |
| # define SYS_setns __NR_setns |
| # endif |
| |
| # ifndef SYS_setns |
| # error "setns(2) syscall not supported by glibc version" |
| # endif |
| |
| int setns(int fd, int nstype) |
| { |
| return syscall(SYS_setns, fd, nstype); |
| } |
| #endif |
| |
| /* XXX: This is ugly. */ |
| static int syncfd = -1; |
| |
| static int write_file(char *data, size_t data_len, char *pathfmt, ...) |
| { |
| int fd, len, ret = 0; |
| char path[PATH_MAX]; |
| |
| va_list ap; |
| va_start(ap, pathfmt); |
| len = vsnprintf(path, PATH_MAX, pathfmt, ap); |
| va_end(ap); |
| if (len < 0) |
| return -1; |
| |
| fd = open(path, O_RDWR); |
| if (fd < 0) { |
| return -1; |
| } |
| |
| len = write(fd, data, data_len); |
| if (len != data_len) { |
| ret = -1; |
| goto out; |
| } |
| |
| out: |
| close(fd); |
| return ret; |
| } |
| |
| enum policy_t { |
| SETGROUPS_DEFAULT = 0, |
| SETGROUPS_ALLOW, |
| SETGROUPS_DENY, |
| }; |
| |
| /* This *must* be called before we touch gid_map. */ |
| static void update_setgroups(int pid, enum policy_t setgroup) |
| { |
| char *policy; |
| |
| switch (setgroup) { |
| case SETGROUPS_ALLOW: |
| policy = "allow"; |
| break; |
| case SETGROUPS_DENY: |
| policy = "deny"; |
| break; |
| case SETGROUPS_DEFAULT: |
| default: |
| /* Nothing to do. */ |
| return; |
| } |
| |
| if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { |
| /* |
| * If the kernel is too old to support /proc/pid/setgroups, |
| * open(2) or write(2) will return ENOENT. This is fine. |
| */ |
| if (errno != ENOENT) |
| bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); |
| } |
| } |
| |
| static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) |
| { |
| int child; |
| |
| /* |
| * If @app is NULL, execve will segfault. Just check it here and bail (if |
| * we're in this path, the caller is already getting desperate and there |
| * isn't a backup to this failing). This usually would be a configuration |
| * or programming issue. |
| */ |
| if (!app) |
| bail("mapping tool not present"); |
| |
| child = fork(); |
| if (child < 0) |
| bail("failed to fork"); |
| |
| if (!child) { |
| #define MAX_ARGV 20 |
| char *argv[MAX_ARGV]; |
| char *envp[] = { NULL }; |
| char pid_fmt[16]; |
| int argc = 0; |
| char *next; |
| |
| snprintf(pid_fmt, 16, "%d", pid); |
| |
| argv[argc++] = (char *)app; |
| argv[argc++] = pid_fmt; |
| /* |
| * Convert the map string into a list of argument that |
| * newuidmap/newgidmap can understand. |
| */ |
| |
| while (argc < MAX_ARGV) { |
| if (*map == '\0') { |
| argv[argc++] = NULL; |
| break; |
| } |
| argv[argc++] = map; |
| next = strpbrk(map, "\n "); |
| if (next == NULL) |
| break; |
| *next++ = '\0'; |
| map = next + strspn(next, "\n "); |
| } |
| |
| execve(app, argv, envp); |
| bail("failed to execv"); |
| } else { |
| int status; |
| |
| while (true) { |
| if (waitpid(child, &status, 0) < 0) { |
| if (errno == EINTR) |
| continue; |
| bail("failed to waitpid"); |
| } |
| if (WIFEXITED(status) || WIFSIGNALED(status)) |
| return WEXITSTATUS(status); |
| } |
| } |
| |
| return -1; |
| } |
| |
| static void update_uidmap(const char *path, int pid, char *map, size_t map_len) |
| { |
| if (map == NULL || map_len == 0) |
| return; |
| |
| write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map); |
| if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { |
| if (errno != EPERM) |
| bail("failed to update /proc/%d/uid_map", pid); |
| write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path); |
| if (try_mapping_tool(path, pid, map, map_len)) |
| bail("failed to use newuid map on %d", pid); |
| } |
| } |
| |
| static void update_gidmap(const char *path, int pid, char *map, size_t map_len) |
| { |
| if (map == NULL || map_len == 0) |
| return; |
| |
| write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map); |
| if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { |
| if (errno != EPERM) |
| bail("failed to update /proc/%d/gid_map", pid); |
| write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path); |
| if (try_mapping_tool(path, pid, map, map_len)) |
| bail("failed to use newgid map on %d", pid); |
| } |
| } |
| |
| static void update_oom_score_adj(char *data, size_t len) |
| { |
| if (data == NULL || len == 0) |
| return; |
| |
| write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data); |
| if (write_file(data, len, "/proc/self/oom_score_adj") < 0) |
| bail("failed to update /proc/self/oom_score_adj"); |
| } |
| |
| /* A dummy function that just jumps to the given jumpval. */ |
| static int child_func(void *arg) __attribute__((noinline)); |
| static int child_func(void *arg) |
| { |
| struct clone_t *ca = (struct clone_t *)arg; |
| longjmp(*ca->env, ca->jmpval); |
| } |
| |
| static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); |
| static int clone_parent(jmp_buf *env, int jmpval) |
| { |
| struct clone_t ca = { |
| .env = env, |
| .jmpval = jmpval, |
| }; |
| |
| return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); |
| } |
| |
| /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ |
| static int nsflag(char *name) |
| { |
| if (!strcmp(name, "cgroup")) |
| return CLONE_NEWCGROUP; |
| else if (!strcmp(name, "ipc")) |
| return CLONE_NEWIPC; |
| else if (!strcmp(name, "mnt")) |
| return CLONE_NEWNS; |
| else if (!strcmp(name, "net")) |
| return CLONE_NEWNET; |
| else if (!strcmp(name, "pid")) |
| return CLONE_NEWPID; |
| else if (!strcmp(name, "user")) |
| return CLONE_NEWUSER; |
| else if (!strcmp(name, "uts")) |
| return CLONE_NEWUTS; |
| else if (!strcmp(name, "time")) |
| return CLONE_NEWTIME; |
| |
| /* If we don't recognise a name, fallback to 0. */ |
| return 0; |
| } |
| |
| static uint32_t readint32(char *buf) |
| { |
| return *(uint32_t *) buf; |
| } |
| |
| static uint8_t readint8(char *buf) |
| { |
| return *(uint8_t *) buf; |
| } |
| |
| static void nl_parse(int fd, struct nlconfig_t *config) |
| { |
| size_t len, size; |
| struct nlmsghdr hdr; |
| char *data, *current; |
| |
| /* Retrieve the netlink header. */ |
| len = read(fd, &hdr, NLMSG_HDRLEN); |
| if (len != NLMSG_HDRLEN) |
| bail("invalid netlink header length %zu", len); |
| |
| if (hdr.nlmsg_type == NLMSG_ERROR) |
| bail("failed to read netlink message"); |
| |
| if (hdr.nlmsg_type != INIT_MSG) |
| bail("unexpected msg type %d", hdr.nlmsg_type); |
| |
| /* Retrieve data. */ |
| size = NLMSG_PAYLOAD(&hdr, 0); |
| current = data = malloc(size); |
| if (!data) |
| bail("failed to allocate %zu bytes of memory for nl_payload", size); |
| |
| len = read(fd, data, size); |
| if (len != size) |
| bail("failed to read netlink payload, %zu != %zu", len, size); |
| |
| /* Parse the netlink payload. */ |
| config->data = data; |
| while (current < data + size) { |
| struct nlattr *nlattr = (struct nlattr *)current; |
| size_t payload_len = nlattr->nla_len - NLA_HDRLEN; |
| |
| /* Advance to payload. */ |
| current += NLA_HDRLEN; |
| |
| /* Handle payload. */ |
| switch (nlattr->nla_type) { |
| case CLONE_FLAGS_ATTR: |
| config->cloneflags = readint32(current); |
| break; |
| case ROOTLESS_EUID_ATTR: |
| config->is_rootless_euid = readint8(current); /* boolean */ |
| break; |
| case OOM_SCORE_ADJ_ATTR: |
| config->oom_score_adj = current; |
| config->oom_score_adj_len = payload_len; |
| break; |
| case NS_PATHS_ATTR: |
| config->namespaces = current; |
| config->namespaces_len = payload_len; |
| break; |
| case UIDMAP_ATTR: |
| config->uidmap = current; |
| config->uidmap_len = payload_len; |
| break; |
| case GIDMAP_ATTR: |
| config->gidmap = current; |
| config->gidmap_len = payload_len; |
| break; |
| case UIDMAPPATH_ATTR: |
| config->uidmappath = current; |
| config->uidmappath_len = payload_len; |
| break; |
| case GIDMAPPATH_ATTR: |
| config->gidmappath = current; |
| config->gidmappath_len = payload_len; |
| break; |
| case SETGROUP_ATTR: |
| config->is_setgroup = readint8(current); |
| break; |
| case TIMENSOFFSET_ATTR: |
| config->timensoffset = current; |
| config->timensoffset_len = payload_len; |
| break; |
| default: |
| bail("unknown netlink message type %d", nlattr->nla_type); |
| } |
| |
| current += NLA_ALIGN(payload_len); |
| } |
| } |
| |
| void nl_free(struct nlconfig_t *config) |
| { |
| free(config->data); |
| } |
| |
| void join_namespaces(char *nslist) |
| { |
| int num = 0, i; |
| char *saveptr = NULL; |
| char *namespace = strtok_r(nslist, ",", &saveptr); |
| struct namespace_t { |
| int fd; |
| char type[PATH_MAX]; |
| char path[PATH_MAX]; |
| } *namespaces = NULL; |
| |
| if (!namespace || !strlen(namespace) || !strlen(nslist)) |
| bail("ns paths are empty"); |
| |
| /* |
| * We have to open the file descriptors first, since after |
| * we join the mnt namespace we might no longer be able to |
| * access the paths. |
| */ |
| do { |
| int fd; |
| char *path; |
| struct namespace_t *ns; |
| |
| /* Resize the namespace array. */ |
| namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); |
| if (!namespaces) |
| bail("failed to reallocate namespace array"); |
| ns = &namespaces[num - 1]; |
| |
| /* Split 'ns:path'. */ |
| path = strstr(namespace, ":"); |
| if (!path) |
| bail("failed to parse %s", namespace); |
| *path++ = '\0'; |
| |
| fd = open(path, O_RDONLY); |
| if (fd < 0) |
| bail("failed to open %s", path); |
| |
| ns->fd = fd; |
| strncpy(ns->type, namespace, PATH_MAX - 1); |
| strncpy(ns->path, path, PATH_MAX - 1); |
| ns->path[PATH_MAX - 1] = '\0'; |
| } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); |
| |
| /* |
| * The ordering in which we join namespaces is important. We should |
| * always join the user namespace *first*. This is all guaranteed |
| * from the container_linux.go side of this, so we're just going to |
| * follow the order given to us. |
| */ |
| |
| for (i = 0; i < num; i++) { |
| struct namespace_t *ns = &namespaces[i]; |
| int flag = nsflag(ns->type); |
| |
| write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); |
| if (setns(ns->fd, flag) < 0) |
| bail("failed to setns into %s namespace", ns->type); |
| |
| /* |
| * If we change user namespaces, make sure we switch to root in the |
| * namespace (this matches the logic for unshare(CLONE_NEWUSER)), lots |
| * of things can break if we aren't the right user. See |
| * <https://github.com/opencontainers/runc/issues/4466> for one example. |
| */ |
| if (flag == CLONE_NEWUSER) { |
| if (setresuid(0, 0, 0) < 0) |
| bail("failed to become root in user namespace"); |
| } |
| |
| close(ns->fd); |
| } |
| |
| free(namespaces); |
| } |
| |
| static inline int sane_kill(pid_t pid, int signum) |
| { |
| if (pid > 0) |
| return kill(pid, signum); |
| else |
| return 0; |
| } |
| |
| void try_unshare(int flags, const char *msg) |
| { |
| write_log(DEBUG, "unshare %s", msg); |
| /* |
| * Kernels prior to v4.3 may return EINVAL on unshare when another process |
| * reads runc's /proc/$PID/status or /proc/$PID/maps. To work around this, |
| * retry on EINVAL a few times. |
| */ |
| int retries = 5; |
| for (; retries > 0; retries--) { |
| if (unshare(flags) == 0) { |
| return; |
| } |
| if (errno != EINVAL) |
| break; |
| } |
| bail("failed to unshare %s", msg); |
| } |
| |
| static void update_timens_offsets(pid_t pid, char *map, size_t map_len) |
| { |
| if (map == NULL || map_len == 0) |
| return; |
| write_log(DEBUG, "update /proc/%d/timens_offsets to '%s'", pid, map); |
| if (write_file(map, map_len, "/proc/%d/timens_offsets", pid) < 0) |
| bail("failed to update /proc/%d/timens_offsets", pid); |
| } |
| |
| void nsexec(void) |
| { |
| int pipenum; |
| jmp_buf env; |
| int sync_child_pipe[2], sync_grandchild_pipe[2]; |
| struct nlconfig_t config = { 0 }; |
| |
| /* |
| * Setup a pipe to send logs to the parent. This should happen |
| * first, because bail will use that pipe. |
| */ |
| setup_logpipe(); |
| |
| /* |
| * Get the init pipe fd from the environment. The init pipe is used to |
| * read the bootstrap data and tell the parent what the new pids are |
| * after the setup is done. |
| */ |
| pipenum = getenv_int("_LIBCONTAINER_INITPIPE"); |
| if (pipenum < 0) { |
| /* We are not a runc init. Just return to go runtime. */ |
| return; |
| } |
| |
| write_log(DEBUG, "=> nsexec container setup"); |
| |
| /* Parse all of the netlink configuration. */ |
| nl_parse(pipenum, &config); |
| |
| /* Set oom_score_adj. This has to be done before !dumpable because |
| * /proc/self/oom_score_adj is not writeable unless you're an privileged |
| * user (if !dumpable is set). All children inherit their parent's |
| * oom_score_adj value on fork(2) so this will always be propagated |
| * properly. |
| */ |
| update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len); |
| |
| /* |
| * Make the process non-dumpable, to avoid various race conditions that |
| * could cause processes in namespaces we're joining to access host |
| * resources (or potentially execute code). |
| * |
| * However, if the number of namespaces we are joining is 0, we are not |
| * going to be switching to a different security context. Thus setting |
| * ourselves to be non-dumpable only breaks things (like rootless |
| * containers), which is the recommendation from the kernel folks. |
| */ |
| if (config.namespaces) { |
| write_log(DEBUG, "set process as non-dumpable"); |
| if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) |
| bail("failed to set process as non-dumpable"); |
| } |
| |
| /* Pipe so we can tell the child when we've finished setting up. */ |
| if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) |
| bail("failed to setup sync pipe between parent and child"); |
| |
| /* |
| * We need a new socketpair to sync with grandchild so we don't have |
| * race condition with child. |
| */ |
| if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0) |
| bail("failed to setup sync pipe between parent and grandchild"); |
| |
| /* TODO: Currently we aren't dealing with child deaths properly. */ |
| |
| /* |
| * Okay, so this is quite annoying. |
| * |
| * In order for this unsharing code to be more extensible we need to split |
| * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case |
| * would be if we did clone(CLONE_NEWUSER) and the other namespaces |
| * separately, but because of SELinux issues we cannot really do that. But |
| * we cannot just dump the namespace flags into clone(...) because several |
| * usecases (such as rootless containers) require more granularity around |
| * the namespace setup. In addition, some older kernels had issues where |
| * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot |
| * handle this while also dealing with SELinux so we choose SELinux support |
| * over broken kernel support). |
| * |
| * However, if we unshare(2) the user namespace *before* we clone(2), then |
| * all hell breaks loose. |
| * |
| * The parent no longer has permissions to do many things (unshare(2) drops |
| * all capabilities in your old namespace), and the container cannot be set |
| * up to have more than one {uid,gid} mapping. This is obviously less than |
| * ideal. In order to fix this, we have to first clone(2) and then unshare. |
| * |
| * Unfortunately, it's not as simple as that. We have to fork to enter the |
| * PID namespace (the PID namespace only applies to children). Since we'll |
| * have to double-fork, this clone_parent() call won't be able to get the |
| * PID of the _actual_ init process (without doing more synchronisation than |
| * I can deal with at the moment). So we'll just get the parent to send it |
| * for us, the only job of this process is to update |
| * /proc/pid/{setgroups,uid_map,gid_map}. |
| * |
| * And as a result of the above, we also need to setns(2) in the first child |
| * because if we join a PID namespace in the topmost parent then our child |
| * will be in that namespace (and it will not be able to give us a PID value |
| * that makes sense without resorting to sending things with cmsg). |
| * |
| * This also deals with an older issue caused by dumping cloneflags into |
| * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so |
| * we have to unshare(2) before clone(2) in order to do this. This was fixed |
| * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was |
| * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're |
| * aware, the last mainline kernel which had this bug was Linux 3.12. |
| * However, we cannot comment on which kernels the broken patch was |
| * backported to. |
| * |
| * -- Aleksa "what has my life come to?" Sarai |
| */ |
| |
| switch (setjmp(env)) { |
| /* |
| * Stage 0: We're in the parent. Our job is just to create a new child |
| * (stage 1: STAGE_CHILD) process and write its uid_map and |
| * gid_map. That process will go on to create a new process, then |
| * it will send us its PID which we will send to the bootstrap |
| * process. |
| */ |
| case STAGE_PARENT:{ |
| int len; |
| pid_t stage1_pid = -1, stage2_pid = -1; |
| bool stage1_complete, stage2_complete; |
| |
| /* For debugging. */ |
| current_stage = STAGE_PARENT; |
| prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); |
| write_log(DEBUG, "~> nsexec stage-0"); |
| |
| /* Start the process of getting a container. */ |
| write_log(DEBUG, "spawn stage-1"); |
| stage1_pid = clone_parent(&env, STAGE_CHILD); |
| if (stage1_pid < 0) |
| bail("unable to spawn stage-1"); |
| |
| syncfd = sync_child_pipe[1]; |
| if (close(sync_child_pipe[0]) < 0) |
| bail("failed to close sync_child_pipe[0] fd"); |
| |
| /* |
| * State machine for synchronisation with the children. We only |
| * return once both the child and grandchild are ready. |
| */ |
| write_log(DEBUG, "-> stage-1 synchronisation loop"); |
| stage1_complete = false; |
| while (!stage1_complete) { |
| enum sync_t s; |
| |
| if (read(syncfd, &s, sizeof(s)) != sizeof(s)) |
| bail("failed to sync with stage-1: next state"); |
| |
| switch (s) { |
| case SYNC_USERMAP_PLS: |
| write_log(DEBUG, "stage-1 requested userns mappings"); |
| |
| /* |
| * Enable setgroups(2) if we've been asked to. But we also |
| * have to explicitly disable setgroups(2) if we're |
| * creating a rootless container for single-entry mapping. |
| * i.e. config.is_setgroup == false. |
| * (this is required since Linux 3.19). |
| * |
| * For rootless multi-entry mapping, config.is_setgroup shall be true and |
| * newuidmap/newgidmap shall be used. |
| */ |
| if (config.is_rootless_euid && !config.is_setgroup) |
| update_setgroups(stage1_pid, SETGROUPS_DENY); |
| |
| /* Set up mappings. */ |
| update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len); |
| update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len); |
| |
| s = SYNC_USERMAP_ACK; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { |
| sane_kill(stage1_pid, SIGKILL); |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)"); |
| } |
| break; |
| case SYNC_RECVPID_PLS: |
| write_log(DEBUG, "stage-1 requested pid to be forwarded"); |
| |
| /* Get the stage-2 pid. */ |
| if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { |
| sane_kill(stage1_pid, SIGKILL); |
| bail("failed to sync with stage-1: read(stage2_pid)"); |
| } |
| |
| /* Send ACK. */ |
| s = SYNC_RECVPID_ACK; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { |
| sane_kill(stage1_pid, SIGKILL); |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)"); |
| } |
| |
| /* |
| * Send both the stage-1 and stage-2 pids back to runc. |
| * runc needs the stage-2 to continue process management, |
| * but because stage-1 was spawned with CLONE_PARENT we |
| * cannot reap it within stage-0 and thus we need to ask |
| * runc to reap the zombie for us. |
| */ |
| write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", |
| stage1_pid, stage2_pid); |
| len = |
| dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid, |
| stage2_pid); |
| if (len < 0) { |
| sane_kill(stage1_pid, SIGKILL); |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with runc: write(pid-JSON)"); |
| } |
| break; |
| case SYNC_TIMEOFFSETS_PLS: |
| write_log(DEBUG, "stage-1 requested timens offsets to be configured"); |
| update_timens_offsets(stage1_pid, config.timensoffset, config.timensoffset_len); |
| s = SYNC_TIMEOFFSETS_ACK; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { |
| sane_kill(stage1_pid, SIGKILL); |
| bail("failed to sync with child: write(SYNC_TIMEOFFSETS_ACK)"); |
| } |
| break; |
| case SYNC_CHILD_FINISH: |
| write_log(DEBUG, "stage-1 complete"); |
| stage1_complete = true; |
| break; |
| default: |
| bail("unexpected sync value: %u", s); |
| } |
| } |
| write_log(DEBUG, "<- stage-1 synchronisation loop"); |
| |
| /* Now sync with grandchild. */ |
| syncfd = sync_grandchild_pipe[1]; |
| if (close(sync_grandchild_pipe[0]) < 0) |
| bail("failed to close sync_grandchild_pipe[0] fd"); |
| |
| write_log(DEBUG, "-> stage-2 synchronisation loop"); |
| stage2_complete = false; |
| while (!stage2_complete) { |
| enum sync_t s; |
| |
| write_log(DEBUG, "signalling stage-2 to run"); |
| s = SYNC_GRANDCHILD; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with child: write(SYNC_GRANDCHILD)"); |
| } |
| |
| if (read(syncfd, &s, sizeof(s)) != sizeof(s)) |
| bail("failed to sync with child: next state"); |
| |
| switch (s) { |
| case SYNC_CHILD_FINISH: |
| write_log(DEBUG, "stage-2 complete"); |
| stage2_complete = true; |
| break; |
| default: |
| bail("unexpected sync value: %u", s); |
| } |
| } |
| write_log(DEBUG, "<- stage-2 synchronisation loop"); |
| write_log(DEBUG, "<~ nsexec stage-0"); |
| exit(0); |
| } |
| break; |
| |
| /* |
| * Stage 1: We're in the first child process. Our job is to join any |
| * provided namespaces in the netlink payload and unshare all of |
| * the requested namespaces. If we've been asked to CLONE_NEWUSER, |
| * we will ask our parent (stage 0) to set up our user mappings |
| * for us. Then, we create a new child (stage 2: STAGE_INIT) for |
| * PID namespace. We then send the child's PID to our parent |
| * (stage 0). |
| */ |
| case STAGE_CHILD:{ |
| pid_t stage2_pid = -1; |
| enum sync_t s; |
| |
| /* For debugging. */ |
| current_stage = STAGE_CHILD; |
| |
| /* We're in a child and thus need to tell the parent if we die. */ |
| syncfd = sync_child_pipe[0]; |
| if (close(sync_child_pipe[1]) < 0) |
| bail("failed to close sync_child_pipe[1] fd"); |
| |
| /* For debugging. */ |
| prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); |
| write_log(DEBUG, "~> nsexec stage-1"); |
| |
| /* |
| * We need to setns first. We cannot do this earlier (in stage 0) |
| * because of the fact that we forked to get here (the PID of |
| * [stage 2: STAGE_INIT]) would be meaningless). We could send it |
| * using cmsg(3) but that's just annoying. |
| */ |
| if (config.namespaces) |
| join_namespaces(config.namespaces); |
| |
| /* |
| * Deal with user namespaces first. They are quite special, as they |
| * affect our ability to unshare other namespaces and are used as |
| * context for privilege checks. |
| * |
| * We don't unshare all namespaces in one go. The reason for this |
| * is that, while the kernel documentation may claim otherwise, |
| * there are certain cases where unsharing all namespaces at once |
| * will result in namespace objects being owned incorrectly. |
| * Ideally we should just fix these kernel bugs, but it's better to |
| * be safe than sorry, and fix them separately. |
| * |
| * A specific case of this is that the SELinux label of the |
| * internal kern-mount that mqueue uses will be incorrect if the |
| * UTS namespace is cloned before the USER namespace is mapped. |
| * I've also heard of similar problems with the network namespace |
| * in some scenarios. This also mirrors how LXC deals with this |
| * problem. |
| */ |
| if (config.cloneflags & CLONE_NEWUSER) { |
| try_unshare(CLONE_NEWUSER, "user namespace"); |
| config.cloneflags &= ~CLONE_NEWUSER; |
| |
| /* |
| * We need to set ourselves as dumpable temporarily so that the |
| * parent process can write to our procfs files. |
| */ |
| if (config.namespaces) { |
| write_log(DEBUG, "temporarily set process as dumpable"); |
| if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) |
| bail("failed to temporarily set process as dumpable"); |
| } |
| |
| /* |
| * We don't have the privileges to do any mapping here (see the |
| * clone_parent rant). So signal stage-0 to do the mapping for |
| * us. |
| */ |
| write_log(DEBUG, "request stage-0 to map user namespace"); |
| s = SYNC_USERMAP_PLS; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) |
| bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); |
| |
| /* ... wait for mapping ... */ |
| write_log(DEBUG, "waiting stage-0 to complete the mapping of user namespace"); |
| if (read(syncfd, &s, sizeof(s)) != sizeof(s)) |
| bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); |
| if (s != SYNC_USERMAP_ACK) |
| bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); |
| |
| /* Revert temporary re-dumpable setting. */ |
| if (config.namespaces) { |
| write_log(DEBUG, "re-set process as non-dumpable"); |
| if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) |
| bail("failed to re-set process as non-dumpable"); |
| } |
| |
| /* Become root in the namespace proper. */ |
| if (setresuid(0, 0, 0) < 0) |
| bail("failed to become root in user namespace"); |
| } |
| |
| /* |
| * Unshare all of the namespaces. Now, it should be noted that this |
| * ordering might break in the future (especially with rootless |
| * containers). But for now, it's not possible to split this into |
| * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. |
| * |
| * Note that we don't merge this with clone() because there were |
| * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) |
| * was broken, so we'll just do it the long way anyway. |
| */ |
| try_unshare(config.cloneflags, "remaining namespaces"); |
| |
| if (config.timensoffset) { |
| write_log(DEBUG, "request stage-0 to write timens offsets"); |
| |
| s = SYNC_TIMEOFFSETS_PLS; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) |
| bail("failed to sync with parent: write(SYNC_TIMEOFFSETS_PLS)"); |
| |
| if (read(syncfd, &s, sizeof(s)) != sizeof(s)) |
| bail("failed to sync with parent: read(SYNC_TIMEOFFSETS_ACK)"); |
| if (s != SYNC_TIMEOFFSETS_ACK) |
| bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s); |
| } |
| |
| /* |
| * TODO: What about non-namespace clone flags that we're dropping here? |
| * |
| * We fork again because of PID namespace, setns(2) or unshare(2) don't |
| * change the PID namespace of the calling process, because doing so |
| * would change the caller's idea of its own PID (as reported by getpid()), |
| * which would break many applications and libraries, so we must fork |
| * to actually enter the new PID namespace. |
| */ |
| write_log(DEBUG, "spawn stage-2"); |
| stage2_pid = clone_parent(&env, STAGE_INIT); |
| if (stage2_pid < 0) |
| bail("unable to spawn stage-2"); |
| |
| /* Send the child to our parent, which knows what it's doing. */ |
| write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid); |
| s = SYNC_RECVPID_PLS; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); |
| } |
| if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with parent: write(stage2_pid)"); |
| } |
| |
| /* ... wait for parent to get the pid ... */ |
| if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); |
| } |
| if (s != SYNC_RECVPID_ACK) { |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); |
| } |
| |
| write_log(DEBUG, "signal completion to stage-0"); |
| s = SYNC_CHILD_FINISH; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { |
| sane_kill(stage2_pid, SIGKILL); |
| bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); |
| } |
| |
| /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */ |
| write_log(DEBUG, "<~ nsexec stage-1"); |
| exit(0); |
| } |
| break; |
| |
| /* |
| * Stage 2: We're the final child process, and the only process that will |
| * actually return to the Go runtime. Our job is to just do the |
| * final cleanup steps and then return to the Go runtime to allow |
| * init_linux.go to run. |
| */ |
| case STAGE_INIT:{ |
| /* |
| * We're inside the child now, having jumped from the |
| * start_child() code after forking in the parent. |
| */ |
| enum sync_t s; |
| |
| /* For debugging. */ |
| current_stage = STAGE_INIT; |
| |
| /* We're in a child and thus need to tell the parent if we die. */ |
| syncfd = sync_grandchild_pipe[0]; |
| if (close(sync_grandchild_pipe[1]) < 0) |
| bail("failed to close sync_grandchild_pipe[1] fd"); |
| |
| if (close(sync_child_pipe[0]) < 0) |
| bail("failed to close sync_child_pipe[0] fd"); |
| |
| /* For debugging. */ |
| prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); |
| write_log(DEBUG, "~> nsexec stage-2"); |
| |
| if (read(syncfd, &s, sizeof(s)) != sizeof(s)) |
| bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); |
| if (s != SYNC_GRANDCHILD) |
| bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); |
| |
| if (setsid() < 0) |
| bail("setsid failed"); |
| |
| if (setuid(0) < 0) |
| bail("setuid failed"); |
| |
| if (setgid(0) < 0) |
| bail("setgid failed"); |
| |
| if (!config.is_rootless_euid && config.is_setgroup) { |
| if (setgroups(0, NULL) < 0) |
| bail("setgroups failed"); |
| } |
| |
| write_log(DEBUG, "signal completion to stage-0"); |
| s = SYNC_CHILD_FINISH; |
| if (write(syncfd, &s, sizeof(s)) != sizeof(s)) |
| bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); |
| |
| /* Close sync pipes. */ |
| if (close(sync_grandchild_pipe[0]) < 0) |
| bail("failed to close sync_grandchild_pipe[0] fd"); |
| |
| /* Free netlink data. */ |
| nl_free(&config); |
| |
| /* Finish executing, let the Go runtime take over. */ |
| write_log(DEBUG, "<= nsexec container setup"); |
| write_log(DEBUG, "booting up go runtime ..."); |
| return; |
| } |
| break; |
| default: |
| bail("unexpected jump value"); |
| } |
| |
| /* Should never be reached. */ |
| bail("should never be reached"); |
| } |