libcontainer/nsenter/nsexec.c - third_party/runc - Git at Google


 #define _GNU_SOURCE
 #include <endian.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <grp.h>
 #include <sched.h>
 #include <setjmp.h>
 #include <signal.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>

 #include <sys/ioctl.h>
 #include <sys/prctl.h>
 #include <sys/socket.h>
 #include <sys/types.h>
 #include <sys/wait.h>

 #include <linux/limits.h>
 #include <linux/netlink.h>
 #include <linux/types.h>

 #include "getenv.h"
 #include "log.h"
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"

 /* Synchronisation values. */
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
 	SYNC_USERMAP_ACK = 0x41,	/* Mapping finished by the parent. */
 	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
 	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
 	SYNC_TIMEOFFSETS_PLS = 0x46,	/* Request parent to write timens offsets. */
 	SYNC_TIMEOFFSETS_ACK = 0x47,	/* Timens offsets were written. */
 };

 #define STAGE_SETUP  -1
 /* longjmp() arguments. */
 #define STAGE_PARENT  0
 #define STAGE_CHILD   1
 #define STAGE_INIT    2

 /* Stores the current stage of nsexec. */
 int current_stage = STAGE_SETUP;

 /* Assume the stack grows down, so arguments should be above it. */
 struct clone_t {
 	/*
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 */
 	char stack[4096] __attribute__((aligned(16)));
 	char stack_ptr[0];

 	/* There's two children. This is used to execute the different code. */
 	jmp_buf *env;
 	int jmpval;
 };

 struct nlconfig_t {
 	char *data;

 	/* Process settings. */
 	uint32_t cloneflags;
 	char *oom_score_adj;
 	size_t oom_score_adj_len;

 	/* User namespace settings. */
 	char *uidmap;
 	size_t uidmap_len;
 	char *gidmap;
 	size_t gidmap_len;
 	char *namespaces;
 	size_t namespaces_len;
 	uint8_t is_setgroup;

 	/* Rootless container settings. */
 	uint8_t is_rootless_euid;	/* boolean */
 	char *uidmappath;
 	size_t uidmappath_len;
 	char *gidmappath;
 	size_t gidmappath_len;

 	/* Time NS offsets. */
 	char *timensoffset;
 	size_t timensoffset_len;
 };

 /*
  * List of netlink message types sent to us as part of bootstrapping the init.
  * These constants are defined in libcontainer/message_linux.go.
  */
 #define INIT_MSG		62000
 #define CLONE_FLAGS_ATTR	27281
 #define NS_PATHS_ATTR		27282
 #define UIDMAP_ATTR		27283
 #define GIDMAP_ATTR		27284
 #define SETGROUP_ATTR		27285
 #define OOM_SCORE_ADJ_ATTR	27286
 #define ROOTLESS_EUID_ATTR	27287
 #define UIDMAPPATH_ATTR		27288
 #define GIDMAPPATH_ATTR		27289
 #define TIMENSOFFSET_ATTR	27290

 /*
  * Use the raw syscall for versions of glibc which don't include a function for
  * it, namely (glibc 2.12).
  */
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
 #  define _GNU_SOURCE
 #  include "syscall.h"
 #  if !defined(SYS_setns) && defined(__NR_setns)
 #    define SYS_setns __NR_setns
 #  endif

 #  ifndef SYS_setns
 #    error "setns(2) syscall not supported by glibc version"
 #  endif

 int setns(int fd, int nstype)
 {
 	return syscall(SYS_setns, fd, nstype);
 }
 #endif

 /* XXX: This is ugly. */
 static int syncfd = -1;

 static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 {
 	int fd, len, ret = 0;
 	char path[PATH_MAX];

 	va_list ap;
 	va_start(ap, pathfmt);
 	len = vsnprintf(path, PATH_MAX, pathfmt, ap);
 	va_end(ap);
 	if (len < 0)
 		return -1;

 	fd = open(path, O_RDWR);
 	if (fd < 0) {
 		return -1;
 	}

 	len = write(fd, data, data_len);
 	if (len != data_len) {
 		ret = -1;
 		goto out;
 	}

 out:
 	close(fd);
 	return ret;
 }

 enum policy_t {
 	SETGROUPS_DEFAULT = 0,
 	SETGROUPS_ALLOW,
 	SETGROUPS_DENY,
 };

 /* This *must* be called before we touch gid_map. */
 static void update_setgroups(int pid, enum policy_t setgroup)
 {
 	char *policy;

 	switch (setgroup) {
 	case SETGROUPS_ALLOW:
 		policy = "allow";
 		break;
 	case SETGROUPS_DENY:
 		policy = "deny";
 		break;
 	case SETGROUPS_DEFAULT:
 	default:
 		/* Nothing to do. */
 		return;
 	}

 	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
 		/*
 		 * If the kernel is too old to support /proc/pid/setgroups,
 		 * open(2) or write(2) will return ENOENT. This is fine.
 		 */
 		if (errno != ENOENT)
 			bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
 	}
 }

 static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
 {
 	int child;

 	/*
 	 * If @app is NULL, execve will segfault. Just check it here and bail (if
 	 * we're in this path, the caller is already getting desperate and there
 	 * isn't a backup to this failing). This usually would be a configuration
 	 * or programming issue.
 	 */
 	if (!app)
 		bail("mapping tool not present");

 	child = fork();
 	if (child < 0)
 		bail("failed to fork");

 	if (!child) {
 #define MAX_ARGV 20
 		char *argv[MAX_ARGV];
 		char *envp[] = { NULL };
 		char pid_fmt[16];
 		int argc = 0;
 		char *next;

 		snprintf(pid_fmt, 16, "%d", pid);

 		argv[argc++] = (char *)app;
 		argv[argc++] = pid_fmt;
 		/*
 		 * Convert the map string into a list of argument that
 		 * newuidmap/newgidmap can understand.
 		 */

 		while (argc < MAX_ARGV) {
 			if (*map == '\0') {
 				argv[argc++] = NULL;
 				break;
 			}
 			argv[argc++] = map;
 			next = strpbrk(map, "\n ");
 			if (next == NULL)
 				break;
 			*next++ = '\0';
 			map = next + strspn(next, "\n ");
 		}

 		execve(app, argv, envp);
 		bail("failed to execv");
 	} else {
 		int status;

 		while (true) {
 			if (waitpid(child, &status, 0) < 0) {
 				if (errno == EINTR)
 					continue;
 				bail("failed to waitpid");
 			}
 			if (WIFEXITED(status) || WIFSIGNALED(status))
 				return WEXITSTATUS(status);
 		}
 	}

 	return -1;
 }

 static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
 {
 	if (map == NULL || map_len == 0)
 		return;

 	write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/uid_map", pid);
 		write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newuid map on %d", pid);
 	}
 }

 static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
 {
 	if (map == NULL || map_len == 0)
 		return;

 	write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
 		if (errno != EPERM)
 			bail("failed to update /proc/%d/gid_map", pid);
 		write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
 		if (try_mapping_tool(path, pid, map, map_len))
 			bail("failed to use newgid map on %d", pid);
 	}
 }

 static void update_oom_score_adj(char *data, size_t len)
 {
 	if (data == NULL || len == 0)
 		return;

 	write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
 	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
 		bail("failed to update /proc/self/oom_score_adj");
 }

 /* A dummy function that just jumps to the given jumpval. */
 static int child_func(void *arg) __attribute__((noinline));
 static int child_func(void *arg)
 {
 	struct clone_t *ca = (struct clone_t *)arg;
 	longjmp(*ca->env, ca->jmpval);
 }

 static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
 static int clone_parent(jmp_buf *env, int jmpval)
 {
 	struct clone_t ca = {
 		.env = env,
 		.jmpval = jmpval,
 	};

 	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
 }

 /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
 static int nsflag(char *name)
 {
 	if (!strcmp(name, "cgroup"))
 		return CLONE_NEWCGROUP;
 	else if (!strcmp(name, "ipc"))
 		return CLONE_NEWIPC;
 	else if (!strcmp(name, "mnt"))
 		return CLONE_NEWNS;
 	else if (!strcmp(name, "net"))
 		return CLONE_NEWNET;
 	else if (!strcmp(name, "pid"))
 		return CLONE_NEWPID;
 	else if (!strcmp(name, "user"))
 		return CLONE_NEWUSER;
 	else if (!strcmp(name, "uts"))
 		return CLONE_NEWUTS;
 	else if (!strcmp(name, "time"))
 		return CLONE_NEWTIME;

 	/* If we don't recognise a name, fallback to 0. */
 	return 0;
 }

 static uint32_t readint32(char *buf)
 {
 	return *(uint32_t *) buf;
 }

 static uint8_t readint8(char *buf)
 {
 	return *(uint8_t *) buf;
 }

 static void nl_parse(int fd, struct nlconfig_t *config)
 {
 	size_t len, size;
 	struct nlmsghdr hdr;
 	char *data, *current;

 	/* Retrieve the netlink header. */
 	len = read(fd, &hdr, NLMSG_HDRLEN);
 	if (len != NLMSG_HDRLEN)
 		bail("invalid netlink header length %zu", len);

 	if (hdr.nlmsg_type == NLMSG_ERROR)
 		bail("failed to read netlink message");

 	if (hdr.nlmsg_type != INIT_MSG)
 		bail("unexpected msg type %d", hdr.nlmsg_type);

 	/* Retrieve data. */
 	size = NLMSG_PAYLOAD(&hdr, 0);
 	current = data = malloc(size);
 	if (!data)
 		bail("failed to allocate %zu bytes of memory for nl_payload", size);

 	len = read(fd, data, size);
 	if (len != size)
 		bail("failed to read netlink payload, %zu != %zu", len, size);

 	/* Parse the netlink payload. */
 	config->data = data;
 	while (current < data + size) {
 		struct nlattr *nlattr = (struct nlattr *)current;
 		size_t payload_len = nlattr->nla_len - NLA_HDRLEN;

 		/* Advance to payload. */
 		current += NLA_HDRLEN;

 		/* Handle payload. */
 		switch (nlattr->nla_type) {
 		case CLONE_FLAGS_ATTR:
 			config->cloneflags = readint32(current);
 			break;
 		case ROOTLESS_EUID_ATTR:
 			config->is_rootless_euid = readint8(current);	/* boolean */
 			break;
 		case OOM_SCORE_ADJ_ATTR:
 			config->oom_score_adj = current;
 			config->oom_score_adj_len = payload_len;
 			break;
 		case NS_PATHS_ATTR:
 			config->namespaces = current;
 			config->namespaces_len = payload_len;
 			break;
 		case UIDMAP_ATTR:
 			config->uidmap = current;
 			config->uidmap_len = payload_len;
 			break;
 		case GIDMAP_ATTR:
 			config->gidmap = current;
 			config->gidmap_len = payload_len;
 			break;
 		case UIDMAPPATH_ATTR:
 			config->uidmappath = current;
 			config->uidmappath_len = payload_len;
 			break;
 		case GIDMAPPATH_ATTR:
 			config->gidmappath = current;
 			config->gidmappath_len = payload_len;
 			break;
 		case SETGROUP_ATTR:
 			config->is_setgroup = readint8(current);
 			break;
 		case TIMENSOFFSET_ATTR:
 			config->timensoffset = current;
 			config->timensoffset_len = payload_len;
 			break;
 		default:
 			bail("unknown netlink message type %d", nlattr->nla_type);
 		}

 		current += NLA_ALIGN(payload_len);
 	}
 }

 void nl_free(struct nlconfig_t *config)
 {
 	free(config->data);
 }

 void join_namespaces(char *nslist)
 {
 	int num = 0, i;
 	char *saveptr = NULL;
 	char *namespace = strtok_r(nslist, ",", &saveptr);
 	struct namespace_t {
 		int fd;
 		char type[PATH_MAX];
 		char path[PATH_MAX];
 	} *namespaces = NULL;

 	if (!namespace || !strlen(namespace) || !strlen(nslist))
 		bail("ns paths are empty");

 	/*
 	 * We have to open the file descriptors first, since after
 	 * we join the mnt namespace we might no longer be able to
 	 * access the paths.
 	 */
 	do {
 		int fd;
 		char *path;
 		struct namespace_t *ns;

 		/* Resize the namespace array. */
 		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
 		if (!namespaces)
 			bail("failed to reallocate namespace array");
 		ns = &namespaces[num - 1];

 		/* Split 'ns:path'. */
 		path = strstr(namespace, ":");
 		if (!path)
 			bail("failed to parse %s", namespace);
 		*path++ = '\0';

 		fd = open(path, O_RDONLY);
 		if (fd < 0)
 			bail("failed to open %s", path);

 		ns->fd = fd;
 		strncpy(ns->type, namespace, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		ns->path[PATH_MAX - 1] = '\0';
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);

 	/*
 	 * The ordering in which we join namespaces is important. We should
 	 * always join the user namespace *first*. This is all guaranteed
 	 * from the container_linux.go side of this, so we're just going to
 	 * follow the order given to us.
 	 */

 	for (i = 0; i < num; i++) {
 		struct namespace_t *ns = &namespaces[i];
 		int flag = nsflag(ns->type);

 		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
 		if (setns(ns->fd, flag) < 0)
 			bail("failed to setns into %s namespace", ns->type);

 		/*
 		 * If we change user namespaces, make sure we switch to root in the
 		 * namespace (this matches the logic for unshare(CLONE_NEWUSER)), lots
 		 * of things can break if we aren't the right user. See
 		 * <https://github.com/opencontainers/runc/issues/4466> for one example.
 		 */
 		if (flag == CLONE_NEWUSER) {
 			if (setresuid(0, 0, 0) < 0)
 				bail("failed to become root in user namespace");
 		}

 		close(ns->fd);
 	}

 	free(namespaces);
 }

 static inline int sane_kill(pid_t pid, int signum)
 {
 	if (pid > 0)
 		return kill(pid, signum);
 	else
 		return 0;
 }

 void try_unshare(int flags, const char *msg)
 {
 	write_log(DEBUG, "unshare %s", msg);
 	/*
 	 * Kernels prior to v4.3 may return EINVAL on unshare when another process
 	 * reads runc's /proc/$PID/status or /proc/$PID/maps. To work around this,
 	 * retry on EINVAL a few times.
 	 */
 	int retries = 5;
 	for (; retries > 0; retries--) {
 		if (unshare(flags) == 0) {
 			return;
 		}
 		if (errno != EINVAL)
 			break;
 	}
 	bail("failed to unshare %s", msg);
 }

 static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
 {
 	if (map == NULL || map_len == 0)
 		return;
 	write_log(DEBUG, "update /proc/%d/timens_offsets to '%s'", pid, map);
 	if (write_file(map, map_len, "/proc/%d/timens_offsets", pid) < 0)
 		bail("failed to update /proc/%d/timens_offsets", pid);
 }

 void nsexec(void)
 {
 	int pipenum;
 	jmp_buf env;
 	int sync_child_pipe[2], sync_grandchild_pipe[2];
 	struct nlconfig_t config = { 0 };

 	/*
 	 * Setup a pipe to send logs to the parent. This should happen
 	 * first, because bail will use that pipe.
 	 */
 	setup_logpipe();

 	/*
 	 * Get the init pipe fd from the environment. The init pipe is used to
 	 * read the bootstrap data and tell the parent what the new pids are
 	 * after the setup is done.
 	 */
 	pipenum = getenv_int("_LIBCONTAINER_INITPIPE");
 	if (pipenum < 0) {
 		/* We are not a runc init. Just return to go runtime. */
 		return;
 	}

 	write_log(DEBUG, "=> nsexec container setup");

 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);

 	/* Set oom_score_adj. This has to be done before !dumpable because
 	 * /proc/self/oom_score_adj is not writeable unless you're an privileged
 	 * user (if !dumpable is set). All children inherit their parent's
 	 * oom_score_adj value on fork(2) so this will always be propagated
 	 * properly.
 	 */
 	update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);

 	/*
 	 * Make the process non-dumpable, to avoid various race conditions that
 	 * could cause processes in namespaces we're joining to access host
 	 * resources (or potentially execute code).
 	 *
 	 * However, if the number of namespaces we are joining is 0, we are not
 	 * going to be switching to a different security context. Thus setting
 	 * ourselves to be non-dumpable only breaks things (like rootless
 	 * containers), which is the recommendation from the kernel folks.
 	 */
 	if (config.namespaces) {
 		write_log(DEBUG, "set process as non-dumpable");
 		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 			bail("failed to set process as non-dumpable");
 	}

 	/* Pipe so we can tell the child when we've finished setting up. */
 	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
 		bail("failed to setup sync pipe between parent and child");

 	/*
 	 * We need a new socketpair to sync with grandchild so we don't have
 	 * race condition with child.
 	 */
 	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
 		bail("failed to setup sync pipe between parent and grandchild");

 	/* TODO: Currently we aren't dealing with child deaths properly. */

 	/*
 	 * Okay, so this is quite annoying.
 	 *
 	 * In order for this unsharing code to be more extensible we need to split
 	 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
 	 * would be if we did clone(CLONE_NEWUSER) and the other namespaces
 	 * separately, but because of SELinux issues we cannot really do that. But
 	 * we cannot just dump the namespace flags into clone(...) because several
 	 * usecases (such as rootless containers) require more granularity around
 	 * the namespace setup. In addition, some older kernels had issues where
 	 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
 	 * handle this while also dealing with SELinux so we choose SELinux support
 	 * over broken kernel support).
 	 *
 	 * However, if we unshare(2) the user namespace *before* we clone(2), then
 	 * all hell breaks loose.
 	 *
 	 * The parent no longer has permissions to do many things (unshare(2) drops
 	 * all capabilities in your old namespace), and the container cannot be set
 	 * up to have more than one {uid,gid} mapping. This is obviously less than
 	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
 	 *
 	 * Unfortunately, it's not as simple as that. We have to fork to enter the
 	 * PID namespace (the PID namespace only applies to children). Since we'll
 	 * have to double-fork, this clone_parent() call won't be able to get the
 	 * PID of the _actual_ init process (without doing more synchronisation than
 	 * I can deal with at the moment). So we'll just get the parent to send it
 	 * for us, the only job of this process is to update
 	 * /proc/pid/{setgroups,uid_map,gid_map}.
 	 *
 	 * And as a result of the above, we also need to setns(2) in the first child
 	 * because if we join a PID namespace in the topmost parent then our child
 	 * will be in that namespace (and it will not be able to give us a PID value
 	 * that makes sense without resorting to sending things with cmsg).
 	 *
 	 * This also deals with an older issue caused by dumping cloneflags into
 	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
 	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
 	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
 	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
 	 * aware, the last mainline kernel which had this bug was Linux 3.12.
 	 * However, we cannot comment on which kernels the broken patch was
 	 * backported to.
 	 *
 	 * -- Aleksa "what has my life come to?" Sarai
 	 */

 	switch (setjmp(env)) {
 		/*
 		 * Stage 0: We're in the parent. Our job is just to create a new child
 		 *          (stage 1: STAGE_CHILD) process and write its uid_map and
 		 *          gid_map. That process will go on to create a new process, then
 		 *          it will send us its PID which we will send to the bootstrap
 		 *          process.
 		 */
 	case STAGE_PARENT:{
 			int len;
 			pid_t stage1_pid = -1, stage2_pid = -1;
 			bool stage1_complete, stage2_complete;

 			/* For debugging. */
 			current_stage = STAGE_PARENT;
 			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
 			write_log(DEBUG, "~> nsexec stage-0");

 			/* Start the process of getting a container. */
 			write_log(DEBUG, "spawn stage-1");
 			stage1_pid = clone_parent(&env, STAGE_CHILD);
 			if (stage1_pid < 0)
 				bail("unable to spawn stage-1");

 			syncfd = sync_child_pipe[1];
 			if (close(sync_child_pipe[0]) < 0)
 				bail("failed to close sync_child_pipe[0] fd");

 			/*
 			 * State machine for synchronisation with the children. We only
 			 * return once both the child and grandchild are ready.
 			 */
 			write_log(DEBUG, "-> stage-1 synchronisation loop");
 			stage1_complete = false;
 			while (!stage1_complete) {
 				enum sync_t s;

 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with stage-1: next state");

 				switch (s) {
 				case SYNC_USERMAP_PLS:
 					write_log(DEBUG, "stage-1 requested userns mappings");

 					/*
 					 * Enable setgroups(2) if we've been asked to. But we also
 					 * have to explicitly disable setgroups(2) if we're
 					 * creating a rootless container for single-entry mapping.
 					 * i.e. config.is_setgroup == false.
 					 * (this is required since Linux 3.19).
 					 *
 					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
 					 * newuidmap/newgidmap shall be used.
 					 */
 					if (config.is_rootless_euid && !config.is_setgroup)
 						update_setgroups(stage1_pid, SETGROUPS_DENY);

 					/* Set up mappings. */
 					update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
 					update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);

 					s = SYNC_USERMAP_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 						sane_kill(stage1_pid, SIGKILL);
 						sane_kill(stage2_pid, SIGKILL);
 						bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
 					}
 					break;
 				case SYNC_RECVPID_PLS:
 					write_log(DEBUG, "stage-1 requested pid to be forwarded");

 					/* Get the stage-2 pid. */
 					if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
 						sane_kill(stage1_pid, SIGKILL);
 						bail("failed to sync with stage-1: read(stage2_pid)");
 					}

 					/* Send ACK. */
 					s = SYNC_RECVPID_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 						sane_kill(stage1_pid, SIGKILL);
 						sane_kill(stage2_pid, SIGKILL);
 						bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
 					}

 					/*
 					 * Send both the stage-1 and stage-2 pids back to runc.
 					 * runc needs the stage-2 to continue process management,
 					 * but because stage-1 was spawned with CLONE_PARENT we
 					 * cannot reap it within stage-0 and thus we need to ask
 					 * runc to reap the zombie for us.
 					 */
 					write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
 						  stage1_pid, stage2_pid);
 					len =
 					    dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
 						    stage2_pid);
 					if (len < 0) {
 						sane_kill(stage1_pid, SIGKILL);
 						sane_kill(stage2_pid, SIGKILL);
 						bail("failed to sync with runc: write(pid-JSON)");
 					}
 					break;
 				case SYNC_TIMEOFFSETS_PLS:
 					write_log(DEBUG, "stage-1 requested timens offsets to be configured");
 					update_timens_offsets(stage1_pid, config.timensoffset, config.timensoffset_len);
 					s = SYNC_TIMEOFFSETS_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 						sane_kill(stage1_pid, SIGKILL);
 						bail("failed to sync with child: write(SYNC_TIMEOFFSETS_ACK)");
 					}
 					break;
 				case SYNC_CHILD_FINISH:
 					write_log(DEBUG, "stage-1 complete");
 					stage1_complete = true;
 					break;
 				default:
 					bail("unexpected sync value: %u", s);
 				}
 			}
 			write_log(DEBUG, "<- stage-1 synchronisation loop");

 			/* Now sync with grandchild. */
 			syncfd = sync_grandchild_pipe[1];
 			if (close(sync_grandchild_pipe[0]) < 0)
 				bail("failed to close sync_grandchild_pipe[0] fd");

 			write_log(DEBUG, "-> stage-2 synchronisation loop");
 			stage2_complete = false;
 			while (!stage2_complete) {
 				enum sync_t s;

 				write_log(DEBUG, "signalling stage-2 to run");
 				s = SYNC_GRANDCHILD;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
 				}

 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with child: next state");

 				switch (s) {
 				case SYNC_CHILD_FINISH:
 					write_log(DEBUG, "stage-2 complete");
 					stage2_complete = true;
 					break;
 				default:
 					bail("unexpected sync value: %u", s);
 				}
 			}
 			write_log(DEBUG, "<- stage-2 synchronisation loop");
 			write_log(DEBUG, "<~ nsexec stage-0");
 			exit(0);
 		}
 		break;

 		/*
 		 * Stage 1: We're in the first child process. Our job is to join any
 		 *          provided namespaces in the netlink payload and unshare all of
 		 *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
 		 *          we will ask our parent (stage 0) to set up our user mappings
 		 *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
 		 *          PID namespace. We then send the child's PID to our parent
 		 *          (stage 0).
 		 */
 	case STAGE_CHILD:{
 			pid_t stage2_pid = -1;
 			enum sync_t s;

 			/* For debugging. */
 			current_stage = STAGE_CHILD;

 			/* We're in a child and thus need to tell the parent if we die. */
 			syncfd = sync_child_pipe[0];
 			if (close(sync_child_pipe[1]) < 0)
 				bail("failed to close sync_child_pipe[1] fd");

 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
 			write_log(DEBUG, "~> nsexec stage-1");

 			/*
 			 * We need to setns first. We cannot do this earlier (in stage 0)
 			 * because of the fact that we forked to get here (the PID of
 			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
 			 * using cmsg(3) but that's just annoying.
 			 */
 			if (config.namespaces)
 				join_namespaces(config.namespaces);

 			/*
 			 * Deal with user namespaces first. They are quite special, as they
 			 * affect our ability to unshare other namespaces and are used as
 			 * context for privilege checks.
 			 *
 			 * We don't unshare all namespaces in one go. The reason for this
 			 * is that, while the kernel documentation may claim otherwise,
 			 * there are certain cases where unsharing all namespaces at once
 			 * will result in namespace objects being owned incorrectly.
 			 * Ideally we should just fix these kernel bugs, but it's better to
 			 * be safe than sorry, and fix them separately.
 			 *
 			 * A specific case of this is that the SELinux label of the
 			 * internal kern-mount that mqueue uses will be incorrect if the
 			 * UTS namespace is cloned before the USER namespace is mapped.
 			 * I've also heard of similar problems with the network namespace
 			 * in some scenarios. This also mirrors how LXC deals with this
 			 * problem.
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
 				try_unshare(CLONE_NEWUSER, "user namespace");
 				config.cloneflags &= ~CLONE_NEWUSER;

 				/*
 				 * We need to set ourselves as dumpable temporarily so that the
 				 * parent process can write to our procfs files.
 				 */
 				if (config.namespaces) {
 					write_log(DEBUG, "temporarily set process as dumpable");
 					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
 						bail("failed to temporarily set process as dumpable");
 				}

 				/*
 				 * We don't have the privileges to do any mapping here (see the
 				 * clone_parent rant). So signal stage-0 to do the mapping for
 				 * us.
 				 */
 				write_log(DEBUG, "request stage-0 to map user namespace");
 				s = SYNC_USERMAP_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");

 				/* ... wait for mapping ... */
 				write_log(DEBUG, "waiting stage-0 to complete the mapping of user namespace");
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 				if (s != SYNC_USERMAP_ACK)
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);

 				/* Revert temporary re-dumpable setting. */
 				if (config.namespaces) {
 					write_log(DEBUG, "re-set process as non-dumpable");
 					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
 						bail("failed to re-set process as non-dumpable");
 				}

 				/* Become root in the namespace proper. */
 				if (setresuid(0, 0, 0) < 0)
 					bail("failed to become root in user namespace");
 			}

 			/*
 			 * Unshare all of the namespaces. Now, it should be noted that this
 			 * ordering might break in the future (especially with rootless
 			 * containers). But for now, it's not possible to split this into
 			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
 			 *
 			 * Note that we don't merge this with clone() because there were
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 */
 			try_unshare(config.cloneflags, "remaining namespaces");

 			if (config.timensoffset) {
 				write_log(DEBUG, "request stage-0 to write timens offsets");

 				s = SYNC_TIMEOFFSETS_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: write(SYNC_TIMEOFFSETS_PLS)");

 				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 					bail("failed to sync with parent: read(SYNC_TIMEOFFSETS_ACK)");
 				if (s != SYNC_TIMEOFFSETS_ACK)
 					bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s);
 			}

 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
 			 *
 			 * We fork again because of PID namespace, setns(2) or unshare(2) don't
 			 * change the PID namespace of the calling process, because doing so
 			 * would change the caller's idea of its own PID (as reported by getpid()),
 			 * which would break many applications and libraries, so we must fork
 			 * to actually enter the new PID namespace.
 			 */
 			write_log(DEBUG, "spawn stage-2");
 			stage2_pid = clone_parent(&env, STAGE_INIT);
 			if (stage2_pid < 0)
 				bail("unable to spawn stage-2");

 			/* Send the child to our parent, which knows what it's doing. */
 			write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
 			s = SYNC_RECVPID_PLS;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
 			}
 			if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
 				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: write(stage2_pid)");
 			}

 			/* ... wait for parent to get the pid ... */
 			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
 				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
 			}
 			if (s != SYNC_RECVPID_ACK) {
 				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
 			}

 			write_log(DEBUG, "signal completion to stage-0");
 			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
 				sane_kill(stage2_pid, SIGKILL);
 				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
 			}

 			/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
 			write_log(DEBUG, "<~ nsexec stage-1");
 			exit(0);
 		}
 		break;

 		/*
 		 * Stage 2: We're the final child process, and the only process that will
 		 *          actually return to the Go runtime. Our job is to just do the
 		 *          final cleanup steps and then return to the Go runtime to allow
 		 *          init_linux.go to run.
 		 */
 	case STAGE_INIT:{
 			/*
 			 * We're inside the child now, having jumped from the
 			 * start_child() code after forking in the parent.
 			 */
 			enum sync_t s;

 			/* For debugging. */
 			current_stage = STAGE_INIT;

 			/* We're in a child and thus need to tell the parent if we die. */
 			syncfd = sync_grandchild_pipe[0];
 			if (close(sync_grandchild_pipe[1]) < 0)
 				bail("failed to close sync_grandchild_pipe[1] fd");

 			if (close(sync_child_pipe[0]) < 0)
 				bail("failed to close sync_child_pipe[0] fd");

 			/* For debugging. */
 			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
 			write_log(DEBUG, "~> nsexec stage-2");

 			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
 			if (s != SYNC_GRANDCHILD)
 				bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);

 			if (setsid() < 0)
 				bail("setsid failed");

 			if (setuid(0) < 0)
 				bail("setuid failed");

 			if (setgid(0) < 0)
 				bail("setgid failed");

 			if (!config.is_rootless_euid && config.is_setgroup) {
 				if (setgroups(0, NULL) < 0)
 					bail("setgroups failed");
 			}

 			write_log(DEBUG, "signal completion to stage-0");
 			s = SYNC_CHILD_FINISH;
 			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
 				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");

 			/* Close sync pipes. */
 			if (close(sync_grandchild_pipe[0]) < 0)
 				bail("failed to close sync_grandchild_pipe[0] fd");

 			/* Free netlink data. */
 			nl_free(&config);

 			/* Finish executing, let the Go runtime take over. */
 			write_log(DEBUG, "<= nsexec container setup");
 			write_log(DEBUG, "booting up go runtime ...");
 			return;
 		}
 		break;
 	default:
 		bail("unexpected jump value");
 	}

 	/* Should never be reached. */
 	bail("should never be reached");
 }