blob: c08757b9e9d0245715667f6eb0fe019788fd830d [file] [log] [blame]
/* Copyright 2016 The Chromium OS Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#define _GNU_SOURCE /* For asprintf */
#include <errno.h>
#include <fcntl.h>
#if USE_device_mapper
#include <libdevmapper.h>
#endif
#include <malloc.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <linux/loop.h>
#include "container_cgroup.h"
#include "libcontainer.h"
#include "libminijail.h"
#define FREE_AND_NULL(ptr) \
do { \
free(ptr); \
ptr = NULL; \
} while(0)
#define MAX_NUM_SETFILES_ARGS 128
static const char loopdev_ctl[] = "/dev/loop-control";
#if USE_device_mapper
static const char dm_dev_prefix[] = "/dev/mapper/";
#endif
static int container_teardown(struct container *c);
static int strdup_and_free(char **dest, const char *src)
{
char *copy = strdup(src);
if (!copy)
return -ENOMEM;
if (*dest)
free(*dest);
*dest = copy;
return 0;
}
struct container_mount {
char *name;
char *source;
char *destination;
char *type;
char *data;
char *verity;
int flags;
int uid;
int gid;
int mode;
int mount_in_ns; /* True if mount should happen in new vfs ns */
int create; /* True if target should be created if it doesn't exist */
int loopback; /* True if target should be mounted via loopback */
};
struct container_device {
char type; /* 'c' or 'b' for char or block */
char *path;
int fs_permissions;
int major;
int minor;
int copy_minor; /* Copy the minor from existing node, ignores |minor| */
int uid;
int gid;
};
struct container_cgroup_device {
int allow;
char type;
int major; /* -1 means all */
int minor; /* -1 means all */
int read;
int write;
int modify;
};
struct container_cpu_cgroup {
int shares;
int quota;
int period;
int rt_runtime;
int rt_period;
};
/*
* Structure that configures how the container is run.
*
* config_root - Path to the root of the container itself.
* rootfs - Path to the root of the container's filesystem.
* rootfs_mount_flags - Flags that will be passed to mount() for the rootfs.
* premounted_runfs - Path to where the container will be run.
* pid_file_path - Path to the file where the pid should be written.
* program_argv - The program to run and args, e.g. "/sbin/init".
* num_args - Number of args in program_argv.
* uid - The uid the container will run as.
* uid_map - Mapping of UIDs in the container, e.g. "0 100000 1024"
* gid - The gid the container will run as.
* gid_map - Mapping of GIDs in the container, e.g. "0 100000 1024"
* alt_syscall_table - Syscall table to use or NULL if none.
* mounts - Filesystems to mount in the new namespace.
* num_mounts - Number of above.
* devices - Device nodes to create.
* num_devices - Number of above.
* cgroup_devices - Device node cgroup permissions.
* num_cgroup_devices - Number of above.
* run_setfiles - Should run setfiles on mounts to enable selinux.
* cpu_cgparams - CPU cgroup params.
* cgroup_parent - Parent dir for cgroup creation
* cgroup_owner - uid to own the created cgroups
* cgroup_group - gid to own the created cgroups
* share_host_netns - Enable sharing of the host network namespace.
* keep_fds_open - Allow the child process to keep open FDs (for stdin/out/err).
*/
struct container_config {
char *config_root;
char *rootfs;
unsigned long rootfs_mount_flags;
char *premounted_runfs;
char *pid_file_path;
char **program_argv;
size_t num_args;
uid_t uid;
char *uid_map;
gid_t gid;
char *gid_map;
char *alt_syscall_table;
struct container_mount *mounts;
size_t num_mounts;
struct container_device *devices;
size_t num_devices;
struct container_cgroup_device *cgroup_devices;
size_t num_cgroup_devices;
char *run_setfiles;
struct container_cpu_cgroup cpu_cgparams;
char *cgroup_parent;
uid_t cgroup_owner;
gid_t cgroup_group;
int share_host_netns;
int keep_fds_open;
};
struct container_config *container_config_create()
{
return calloc(1, sizeof(struct container_config));
}
static void container_free_program_args(struct container_config *c)
{
int i;
if (!c->program_argv)
return;
for (i = 0; i < c->num_args; ++i) {
FREE_AND_NULL(c->program_argv[i]);
}
FREE_AND_NULL(c->program_argv);
}
static void container_config_free_mount(struct container_mount *mount)
{
FREE_AND_NULL(mount->name);
FREE_AND_NULL(mount->source);
FREE_AND_NULL(mount->destination);
FREE_AND_NULL(mount->type);
FREE_AND_NULL(mount->data);
}
static void container_config_free_device(struct container_device *device)
{
FREE_AND_NULL(device->path);
}
void container_config_destroy(struct container_config *c)
{
size_t i;
if (c == NULL)
return;
FREE_AND_NULL(c->rootfs);
container_free_program_args(c);
FREE_AND_NULL(c->premounted_runfs);
FREE_AND_NULL(c->pid_file_path);
FREE_AND_NULL(c->uid_map);
FREE_AND_NULL(c->gid_map);
FREE_AND_NULL(c->alt_syscall_table);
for (i = 0; i < c->num_mounts; ++i) {
container_config_free_mount(&c->mounts[i]);
}
FREE_AND_NULL(c->mounts);
for (i = 0; i < c->num_devices; ++i) {
container_config_free_device(&c->devices[i]);
}
FREE_AND_NULL(c->devices);
FREE_AND_NULL(c->cgroup_devices);
FREE_AND_NULL(c->run_setfiles);
FREE_AND_NULL(c->cgroup_parent);
FREE_AND_NULL(c);
}
int container_config_config_root(struct container_config *c,
const char *config_root)
{
return strdup_and_free(&c->config_root, config_root);
}
const char *container_config_get_config_root(const struct container_config *c)
{
return c->config_root;
}
int container_config_rootfs(struct container_config *c, const char *rootfs)
{
return strdup_and_free(&c->rootfs, rootfs);
}
const char *container_config_get_rootfs(const struct container_config *c)
{
return c->rootfs;
}
void container_config_rootfs_mount_flags(struct container_config *c,
unsigned long rootfs_mount_flags)
{
/* Since we are going to add MS_REMOUNT anyways, add it here so we can
* simply check against zero later. MS_BIND is also added to avoid
* re-mounting the original filesystem, since the rootfs is always
* bind-mounted.
*/
c->rootfs_mount_flags = MS_REMOUNT | MS_BIND | rootfs_mount_flags;
}
unsigned long container_config_get_rootfs_mount_flags(
const struct container_config *c)
{
return c->rootfs_mount_flags;
}
int container_config_premounted_runfs(struct container_config *c, const char *runfs)
{
return strdup_and_free(&c->premounted_runfs, runfs);
}
const char *container_config_get_premounted_runfs(const struct container_config *c)
{
return c->premounted_runfs;
}
int container_config_pid_file(struct container_config *c, const char *path)
{
return strdup_and_free(&c->pid_file_path, path);
}
const char *container_config_get_pid_file(const struct container_config *c)
{
return c->pid_file_path;
}
int container_config_program_argv(struct container_config *c,
const char **argv, size_t num_args)
{
size_t i;
container_free_program_args(c);
c->num_args = num_args;
c->program_argv = calloc(num_args + 1, sizeof(char *));
if (!c->program_argv)
return -ENOMEM;
for (i = 0; i < num_args; ++i) {
if (strdup_and_free(&c->program_argv[i], argv[i]))
goto error_free_return;
}
c->program_argv[num_args] = NULL;
return 0;
error_free_return:
container_free_program_args(c);
return -ENOMEM;
}
size_t container_config_get_num_program_args(const struct container_config *c)
{
return c->num_args;
}
const char *container_config_get_program_arg(const struct container_config *c,
size_t index)
{
if (index >= c->num_args)
return NULL;
return c->program_argv[index];
}
void container_config_uid(struct container_config *c, uid_t uid)
{
c->uid = uid;
}
uid_t container_config_get_uid(const struct container_config *c)
{
return c->uid;
}
int container_config_uid_map(struct container_config *c, const char *uid_map)
{
return strdup_and_free(&c->uid_map, uid_map);
}
void container_config_gid(struct container_config *c, gid_t gid)
{
c->gid = gid;
}
gid_t container_config_get_gid(const struct container_config *c)
{
return c->gid;
}
int container_config_gid_map(struct container_config *c, const char *gid_map)
{
return strdup_and_free(&c->gid_map, gid_map);
}
int container_config_alt_syscall_table(struct container_config *c,
const char *alt_syscall_table)
{
return strdup_and_free(&c->alt_syscall_table, alt_syscall_table);
}
int container_config_add_mount(struct container_config *c,
const char *name,
const char *source,
const char *destination,
const char *type,
const char *data,
const char *verity,
int flags,
int uid,
int gid,
int mode,
int mount_in_ns,
int create,
int loopback)
{
struct container_mount *mount_ptr;
struct container_mount *current_mount;
if (name == NULL || source == NULL ||
destination == NULL || type == NULL)
return -EINVAL;
mount_ptr = realloc(c->mounts,
sizeof(c->mounts[0]) * (c->num_mounts + 1));
if (!mount_ptr)
return -ENOMEM;
c->mounts = mount_ptr;
current_mount = &c->mounts[c->num_mounts];
memset(current_mount, 0, sizeof(struct container_mount));
if (strdup_and_free(&current_mount->name, name))
goto error_free_return;
if (strdup_and_free(&current_mount->source, source))
goto error_free_return;
if (strdup_and_free(&current_mount->destination, destination))
goto error_free_return;
if (strdup_and_free(&current_mount->type, type))
goto error_free_return;
if (data && strdup_and_free(&current_mount->data, data))
goto error_free_return;
if (verity && strdup_and_free(&current_mount->verity, verity))
goto error_free_return;
current_mount->flags = flags;
current_mount->uid = uid;
current_mount->gid = gid;
current_mount->mode = mode;
current_mount->mount_in_ns = mount_in_ns;
current_mount->create = create;
current_mount->loopback = loopback;
++c->num_mounts;
return 0;
error_free_return:
container_config_free_mount(current_mount);
return -ENOMEM;
}
int container_config_add_cgroup_device(struct container_config *c,
int allow,
char type,
int major,
int minor,
int read,
int write,
int modify)
{
struct container_cgroup_device *dev_ptr;
struct container_cgroup_device *current_dev;
dev_ptr = realloc(c->cgroup_devices,
sizeof(c->cgroup_devices[0]) *
(c->num_cgroup_devices + 1));
if (!dev_ptr)
return -ENOMEM;
c->cgroup_devices = dev_ptr;
current_dev = &c->cgroup_devices[c->num_cgroup_devices];
memset(current_dev, 0, sizeof(struct container_cgroup_device));
current_dev->allow = allow;
current_dev->type = type;
current_dev->major = major;
current_dev->minor = minor;
current_dev->read = read;
current_dev->write = write;
current_dev->modify = modify;
++c->num_cgroup_devices;
return 0;
}
int container_config_add_device(struct container_config *c,
char type,
const char *path,
int fs_permissions,
int major,
int minor,
int copy_minor,
int uid,
int gid,
int read_allowed,
int write_allowed,
int modify_allowed)
{
struct container_device *dev_ptr;
struct container_device *current_dev;
if (path == NULL)
return -EINVAL;
/* If using a dynamic minor number, ensure that minor is -1. */
if (copy_minor && (minor != -1))
return -EINVAL;
dev_ptr = realloc(c->devices,
sizeof(c->devices[0]) * (c->num_devices + 1));
if (!dev_ptr)
return -ENOMEM;
c->devices = dev_ptr;
current_dev = &c->devices[c->num_devices];
memset(current_dev, 0, sizeof(struct container_device));
current_dev->type = type;
if (strdup_and_free(&current_dev->path, path))
goto error_free_return;
current_dev->fs_permissions = fs_permissions;
current_dev->major = major;
current_dev->minor = minor;
current_dev->copy_minor = copy_minor;
current_dev->uid = uid;
current_dev->gid = gid;
if (read_allowed || write_allowed || modify_allowed) {
if (container_config_add_cgroup_device(c,
1,
type,
major,
minor,
read_allowed,
write_allowed,
modify_allowed))
goto error_free_return;
}
++c->num_devices;
return 0;
error_free_return:
container_config_free_device(current_dev);
return -ENOMEM;
}
int container_config_run_setfiles(struct container_config *c,
const char *setfiles_cmd)
{
return strdup_and_free(&c->run_setfiles, setfiles_cmd);
}
const char *container_config_get_run_setfiles(const struct container_config *c)
{
return c->run_setfiles;
}
int container_config_set_cpu_shares(struct container_config *c, int shares)
{
/* CPU shares must be 2 or higher. */
if (shares < 2)
return -EINVAL;
c->cpu_cgparams.shares = shares;
return 0;
}
int container_config_set_cpu_cfs_params(struct container_config *c,
int quota,
int period)
{
/*
* quota could be set higher than period to utilize more than one CPU.
* quota could also be set as -1 to indicate the cgroup does not adhere
* to any CPU time restrictions.
*/
if (quota <= 0 && quota != -1)
return -EINVAL;
if (period <= 0)
return -EINVAL;
c->cpu_cgparams.quota = quota;
c->cpu_cgparams.period = period;
return 0;
}
int container_config_set_cpu_rt_params(struct container_config *c,
int rt_runtime,
int rt_period)
{
/*
* rt_runtime could be set as 0 to prevent the cgroup from using
* realtime CPU.
*/
if (rt_runtime < 0 || rt_runtime >= rt_period)
return -EINVAL;
c->cpu_cgparams.rt_runtime = rt_runtime;
c->cpu_cgparams.rt_period = rt_period;
return 0;
}
int container_config_get_cpu_shares(struct container_config *c)
{
return c->cpu_cgparams.shares;
}
int container_config_get_cpu_quota(struct container_config *c)
{
return c->cpu_cgparams.quota;
}
int container_config_get_cpu_period(struct container_config *c)
{
return c->cpu_cgparams.period;
}
int container_config_get_cpu_rt_runtime(struct container_config *c)
{
return c->cpu_cgparams.rt_runtime;
}
int container_config_get_cpu_rt_period(struct container_config *c)
{
return c->cpu_cgparams.rt_period;
}
int container_config_set_cgroup_parent(struct container_config *c,
const char *parent,
uid_t cgroup_owner, gid_t cgroup_group)
{
c->cgroup_owner = cgroup_owner;
c->cgroup_group = cgroup_group;
return strdup_and_free(&c->cgroup_parent, parent);
}
const char *container_config_get_cgroup_parent(struct container_config *c)
{
return c->cgroup_parent;
}
void container_config_share_host_netns(struct container_config *c)
{
c->share_host_netns = 1;
}
int get_container_config_share_host_netns(struct container_config *c)
{
return c->share_host_netns;
}
void container_config_keep_fds_open(struct container_config *c)
{
c->keep_fds_open = 1;
}
/*
* Container manipulation
*/
struct container {
struct container_cgroup *cgroup;
struct minijail *jail;
pid_t init_pid;
char *config_root;
char *runfs;
char *rundir;
char *runfsroot;
char *pid_file_path;
char **ext_mounts; /* Mounts made outside of the minijail */
size_t num_ext_mounts;
char **loopdevs;
size_t num_loopdevs;
char **device_mappers;
size_t num_device_mappers;
char *name;
};
struct container *container_new(const char *name,
const char *rundir)
{
struct container *c;
c = calloc(1, sizeof(*c));
if (!c)
return NULL;
c->rundir = strdup(rundir);
c->name = strdup(name);
if (!c->rundir || !c->name) {
container_destroy(c);
return NULL;
}
return c;
}
void container_destroy(struct container *c)
{
if (c->cgroup)
container_cgroup_destroy(c->cgroup);
if (c->jail)
minijail_destroy(c->jail);
FREE_AND_NULL(c->config_root);
FREE_AND_NULL(c->name);
FREE_AND_NULL(c->rundir);
FREE_AND_NULL(c);
}
/*
* Given a uid/gid map of "inside1 outside1 length1, ...", and an id
* inside of the user namespace, return the equivalent outside id, or
* return < 0 on error.
*/
static int get_userns_outside_id(const char *map, int id)
{
char *map_copy, *mapping, *saveptr1, *saveptr2;
int inside, outside, length;
int result = 0;
errno = 0;
if (asprintf(&map_copy, "%s", map) < 0)
return -ENOMEM;
mapping = strtok_r(map_copy, ",", &saveptr1);
while (mapping) {
inside = strtol(strtok_r(mapping, " ", &saveptr2), NULL, 10);
outside = strtol(strtok_r(NULL, " ", &saveptr2), NULL, 10);
length = strtol(strtok_r(NULL, "\0", &saveptr2), NULL, 10);
if (errno) {
goto error_free_return;
} else if (inside < 0 || outside < 0 || length < 0) {
errno = EINVAL;
goto error_free_return;
}
if (id >= inside && id <= (inside + length)) {
result = (id - inside) + outside;
goto exit;
}
mapping = strtok_r(NULL, ",", &saveptr1);
}
errno = EINVAL;
error_free_return:
result = -errno;
exit:
free(map_copy);
return result;
}
static int make_dir(const char *path, int uid, int gid, int mode)
{
if (mkdir(path, mode))
return -errno;
if (chmod(path, mode))
return -errno;
if (chown(path, uid, gid))
return -errno;
return 0;
}
static int touch_file(const char *path, int uid, int gid, int mode)
{
int rc;
int fd = open(path, O_RDWR | O_CREAT, mode);
if (fd < 0)
return -errno;
rc = fchown(fd, uid, gid);
close(fd);
if (rc)
return -errno;
return 0;
}
/* Make sure the mount target exists in the new rootfs. Create if needed and
* possible.
*/
static int setup_mount_destination(const struct container_config *config,
const struct container_mount *mnt,
const char *source,
const char *dest)
{
int uid_userns, gid_userns;
int rc;
struct stat st_buf;
rc = stat(dest, &st_buf);
if (rc == 0) /* destination exists */
return 0;
/* Try to create the destination. Either make directory or touch a file
* depending on the source type.
*/
uid_userns = get_userns_outside_id(config->uid_map, mnt->uid);
if (uid_userns < 0)
return uid_userns;
gid_userns = get_userns_outside_id(config->gid_map, mnt->gid);
if (gid_userns < 0)
return gid_userns;
rc = stat(source, &st_buf);
if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode))
return make_dir(dest, uid_userns, gid_userns, mnt->mode);
return touch_file(dest, uid_userns, gid_userns, mnt->mode);
}
/* Fork and exec the setfiles command to configure the selinux policy. */
static int run_setfiles_command(const struct container *c,
const struct container_config *config,
char *const *destinations, size_t num_destinations)
{
int rc;
int status;
int pid;
char *context_path;
if (!config->run_setfiles)
return 0;
if (asprintf(&context_path, "%s/file_contexts",
c->runfsroot) < 0)
return -errno;
pid = fork();
if (pid == 0) {
size_t i;
size_t arg_index = 0;
const char *argv[MAX_NUM_SETFILES_ARGS];
const char *env[] = {
NULL,
};
argv[arg_index++] = config->run_setfiles;
argv[arg_index++] = "-r";
argv[arg_index++] = c->runfsroot;
argv[arg_index++] = context_path;
if (arg_index + num_destinations >= MAX_NUM_SETFILES_ARGS)
_exit(-E2BIG);
for (i = 0; i < num_destinations; ++i) {
argv[arg_index++] = destinations[i];
}
argv[arg_index] = NULL;
execve(argv[0], (char *const*)argv, (char *const*)env);
/* Command failed to exec if execve returns. */
_exit(-errno);
}
free(context_path);
if (pid < 0)
return -errno;
do {
rc = waitpid(pid, &status, 0);
} while (rc == -1 && errno == EINTR);
if (rc < 0)
return -errno;
return status;
}
/* Find a free loop device and attach it. */
static int loopdev_setup(char **loopdev_ret, const char *source)
{
int ret = 0;
int source_fd = -1;
int control_fd = -1;
int loop_fd = -1;
char *loopdev = NULL;
source_fd = open(source, O_RDONLY|O_CLOEXEC);
if (source_fd < 0)
goto error;
control_fd = open(loopdev_ctl, O_RDWR|O_NOFOLLOW|O_CLOEXEC);
if (control_fd < 0)
goto error;
while (1) {
int num = ioctl(control_fd, LOOP_CTL_GET_FREE);
if (num < 0)
goto error;
if (asprintf(&loopdev, "/dev/loop%i", num) < 0)
goto error;
loop_fd = open(loopdev, O_RDONLY|O_NOFOLLOW|O_CLOEXEC);
if (loop_fd < 0)
goto error;
if (ioctl(loop_fd, LOOP_SET_FD, source_fd) == 0)
break;
if (errno != EBUSY)
goto error;
/* Clean up resources for the next pass. */
free(loopdev);
close(loop_fd);
}
*loopdev_ret = loopdev;
goto exit;
error:
ret = -errno;
free(loopdev);
exit:
if (source_fd != -1)
close(source_fd);
if (control_fd != -1)
close(control_fd);
if (loop_fd != -1)
close(loop_fd);
return ret;
}
/* Detach the specified loop device. */
static int loopdev_detach(const char *loopdev)
{
int ret = 0;
int fd;
fd = open(loopdev, O_RDONLY|O_NOFOLLOW|O_CLOEXEC);
if (fd < 0)
goto error;
if (ioctl(fd, LOOP_CLR_FD) < 0)
goto error;
goto exit;
error:
ret = -errno;
exit:
if (fd != -1)
close(fd);
return ret;
}
/* Create a new device mapper target for the source. */
static int dm_setup(char **dm_path_ret, char **dm_name_ret, const char *source,
const char *verity_cmdline)
{
int ret = 0;
#if USE_device_mapper
char *p;
char *dm_path = NULL;
char *dm_name = NULL;
char *verity = NULL;
struct dm_task *dmt = NULL;
uint32_t cookie = 0;
/* Normalize the name into something unique-esque. */
if (asprintf(&dm_name, "cros-containers-%s", source) < 0)
goto error;
p = dm_name;
while ((p = strchr(p, '/')) != NULL)
*p++ = '_';
/* Get the /dev path for the higher levels to mount. */
if (asprintf(&dm_path, "%s%s", dm_dev_prefix, dm_name) < 0)
goto error;
/* Insert the source path in the verity command line. */
size_t source_len = strlen(source);
verity = malloc(strlen(verity_cmdline) + source_len * 2 + 1);
strcpy(verity, verity_cmdline);
while ((p = strstr(verity, "@DEV@")) != NULL) {
memmove(p + source_len, p + 5, strlen(p + 5) + 1);
memcpy(p, source, source_len);
}
/* Extract the first three parameters for dm-verity settings. */
char ttype[20];
unsigned long long start, size;
int n;
if (sscanf(verity, "%llu %llu %10s %n", &start, &size, ttype, &n) != 3)
goto error;
/* Finally create the device mapper. */
dmt = dm_task_create(DM_DEVICE_CREATE);
if (dmt == NULL)
goto error;
if (!dm_task_set_name(dmt, dm_name))
goto error;
if (!dm_task_set_ro(dmt))
goto error;
if (!dm_task_add_target(dmt, start, size, ttype, verity + n))
goto error;
if (!dm_task_set_cookie(dmt, &cookie, 0))
goto error;
if (!dm_task_run(dmt))
goto error;
/* Make sure the node exists before we continue. */
dm_udev_wait(cookie);
*dm_path_ret = dm_path;
*dm_name_ret = dm_name;
goto exit;
error:
ret = -errno;
free(dm_name);
free(dm_path);
exit:
free(verity);
if (dmt)
dm_task_destroy(dmt);
#endif
return ret;
}
/* Tear down the device mapper target. */
static int dm_detach(const char *dm_name)
{
int ret = 0;
#if USE_device_mapper
struct dm_task *dmt;
dmt = dm_task_create(DM_DEVICE_REMOVE);
if (dmt == NULL)
goto error;
if (!dm_task_set_name(dmt, dm_name))
goto error;
if (!dm_task_run(dmt))
goto error;
goto exit;
error:
ret = -errno;
exit:
dm_task_destroy(dmt);
#endif
return ret;
}
/*
* Unmounts anything we mounted in this mount namespace in the opposite order
* that they were mounted.
*/
static int unmount_external_mounts(struct container *c)
{
int ret = 0;
while (c->num_ext_mounts) {
c->num_ext_mounts--;
if (!c->ext_mounts[c->num_ext_mounts])
continue;
if (umount(c->ext_mounts[c->num_ext_mounts]))
ret = -errno;
FREE_AND_NULL(c->ext_mounts[c->num_ext_mounts]);
}
FREE_AND_NULL(c->ext_mounts);
while (c->num_loopdevs) {
c->num_loopdevs--;
if (loopdev_detach(c->loopdevs[c->num_loopdevs]))
ret = -errno;
FREE_AND_NULL(c->loopdevs[c->num_loopdevs]);
}
FREE_AND_NULL(c->loopdevs);
while (c->num_device_mappers) {
c->num_device_mappers--;
if (dm_detach(c->device_mappers[c->num_device_mappers]))
ret = -errno;
FREE_AND_NULL(c->device_mappers[c->num_device_mappers]);
}
FREE_AND_NULL(c->device_mappers);
return ret;
}
/*
* Match mount_one in minijail, mount one mountpoint with
* consideration for combination of MS_BIND/MS_RDONLY flag.
*/
static int mount_external(const char *src, const char *dest, const char *type,
unsigned long flags, const void *data)
{
int remount_ro = 0;
/*
* R/O bind mounts have to be remounted since 'bind' and 'ro'
* can't both be specified in the original bind mount.
* Remount R/O after the initial mount.
*/
if ((flags & MS_BIND) && (flags & MS_RDONLY)) {
remount_ro = 1;
flags &= ~MS_RDONLY;
}
if (mount(src, dest, type, flags, data) == -1)
return -1;
if (remount_ro) {
flags |= MS_RDONLY;
if (mount(src, dest, NULL, flags | MS_REMOUNT, data) == -1)
return -1;
}
return 0;
}
static int do_container_mount(struct container *c,
const struct container_config *config,
const struct container_mount *mnt)
{
char *dm_source = NULL;
char *loop_source = NULL;
char *source = NULL;
char *dest = NULL;
int rc = 0;
if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0)
return -errno;
/*
* If it's a bind mount relative to rootfs, append source to
* rootfs path, otherwise source path is absolute.
*/
if ((mnt->flags & MS_BIND) && mnt->source[0] != '/') {
if (asprintf(&source, "%s/%s", c->runfsroot, mnt->source) < 0)
goto error_free_return;
} else if (mnt->loopback && mnt->source[0] != '/' && c->config_root) {
if (asprintf(&source, "%s/%s", c->config_root, mnt->source) < 0)
goto error_free_return;
} else {
if (asprintf(&source, "%s", mnt->source) < 0)
goto error_free_return;
}
if (mnt->create) {
rc = setup_mount_destination(config, mnt, source, dest);
if (rc)
goto error_free_return;
}
if (mnt->loopback) {
/* Record this loopback file for cleanup later. */
loop_source = source;
source = NULL;
rc = loopdev_setup(&source, loop_source);
if (rc)
goto error_free_return;
/* Save this to cleanup when shutting down. */
rc = strdup_and_free(&c->loopdevs[c->num_loopdevs], source);
if (rc)
goto error_free_return;
c->num_loopdevs++;
}
if (mnt->verity) {
/* Set this device up via dm-verity. */
char *dm_name;
dm_source = source;
source = NULL;
rc = dm_setup(&source, &dm_name, dm_source, mnt->verity);
if (rc)
goto error_free_return;
/* Save this to cleanup when shutting down. */
rc = strdup_and_free(&c->device_mappers[c->num_device_mappers],
dm_name);
free(dm_name);
if (rc)
goto error_free_return;
c->num_device_mappers++;
}
if (mnt->mount_in_ns) {
/* We can mount this with minijail. */
rc = minijail_mount_with_data(c->jail, source, mnt->destination,
mnt->type, mnt->flags, mnt->data);
if (rc)
goto error_free_return;
} else {
/* Mount this externally and unmount it on exit. */
if (mount_external(source, dest, mnt->type, mnt->flags,
mnt->data))
goto error_free_return;
/* Save this to unmount when shutting down. */
rc = strdup_and_free(&c->ext_mounts[c->num_ext_mounts], dest);
if (rc)
goto error_free_return;
c->num_ext_mounts++;
}
goto exit;
error_free_return:
if (!rc)
rc = -errno;
exit:
free(dm_source);
free(loop_source);
free(source);
free(dest);
return rc;
}
static int do_container_mounts(struct container *c,
const struct container_config *config)
{
unsigned int i;
int rc = 0;
unmount_external_mounts(c);
/*
* Allocate space to track anything we mount in our mount namespace.
* This over-allocates as it has space for all mounts.
*/
c->ext_mounts = calloc(config->num_mounts, sizeof(*c->ext_mounts));
if (!c->ext_mounts)
return -errno;
c->loopdevs = calloc(config->num_mounts, sizeof(*c->loopdevs));
if (!c->loopdevs)
return -errno;
c->device_mappers = calloc(config->num_mounts, sizeof(*c->device_mappers));
if (!c->device_mappers)
return -errno;
for (i = 0; i < config->num_mounts; ++i) {
rc = do_container_mount(c, config, &config->mounts[i]);
if (rc)
goto error_free_return;
}
return 0;
error_free_return:
unmount_external_mounts(c);
return rc;
}
static int container_create_device(const struct container *c,
const struct container_config *config,
const struct container_device *dev,
int minor)
{
char *path = NULL;
int rc = 0;
int mode;
int uid_userns, gid_userns;
switch (dev->type) {
case 'b':
mode = S_IFBLK;
break;
case 'c':
mode = S_IFCHR;
break;
default:
return -EINVAL;
}
mode |= dev->fs_permissions;
uid_userns = get_userns_outside_id(config->uid_map, dev->uid);
if (uid_userns < 0)
return uid_userns;
gid_userns = get_userns_outside_id(config->gid_map, dev->gid);
if (gid_userns < 0)
return gid_userns;
if (asprintf(&path, "%s%s", c->runfsroot, dev->path) < 0)
goto error_free_return;
if (mknod(path, mode, makedev(dev->major, minor)) && errno != EEXIST)
goto error_free_return;
if (chown(path, uid_userns, gid_userns))
goto error_free_return;
if (chmod(path, dev->fs_permissions))
goto error_free_return;
goto exit;
error_free_return:
rc = -errno;
exit:
free(path);
return rc;
}
static int mount_runfs(struct container *c, const struct container_config *config)
{
static const mode_t root_dir_mode = 0660;
const char *rootfs = config->rootfs;
char *runfs_template = NULL;
int uid_userns, gid_userns;
if (asprintf(&runfs_template, "%s/%s_XXXXXX", c->rundir, c->name) < 0)
return -ENOMEM;
c->runfs = mkdtemp(runfs_template);
if (!c->runfs) {
free(runfs_template);
return -errno;
}
uid_userns = get_userns_outside_id(config->uid_map, config->uid);
if (uid_userns < 0)
return uid_userns;
gid_userns = get_userns_outside_id(config->gid_map, config->gid);
if (gid_userns < 0)
return gid_userns;
/* Make sure the container uid can access the rootfs. */
if (chmod(c->runfs, 0700))
return -errno;
if (chown(c->runfs, uid_userns, gid_userns))
return -errno;
if (asprintf(&c->runfsroot, "%s/root", c->runfs) < 0)
return -errno;
if (mkdir(c->runfsroot, root_dir_mode))
return -errno;
if (chmod(c->runfsroot, root_dir_mode))
return -errno;
if (mount(rootfs, c->runfsroot, "", MS_BIND, NULL))
return -errno;
/* MS_BIND ignores any flags passed to it (except MS_REC). We need a
* second call to mount() to actually set them.
*/
if (config->rootfs_mount_flags &&
mount(rootfs, c->runfsroot, "",
config->rootfs_mount_flags, NULL)) {
return -errno;
}
return 0;
}
static int device_setup(struct container *c,
const struct container_config *config)
{
int rc;
size_t i;
c->cgroup->ops->deny_all_devices(c->cgroup);
for (i = 0; i < config->num_cgroup_devices; i++) {
const struct container_cgroup_device *dev =
&config->cgroup_devices[i];
rc = c->cgroup->ops->add_device(c->cgroup,
dev->allow,
dev->major,
dev->minor,
dev->read,
dev->write,
dev->modify,
dev->type);
if (rc)
return rc;
}
for (i = 0; i < config->num_devices; i++) {
const struct container_device *dev = &config->devices[i];
int minor = dev->minor;
if (dev->copy_minor) {
struct stat st_buff;
if (stat(dev->path, &st_buff) < 0)
continue;
minor = minor(st_buff.st_rdev);
}
if (minor >= 0) {
rc = container_create_device(c, config, dev, minor);
if (rc)
return rc;
}
}
for (i = 0; i < c->num_loopdevs; ++i) {
struct stat st;
rc = stat(c->loopdevs[i], &st);
if (rc < 0)
return -errno;
rc = c->cgroup->ops->add_device(c->cgroup, 1, major(st.st_rdev),
minor(st.st_rdev),
1, 0, 0, 'b');
if (rc)
return rc;
}
return 0;
}
int container_start(struct container *c, const struct container_config *config)
{
int rc = 0;
unsigned int i;
int cgroup_uid, cgroup_gid;
char **destinations;
size_t num_destinations;
if (!c)
return -EINVAL;
if (!config)
return -EINVAL;
if (!config->program_argv || !config->program_argv[0])
return -EINVAL;
if (config->config_root) {
c->config_root = strdup(config->config_root);
if (!c->config_root) {
rc = -ENOMEM;
goto error_rmdir;
}
}
if (config->premounted_runfs) {
c->runfs = NULL;
c->runfsroot = strdup(config->premounted_runfs);
if (!c->runfsroot) {
rc = -ENOMEM;
goto error_rmdir;
}
} else {
rc = mount_runfs(c, config);
if (rc)
goto error_rmdir;
}
c->jail = minijail_new();
if (!c->jail)
goto error_rmdir;
rc = do_container_mounts(c, config);
if (rc)
goto error_rmdir;
cgroup_uid = get_userns_outside_id(config->uid_map,
config->cgroup_owner);
if (cgroup_uid < 0) {
rc = cgroup_uid;
goto error_rmdir;
}
cgroup_gid = get_userns_outside_id(config->gid_map,
config->cgroup_group);
if (cgroup_gid < 0) {
rc = cgroup_gid;
goto error_rmdir;
}
c->cgroup = container_cgroup_new(c->name,
"/sys/fs/cgroup",
config->cgroup_parent,
cgroup_uid,
cgroup_gid);
if (!c->cgroup)
goto error_rmdir;
/* Must be root to modify device cgroup or mknod */
if (getuid() == 0) {
if (device_setup(c, config))
goto error_rmdir;
}
/* Potentailly run setfiles on mounts configured outside of the jail */
destinations = calloc(config->num_mounts, sizeof(char *));
num_destinations = 0;
for (i = 0; i < config->num_mounts; i++) {
const struct container_mount *mnt = &config->mounts[i];
char* dest = mnt->destination;
if (mnt->mount_in_ns)
continue;
if (mnt->flags & MS_RDONLY)
continue;
/* A hack to avoid setfiles on /data and /cache. */
if (!strcmp(dest, "/data") || !strcmp(dest, "/cache"))
continue;
if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0) {
size_t j;
for (j = 0; j < num_destinations; ++j) {
free(destinations[j]);
}
free(destinations);
goto error_rmdir;
}
destinations[num_destinations++] = dest;
}
if (num_destinations) {
size_t i;
rc = run_setfiles_command(c, config, destinations, num_destinations);
for (i = 0; i < num_destinations; ++i) {
free(destinations[i]);
}
}
free(destinations);
if (rc)
goto error_rmdir;
/* Setup CPU cgroup params. */
if (config->cpu_cgparams.shares) {
rc = c->cgroup->ops->set_cpu_shares(
c->cgroup, config->cpu_cgparams.shares);
if (rc)
goto error_rmdir;
}
if (config->cpu_cgparams.period) {
rc = c->cgroup->ops->set_cpu_quota(
c->cgroup, config->cpu_cgparams.quota);
if (rc)
goto error_rmdir;
rc = c->cgroup->ops->set_cpu_period(
c->cgroup, config->cpu_cgparams.period);
if (rc)
goto error_rmdir;
}
if (config->cpu_cgparams.rt_period) {
rc = c->cgroup->ops->set_cpu_rt_runtime(
c->cgroup, config->cpu_cgparams.rt_runtime);
if (rc)
goto error_rmdir;
rc = c->cgroup->ops->set_cpu_rt_period(
c->cgroup, config->cpu_cgparams.rt_period);
if (rc)
goto error_rmdir;
}
/* Setup and start the container with libminijail. */
if (config->pid_file_path) {
c->pid_file_path = strdup(config->pid_file_path);
if (!c->pid_file_path) {
rc = -ENOMEM;
goto error_rmdir;
}
} else if (c->runfs) {
if (asprintf(&c->pid_file_path, "%s/container.pid", c->runfs) < 0) {
rc = -ENOMEM;
goto error_rmdir;
}
}
if (c->pid_file_path)
minijail_write_pid_file(c->jail, c->pid_file_path);
minijail_reset_signal_mask(c->jail);
/* Setup container namespaces. */
minijail_namespace_ipc(c->jail);
minijail_namespace_vfs(c->jail);
if (!config->share_host_netns)
minijail_namespace_net(c->jail);
minijail_namespace_pids(c->jail);
minijail_namespace_user(c->jail);
if (getuid() != 0)
minijail_namespace_user_disable_setgroups(c->jail);
minijail_namespace_cgroups(c->jail);
rc = minijail_uidmap(c->jail, config->uid_map);
if (rc)
goto error_rmdir;
rc = minijail_gidmap(c->jail, config->gid_map);
if (rc)
goto error_rmdir;
/* Set the UID/GID inside the container if not 0. */
if (get_userns_outside_id(config->uid_map, config->uid) < 0)
goto error_rmdir;
else if (config->uid > 0)
minijail_change_uid(c->jail, config->uid);
if (get_userns_outside_id(config->gid_map, config->gid) < 0)
goto error_rmdir;
else if (config->gid > 0)
minijail_change_gid(c->jail, config->gid);
rc = minijail_enter_pivot_root(c->jail, c->runfsroot);
if (rc)
goto error_rmdir;
/* Add the cgroups configured above. */
for (i = 0; i < NUM_CGROUP_TYPES; i++) {
if (c->cgroup->cgroup_tasks_paths[i]) {
rc = minijail_add_to_cgroup(c->jail,
c->cgroup->cgroup_tasks_paths[i]);
if (rc)
goto error_rmdir;
}
}
if (config->alt_syscall_table)
minijail_use_alt_syscall(c->jail, config->alt_syscall_table);
minijail_run_as_init(c->jail);
/* TODO(dgreid) - remove this once shared mounts are cleaned up. */
minijail_skip_remount_private(c->jail);
if (!config->keep_fds_open)
minijail_close_open_fds(c->jail);
rc = minijail_run_pid_pipes_no_preload(c->jail,
config->program_argv[0],
config->program_argv,
&c->init_pid, NULL, NULL,
NULL);
if (rc)
goto error_rmdir;
return 0;
error_rmdir:
if (!rc)
rc = -errno;
container_teardown(c);
return rc;
}
const char *container_root(struct container *c)
{
return c->runfs;
}
int container_pid(struct container *c)
{
return c->init_pid;
}
static int container_teardown(struct container *c)
{
int ret = 0;
unmount_external_mounts(c);
if (c->runfsroot && c->runfs) {
if (umount(c->runfsroot))
ret = -errno;
if (rmdir(c->runfsroot))
ret = -errno;
FREE_AND_NULL(c->runfsroot);
}
if (c->pid_file_path) {
if (unlink(c->pid_file_path))
ret = -errno;
FREE_AND_NULL(c->pid_file_path);
}
if (c->runfs) {
if (rmdir(c->runfs))
ret = -errno;
FREE_AND_NULL(c->runfs);
}
return ret;
}
int container_wait(struct container *c)
{
int rc;
do {
rc = minijail_wait(c->jail);
} while (rc == -EINTR);
// If the process had already been reaped, still perform teardown.
if (rc == -ECHILD || rc >= 0) {
rc = container_teardown(c);
}
return rc;
}
int container_kill(struct container *c)
{
if (kill(c->init_pid, SIGKILL) && errno != ESRCH)
return -errno;
return container_wait(c);
}