| /* Copyright 2016 The Chromium OS Authors. All rights reserved. |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #define _GNU_SOURCE /* For asprintf */ |
| |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <malloc.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/mount.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <unistd.h> |
| |
| #include "container_cgroup.h" |
| #include "libcontainer.h" |
| #include "libminijail.h" |
| |
| struct container_mount { |
| char *name; |
| char *source; |
| char *destination; |
| char *type; |
| char *data; |
| int flags; |
| int uid; |
| int gid; |
| int mode; |
| int mount_in_ns; /* True if mount should happen in new vfs ns */ |
| int create; /* True if target should be created if it doesn't exist */ |
| }; |
| |
| struct container_device { |
| char type; /* 'c' or 'b' for char or block */ |
| char *path; |
| int fs_permissions; |
| int major; |
| int minor; |
| int uid; |
| int gid; |
| int read_allowed; |
| int write_allowed; |
| int modify_allowed; |
| }; |
| |
| /* |
| * Structure that configures how the container is run. |
| * |
| * rootfs - Path to the root of the container's filesystem. |
| * program_argv - The program to run and args, e.g. "/sbin/init". |
| * num_args - Number of args in program_argv. |
| * uid_map - Mapping of UIDs in the container, e.g. "0 100000 1024" |
| * gid_map - Mapping of GIDs in the container, e.g. "0 100000 1024" |
| * alt_syscall_table - Syscall table to use or NULL if none. |
| * mounts - Filesystems to mount in the new namespace. |
| * num_mounts - Number of above. |
| * devices - Device nodes to create. |
| * num_devices - Number of above. |
| * run_setfiles - Should run setfiles on mounts to enable selinux. |
| */ |
| struct container_config { |
| char *rootfs; |
| char **program_argv; |
| size_t num_args; |
| char *uid_map; |
| char *gid_map; |
| char *alt_syscall_table; |
| struct container_mount *mounts; |
| size_t num_mounts; |
| struct container_device *devices; |
| size_t num_devices; |
| const char *run_setfiles; |
| }; |
| |
| struct container_config *container_config_create() |
| { |
| return calloc(1, sizeof(struct container_config)); |
| } |
| |
| void container_config_destroy(struct container_config *c) |
| { |
| size_t i; |
| |
| if (c == NULL) |
| return; |
| free(c->rootfs); |
| for (i = 0; i < c->num_args; ++i) |
| free(c->program_argv[i]); |
| free(c->program_argv); |
| free(c->uid_map); |
| free(c->gid_map); |
| free(c->alt_syscall_table); |
| for (i = 0; i < c->num_mounts; ++i) { |
| free(c->mounts[i].name); |
| free(c->mounts[i].source); |
| free(c->mounts[i].destination); |
| free(c->mounts[i].type); |
| free(c->mounts[i].data); |
| } |
| free(c->mounts); |
| for (i = 0; i < c->num_devices; ++i) { |
| free(c->devices[i].path); |
| } |
| free(c->devices); |
| free(c); |
| } |
| |
| int container_config_rootfs(struct container_config *c, const char *rootfs) |
| { |
| c->rootfs = strdup(rootfs); |
| if (!c->rootfs) |
| return -ENOMEM; |
| return 0; |
| } |
| |
| int container_config_program_argv(struct container_config *c, |
| char **argv, size_t num_args) |
| { |
| size_t i; |
| |
| c->num_args = num_args; |
| c->program_argv = calloc(num_args + 1, sizeof(char *)); |
| if (!c->program_argv) |
| return -ENOMEM; |
| for (i = 0; i < num_args; ++i) { |
| c->program_argv[i] = strdup(argv[i]); |
| if (!c->program_argv[i]) |
| return -ENOMEM; |
| } |
| c->program_argv[num_args] = NULL; |
| return 0; |
| } |
| |
| int container_config_uid_map(struct container_config *c, const char *uid_map) |
| { |
| c->uid_map = strdup(uid_map); |
| if (!c->uid_map) |
| return -ENOMEM; |
| return 0; |
| } |
| |
| int container_config_gid_map(struct container_config *c, const char *gid_map) |
| { |
| c->gid_map = strdup(gid_map); |
| if (!c->gid_map) |
| return -ENOMEM; |
| return 0; |
| } |
| |
| int container_config_alt_syscall_table(struct container_config *c, |
| const char *alt_syscall_table) |
| { |
| c->alt_syscall_table = strdup(alt_syscall_table); |
| if (!c->alt_syscall_table) |
| return -ENOMEM; |
| return 0; |
| } |
| |
| int container_config_add_mount(struct container_config *c, |
| const char *name, |
| const char *source, |
| const char *destination, |
| const char *type, |
| const char *data, |
| int flags, |
| int uid, |
| int gid, |
| int mode, |
| int mount_in_ns, |
| int create) |
| { |
| struct container_mount *mount_ptr; |
| |
| if (name == NULL || source == NULL || |
| destination == NULL || type == NULL) |
| return -EINVAL; |
| |
| mount_ptr = realloc(c->mounts, |
| sizeof(c->mounts[0]) * (c->num_mounts + 1)); |
| if (!mount_ptr) |
| return -ENOMEM; |
| c->mounts = mount_ptr; |
| c->mounts[c->num_mounts].name = strdup(name); |
| if (!c->mounts[c->num_mounts].name) |
| return -ENOMEM; |
| c->mounts[c->num_mounts].source = strdup(source); |
| if (!c->mounts[c->num_mounts].source) |
| return -ENOMEM; |
| c->mounts[c->num_mounts].destination = strdup(destination); |
| if (!c->mounts[c->num_mounts].destination) |
| return -ENOMEM; |
| c->mounts[c->num_mounts].type = strdup(type); |
| if (!c->mounts[c->num_mounts].type) |
| return -ENOMEM; |
| if (data) { |
| c->mounts[c->num_mounts].data = strdup(data); |
| if (!c->mounts[c->num_mounts].data) |
| return -ENOMEM; |
| } else { |
| c->mounts[c->num_mounts].data = NULL; |
| } |
| c->mounts[c->num_mounts].flags = flags; |
| c->mounts[c->num_mounts].uid = uid; |
| c->mounts[c->num_mounts].gid = gid; |
| c->mounts[c->num_mounts].mode = mode; |
| c->mounts[c->num_mounts].mount_in_ns = mount_in_ns; |
| c->mounts[c->num_mounts].create = create; |
| ++c->num_mounts; |
| return 0; |
| } |
| |
| int container_config_add_device(struct container_config *c, |
| char type, |
| const char *path, |
| int fs_permissions, |
| int major, |
| int minor, |
| int uid, |
| int gid, |
| int read_allowed, |
| int write_allowed, |
| int modify_allowed) |
| { |
| struct container_device *dev_ptr; |
| |
| if (path == NULL) |
| return -EINVAL; |
| dev_ptr = realloc(c->devices, |
| sizeof(c->devices[0]) * (c->num_devices + 1)); |
| if (!dev_ptr) |
| return -ENOMEM; |
| c->devices = dev_ptr; |
| c->devices[c->num_devices].type = type; |
| c->devices[c->num_devices].path = strdup(path); |
| if (!c->devices[c->num_devices].path) |
| return -ENOMEM; |
| c->devices[c->num_devices].fs_permissions = fs_permissions; |
| c->devices[c->num_devices].major = major; |
| c->devices[c->num_devices].minor = minor; |
| c->devices[c->num_devices].uid = uid; |
| c->devices[c->num_devices].gid = gid; |
| c->devices[c->num_devices].read_allowed = read_allowed; |
| c->devices[c->num_devices].write_allowed = write_allowed; |
| c->devices[c->num_devices].modify_allowed = modify_allowed; |
| ++c->num_devices; |
| return 0; |
| } |
| |
| void container_config_run_setfiles(struct container_config *c, |
| const char *setfiles_cmd) |
| { |
| c->run_setfiles = setfiles_cmd; |
| } |
| |
| /* |
| * Container manipulation |
| */ |
| struct container { |
| struct container_config *config; |
| struct container_cgroup *cgroup; |
| struct minijail *jail; |
| pid_t init_pid; |
| char *runfs; |
| char *rundir; |
| char *runfsroot; |
| char *pid_file_path; |
| const char *name; |
| }; |
| |
| struct container *container_new(const char *name, |
| const char *rundir, |
| struct container_config *config) |
| { |
| struct container *c; |
| |
| if (!config) |
| return NULL; |
| if (!config->program_argv || !config->program_argv[0]) |
| return NULL; |
| |
| c = calloc(1, sizeof(*c)); |
| if (!c) |
| return NULL; |
| c->name = name; |
| c->config = config; |
| c->cgroup = container_cgroup_new(name, "/sys/fs/cgroup"); |
| c->rundir = strdup(rundir); |
| if (!c->rundir) { |
| free(c); |
| return NULL; |
| } |
| return c; |
| } |
| |
| void container_destroy(struct container *c) |
| { |
| container_config_destroy(c->config); |
| container_cgroup_destroy(c->cgroup); |
| free(c->rundir); |
| free(c); |
| } |
| |
| static int make_dir(const char *path, int uid, int gid, int mode) |
| { |
| if (mkdir(path, mode)) |
| return -errno; |
| if (chmod(path, mode)) |
| return -errno; |
| if (chown(path, uid, gid)) |
| return -errno; |
| return 0; |
| } |
| |
| static int touch_file(const char *path, int uid, int gid, int mode) |
| { |
| int rc; |
| int fd = open(path, O_RDWR | O_CREAT, mode); |
| if (fd < 0) |
| return -errno; |
| rc = fchown(fd, uid, gid); |
| close(fd); |
| |
| if (rc) |
| return -errno; |
| return 0; |
| } |
| |
| /* Make sure the mount target exists in the new rootfs. Create if needed and |
| * possible. |
| */ |
| static int setup_mount_destination(const struct container_mount *mnt, |
| const char *dest) |
| { |
| int rc; |
| struct stat st_buf; |
| |
| rc = stat(dest, &st_buf); |
| if (rc == 0) /* destination exists */ |
| return 0; |
| |
| /* Try to create the destination. Either make directory or touch a file |
| * depending on the source type. |
| */ |
| rc = stat(mnt->source, &st_buf); |
| if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode)) |
| return make_dir(dest, mnt->uid, mnt->gid, mnt->mode); |
| |
| return touch_file(dest, mnt->uid, mnt->gid, mnt->mode); |
| } |
| |
| /* Fork and exec the setfiles command to configure the selinux policy. */ |
| static int run_setfiles_command(const struct container *c, const char *dest) |
| { |
| int rc; |
| int status; |
| int pid; |
| char *context_path; |
| |
| if (!c->config->run_setfiles) |
| return 0; |
| |
| if (asprintf(&context_path, "%s/file_contexts", |
| c->runfsroot) < 0) |
| return -errno; |
| |
| pid = fork(); |
| if (pid == 0) { |
| const char *argv[] = { |
| c->config->run_setfiles, |
| "-r", |
| c->runfsroot, |
| context_path, |
| dest, |
| NULL, |
| }; |
| const char *env[] = { |
| NULL, |
| }; |
| |
| execve(argv[0], (char *const*)argv, (char *const*)env); |
| |
| /* Command failed to exec if execve returns. */ |
| _exit(-errno); |
| } |
| free(context_path); |
| if (pid < 0) |
| return -errno; |
| do { |
| rc = waitpid(pid, &status, 0); |
| } while (rc == -1 && errno == EINTR); |
| if (rc < 0) |
| return -errno; |
| return status; |
| } |
| |
| int container_start(struct container *c) |
| { |
| int rc; |
| unsigned int i; |
| const char *rootfs = c->config->rootfs; |
| char *runfs_template; |
| |
| if (asprintf(&runfs_template, "%s/%s_XXXXXX", c->rundir, c->name) < 0) |
| return -errno; |
| |
| c->runfs = mkdtemp(runfs_template); |
| if (!c->runfs) { |
| free(runfs_template); |
| return -errno; |
| } |
| if (asprintf(&c->runfsroot, "%s/root", c->runfs) < 0) { |
| free(runfs_template); |
| return -errno; |
| } |
| |
| rc = mkdir(c->runfsroot, 0660); |
| if (rc) |
| goto error_rmdir; |
| |
| rc = mount(rootfs, c->runfsroot, "", MS_BIND | MS_RDONLY | MS_NOEXEC, |
| NULL); |
| if (rc) |
| goto error_rmdir; |
| |
| c->jail = minijail_new(); |
| |
| for (i = 0; i < c->config->num_mounts; ++i) { |
| const struct container_mount *mnt = &c->config->mounts[i]; |
| char *dest; |
| |
| if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0) |
| goto error_rmdir; |
| |
| if (mnt->create) { |
| rc = setup_mount_destination(mnt, dest); |
| if (rc) { |
| free(dest); |
| goto error_rmdir; |
| } |
| } |
| if (mnt->mount_in_ns) { |
| /* |
| * We can mount this with minijail. |
| * If relative to rootfs, append source to rootfs. |
| */ |
| char *tmpsrc = NULL; |
| if ((mnt->flags & MS_BIND) && mnt->source[0] != '/') { |
| if (asprintf(&tmpsrc, "%s/%s", c->runfsroot, |
| mnt->source) < 0) { |
| free(dest); |
| goto error_rmdir; |
| } |
| } |
| rc = minijail_mount(c->jail, |
| tmpsrc ? tmpsrc : mnt->source, |
| mnt->destination, mnt->type, |
| mnt->flags); |
| free(tmpsrc); |
| if (rc) { |
| free(dest); |
| goto error_rmdir; |
| } |
| } else { |
| /* |
| * Mount this externally and unmount it on exit. Don't |
| * allow execution from external mounts. |
| */ |
| rc = mount(mnt->source, dest, mnt->type, |
| mnt->flags | MS_NOEXEC, mnt->data); |
| if (rc) { |
| free(dest); |
| goto error_rmdir; |
| } |
| |
| rc = run_setfiles_command(c, dest); |
| if (rc) { |
| free(dest); |
| goto error_rmdir; |
| } |
| |
| } |
| free(dest); |
| } |
| |
| c->cgroup->ops->deny_all_devices(c->cgroup); |
| |
| for (i = 0; i < c->config->num_devices; i++) { |
| const struct container_device *dev = &c->config->devices[i]; |
| int mode; |
| char *path; |
| |
| switch (dev->type) { |
| case 'b': |
| mode = S_IFBLK; |
| break; |
| case 'c': |
| mode = S_IFCHR; |
| break; |
| default: |
| goto error_rmdir; |
| } |
| mode |= dev->fs_permissions; |
| |
| if (asprintf(&path, "%s%s", c->runfsroot, dev->path) < 0) |
| goto error_rmdir; |
| if (dev->minor >= 0) { |
| rc = mknod(path, mode, makedev(dev->major, dev->minor)); |
| if (rc && errno != EEXIST) { |
| free(path); |
| goto error_rmdir; |
| } |
| rc = chown(path, dev->uid, dev->gid); |
| if (rc) { |
| free(path); |
| goto error_rmdir; |
| } |
| rc = chmod(path, dev->fs_permissions); |
| free(path); |
| if (rc) |
| goto error_rmdir; |
| } |
| |
| rc = c->cgroup->ops->add_device(c->cgroup, dev->major, |
| dev->minor, dev->read_allowed, |
| dev->write_allowed, |
| dev->modify_allowed, dev->type); |
| if (rc) |
| goto error_rmdir; |
| } |
| |
| /* Setup and start the container with libminijail. */ |
| if (asprintf(&c->pid_file_path, "%s/container.pid", c->runfs) < 0) |
| goto error_rmdir; |
| minijail_write_pid_file(c->jail, c->pid_file_path); |
| minijail_reset_signal_mask(c->jail); |
| |
| /* Setup container namespaces. */ |
| minijail_namespace_ipc(c->jail); |
| minijail_namespace_vfs(c->jail); |
| minijail_namespace_net(c->jail); |
| minijail_namespace_pids(c->jail); |
| /* TODO(dgreid) - Enable user namespaces |
| minijail_namespace_user(c->jail); |
| rc = minijail_uidmap(c->jail, c->config->uid_map); |
| if (rc) |
| goto error_rmdir; |
| rc = minijail_gidmap(c->jail, c->config->gid_map); |
| if (rc) |
| goto error_rmdir; |
| */ |
| |
| rc = minijail_enter_pivot_root(c->jail, c->runfsroot); |
| if (rc) |
| goto error_rmdir; |
| |
| /* Add the cgroups configured above. */ |
| rc = minijail_add_to_cgroup(c->jail, cgroup_cpu_tasks_path(c->cgroup)); |
| if (rc) |
| goto error_rmdir; |
| rc = minijail_add_to_cgroup(c->jail, |
| cgroup_cpuacct_tasks_path(c->cgroup)); |
| if (rc) |
| goto error_rmdir; |
| rc = minijail_add_to_cgroup(c->jail, |
| cgroup_devices_tasks_path(c->cgroup)); |
| if (rc) |
| goto error_rmdir; |
| rc = minijail_add_to_cgroup(c->jail, |
| cgroup_freezer_tasks_path(c->cgroup)); |
| if (rc) |
| goto error_rmdir; |
| |
| if (c->config->alt_syscall_table) |
| minijail_use_alt_syscall(c->jail, c->config->alt_syscall_table); |
| |
| minijail_run_as_init(c->jail); |
| |
| /* Last mount is to make '/' executable in the container. */ |
| rc = minijail_mount(c->jail, rootfs, "/", "", |
| MS_REMOUNT | MS_RDONLY); |
| if (rc) |
| goto error_rmdir; |
| |
| rc = minijail_run_pid_pipes_no_preload(c->jail, |
| c->config->program_argv[0], |
| c->config->program_argv, |
| &c->init_pid, NULL, NULL, |
| NULL); |
| if (rc) |
| goto error_rmdir; |
| return 0; |
| |
| error_rmdir: |
| umount(c->runfsroot); |
| rmdir(c->runfsroot); |
| unlink(c->pid_file_path); |
| free(c->pid_file_path); |
| rmdir(c->runfs); |
| free(c->runfsroot); |
| free(c->runfs); |
| return rc; |
| } |
| |
| const char *container_root(struct container *c) |
| { |
| return c->runfs; |
| } |
| |
| int container_pid(struct container *c) |
| { |
| return c->init_pid; |
| } |
| |
| static int container_teardown(struct container *c) |
| { |
| int i; |
| int ret = 0; |
| |
| /* |
| * Unmount anything we mounted in this mount namespace in the opposite |
| * order that they were mounted. |
| */ |
| for (i = (int)c->config->num_mounts - 1; i >= 0; --i) { |
| const struct container_mount *mnt = &c->config->mounts[i]; |
| char *dest; |
| |
| if (mnt->mount_in_ns) |
| continue; |
| if (asprintf(&dest, "%s%s", c->runfsroot, mnt->destination) < 0) |
| continue; |
| if (umount(dest)) |
| ret = -errno; |
| free(dest); |
| } |
| if (umount(c->runfsroot)) |
| ret = -errno; |
| if (rmdir(c->runfsroot)) |
| ret = -errno; |
| if (unlink(c->pid_file_path)) |
| ret = -errno; |
| if (rmdir(c->runfs)) |
| ret = -errno; |
| free(c->pid_file_path); |
| free(c->runfsroot); |
| free(c->runfs); |
| return ret; |
| } |
| |
| int container_wait(struct container *c) |
| { |
| int rc; |
| |
| do { |
| rc = minijail_wait(c->jail); |
| } while (rc == -1 && errno == EINTR); |
| |
| if (rc == 0) |
| rc = container_teardown(c); |
| return rc; |
| } |
| |
| int container_kill(struct container *c) |
| { |
| int rc; |
| |
| rc = kill(c->init_pid, SIGKILL); |
| if (rc) |
| return -errno; |
| return container_wait(c); |
| } |