blob: fab073b8ee17cd0ea4aa498860345174d3577f13 [file] [log] [blame]
// Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "libcontainer/libcontainer_util.h"
#include <errno.h>
#include <fcntl.h>
#if USE_device_mapper
#include <libdevmapper.h>
#endif
#include <linux/loop.h>
#include <sched.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <unistd.h>
#include <memory>
#include <utility>
#include <vector>
#include <base/bind.h>
#include <base/bind_helpers.h>
#include <base/callback_helpers.h>
#include <base/files/file_util.h>
#include <base/files/scoped_file.h>
#include <base/logging.h>
#include <base/macros.h>
#include <base/posix/eintr_wrapper.h>
#include <base/strings/string_number_conversions.h>
#include <base/strings/string_split.h>
#include <base/strings/string_util.h>
#include <base/strings/stringprintf.h>
// New cgroup namespace might not be in linux-headers yet.
#ifndef CLONE_NEWCGROUP
#define CLONE_NEWCGROUP 0x02000000
#endif
namespace libcontainer {
namespace {
constexpr char kLoopdevCtlPath[] = "/dev/loop-control";
#if USE_device_mapper
constexpr char kDevMapperPath[] = "/dev/mapper/";
#endif
// Gets the namespace name for |nstype|.
std::string GetNamespaceNameForType(int nstype) {
switch (nstype) {
case CLONE_NEWCGROUP:
return "cgroup";
case CLONE_NEWIPC:
return "ipc";
case CLONE_NEWNET:
return "net";
case CLONE_NEWNS:
return "mnt";
case CLONE_NEWPID:
return "pid";
case CLONE_NEWUSER:
return "user";
case CLONE_NEWUTS:
return "uts";
}
return std::string();
}
// Helper function that runs |callback| in all the namespaces identified by
// |nstypes|.
bool RunInNamespacesHelper(HookCallback callback,
std::vector<int> nstypes,
pid_t container_pid) {
pid_t child = fork();
if (child < 0) {
PLOG(ERROR) << "Failed to fork()";
return false;
}
if (child == 0) {
for (const int nstype : nstypes) {
std::string nstype_name = GetNamespaceNameForType(nstype);
if (nstype_name.empty()) {
LOG(ERROR) << "Invalid namespace type " << nstype;
_exit(-1);
}
base::FilePath ns_path = base::FilePath(base::StringPrintf(
"/proc/%d/ns/%s", container_pid, nstype_name.c_str()));
base::ScopedFD ns_fd(open(ns_path.value().c_str(), O_RDONLY));
if (!ns_fd.is_valid()) {
PLOG(ERROR) << "Failed to open " << ns_path.value();
_exit(-1);
}
if (setns(ns_fd.get(), nstype)) {
PLOG(ERROR) << "Failed to enter PID " << container_pid << "'s "
<< nstype_name << " namespace";
_exit(-1);
}
}
// Preserve normal POSIX semantics of calling exit(2) with 0 for success and
// non-zero for failure.
_exit(callback.Run(container_pid) ? 0 : 1);
}
int status;
if (HANDLE_EINTR(waitpid(child, &status, 0)) < 0) {
PLOG(ERROR) << "Failed to wait for callback";
return false;
}
if (!WIFEXITED(status)) {
LOG(ERROR) << "Callback terminated abnormally: " << std::hex << status;
return false;
}
return static_cast<int8_t>(WEXITSTATUS(status)) == 0;
}
// Helper function that runs a program execve(2)-style.
bool ExecveCallbackHelper(base::FilePath filename,
std::vector<std::string> args,
base::ScopedFD stdin_fd,
base::ScopedFD stdout_fd,
base::ScopedFD stderr_fd,
pid_t container_pid) {
pid_t child = fork();
if (child < 0) {
PLOG(ERROR) << "Failed to fork()";
return false;
}
if (child == 0) {
if (stdin_fd.is_valid()) {
if (dup2(stdin_fd.get(), STDIN_FILENO) == -1) {
PLOG(ERROR) << "Failed to dup2() stdin fd";
_exit(-1);
}
}
if (stdout_fd.is_valid()) {
if (dup2(stdout_fd.get(), STDOUT_FILENO) == -1) {
PLOG(ERROR) << "Failed to dup2() stdout fd";
_exit(-1);
}
}
if (stderr_fd.is_valid()) {
if (dup2(stderr_fd.get(), STDERR_FILENO) == -1) {
PLOG(ERROR) << "Failed to dup2() stderr fd";
_exit(-1);
}
}
std::string pid_str = base::NumberToString(container_pid);
std::vector<const char*> argv;
argv.reserve(args.size() + 1);
for (const auto& arg : args) {
if (arg == "$PID") {
argv.emplace_back(pid_str.c_str());
continue;
}
argv.emplace_back(arg.c_str());
}
argv.emplace_back(nullptr);
execve(filename.value().c_str(), const_cast<char**>(argv.data()), environ);
// Only happens when execve(2) fails.
_exit(-1);
}
int status;
if (HANDLE_EINTR(waitpid(child, &status, 0)) < 0) {
PLOG(ERROR) << "Failed to wait for hook";
return false;
}
if (!WIFEXITED(status)) {
LOG(ERROR) << "Hook terminated abnormally: " << std::hex << status;
return false;
}
return static_cast<int8_t>(WEXITSTATUS(status)) == 0;
}
// Immediately removes the loop device from the system.
void RemoveLoopDevice(int control_fd, int32_t device) {
if (ioctl(control_fd, LOOP_CTL_REMOVE, device) < 0)
PLOG(ERROR) << "Failed to free /dev/loop" << device;
}
} // namespace
WaitablePipe::WaitablePipe() {
if (pipe2(pipe_fds, O_CLOEXEC) < 0)
PLOG(FATAL) << "Failed to create pipe";
}
WaitablePipe::~WaitablePipe() {
if (pipe_fds[0] != -1)
close(pipe_fds[0]);
if (pipe_fds[1] != -1)
close(pipe_fds[1]);
}
WaitablePipe::WaitablePipe(WaitablePipe&& other) {
pipe_fds[0] = pipe_fds[1] = -1;
std::swap(pipe_fds, other.pipe_fds);
}
void WaitablePipe::Wait() {
char buf;
close(pipe_fds[1]);
HANDLE_EINTR(read(pipe_fds[0], &buf, sizeof(buf)));
close(pipe_fds[0]);
pipe_fds[0] = pipe_fds[1] = -1;
}
void WaitablePipe::Signal() {
close(pipe_fds[0]);
close(pipe_fds[1]);
pipe_fds[0] = pipe_fds[1] = -1;
}
HookState::HookState() = default;
HookState::~HookState() = default;
HookState::HookState(HookState&& state) = default;
bool HookState::InstallHook(struct minijail* j, minijail_hook_event_t event) {
if (installed_) {
LOG(ERROR) << "Failed to install hook: already installed";
return false;
}
// All these fds will be closed in WaitHook in the child process.
for (size_t i = 0; i < 2; ++i) {
if (minijail_preserve_fd(j, reached_pipe_.pipe_fds[i],
reached_pipe_.pipe_fds[i]) != 0) {
LOG(ERROR) << "Failed to preserve reached pipe FDs to install hook";
return false;
}
if (minijail_preserve_fd(j, ready_pipe_.pipe_fds[i],
ready_pipe_.pipe_fds[i]) != 0) {
LOG(ERROR) << "Failed to preserve ready pipe FDs to install hook";
return false;
}
}
if (minijail_add_hook(j, &HookState::WaitHook, this, event) != 0) {
LOG(ERROR) << "Failed to add hook";
return false;
}
installed_ = true;
return true;
}
bool HookState::WaitForHookAndRun(const std::vector<HookCallback>& callbacks,
pid_t container_pid) {
if (!installed_) {
LOG(ERROR) << "Failed to wait for hook: not installed";
return false;
}
reached_pipe_.Wait();
for (auto& callback : callbacks) {
bool success = callback.Run(container_pid);
if (!success)
return false;
}
ready_pipe_.Signal();
return true;
}
// static
int HookState::WaitHook(void* payload) {
HookState* self = reinterpret_cast<HookState*>(payload);
self->reached_pipe_.Signal();
self->ready_pipe_.Wait();
return 0;
}
bool GetUsernsOutsideId(const std::string& map, int id, int* id_out) {
if (map.empty()) {
if (id_out)
*id_out = id;
return true;
}
std::string map_copy = map;
base::StringPiece map_piece(map_copy);
for (const auto& mapping : base::SplitStringPiece(
map_piece, ",", base::KEEP_WHITESPACE, base::SPLIT_WANT_ALL)) {
std::vector<base::StringPiece> tokens = base::SplitStringPiece(
mapping, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
if (tokens.size() != 3) {
LOG(ERROR) << "Malformed ugid mapping: '" << mapping << "'";
return false;
}
uint32_t inside, outside, length;
if (!base::StringToUint(tokens[0], &inside) ||
!base::StringToUint(tokens[1], &outside) ||
!base::StringToUint(tokens[2], &length)) {
LOG(ERROR) << "Malformed ugid mapping: '" << mapping << "'";
return false;
}
if (id >= inside && id <= (inside + length)) {
if (id_out)
*id_out = (id - inside) + outside;
return true;
}
}
VLOG(1) << "ugid " << id << " not found in mapping";
return false;
}
bool MakeDir(const base::FilePath& path, int uid, int gid, int mode) {
if (mkdir(path.value().c_str(), mode)) {
PLOG(ERROR) << "Failed to mkdir " << path.value();
return false;
}
if (chmod(path.value().c_str(), mode)) {
PLOG(ERROR) << "Failed to chmod " << path.value();
return false;
}
if (chown(path.value().c_str(), uid, gid)) {
PLOG(ERROR) << "Failed to chown " << path.value();
return false;
}
return true;
}
bool TouchFile(const base::FilePath& path, int uid, int gid, int mode) {
base::ScopedFD fd(open(path.value().c_str(), O_RDWR | O_CREAT, mode));
if (!fd.is_valid()) {
PLOG(ERROR) << "Failed to create " << path.value();
return false;
}
if (fchown(fd.get(), uid, gid)) {
PLOG(ERROR) << "Failed to chown " << path.value();
return false;
}
return true;
}
bool LoopdevSetup(const base::FilePath& source, Loopdev* loopdev_out) {
base::ScopedFD source_fd(open(source.value().c_str(), O_RDONLY | O_CLOEXEC));
if (!source_fd.is_valid()) {
PLOG(ERROR) << "Failed to open " << source.value();
return false;
}
base::ScopedFD control_fd(
open(kLoopdevCtlPath, O_RDWR | O_NOFOLLOW | O_CLOEXEC));
if (!control_fd.is_valid()) {
PLOG(ERROR) << "Failed to open " << source.value();
return false;
}
while (true) {
int num = ioctl(control_fd.get(), LOOP_CTL_GET_FREE);
if (num < 0) {
PLOG(ERROR) << "Failed to open " << source.value();
return false;
}
// Cleanup in case the setup fails. This frees |num| altogether.
base::ScopedClosureRunner loop_device_cleanup(
base::Bind(&RemoveLoopDevice, control_fd.get(), num));
base::FilePath loopdev_path(base::StringPrintf("/dev/loop%i", num));
base::ScopedFD loop_fd(
open(loopdev_path.value().c_str(), O_RDONLY | O_NOFOLLOW | O_CLOEXEC));
if (!loop_fd.is_valid()) {
PLOG(ERROR) << "Failed to open " << loopdev_path.value();
return false;
}
if (ioctl(loop_fd.get(), LOOP_SET_FD, source_fd.get()) < 0) {
if (errno != EBUSY) {
PLOG(ERROR) << "Failed to ioctl(LOOP_SET_FD) " << loopdev_path.value();
return false;
}
continue;
}
// Set the autoclear flag on the loop device, which will release it when
// there are no more references to it.
struct loop_info64 loop_info = {};
if (ioctl(loop_fd.get(), LOOP_GET_STATUS64, &loop_info) < 0) {
PLOG(ERROR) << "Failed to ioctl(LOOP_GET_STATUS64) "
<< loopdev_path.value();
return false;
}
loop_info.lo_flags |= LO_FLAGS_AUTOCLEAR;
if (ioctl(loop_fd.get(), LOOP_SET_STATUS64, &loop_info) < 0) {
PLOG(ERROR) << "Failed to ioctl(LOOP_SET_STATUS64, LO_FLAGS_AUTOCLEAR) "
<< loopdev_path.value();
return false;
}
ignore_result(loop_device_cleanup.Release());
loopdev_out->path = loopdev_path;
loopdev_out->fd = std::move(loop_fd);
loopdev_out->info = loop_info;
break;
}
return true;
}
bool LoopdevDetach(Loopdev* loopdev) {
if (ioctl(loopdev->fd.get(), LOOP_CLR_FD) < 0) {
PLOG(ERROR) << "Failed to ioctl(LOOP_CLR_FD) for " << loopdev->path.value();
return false;
}
return true;
}
bool DeviceMapperSetup(const base::FilePath& source,
const std::string& verity_cmdline,
base::FilePath* dm_path_out,
std::string* dm_name_out) {
#if USE_device_mapper
// Normalize the name into something unique-esque.
std::string dm_name =
base::StringPrintf("cros-containers-%s", source.value().c_str());
base::ReplaceChars(dm_name, "/", "_", &dm_name);
// Get the /dev path for the higher levels to mount.
base::FilePath dm_path = base::FilePath(kDevMapperPath).Append(dm_name);
// Insert the source path in the verity command line.
std::string verity = verity_cmdline;
base::ReplaceSubstringsAfterOffset(&verity, 0, "@DEV@", source.value());
// Extract the first three parameters for dm-verity settings.
char ttype[20];
unsigned long long start, size;
int n;
if (sscanf(verity.c_str(), "%llu %llu %10s %n", &start, &size, ttype, &n) !=
3) {
PLOG(ERROR) << "Malformed verity string " << verity;
return false;
}
/* Finally create the device mapper. */
std::unique_ptr<struct dm_task, decltype(&dm_task_destroy)> dmt(
dm_task_create(DM_DEVICE_CREATE), &dm_task_destroy);
if (dmt == nullptr) {
PLOG(ERROR) << "Failed to dm_task_create() for " << source.value();
return false;
}
if (dm_task_set_name(dmt.get(), dm_name.c_str()) != 0) {
PLOG(ERROR) << "Failed to dm_task_set_name() for " << source.value();
return false;
}
if (dm_task_set_ro(dmt.get()) != 0) {
PLOG(ERROR) << "Failed to dm_task_set_ro() for " << source.value();
return false;
}
if (dm_task_add_target(dmt.get(), start, size, ttype, verity.c_str() + n) !=
0) {
PLOG(ERROR) << "Failed to dm_task_add_target() for " << source.value();
return false;
}
uint32_t cookie = 0;
if (dm_task_set_cookie(dmt.get(), &cookie, 0) != 0) {
PLOG(ERROR) << "Failed to dm_task_set_cookie() for " << source.value();
return false;
}
if (dm_task_run(dmt.get()) != 0) {
PLOG(ERROR) << "Failed to dm_task_run() for " << source.value();
return false;
}
/* Make sure the node exists before we continue. */
dm_udev_wait(cookie);
*dm_path_out = dm_path;
*dm_name_out = dm_name;
#endif
return true;
}
// Tear down the device mapper target.
bool DeviceMapperDetach(const std::string& dm_name) {
#if USE_device_mapper
struct dm_task* dmt = dm_task_create(DM_DEVICE_REMOVE);
if (dmt == nullptr) {
PLOG(ERROR) << "Failed to dm_task_run() for " << dm_name;
return false;
}
base::ScopedClosureRunner teardown(
base::Bind(base::IgnoreResult(&dm_task_destroy), base::Unretained(dmt)));
if (dm_task_set_name(dmt, dm_name.c_str()) != 0) {
PLOG(ERROR) << "Failed to dm_task_set_name() for " << dm_name;
return false;
}
if (dm_task_run(dmt) != 0) {
PLOG(ERROR) << "Failed to dm_task_run() for " << dm_name;
return false;
}
#endif
return true;
}
bool MountExternal(const std::string& src,
const std::string& dest,
const std::string& type,
unsigned long flags,
const std::string& data) {
bool remount_ro = false;
// R/O bind mounts have to be remounted since 'bind' and 'ro' can't both be
// specified in the original bind mount. Remount R/O after the initial mount.
if ((flags & MS_BIND) && (flags & MS_RDONLY)) {
remount_ro = true;
flags &= ~MS_RDONLY;
}
if (mount(src.c_str(), dest.c_str(), type.c_str(), flags,
data.empty() ? nullptr : data.c_str()) != 0) {
PLOG(ERROR) << "Failed to mount " << src << " to " << dest;
return false;
}
if (remount_ro) {
flags |= MS_RDONLY;
if (mount(src.c_str(), dest.c_str(), nullptr, flags | MS_REMOUNT,
data.empty() ? nullptr : data.c_str()) != 0) {
PLOG(ERROR) << "Failed to remount " << src << " to " << dest;
return false;
}
}
return true;
}
bool Pipe2(base::ScopedFD* read_pipe, base::ScopedFD* write_pipe, int flags) {
int fds[2];
if (pipe2(fds, flags) != 0)
return false;
read_pipe->reset(fds[0]);
write_pipe->reset(fds[1]);
return true;
}
HookCallback CreateExecveCallback(base::FilePath filename,
std::vector<std::string> args,
base::ScopedFD stdin_fd,
base::ScopedFD stdout_fd,
base::ScopedFD stderr_fd) {
return base::Bind(
&ExecveCallbackHelper, filename, args, base::Passed(std::move(stdin_fd)),
base::Passed(std::move(stdout_fd)), base::Passed(std::move(stderr_fd)));
}
HookCallback AdaptCallbackToRunInNamespaces(HookCallback callback,
std::vector<int> nstypes) {
return base::Bind(&RunInNamespacesHelper, base::Passed(std::move(callback)),
base::Passed(std::move(nstypes)));
}
bool CreateDirectoryOwnedBy(const base::FilePath& full_path,
mode_t mode,
uid_t uid,
gid_t gid) {
if (base::DirectoryExists(full_path))
return true;
// Collect a list of all missing directories.
base::FilePath last_path = full_path;
std::vector<base::FilePath> missing_subpaths{full_path};
for (base::FilePath path = full_path.DirName();
path != last_path && !base::DirectoryExists(path);
path = path.DirName()) {
missing_subpaths.push_back(path);
last_path = path;
}
// Iterate through the missing parents, creating them.
for (std::vector<base::FilePath>::reverse_iterator i =
missing_subpaths.rbegin();
i != missing_subpaths.rend(); ++i) {
if (mkdir(i->value().c_str(), mode) != 0)
return false;
if (chown(i->value().c_str(), uid, gid) != 0)
return false;
}
return true;
}
} // namespace libcontainer