blob: 0f9ed5dca6a2abd463bea0ee1b820ada7e496859 [file] [log] [blame]
// Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "vm_tools/maitred/init.h"
#include <errno.h>
#include <fcntl.h>
#include <mntent.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/signalfd.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <unistd.h>
// These usually need to come after the sys/ includes.
#include <linux/dm-ioctl.h>
#include <linux/loop.h>
#include <linux/vm_sockets.h>
#include <algorithm>
#include <limits>
#include <list>
#include <set>
#include <utility>
#include <vector>
#include <base/bind.h>
#include <base/bind_helpers.h>
#include <base/files/file_descriptor_watcher_posix.h>
#include <base/files/file_enumerator.h>
#include <base/files/file_path.h>
#include <base/files/file_util.h>
#include <base/files/scoped_file.h>
#include <base/location.h>
#include <base/logging.h>
#include <base/memory/ptr_util.h>
#include <base/message_loop/message_pump_type.h>
#include <base/posix/eintr_wrapper.h>
#include <base/strings/string_number_conversions.h>
#include <base/strings/string_piece.h>
#include <base/strings/string_split.h>
#include <base/strings/string_util.h>
#include <base/strings/stringprintf.h>
#include <base/time/time.h>
#include <chromeos/constants/vm_tools.h>
#include <grpcpp/grpcpp.h>
#include <vm_protos/proto_bindings/vm_crash.grpc.pb.h>
#include "vm_tools/common/spawn_util.h"
using std::string;
namespace vm_tools {
namespace maitred {
namespace {
// Path to the root directory for cgroups.
constexpr char kCgroupRootDir[] = "/sys/fs/cgroup";
// Name of the directory in every cgroup subsystem for dealing with containers.
constexpr char kCgroupContainerSuffix[] = "chronos_containers";
// Default value of the PATH environment variable.
constexpr char kDefaultPath[] = "/usr/bin:/usr/sbin:/bin:/sbin";
// Uid and Gid for the chronos user and group, respectively.
constexpr uid_t kChronosUid = 1000;
constexpr gid_t kChronosGid = 1000;
// Retry threshould and duration for processes that respawn. If a process needs
// to be respawned more than kMaxRespawnCount times in the last
// kRespawnWindowSeconds, then it will stop being respawned.
constexpr size_t kMaxRespawnCount = 10;
constexpr base::TimeDelta kRespawnWindowSeconds =
base::TimeDelta::FromSeconds(30);
// Number of seconds that we should wait before force-killing processes for
// shutdown.
constexpr base::TimeDelta kShutdownTimeout = base::TimeDelta::FromSeconds(10);
// Number of seconds that we should wait for tremplin to attempt to gracefully
// shut down containers.
constexpr base::TimeDelta kTremplinShutdownTimeout =
base::TimeDelta::FromSeconds(2);
// Maximum number of bytes to capture from a single spawned process.
constexpr size_t kMaxOutputCaptureSize = 65536;
// Mounts that must be created on boot.
constexpr struct {
const char* source;
const char* target;
const char* fstype;
unsigned long flags; // NOLINT(runtime/int)
const void* data;
bool failure_is_fatal; // Abort if this mount fails.
} mounts[] = {
{
.source = "proc",
.target = "/proc",
.fstype = "proc",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = nullptr,
.failure_is_fatal = true,
},
{
.source = "sys",
.target = "/sys",
.fstype = "sysfs",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = nullptr,
.failure_is_fatal = true,
},
{
// For borealis mount as exec because some apps require it.
.source = "tmp",
.target = "/tmp",
.fstype = "tmpfs",
.flags = MS_NOSUID |
#if !USE_VM_BOREALIS
MS_NOEXEC |
#endif
MS_NODEV,
.data = nullptr,
.failure_is_fatal = true,
},
{
.source = "tmpfs",
.target = "/mnt/external",
.fstype = "tmpfs",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "mode=0755",
.failure_is_fatal = true,
},
{
.source = "run",
.target = "/run",
.fstype = "tmpfs",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "mode=0755",
.failure_is_fatal = true,
},
{
.source = "shmfs",
.target = "/dev/shm",
.fstype = "tmpfs",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = nullptr,
.failure_is_fatal = true,
},
{
.source = "devpts",
.target = "/dev/pts",
.fstype = "devpts",
.flags = MS_NOSUID | MS_NOEXEC,
.data = "gid=5,mode=0620,ptmxmode=666",
.failure_is_fatal = true,
},
#if !USE_VM_BOREALIS
{
.source = "var",
.target = "/var",
.fstype = "tmpfs",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "mode=0755",
.failure_is_fatal = true,
},
#endif
{
.source = "none",
.target = kCgroupRootDir,
.fstype = "tmpfs",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "mode=0755",
.failure_is_fatal = true,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/blkio",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "blkio",
.failure_is_fatal = false,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/cpu,cpuacct",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "cpu,cpuacct",
.failure_is_fatal = true,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/cpuset",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "cpuset",
.failure_is_fatal = true,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/devices",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "devices",
.failure_is_fatal = true,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/freezer",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "freezer",
.failure_is_fatal = true,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/hugetlb",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "hugetlb",
.failure_is_fatal = false,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/memory",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "memory",
.failure_is_fatal = false,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/net_cls,net_prio",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "net_cls,net_prio",
.failure_is_fatal = false,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/perf_event",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "perf_event",
.failure_is_fatal = false,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/pids",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "pids",
.failure_is_fatal = false,
},
{
.source = "cgroup",
.target = "/sys/fs/cgroup/systemd",
.fstype = "cgroup",
.flags = MS_NOSUID | MS_NODEV | MS_NOEXEC,
.data = "none,name=systemd",
.failure_is_fatal = false,
},
};
// Symlinks to be created on boot. It's done after all mounts have completed.
constexpr struct {
const char* source;
const char* target;
} symlinks[] = {
{
.source = "/sys/fs/cgroup/cpu,cpuacct",
.target = "/sys/fs/cgroup/cpu",
},
{
.source = "/sys/fs/cgroup/cpu,cpuacct",
.target = "/sys/fs/cgroup/cpuacct",
},
{
.source = "/sys/fs/cgroup/net_cls,net_prio",
.target = "/sys/fs/cgroup/net_cls",
},
{
.source = "/sys/fs/cgroup/net_cls,net_prio",
.target = "/sys/fs/cgroup/net_prio",
},
};
// Directories to be created on boot. These are created only after all the
// mounts have completed.
constexpr struct {
const char* path;
mode_t mode;
} boot_dirs[] = {
{
.path = "/run/lock",
.mode = 01777,
},
{
.path = "/run/sshd",
.mode = 01777,
},
{
.path = "/run/tokens",
.mode = 01777,
},
#if !USE_VM_BOREALIS
{
.path = "/var/cache",
.mode = 0755,
},
{
.path = "/var/db",
.mode = 0755,
},
{
.path = "/var/empty",
.mode = 0755,
},
{
.path = "/var/log",
.mode = 0755,
},
{
.path = "/var/spool",
.mode = 0755,
},
{
.path = "/var/lib",
.mode = 0755,
},
{
.path = "/var/lib/lxc",
.mode = 0755,
},
{
.path = "/var/lib/lxc/rootfs",
.mode = 0755,
},
{
.path = "/var/lib/lxcfs",
.mode = 0755,
},
{
.path = "/var/lib/misc",
.mode = 0755,
},
#endif
};
// These limits are based on suggestions from lxd doc/production-setup.md.
constexpr struct {
uint8_t resource_type;
rlimit limit;
} resource_limits[] = {
{
.resource_type = RLIMIT_NOFILE,
.limit = {.rlim_cur = 1048576, .rlim_max = 1048576},
},
{
.resource_type = RLIMIT_MEMLOCK,
.limit = {.rlim_cur = RLIM_INFINITY, .rlim_max = RLIM_INFINITY},
},
};
constexpr struct {
const char* path;
const char* value;
} sysctl_limits[] = {
{
.path = "/proc/sys/fs/inotify/max_queued_events",
.value = "1048576",
},
{
.path = "/proc/sys/fs/inotify/max_user_instances",
.value = "1048576",
},
{
.path = "/proc/sys/fs/inotify/max_user_watches",
.value = "1048576",
},
{
.path = "/proc/sys/vm/max_map_count",
.value = "262144",
},
};
// Recursively changes the owner and group for all files and directories in
// |path| (including |path|) to |uid| and |gid|, respectively.
bool ChangeOwnerAndGroup(base::FilePath path, uid_t uid, gid_t gid) {
base::FileEnumerator enumerator(
path, true /*recursive*/,
base::FileEnumerator::FILES | base::FileEnumerator::DIRECTORIES);
for (base::FilePath current = enumerator.Next(); !current.empty();
current = enumerator.Next()) {
if (chown(current.value().c_str(), uid, gid) != 0) {
PLOG(ERROR) << "Failed to change owner and group for " << current.value()
<< " to " << uid << ":" << gid;
return false;
}
}
// FileEnumerator doesn't include the root path so change it manually here.
if (chown(path.value().c_str(), uid, gid) != 0) {
PLOG(ERROR) << "Failed to change owner and group for " << path.value()
<< " to " << uid << ":" << gid;
return false;
}
return true;
}
// Waits for all the processes in |pids| to exit. Returns when all processes
// have exited or when |deadline| is reached, whichever happens first.
void WaitForChildren(std::set<pid_t> pids, base::Time deadline) {
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, SIGCHLD);
while (!pids.empty()) {
// First reap any child processes that have already exited.
while (true) {
pid_t child = waitpid(-1, nullptr, WNOHANG);
if (child < 0 && errno != ECHILD) {
PLOG(ERROR) << "Failed to wait for child processes";
return;
}
if (child <= 0) {
// Either there are no more children or they have not exited yet.
break;
}
pids.erase(child);
}
// We will not find out about all child processes. For example some
// processes might set up custom SIGTERM handlers and then try to handle
// the termination of their own children, in which case we would not find
// out about those processes here.
for (auto iter = pids.begin(); iter != pids.end();) {
// If the process still exists then leave it in the set. kill() with a
// signal value of 0 is explicitly documented as a way to check for the
// existence of a given process.
if (kill(*iter, 0) == 0) {
++iter;
continue;
}
// If the process has already exited, then remove it from the set.
DCHECK_EQ(errno, ESRCH);
iter = pids.erase(iter);
}
// If there are no processes left then exit early. Otherwise we will block
// for the full timeout duration in the sigtimedwait below.
if (pids.empty()) {
return;
}
// Check the deadline.
base::Time now = base::Time::Now();
if (now >= deadline) {
return;
}
// Wait for more processes to exit.
struct timespec ts = (deadline - now).ToTimeSpec();
int ret = sigtimedwait(&mask, nullptr, &ts);
if (ret == SIGCHLD) {
// One or more child processes have exited.
continue;
}
if (ret < 0 && errno == EAGAIN) {
// Deadline expired.
return;
}
if (ret < 0) {
PLOG(WARNING) << "Unable to wait for processes to exit";
} else {
LOG(WARNING) << "Unexpected return value from sigtimedwait(): "
<< strsignal(ret);
}
}
// Control should never reach here.
NOTREACHED();
}
// Cached pid of this process. Starting from version 2.24, glibc stopped
// caching the pid of the current process since the cache interacts in weird
// ways with certain clone() and unshare() flags. This value is only checked
// and set in ShouldKillProcess().
static pid_t cached_pid = 0;
// Returns true if it is safe to kill |process| either with a SIGTERM or a
// SIGKILL. |path| must be the path to the process directory in /proc.
bool ShouldKillProcess(pid_t process, const base::FilePath& path) {
if (cached_pid == 0) {
cached_pid = getpid();
}
if (process == 1 || process == cached_pid) {
// Probably not a good idea to kill ourselves.
return false;
}
// Get the process's UID.
uid_t uid = -1;
string status;
if (!base::ReadFileToString(path.Append("status"), &status)) {
PLOG(WARNING) << "Failed to read status for process " << process;
// Don't send a signal to this process just to be on the safe side.
return false;
}
for (const auto& line : base::SplitStringPiece(
status, "\n", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY)) {
if (!base::StartsWith(line, "Uid:")) {
continue;
}
std::vector<base::StringPiece> tokens = base::SplitStringPiece(
line, base::kWhitespaceASCII, base::TRIM_WHITESPACE,
base::SPLIT_WANT_NONEMPTY);
DCHECK_EQ(tokens.size(), 5);
if (!base::StringToUint(tokens[1], &uid)) {
LOG(WARNING) << "Failed to parse uid (" << tokens[1] << ") for process "
<< process;
return false;
}
break;
}
DCHECK_NE(uid, -1);
if (uid != 0) {
// All non-root processes can be killed.
return true;
}
// Check if this is a kernel process.
char buf;
if (readlink(path.Append("exe").value().c_str(), &buf, sizeof(buf)) < 0 &&
errno == ENOENT) {
// Kernel processes have no executable.
return false;
}
return true;
}
// Broadcast the signal |signo| to all processes. |signo| must be either
// SIGTERM or SIGKILL. If |pids| is not nullptr, then it is filled with the
// pids of the processes to which |signo| was successfully sent.
void BroadcastSignal(int signo, std::set<pid_t>* pids) {
DCHECK(signo == SIGTERM || signo == SIGKILL);
// We are about to walk the process tree. Pause all processes so that new
// processes don't appear or disappear while we're walking the tree.
// Additionally, pausing all the processes here means that we don't end up
// with unnecessary thrashing in the system. For example, consider a
// pipeline of programs:
//
// cmd1 | cmd2 | cmd3 | cmd4
//
// If cmd2 gets killed first, cmd3 might wake up from its read because its
// pipe is now closed and might end up doing some extra work even though we
// are going to be killing it very soon as well. Pausing all processes
// avoids this problem and ensures that the signal is delivered atomically to
// all processes.
if (kill(-1, SIGSTOP) < 0 && errno != ESRCH) {
PLOG(WARNING) << "Unable to send SIGSTOP to all processes. System "
<< "thrashing may occur";
}
base::FileEnumerator enumerator(base::FilePath("/proc"),
false /* recursive */,
base::FileEnumerator::DIRECTORIES);
for (base::FilePath path = enumerator.Next(); !path.empty();
path = enumerator.Next()) {
pid_t process;
if (!base::StringToInt(path.BaseName().value(), &process)) {
// Ignore anything that doesn't look like a pid.
continue;
}
if (!ShouldKillProcess(process, path)) {
continue;
}
if (kill(process, signo) < 0) {
PLOG(ERROR) << "Failed to send " << strsignal(signo) << " to process "
<< process;
continue;
}
// Now that we've sent the signal to the process wake it up. This way we
// avoid a thundering herd problem if all the processes wake up at the same
// time later.
if (kill(process, SIGCONT) < 0 && errno != ESRCH) {
// It's possible the process is already gone (for example if signo was
// SIGKILL). Only log an error if it's not that case.
PLOG(WARNING) << "Failed to wake up process " << process;
}
if (pids) {
pids->insert(process);
}
}
// Now restart any programs that may still be hanging around. There shouldn't
// actually be any but just in case one of the attempts to send SIGCONT
// earlier failed we can try one more time here.
if (kill(-1, SIGCONT) < 0 && errno != ESRCH) {
PLOG(WARNING) << "Unable to send SIGCONT to all processes. Some "
<< "processes may still be frozen";
}
}
// Detaches all loopback devices.
void DetachLoopback() {
LOG(INFO) << "Detaching loopback devices";
const base::FilePath kDev("/dev");
base::FileEnumerator enumerator(
base::FilePath("/sys/block"), false /*recursive*/,
base::FileEnumerator::FILES | base::FileEnumerator::SHOW_SYM_LINKS,
"loop*" /*pattern*/);
for (base::FilePath path = enumerator.Next(); !path.empty();
path = enumerator.Next()) {
const base::FilePath backing_file =
path.Append("loop").Append("backing_file");
if (!base::PathExists(backing_file)) {
continue;
}
const base::FilePath dev_path = kDev.Append(path.BaseName());
LOG(INFO) << "Detaching " << dev_path.value();
base::ScopedFD loopdev(open(dev_path.value().c_str(), O_RDWR | O_CLOEXEC));
if (!loopdev.is_valid()) {
PLOG(ERROR) << "Unable to open " << dev_path.value();
continue;
}
if (ioctl(loopdev.get(), LOOP_CLR_FD, 0) != 0) {
PLOG(ERROR) << "Failed to remove backing file for /dev/"
<< path.BaseName().value();
}
}
}
// Removes all device mapper devices.
void RemoveDevMapper() {
LOG(INFO) << "Removing device mapper devices";
const base::FilePath kDMControl("/dev/mapper/control");
base::ScopedFD dm_control(
open(kDMControl.value().c_str(), O_RDWR | O_CLOEXEC));
if (!dm_control.is_valid()) {
PLOG(ERROR) << "Failed to open " << kDMControl.value();
return;
}
struct dm_ioctl param = {
// clang-format off
.version = {
DM_VERSION_MAJOR,
DM_VERSION_MINOR,
DM_VERSION_PATCHLEVEL,
},
// clang-format on
.data_size = sizeof(struct dm_ioctl),
.data_start = sizeof(struct dm_ioctl),
.flags = DM_DEFERRED_REMOVE,
};
if (ioctl(dm_control.get(), DM_REMOVE_ALL, &param) != 0) {
PLOG(ERROR) << "Failed to remove device mapper devices";
}
}
// Returns true if |mount_point| should not be unmounted even during the
// shutdown sequence.
bool IsProtectedMount(const string& mount_point) {
const char* const kProtectedMounts[] = {
"/dev",
"/proc",
"/sys",
};
if (mount_point == "/") {
return true;
}
for (const char* mount : kProtectedMounts) {
if (mount == mount_point ||
base::FilePath(mount).IsParent(base::FilePath(mount_point))) {
return true;
}
}
return false;
}
// Unmounts all non-essential filesystems.
void UnmountFilesystems() {
LOG(INFO) << "Unmounting filesystems";
base::ScopedFILE mountinfo(fopen("/proc/self/mounts", "r"));
if (!mountinfo) {
PLOG(ERROR) << "Failed to open /proc/self/mounts";
return;
}
// Parse all the mounts into a vector since we need to unmount them in
// reverse order.
std::vector<string> mount_points;
char buf[1024 + 4];
struct mntent entry;
while (getmntent_r(mountinfo.get(), &entry, buf, sizeof(buf)) != nullptr) {
mount_points.emplace_back(entry.mnt_dir);
}
for (auto iter = mount_points.rbegin(), end = mount_points.rend();
iter != end; ++iter) {
if (IsProtectedMount(*iter)) {
continue;
}
LOG(INFO) << "Unmounting " << *iter;
if (umount(iter->c_str()) != 0) {
PLOG(ERROR) << "Failed to unmount " << *iter;
}
}
}
} // namespace
class Init::Worker {
public:
// Relevant information about processes launched by this process.
struct ChildInfo {
std::vector<string> argv;
std::map<string, string> env;
bool respawn;
bool use_console;
bool wait_for_exit;
std::list<base::Time> spawn_times;
base::Optional<base::Callback<void(ProcessStatus, int)>> exit_cb;
};
Worker()
: crash_listener_(grpc::CreateChannel(
base::StringPrintf(
"vsock:%u:%u", VMADDR_CID_HOST, vm_tools::kCrashListenerPort),
grpc::InsecureChannelCredentials())) {}
Worker(const Worker&) = delete;
Worker& operator=(const Worker&) = delete;
~Worker() = default;
// Start the worker. This will set up a signalfd for receiving SIGCHLD
// events.
void Start();
// Actually spawns a child process. Waits until it receives confirmation from
// the child that the requested program was actually started and fills in
// |launch_info| with information about the process. Additionally if
// |info.wait_for_exit| is true, then waits until the child process exits or
// is killed before returning.
void Spawn(struct ChildInfo info, int semfd, ProcessLaunchInfo* launch_info);
// Shuts down the system. First broadcasts SIGTERM to all processes and
// waits for those processes to exit up to a deadline. Then kills any
// remaining processes with SIGKILL. |notify_fd| must be an eventfd, which
// is notified after all processes are killed.
void Shutdown(int notify_fd);
// Finds the pid of a process with |name|. Returns 0 if such a process doesn't
// exist.
pid_t FindProcessByName(const string& name);
private:
// Called when |signal_fd_| becomes readable.
void OnSignalReadable();
// File descriptor on which we will receive SIGCHLD events.
base::ScopedFD signal_fd_;
std::unique_ptr<base::FileDescriptorWatcher::Controller> watcher_;
vm_tools::cicerone::CrashListener::Stub crash_listener_;
// Information about processes launched by this process.
std::map<pid_t, ChildInfo> children_;
// File descriptor for "/dev/console".
// This is used for spawned processes when |use_console| is true.
base::ScopedFD console_fd_;
};
void Init::Worker::Start() {
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, SIGCHLD);
// Block SIGCHLD so that we can get it via the signalfd.
if (sigprocmask(SIG_BLOCK, &mask, nullptr) != 0) {
PLOG(ERROR) << "Failed to block SIGCHLD";
}
signal_fd_.reset(signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK));
PCHECK(signal_fd_.is_valid()) << "Unable to create signal fd";
watcher_ = base::FileDescriptorWatcher::WatchReadable(
signal_fd_.get(), base::BindRepeating(&Init::Worker::OnSignalReadable,
base::Unretained(this)));
CHECK(watcher_) << "Failed to watch SIGCHLD file descriptor";
console_fd_.reset(open("/dev/console", O_RDWR | O_NOCTTY));
PCHECK(console_fd_.is_valid()) << "Failed to open /dev/console";
}
static void SignalSpawnComplete(int semfd) {
if (semfd != -1) {
uint64_t done = 1;
ssize_t count = write(semfd, &done, sizeof(done));
DCHECK_EQ(count, sizeof(done));
}
}
// Read up to |max_size| bytes from |fd| into |contents|.
// Returns true on success and false on error (including truncation).
static bool ReadFDToStringWithMaxSize(int fd,
std::string* contents,
size_t max_size) {
DCHECK(contents);
bool success = true;
size_t buf_used = 0;
std::string buf;
buf.resize(max_size);
// Keep reading output until read() returns EOF or an error
// or we run out of space in the buffer.
while (buf_used < max_size) {
ssize_t num_bytes = read(fd, &buf[buf_used], max_size - buf_used);
if (num_bytes <= 0) {
success = false;
break;
}
buf_used += num_bytes;
}
contents->swap(buf);
contents->resize(buf_used);
return success;
}
void Init::Worker::Spawn(struct ChildInfo info,
int semfd,
ProcessLaunchInfo* launch_info) {
DCHECK_GT(info.argv.size(), 0);
DCHECK(launch_info);
bool capture_output = info.wait_for_exit && !info.use_console;
int pipe_fds[2] = {-1, -1};
if (capture_output) {
if (pipe(pipe_fds) != 0) {
PLOG(ERROR) << "Failed to create pipe";
launch_info->status = ProcessStatus::FAILED;
SignalSpawnComplete(semfd);
return;
}
}
base::ScopedFD output_read_fd(pipe_fds[0]);
base::ScopedFD output_write_fd(pipe_fds[1]);
// Block all signals before forking to prevent signals from arriving in the
// child.
sigset_t mask, omask;
sigfillset(&mask);
sigprocmask(SIG_BLOCK, &mask, &omask);
int stdio_fds[3] = {-1, -1, -1};
if (info.use_console) {
for (auto& fd : stdio_fds) {
fd = console_fd_.get();
}
}
if (capture_output) {
stdio_fds[STDOUT_FILENO] = output_write_fd.get();
}
pid_t pid = -1;
bool spawned = vm_tools::Spawn(info.argv, info.env, "" /* working_dir */,
stdio_fds, &pid);
if (capture_output) {
// Close the writable end of the pipe in the parent.
output_write_fd.reset();
}
if (!spawned) {
LOG(ERROR) << "Failed to spawn child process";
launch_info->status = ProcessStatus::FAILED;
} else if (info.wait_for_exit) {
if (capture_output) {
launch_info->output_truncated = !ReadFDToStringWithMaxSize(
output_read_fd.get(), &launch_info->output, kMaxOutputCaptureSize);
}
int status = 0;
pid_t child = waitpid(pid, &status, 0);
DCHECK_EQ(child, pid);
if (WIFEXITED(status)) {
launch_info->status = ProcessStatus::EXITED;
launch_info->code = WEXITSTATUS(status);
} else if (WIFSIGNALED(status)) {
launch_info->status = ProcessStatus::SIGNALED;
launch_info->code = WTERMSIG(status);
} else {
launch_info->status = ProcessStatus::UNKNOWN;
}
} else {
info.spawn_times.emplace_back(base::Time::Now());
// result is a pair<iterator, bool>.
auto result = children_.emplace(pid, std::move(info));
DCHECK(result.second);
launch_info->status = ProcessStatus::LAUNCHED;
}
SignalSpawnComplete(semfd);
// Restore the signal mask.
sigprocmask(SIG_SETMASK, &omask, nullptr);
}
void Init::Worker::Shutdown(int notify_fd) {
DCHECK_NE(notify_fd, -1);
// Stop watching for SIGCHLD. We will do it manually here.
watcher_.reset();
signal_fd_.reset();
// First send SIGPWR to tremplin, if it is running. This runs "poweroff"
// in every container, which is necessary to work around the version
// of systemd in stretch that hangs after receiving SIGRTMIN + 3.
pid_t tremplin_pid = FindProcessByName("tremplin");
if (tremplin_pid != 0 && kill(tremplin_pid, SIGPWR) == 0) {
WaitForChildren({tremplin_pid},
base::Time::Now() + kTremplinShutdownTimeout);
}
// Second, send SIGPWR to lxd, if it is running. This will cause lxd to shut
// down all running containers in parallel.
pid_t lxd_pid = FindProcessByName("lxd");
if (lxd_pid != 0 && kill(lxd_pid, SIGPWR) == 0) {
WaitForChildren({lxd_pid}, base::Time::Now() + kShutdownTimeout);
}
// Now send SIGTERM to all remaining processes.
std::set<pid_t> pids;
BroadcastSignal(SIGTERM, &pids);
// Wait for those processes to terminate.
WaitForChildren(std::move(pids), base::Time::Now() + kShutdownTimeout);
// Kill anything left with SIGKILL.
BroadcastSignal(SIGKILL, nullptr);
// Detach loopback devices.
DetachLoopback();
// Remove any device-mapper devices.
RemoveDevMapper();
// Unmount all non-essential file systems.
UnmountFilesystems();
// Final sync to flush anything left.
sync();
// Signal the waiter.
uint64_t done = 1;
if (write(notify_fd, &done, sizeof(done)) != sizeof(done)) {
PLOG(ERROR) << "Failed to wake up shutdown waiter";
}
}
void Init::Worker::OnSignalReadable() {
// Pull information about the signal sender out of the fd to ack the signal.
struct signalfd_siginfo siginfo;
if (HANDLE_EINTR(read(signal_fd_.get(), &siginfo, sizeof(siginfo))) !=
sizeof(siginfo)) {
PLOG(ERROR) << "Failed to read from signalfd";
return;
}
DCHECK_EQ(siginfo.ssi_signo, SIGCHLD);
// We can't just rely on the information in the siginfo structure because
// more than one child may have exited but only one SIGCHLD will be
// generated.
while (true) {
int status;
pid_t pid = waitpid(-1, &status, WNOHANG);
if (pid <= 0) {
if (pid == -1) {
PLOG(ERROR) << "Unable to reap child processes";
}
break;
}
// See if this is a process we launched.
struct ChildInfo info = {};
auto iter = children_.find(pid);
if (iter != children_.end()) {
info = std::move(iter->second);
children_.erase(iter);
}
ProcessStatus proc_status = ProcessStatus::UNKNOWN;
int code = -1;
if (WIFEXITED(status)) {
LOG(INFO) << (info.argv.size() == 0 ? "<unknown process>"
: info.argv[0].c_str())
<< " (" << pid << ") exited with status "
<< WEXITSTATUS(status);
proc_status = ProcessStatus::EXITED;
code = WEXITSTATUS(status);
} else if (WIFSIGNALED(status)) {
LOG(INFO) << (info.argv.size() == 0 ? "<unknown process>"
: info.argv[0].c_str())
<< " (" << pid << ") killed by signal " << WTERMSIG(status)
<< (WCOREDUMP(status) ? " (core dumped)" : "");
proc_status = ProcessStatus::SIGNALED;
code = WTERMSIG(status);
} else {
LOG(WARNING) << "Unknown exit status " << status << " for process "
<< pid;
}
if (info.exit_cb) {
info.exit_cb.value().Run(proc_status, code);
}
if (!info.respawn) {
continue;
}
// Notify the host that a persistent process has failed.
{
grpc::ClientContext ctx;
vm_tools::EmptyMessage empty;
vm_tools::cicerone::FailureReport failure_report;
// Cicerone expects bare service names (no path).
failure_report.set_failed_process(
base::FilePath(info.argv.front()).BaseName().value());
grpc::Status status =
crash_listener_.SendFailureReport(&ctx, failure_report, &empty);
if (!status.ok()) {
LOG(ERROR) << "Failed to report failure of service \""
<< failure_report.failed_process()
<< "\": " << status.error_message() << ", error code "
<< status.error_code();
}
}
// The process needs to be respawned. First remove any spawn times older
// than the respawn counter window.
base::Time now = base::Time::Now();
while (info.spawn_times.size() > 0 &&
now - info.spawn_times.front() > kRespawnWindowSeconds) {
info.spawn_times.pop_front();
}
// Check if the process has respawned too often.
if (info.spawn_times.size() >= kMaxRespawnCount) {
LOG(WARNING) << info.argv[0] << " respawning too frequently; stopped";
continue;
}
// Respawn the process.
LOG(INFO) << "Restarting " << info.argv[0];
string app(info.argv[0]);
Init::ProcessLaunchInfo launch_info;
Spawn(std::move(info), -1, &launch_info);
switch (launch_info.status) {
case ProcessStatus::UNKNOWN:
LOG(WARNING) << app << " has unknown status";
break;
case ProcessStatus::EXITED:
LOG(WARNING) << app << " unexpectedly exited with status "
<< launch_info.code << "; stopped";
break;
case ProcessStatus::SIGNALED:
LOG(WARNING) << app << " unexpectedly killed by signal "
<< launch_info.code << "; stopped";
break;
case ProcessStatus::LAUNCHED:
LOG(INFO) << app << " restarted";
break;
case ProcessStatus::FAILED:
LOG(ERROR) << "Failed to start " << app;
break;
}
}
}
pid_t Init::Worker::FindProcessByName(const string& name) {
for (const auto& pair : children_) {
const ChildInfo& info = pair.second;
if (info.argv[0] == name) {
return pair.first;
}
}
return 0;
}
std::unique_ptr<Init> Init::Create() {
auto init = base::WrapUnique<Init>(new Init());
if (!init->Setup()) {
init.reset();
}
return init;
}
Init::~Init() {
if (worker_) {
// worker_ is created after worker_thread_ is started so we don't need to
// check if it is running.
worker_thread_.task_runner()->DeleteSoon(FROM_HERE, worker_.release());
}
}
bool Init::Spawn(
std::vector<string> argv,
std::map<string, string> env,
bool respawn,
bool use_console,
bool wait_for_exit,
ProcessLaunchInfo* launch_info,
base::Optional<base::Callback<void(ProcessStatus, int)>> exit_cb) {
CHECK(!argv.empty());
CHECK(!(respawn && wait_for_exit));
CHECK(launch_info);
if (!worker_) {
// If there's no worker then we are currently in the process of shutting
// down.
return false;
}
struct Worker::ChildInfo info = {.argv = std::move(argv),
.env = std::move(env),
.respawn = respawn,
.use_console = use_console,
.wait_for_exit = wait_for_exit,
.exit_cb = std::move(exit_cb)};
// Create a semaphore that we will use to wait for the worker thread to launch
// the process and fill in the the ProcessLaunchInfo struct with the result.
base::ScopedFD sem(eventfd(0 /*initval*/, EFD_CLOEXEC | EFD_SEMAPHORE));
if (!sem.is_valid()) {
PLOG(ERROR) << "Failed to create semaphore eventfd";
return false;
}
bool ret = worker_thread_.task_runner()->PostTask(
FROM_HERE,
base::Bind(&Worker::Spawn, base::Unretained(worker_.get()),
base::Passed(std::move(info)), sem.get(), launch_info));
if (!ret) {
return false;
}
uint64_t done = 0;
ssize_t count = HANDLE_EINTR(read(sem.get(), &done, sizeof(done)));
DCHECK_EQ(count, sizeof(done));
DCHECK_EQ(done, 1);
return true;
}
void Init::Shutdown() {
base::ScopedFD notify_fd(eventfd(0 /*initval*/, EFD_CLOEXEC | EFD_SEMAPHORE));
if (!notify_fd.is_valid()) {
PLOG(ERROR) << "Failed to create eventfd";
return;
}
bool ret = worker_thread_.task_runner()->PostTask(
FROM_HERE, base::Bind(&Worker::Shutdown, base::Unretained(worker_.get()),
notify_fd.get()));
if (!ret) {
LOG(ERROR) << "Failed to post task to worker thread";
return;
}
uint64_t done = 0;
if (read(notify_fd.get(), &done, sizeof(done)) != sizeof(done)) {
PLOG(ERROR) << "Failed to read from eventfd";
return;
}
DCHECK_EQ(done, 1);
}
bool Init::SetupResourceLimit() {
// Setup rlimit.
for (const auto& rlimit : resource_limits) {
if (setrlimit(rlimit.resource_type, &rlimit.limit) != 0) {
PLOG(ERROR) << "Failed to set limit for resouce type: "
<< rlimit.resource_type;
return false;
}
}
// Setup sysctl limits.
for (const auto& syslimit : sysctl_limits) {
base::ScopedFD sysctl_node(open(syslimit.path, O_RDWR | O_CLOEXEC));
if (!sysctl_node.is_valid()) {
PLOG(ERROR) << "Unable to open sysctl node: " << syslimit.path;
return false;
}
ssize_t count =
write(sysctl_node.get(), syslimit.value, strlen(syslimit.value));
if (count != strlen(syslimit.value)) {
PLOG(ERROR) << "Faile to write sysctl node: " << syslimit.path;
return false;
}
}
return true;
}
bool Init::Setup() {
// Set the umask properly or the directory modes will not work.
umask(0000);
// Do all the mounts.
for (const auto& mt : mounts) {
if (mkdir(mt.target, 0755) != 0 && errno != EEXIST) {
PLOG(ERROR) << "Failed to create " << mt.target;
if (mt.failure_is_fatal)
return false;
}
if (mount(mt.source, mt.target, mt.fstype, mt.flags, mt.data) != 0) {
rmdir(mt.target);
PLOG(ERROR) << "Failed to mount " << mt.target;
if (mt.failure_is_fatal)
return false;
}
}
// Setup the resource limits.
if (!SetupResourceLimit()) {
return false;
}
// Create all the symlinks.
for (const auto& sl : symlinks) {
if (symlink(sl.source, sl.target) != 0) {
PLOG(ERROR) << "Failed to create symlink: source " << sl.source
<< ", target " << sl.target;
return false;
}
}
// Create all the directories.
for (const auto& dir : boot_dirs) {
if (mkdir(dir.path, dir.mode) != 0 && errno != EEXIST) {
PLOG(ERROR) << "Failed to create " << dir.path;
return false;
}
}
// Change the ownership of the kCgroupContainerSuffix directory in each cgroup
// subsystem to "chronos".
base::FileEnumerator enumerator(base::FilePath(kCgroupRootDir),
false /*recursive*/,
base::FileEnumerator::DIRECTORIES);
for (base::FilePath current = enumerator.Next(); !current.empty();
current = enumerator.Next()) {
base::FilePath target_cgroup = current.Append(kCgroupContainerSuffix);
if (mkdir(target_cgroup.value().c_str(), 0755) != 0 && errno != EEXIST) {
PLOG(ERROR) << "Failed to create cgroup " << target_cgroup.value();
return false;
}
if (!ChangeOwnerAndGroup(target_cgroup, kChronosUid, kChronosGid)) {
return false;
}
}
// Create and setup the container cpusets with the default settings (all cpus,
// all mems).
const char* sets[] = {"cpuset.cpus", "cpuset.mems"};
base::FilePath root_dir = base::FilePath(kCgroupRootDir).Append("cpuset");
base::FilePath chronos_dir = root_dir.Append(kCgroupContainerSuffix);
for (const char* set : sets) {
string contents;
if (!base::ReadFileToString(root_dir.Append(set), &contents)) {
PLOG(ERROR) << "Failed to read contents from "
<< root_dir.Append(set).value();
return false;
}
if (base::WriteFile(chronos_dir.Append(set), contents.c_str(),
contents.length()) != contents.length()) {
PLOG(ERROR) << "Failed to write cpuset contents to "
<< chronos_dir.Append(set).value();
return false;
}
}
// Become the session leader.
if (setsid() == -1) {
PLOG(ERROR) << "Failed to become session leader";
return false;
}
// Set the controlling terminal.
if (ioctl(STDIN_FILENO, TIOCSCTTY, 1) != 0) {
PLOG(ERROR) << "Failed to set controlling terminal";
return false;
}
// Setup up PATH.
if (clearenv() != 0) {
PLOG(ERROR) << "Failed to clear environment";
return false;
}
if (setenv("PATH", kDefaultPath, 1 /*overwrite*/) != 0) {
PLOG(ERROR) << "Failed to set PATH";
return false;
}
// Block SIGCHLD here because we want to handle it in the worker thread.
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, SIGCHLD);
if (sigprocmask(SIG_BLOCK, &mask, nullptr) != 0) {
PLOG(ERROR) << "Failed to block SIGCHLD";
return false;
}
// Start the worker.
base::Thread::Options opts(base::MessagePumpType::IO, 0 /*stack_size*/);
if (!worker_thread_.StartWithOptions(opts)) {
LOG(ERROR) << "Failed to start worker thread";
return false;
}
worker_ = std::make_unique<Worker>();
bool ret = worker_thread_.task_runner()->PostTask(
FROM_HERE, base::Bind(&Worker::Start, base::Unretained(worker_.get())));
if (!ret) {
LOG(ERROR) << "Failed to post task to worker thread";
return false;
}
// Applications that should be started for every VM.
struct {
const char* doc;
std::vector<string> argv;
std::map<string, string> env;
bool respawn;
bool use_console;
bool wait_for_exit;
} startup_applications[] = {
{
.doc = "system log collector",
.argv = {"vm_syslog"},
.env = {},
.respawn = true,
.use_console = false,
.wait_for_exit = false,
},
{
.doc = "vsock remote shell daemon",
.argv = {"vshd"},
.env = {},
.respawn = true,
.use_console = false,
.wait_for_exit = false,
},
};
// Spawn all the startup applications.
for (auto& app : startup_applications) {
CHECK(!app.argv.empty());
LOG(INFO) << "Starting " << app.doc;
ProcessLaunchInfo info;
if (!Spawn(std::move(app.argv), std::move(app.env), app.respawn,
app.use_console, app.wait_for_exit, &info)) {
LOG(ERROR) << "Unable to launch " << app.doc;
continue;
}
switch (info.status) {
case ProcessStatus::UNKNOWN:
LOG(WARNING) << app.doc << " has unknown status";
break;
case ProcessStatus::EXITED:
LOG(INFO) << app.doc << " exited with status " << info.code;
break;
case ProcessStatus::SIGNALED:
LOG(INFO) << app.doc << " killed by signal " << info.code;
break;
case ProcessStatus::LAUNCHED:
LOG(INFO) << app.doc << " started";
break;
case ProcessStatus::FAILED:
LOG(ERROR) << "Failed to start " << app.doc;
break;
}
}
return true;
}
} // namespace maitred
} // namespace vm_tools