libcontainer: add support for core scheduling policy
Allow tagging the root container processes as untrusted, forcing all
threads to run isolated without sharing sibling cores.
BUG=b:160160322
TEST=Add crrev.com/c/2285838 and check process status in /proc
Change-Id: I05af4ac0173de094f31402b215db68c963766042
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform2/+/2284199
Reviewed-by: Hidehiko Abe <hidehiko@chromium.org>
Reviewed-by: Eric Caruso <ejcaruso@chromium.org>
Tested-by: Ereth McKnight-MacNeil <ereth@chromium.org>
Commit-Queue: Ereth McKnight-MacNeil <ereth@chromium.org>
diff --git a/libcontainer/libcontainer.cc b/libcontainer/libcontainer.cc
index 38dce6e..1bb290b 100644
--- a/libcontainer/libcontainer.cc
+++ b/libcontainer/libcontainer.cc
@@ -9,6 +9,7 @@
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
+#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
@@ -47,6 +48,11 @@
#define QUOTE(s) ('"' + std::string(s) + '"')
+// Not available in sys/prctl.h yet, but supported on some kernels.
+#ifndef PR_SET_CORE_SCHED
+#define PR_SET_CORE_SCHED 0x200
+#endif
+
namespace {
using libcontainer::DeviceMapperDetach;
@@ -205,6 +211,9 @@
// The mask of securebits to skip when restricting caps.
uint64_t securebits_skip_mask = 0x0;
+ // Core Scheduling policy
+ bool core_sched = false;
+
// Whether the container needs an extra process to be run as init.
bool do_init = false;
@@ -371,7 +380,9 @@
<< "uid_map: " << QUOTE(c->uid_map) << std::endl
<< "gid: " << c->gid << std::endl
<< "gid_map: " << QUOTE(c->gid_map) << std::endl
- << "alt_syscall_table: " << QUOTE(c->alt_syscall_table) << std::endl;
+ << "alt_syscall_table: " << QUOTE(c->alt_syscall_table) << std::endl
+ << "core_sched:" << (c->core_sched ? "enable" : "disable")
+ << std::endl;
auto mount_sorted = c->mounts;
if (sort_vectors) {
@@ -744,6 +755,16 @@
return true;
}
+int SetCoreSched(void* payload) {
+ int ret = prctl(PR_SET_CORE_SCHED, 1);
+ if (ret != 0 && errno != EINVAL) {
+ // Bubble error, minijail will abort child process.
+ return -errno;
+ }
+ // Success or unsupported on this kernel, continue.
+ return 0;
+}
+
int Setexeccon(void* payload) {
char* init_domain = reinterpret_cast<char*>(payload);
pid_t tid = syscall(SYS_gettid);
@@ -1048,6 +1069,11 @@
return 0;
}
+int container_config_set_core_sched(struct container_config* c, int enable) {
+ c->core_sched = enable;
+ return 0;
+}
+
int container_config_set_cpu_shares(struct container_config* c, int shares) {
/* CPU shares must be 2 or higher. */
if (shares < 2) {
@@ -1440,6 +1466,13 @@
if (!config->alt_syscall_table.empty())
minijail_use_alt_syscall(c->jail.get(), config->alt_syscall_table.c_str());
+ if (config->core_sched) {
+ if (minijail_add_hook(c->jail.get(), &SetCoreSched, nullptr,
+ MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS) != 0) {
+ return -1;
+ }
+ }
+
for (int i = 0; i < config->num_rlimits; i++) {
const Rlimit& lim = config->rlimits[i];
if (minijail_rlimit(c->jail.get(), lim.type, lim.cur, lim.max) != 0)
diff --git a/libcontainer/libcontainer.h b/libcontainer/libcontainer.h
index 6eaa8ea..8e2032d 100644
--- a/libcontainer/libcontainer.h
+++ b/libcontainer/libcontainer.h
@@ -220,6 +220,10 @@
BRILLO_EXPORT int container_config_get_cpu_rt_period(
struct container_config* c);
+/* Set core scheduling policy to disable sibling core sharing. */
+BRILLO_EXPORT int container_config_set_core_sched(struct container_config* c,
+ int enable);
+
/*
* Configure the owner of cgroups created for the container.
*