libcontainer: add support for core scheduling policy

Allow tagging the root container processes as untrusted, forcing all
threads to run isolated without sharing sibling cores.

BUG=b:160160322
TEST=Add crrev.com/c/2285838 and check process status in /proc

Change-Id: I05af4ac0173de094f31402b215db68c963766042
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform2/+/2284199
Reviewed-by: Hidehiko Abe <hidehiko@chromium.org>
Reviewed-by: Eric Caruso <ejcaruso@chromium.org>
Tested-by: Ereth McKnight-MacNeil <ereth@chromium.org>
Commit-Queue: Ereth McKnight-MacNeil <ereth@chromium.org>
diff --git a/libcontainer/libcontainer.cc b/libcontainer/libcontainer.cc
index 38dce6e..1bb290b 100644
--- a/libcontainer/libcontainer.cc
+++ b/libcontainer/libcontainer.cc
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mount.h>
+#include <sys/prctl.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
@@ -47,6 +48,11 @@
 
 #define QUOTE(s) ('"' + std::string(s) + '"')
 
+// Not available in sys/prctl.h yet, but supported on some kernels.
+#ifndef PR_SET_CORE_SCHED
+#define PR_SET_CORE_SCHED 0x200
+#endif
+
 namespace {
 
 using libcontainer::DeviceMapperDetach;
@@ -205,6 +211,9 @@
   // The mask of securebits to skip when restricting caps.
   uint64_t securebits_skip_mask = 0x0;
 
+  // Core Scheduling policy
+  bool core_sched = false;
+
   // Whether the container needs an extra process to be run as init.
   bool do_init = false;
 
@@ -371,7 +380,9 @@
           << "uid_map: " << QUOTE(c->uid_map) << std::endl
           << "gid: " << c->gid << std::endl
           << "gid_map: " << QUOTE(c->gid_map) << std::endl
-          << "alt_syscall_table: " << QUOTE(c->alt_syscall_table) << std::endl;
+          << "alt_syscall_table: " << QUOTE(c->alt_syscall_table) << std::endl
+          << "core_sched:" << (c->core_sched ? "enable" : "disable")
+          << std::endl;
 
   auto mount_sorted = c->mounts;
   if (sort_vectors) {
@@ -744,6 +755,16 @@
   return true;
 }
 
+int SetCoreSched(void* payload) {
+  int ret = prctl(PR_SET_CORE_SCHED, 1);
+  if (ret != 0 && errno != EINVAL) {
+    // Bubble error, minijail will abort child process.
+    return -errno;
+  }
+  // Success or unsupported on this kernel, continue.
+  return 0;
+}
+
 int Setexeccon(void* payload) {
   char* init_domain = reinterpret_cast<char*>(payload);
   pid_t tid = syscall(SYS_gettid);
@@ -1048,6 +1069,11 @@
   return 0;
 }
 
+int container_config_set_core_sched(struct container_config* c, int enable) {
+  c->core_sched = enable;
+  return 0;
+}
+
 int container_config_set_cpu_shares(struct container_config* c, int shares) {
   /* CPU shares must be 2 or higher. */
   if (shares < 2) {
@@ -1440,6 +1466,13 @@
   if (!config->alt_syscall_table.empty())
     minijail_use_alt_syscall(c->jail.get(), config->alt_syscall_table.c_str());
 
+  if (config->core_sched) {
+    if (minijail_add_hook(c->jail.get(), &SetCoreSched, nullptr,
+                          MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS) != 0) {
+      return -1;
+    }
+  }
+
   for (int i = 0; i < config->num_rlimits; i++) {
     const Rlimit& lim = config->rlimits[i];
     if (minijail_rlimit(c->jail.get(), lim.type, lim.cur, lim.max) != 0)
diff --git a/libcontainer/libcontainer.h b/libcontainer/libcontainer.h
index 6eaa8ea..8e2032d 100644
--- a/libcontainer/libcontainer.h
+++ b/libcontainer/libcontainer.h
@@ -220,6 +220,10 @@
 BRILLO_EXPORT int container_config_get_cpu_rt_period(
     struct container_config* c);
 
+/* Set core scheduling policy to disable sibling core sharing. */
+BRILLO_EXPORT int container_config_set_core_sched(struct container_config* c,
+                                                  int enable);
+
 /*
  * Configure the owner of cgroups created for the container.
  *