Run cgroups cleanup code in WatchDog.

After CL:45672, the watchdog process doesn't run cgroups cleanup code. This
means that orphaned child processes are left around when you press the 'stop'
button on a builder.

BUG=chromium:223044
TEST=Run test case where child process switched the sid and verify that this
     child gets killed.

Change-Id: I367425df849a7ef9ce38cafb1998e72b34b19eb7
Previous-Reviewed-on: https://gerrit.chromium.org/gerrit/47188
(cherry picked from commit 2852ba2c23eb6809f0b97f7abedecfb2c63561a2)
Reviewed-on: https://gerrit.chromium.org/gerrit/49737
Tested-by: David James <davidjames@chromium.org>
Reviewed-by: Ryan Cui <rcui@chromium.org>
diff --git a/lib/cgroups.py b/lib/cgroups.py
index bee2097..be1fc13 100644
--- a/lib/cgroups.py
+++ b/lib/cgroups.py
@@ -546,55 +546,6 @@
     finally:
       self.TransferCurrentProcess()
 
-  @contextlib.contextmanager
-  def ContainChildren(self, pool_name=None, sigterm_timeout=10):
-    """Context manager for containing children processes.
-
-    This manager creates a job pool derived from this instance, transfers
-    the current process into it upon __enter__.
-
-    Any children processes created at that point will inherit our cgroup;
-    they can only escape the group if they're running as root and move
-    themselves out of this hierarchy.
-
-    Upon __exit__, transfer the current process back to this group, then
-    sigterm (progressing to sigkill) any immediate children in the pool,
-    finally removing the pool if possible.
-
-    If pool_name is given, that name is used rather than os.getpid() for
-    the job pool created.
-
-    Finally, note that during cleanup this will suppress SIGINT and SIGTERM
-    to ensure that it cleanses any children before returning.
-    """
-
-    pid = os.getpid()
-    if pool_name is None:
-      pool_name = str(pid)
-
-    run_kill = False
-    try:
-      # Note; we use lazy init here so that we cannot trigger a
-      # _GroupWasRemoved; we want that contained.
-      node = self.AddGroup(pool_name, autoclean=True, lazy_init=True)
-      try:
-        node.TransferCurrentProcess()
-      except _GroupWasRemoved:
-        raise SystemExit(
-            "Group %s was removed under our feet; pool shutdown is underway"
-            % node.namespace)
-      run_kill = True
-      yield
-    finally:
-      if os.getpid() == pid:
-        with signals.DeferSignals():
-          self.TransferCurrentProcess()
-          if run_kill:
-            node.KillProcesses(remove=True, sigterm_timeout=sigterm_timeout)
-          else:
-            # Non strict since the group may have failed to be created.
-            node.RemoveThisGroup(strict=False)
-
   def KillProcesses(self, poll_interval=0.05, remove=False, sigterm_timeout=10):
     """Kill all processes in this namespace."""
 
@@ -753,6 +704,63 @@
     return _cros_node.AddGroup(target, autoclean=False)
 
 
+class ContainChildren(cros_build_lib.MasterPidContextManager):
+  """Context manager for containing children processes.
+
+  This manager creates a job pool derived from the specified Cgroup |node|
+  and transfers the current process into it upon __enter__.
+
+  Any children processes created at that point will inherit our cgroup;
+  they can only escape the group if they're running as root and move
+  themselves out of this hierarchy.
+
+  Upon __exit__, transfer the current process back to this group, then
+  SIGTERM (progressing to SIGKILL) any immediate children in the pool,
+  finally removing the pool if possible. After sending SIGTERM, we wait
+  |sigterm_timeout| seconds before sending SIGKILL.
+
+  If |pool_name| is given, that name is used rather than os.getpid() for
+  the job pool created.
+
+  Finally, note that during cleanup this will suppress all signals
+  to ensure that it cleanses any children before returning.
+  """
+
+  def __init__(self, node, pool_name=None, sigterm_timeout=10):
+    super(ContainChildren, self).__init__()
+    self.node = node
+    self.child = None
+    self.pid = None
+    self.pool_name = pool_name
+    self.sigterm_timeout = sigterm_timeout
+    self.run_kill = False
+
+  def _enter(self):
+    self.pid = os.getpid()
+
+    # Note: We use lazy init here so that we cannot trigger a
+    # _GroupWasRemoved -- we want that to be contained.
+    pool_name = str(self.pid) if self.pool_name is None else self.pool_name
+    self.child = self.node.AddGroup(pool_name, autoclean=True, lazy_init=True)
+    try:
+      self.child.TransferCurrentProcess()
+    except _GroupWasRemoved:
+      raise SystemExit(
+          "Group %s was removed under our feet; pool shutdown is underway"
+          % self.child.namespace)
+    self.run_kill = True
+
+  def _exit(self, *_args, **_kwargs):
+    with signals.DeferSignals():
+      self.node.TransferCurrentProcess()
+      if self.run_kill:
+        self.child.KillProcesses(remove=True,
+                                 sigterm_timeout=self.sigterm_timeout)
+      else:
+        # Non-strict since the group may have failed to be created.
+        self.child.RemoveThisGroup(strict=False)
+
+
 def SimpleContainChildren(process_name, nesting=True, **kwds):
   """Convenience context manager to create a cgroup for children containment
 
@@ -763,7 +771,7 @@
   if node is None:
     return cros_build_lib.NoOpContextManager()
   name = '%s:%i' % (process_name, os.getpid())
-  return node.ContainChildren(name, **kwds)
+  return ContainChildren(node, name, **kwds)
 
 # This is a generic group, not associated with any specific process id, so
 # we shouldn't autoclean it on exit; doing so would delete the group from
diff --git a/lib/cleanup.py b/lib/cleanup.py
index 056b099..af962e4 100644
--- a/lib/cleanup.py
+++ b/lib/cleanup.py
@@ -71,8 +71,8 @@
       # fails- failure means a code bug.  Specifically, we don't know if
       # cleanup code was run, thus just flat out bail.
       os._exit(1)
-    # Check if the parent exited cleanly; if so, we don't need to do anything.
 
+    # Check if the parent exited cleanly; if so, we don't need to do anything.
     if os.read(self._lock.fd, 1):
       for handle in (sys.__stdin__, sys.__stdout__, sys.__stderr__):
         try:
@@ -83,7 +83,7 @@
 
     # Allow masterpid context managers to run in this case, since we're
     # explicitly designed for this cleanup.
-    cros_build_lib.MasterPidContextManager._ALTERNATE_MASTER_PID = os.getpid()
+    cros_build_lib.MasterPidContextManager.ALTERNATE_MASTER_PID = os.getpid()
 
     raise RuntimeError("Parent exited uncleanly; forcing cleanup code to run.")
 
diff --git a/lib/cros_build_lib.py b/lib/cros_build_lib.py
index 2f978fe..1fba1fb 100644
--- a/lib/cros_build_lib.py
+++ b/lib/cros_build_lib.py
@@ -936,8 +936,7 @@
   # In certain cases we actually want this ran outside
   # of the main pid- specifically in backup processes
   # doing cleanup.
-
-  _ALTERNATE_MASTER_PID = None
+  ALTERNATE_MASTER_PID = None
 
   def __init__(self):
     self._invoking_pid = None
@@ -948,7 +947,7 @@
 
   def __exit__(self, exc_type, exc, traceback):
     curpid = os.getpid()
-    if curpid == self._ALTERNATE_MASTER_PID:
+    if curpid == self.ALTERNATE_MASTER_PID:
       self._invoking_pid = curpid
     if curpid == self._invoking_pid:
       return self._exit(exc_type, exc, traceback)