pid-sandbox: execute pid-ns-init as pid 1 (bug 675312)

Execute pid-ns-init as the first fork after unshare, as
required for it to have pid 1 and become the default reaper
of orphaned descendant processes. In _exec, exec a separate
pid-ns-init process to behave as a supervisor which will
forward signals to init and forward exit status to the parent
process.

Fixes: a75d5546e3a4 ("Introduce a tiny init replacement for inside pid namespace")
Bug: https://bugs.gentoo.org/675312
Reviewed-by: Brian Dolbec <dolsen@gentoo.org>
Signed-off-by: Zac Medico <zmedico@gentoo.org>
diff --git a/bin/pid-ns-init b/bin/pid-ns-init
index 843257b..182d00a 100644
--- a/bin/pid-ns-init
+++ b/bin/pid-ns-init
@@ -1,23 +1,59 @@
 #!/usr/bin/env python
-# Copyright 2018 Gentoo Authors
+# Copyright 2018-2019 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
+import functools
 import os
+import signal
 import sys
 
 
+KILL_SIGNALS = (
+	signal.SIGINT,
+	signal.SIGTERM,
+	signal.SIGHUP,
+)
+
+def forward_kill_signal(main_child_pid, signum, frame):
+	os.kill(main_child_pid, signum)
+
+
 def main(argv):
 	if len(argv) < 2:
-		return 'Usage: {} <main-child-pid>'.format(argv[0])
-	main_child_pid = int(argv[1])
+		return 'Usage: {} <main-child-pid> or <binary> <argv0> [arg]..'.format(argv[0])
+
+	if len(argv) == 2:
+		# The child process is init (pid 1) in a child pid namespace, and
+		# the current process supervises from within the global pid namespace
+		# (forwarding signals to init and forwarding exit status to the parent
+		# process).
+		main_child_pid = int(argv[1])
+	else:
+		# The current process is init (pid 1) in a child pid namespace.
+		binary = argv[1]
+		args = argv[2:]
+
+		main_child_pid = os.fork()
+		if main_child_pid == 0:
+			os.execv(binary, args)
+
+	sig_handler = functools.partial(forward_kill_signal, main_child_pid)
+	for signum in KILL_SIGNALS:
+		signal.signal(signum, sig_handler)
 
 	# wait for child processes
 	while True:
-		pid, status = os.wait()
+		try:
+			pid, status = os.wait()
+		except OSError as e:
+			if e.errno == errno.EINTR:
+				continue
+			raise
 		if pid == main_child_pid:
 			if os.WIFEXITED(status):
 				return os.WEXITSTATUS(status)
 			elif os.WIFSIGNALED(status):
+				signal.signal(os.WTERMSIG(status), signal.SIG_DFL)
 				os.kill(os.getpid(), os.WTERMSIG(status))
 			# go to the unreachable place
 			break
diff --git a/lib/portage/process.py b/lib/portage/process.py
index 7103b6b..6af3ac3 100644
--- a/lib/portage/process.py
+++ b/lib/portage/process.py
@@ -564,15 +564,28 @@
 							noiselevel=-1)
 					else:
 						if unshare_pid:
-							# pid namespace requires us to become init
-							fork_ret = os.fork()
-							if fork_ret != 0:
-								os.execv(portage._python_interpreter, [
+							main_child_pid = os.fork()
+							if main_child_pid == 0:
+								# pid namespace requires us to become init
+								binary, myargs = portage._python_interpreter, [
 									portage._python_interpreter,
 									os.path.join(portage._bin_path,
 										'pid-ns-init'),
-									'%s' % fork_ret,
-									])
+									binary] + myargs
+							else:
+								# Execute a supervisor process which will forward
+								# signals to init and forward exit status to the
+								# parent process. The supervisor process runs in
+								# the global pid namespace, so skip /proc remount
+								# and other setup that's intended only for the
+								# init process.
+								binary, myargs = portage._python_interpreter, [
+									portage._python_interpreter,
+									os.path.join(portage._bin_path,
+									'pid-ns-init'), str(main_child_pid)]
+
+								os.execve(binary, myargs, env)
+
 						if unshare_mount:
 							# mark the whole filesystem as slave to avoid
 							# mounts escaping the namespace