blob: 9acc89aeb8f1798f9b7ac0d059fd0140efc6cd2b [file] [log] [blame]
# Copyright 2013 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Support for Linux namespaces"""
import ctypes
import ctypes.util
import errno
import logging
import os
import signal
# Note: We avoid cros_build_lib here as that's a "large" module and we want
# to keep this "light" and standalone. The subprocess usage in here is also
# simple by design -- if it gets more complicated, we should look at using
# the cros_build_lib.run helper.
import subprocess
import sys
from typing import List, Optional
from chromite.lib import commandline
from chromite.lib import locking
from chromite.lib import osutils
from chromite.lib import process_util
from chromite.utils import proctitle_util
CLONE_FILES = 0x00000400
CLONE_FS = 0x00000200
CLONE_NEWCGROUP = 0x02000000
CLONE_NEWIPC = 0x08000000
CLONE_NEWNET = 0x40000000
CLONE_NEWNS = 0x00020000
CLONE_NEWPID = 0x20000000
CLONE_NEWUSER = 0x10000000
CLONE_NEWUTS = 0x04000000
def SetNS(fd, nstype):
"""Binding to the Linux setns system call. See setns(2) for details.
Args:
fd: An open file descriptor or path to one.
nstype: Namespace to enter; one of CLONE_*.
Raises:
OSError: if setns failed.
"""
try:
fp = None
if isinstance(fd, str):
fp = open(fd, "wb") # pylint: disable=consider-using-with
fd = fp.fileno()
libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
if libc.setns(ctypes.c_int(fd), ctypes.c_int(nstype)) != 0:
e = ctypes.get_errno()
raise OSError(e, os.strerror(e))
finally:
if fp is not None:
fp.close()
def Unshare(flags):
"""Binding to the Linux unshare system call. See unshare(2) for details.
Args:
flags: Namespaces to unshare; bitwise OR of CLONE_* flags.
Raises:
OSError: if unshare failed.
"""
libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
if libc.unshare(ctypes.c_int(flags)) != 0:
e = ctypes.get_errno()
raise OSError(e, os.strerror(e))
def _ReapChildren(pid: int, uid: Optional[int], gid: Optional[int]) -> None:
"""Reap all children that get reparented to us until we see |pid| exit.
Args:
pid: The main child to watch for.
uid: The user to switch to first.
gid: The group to switch to first.
"""
if gid is not None:
os.setgid(gid)
if uid is not None:
os.setuid(uid)
while True:
try:
(wpid, status) = os.wait()
if pid == wpid:
process_util.ExitAsStatus(status)
except OSError as e:
if e.errno == errno.ECHILD:
raise ValueError(
"All children of the current processes have been reaped, "
"but %u was not one of them. This means that %u is not a "
"child of the current processes." % (pid)
)
elif e.errno != errno.EINTR:
raise
def _SafeTcSetPgrp(fd, pgrp):
"""Set |pgrp| as the controller of the tty |fd|."""
try:
curr_pgrp = os.tcgetpgrp(fd)
except OSError as e:
# This can come up when the fd is not connected to a terminal.
if e.errno == errno.ENOTTY:
return
raise
# We can change the owner only if currently own it. Otherwise we'll get
# stopped by the kernel with SIGTTOU and that'll hit the whole group.
if curr_pgrp == os.getpgrp():
os.tcsetpgrp(fd, pgrp)
def _ForwardToChildPid(pid, signal_to_forward):
"""Setup a signal handler that forwards the given signal to |pid|."""
def _ForwardingHandler(signum, _frame):
try:
os.kill(pid, signum)
except ProcessLookupError:
# The target PID might have already exited, and thus we get a
# ProcessLookupError when trying to send it a signal.
logging.debug(
"Can't forward signal %u to pid %u as it doesn't exist",
signum,
pid,
)
signal.signal(signal_to_forward, _ForwardingHandler)
def CreatePidNs(uid: Optional[int] = None, gid: Optional[int] = None) -> None:
"""Start a new pid namespace.
This will launch all the right manager processes. The child that returns
will be isolated in a new pid namespace.
If functionality is not available, then it will return w/out doing anything.
A note about the processes generated as a result of calling this function:
You call CreatePidNs() in pid X
- X launches Pid Y,
- Pid X will now do nothing but wait for Pid Y to finish and then
sys.exit() with that return code
- Y launches Pid Z
- Pid Y will now do nothing but wait for Pid Z to finish and then
sys.exit() with that return code
- **Pid Z returns from CreatePidNs**. So, the caller of this function
continues in a different process than the one that made the call.
- All SIGTERM/SIGINT signals are forwarded down from pid X to pid Z
to handle.
- SIGKILL will only kill pid X, and leak Pid Y and Z.
Args:
uid: The user to run the init processes as.
gid: The group to run the init processes as.
Returns:
The last pid outside of the namespace. (i.e., pid X)
"""
first_pid = os.getpid()
try:
# First create the namespace.
Unshare(CLONE_NEWPID)
except OSError as e:
if e.errno == errno.EINVAL:
# For older kernels, or the functionality is disabled in the config,
# return silently. We don't want to hard require this stuff.
return first_pid
else:
# For all other errors, abort. They shouldn't happen.
raise
# Used to make sure process groups are in the right state before we try to
# forward the controlling terminal.
lock = locking.PipeLock()
# Now that we're in the new pid namespace, fork. The parent is the master
# of it in the original namespace, so it only monitors the child inside it.
# It is only allowed to fork once too.
pid = os.fork()
if pid:
proctitle_util.settitle("pid ns", "external init")
# We forward termination signals to the child and trust the child to
# respond sanely. Later, ExitAsStatus propagates the exit status back
# up.
_ForwardToChildPid(pid, signal.SIGINT)
_ForwardToChildPid(pid, signal.SIGTERM)
# Forward the control of the terminal to the child so it can manage
# input.
_SafeTcSetPgrp(sys.stdin.fileno(), pid)
# Signal our child it can move forward.
lock.Post()
del lock
# Reap the children as the parent of the new namespace.
_ReapChildren(pid, uid=uid, gid=gid)
else:
# Make sure to unshare the existing mount point if needed. Some distros
# create shared mount points everywhere by default.
try:
osutils.Mount(
"none", "/proc", 0, osutils.MS_PRIVATE | osutils.MS_REC
)
except OSError as e:
if e.errno != errno.EINVAL:
raise
# The child needs its own proc mount as it'll be different.
osutils.Mount(
"proc",
"/proc",
"proc",
osutils.MS_NOSUID
| osutils.MS_NODEV
| osutils.MS_NOEXEC
| osutils.MS_RELATIME,
)
# Wait for our parent to finish initialization.
lock.Wait()
del lock
# Resetup the locks for the next phase.
lock = locking.PipeLock()
pid = os.fork()
if pid:
proctitle_util.settitle("pid ns", "init")
# We forward termination signals to the child and trust the child to
# respond sanely. Later, ExitAsStatus propagates the exit status
# back up.
_ForwardToChildPid(pid, signal.SIGINT)
_ForwardToChildPid(pid, signal.SIGTERM)
# Now that we're in a new pid namespace, start a new process group
# so that children have something valid to use. Otherwise
# getpgrp/etc... will get back 0 which tends to confuse -- you can't
# setpgrp(0) for example.
os.setpgrp()
# Forward the control of the terminal to the child so it can manage
# input.
_SafeTcSetPgrp(sys.stdin.fileno(), pid)
# Signal our child it can move forward.
lock.Post()
del lock
# Watch all the children. We need to act as the master inside the
# namespace and reap old processes.
_ReapChildren(pid, uid=uid, gid=gid)
# Wait for our parent to finish initialization.
lock.Wait()
del lock
# Create a process group for the grandchild so it can manage things
# independent of the init process.
os.setpgrp()
# The grandchild will return and take over the rest of the sdk steps.
return first_pid
def CreateNetNs():
"""Start a new net namespace
We will bring up the loopback interface, but that is all.
If functionality is not available, then it will return w/out doing anything.
"""
# The net namespace was added in 2.6.24 and may be disabled in the kernel.
try:
Unshare(CLONE_NEWNET)
except OSError as e:
if e.errno == errno.EINVAL:
return
else:
# For all other errors, abort. They shouldn't happen.
raise
# Since we've unshared the net namespace, we need to bring up loopback.
# The kernel automatically adds the various ip addresses, so skip that.
try:
subprocess.call(["ip", "link", "set", "up", "lo"])
except OSError as e:
if e.errno == errno.ENOENT:
print(
"warning: could not bring up loopback for network; "
"install the iproute2 package",
file=sys.stderr,
)
else:
raise
def CreateUserNs() -> None:
"""Start a user namespace
This will create a new user namespace and move the current process into it.
It will fail if the current process is multi-threaded.
In the new user namespace, the current process will:
- have UID=GID=0, which is mapped to the original UID/GID in the original
user namespace
- have all capabilities (with the namespace)
This function is useful when you want to enter other namespaces (e.g. mount
namespace) without root privileges.
"""
orig_uid = os.getuid()
orig_gid = os.getgid()
Unshare(CLONE_NEWUSER)
# Set up a UID/GID mapping that maps the original UID/GID to UID=GID=0 in
# the new user namespace. The order of writing these files matters.
# See `man 1 user_namespaces` for details.
with open("/proc/self/setgroups", "w", encoding="utf-8") as f:
f.write("deny")
with open("/proc/self/uid_map", "w", encoding="utf-8") as f:
f.write("0 %d 1\n" % orig_uid)
with open("/proc/self/gid_map", "w", encoding="utf-8") as f:
f.write("0 %d 1\n" % orig_gid)
def SimpleUnshare(
mount: bool = True,
uts: bool = True,
ipc: bool = True,
net: bool = False,
pid: bool = False,
cgroup: bool = False,
pid_uid: Optional[int] = None,
pid_gid: Optional[int] = None,
) -> None:
"""Simpler helper for setting up namespaces quickly.
If support for any namespace type is not available, we'll silently skip it.
Args:
mount: Create a mount namespace.
uts: Create a UTS namespace.
ipc: Create an IPC namespace.
net: Create a net namespace.
pid: Create a pid namespace.
cgroup: Create a cgroup namespace.
pid_uid: The UID to switch the init to when creating a pid namespace.
pid_gid: The GID to switch the init to when creating a pid namespace.
"""
# The mount namespace is the only one really guaranteed to exist --
# it's been supported forever and it cannot be turned off.
if mount:
Unshare(CLONE_NEWNS)
# The UTS namespace was added 2.6.19 and may be disabled in the kernel.
if uts:
try:
Unshare(CLONE_NEWUTS)
except OSError as e:
if e.errno != errno.EINVAL:
pass
# The IPC namespace was added 2.6.19 and may be disabled in the kernel.
if ipc:
try:
Unshare(CLONE_NEWIPC)
except OSError as e:
if e.errno != errno.EINVAL:
pass
if net:
CreateNetNs()
if pid:
CreatePidNs(uid=pid_uid, gid=pid_gid)
# The cgroup namespace was added in 4.6 and may be disabled in the kernel.
if cgroup:
try:
Unshare(CLONE_NEWCGROUP)
except OSError as e:
if e.errno != errno.EINVAL:
pass
# We considered unsharing the time namespace as well. Unfortunately,
# the usefulness of time namespaces is limited:
# - they only isolate the CLOCK_BOOTTIME and CLOCK_MONOTONIC clocks
# - there's no way to set these clocks apart from updating the offset in the
# /proc/self/timens_offset file, which cannot be edited after a process
# has been created in the new time namespace
# - CLOCK_REALTIME is not isolated
# Hence we've left them out.
def ReExecuteWithNamespace(
argv: List[str],
preserve_env: bool = False,
network: bool = False,
clear_saved_id: bool = False,
) -> None:
"""Re-execute as root so we can unshare resources.
Args:
argv: Command line arguments to run as root user.
preserve_env: If True, preserve existing environment variables when
running as root user.
network: If False, disable access to the network.
clear_saved_id: Whether to clear the saved-uid & saved-gid. Retaining
will allow code to switch back to root via e.g. os.setuid() calls.
"""
# Re-run the command as a root user in order to create the namespaces.
# Ideally, we can rework this logic to swap to the root user in a way that
# doesn't involve re-executing the command.
commandline.RunAsRootUser(argv, preserve_env=preserve_env)
SimpleUnshare(net=not network, pid=True)
# We got our namespaces, so switch back to the non-root user.
gid = int(os.environ.pop("SUDO_GID"))
uid = int(os.environ.pop("SUDO_UID"))
user = os.environ.pop("SUDO_USER")
os.initgroups(user, gid)
os.setresgid(gid, gid, gid if clear_saved_id else -1)
os.setresuid(uid, uid, uid if clear_saved_id else -1)
os.environ["USER"] = user