blob: d4531f964aa26d0daac499488a83e82a4e4a8447 [file] [log] [blame]
From 61be716cdcf8cd8b880a0af73cce26eb87788e52 Mon Sep 17 00:00:00 2001
From: Wei Fu <fuweid89@gmail.com>
Date: Wed, 26 Jan 2022 22:26:35 +0800
Subject: [PATCH] oci: use readonly mount to read user/group info
In linux kernel, the umount writable-mountpoint will try to do sync-fs
to make sure that the dirty pages to the underlying filesystems. The many
number of umount actions in the same time maybe introduce performance
issue in IOPS limited disk.
When CRI-plugin creates container, it will temp-mount rootfs to read
that UID/GID info for entrypoint. Basically, the rootfs is writable
snapshotter and then after read, umount will invoke sync-fs action.
For example, using overlayfs on ext4 and use bcc-tools to monitor
ext4_sync_fs call.
```
// uname -a
Linux chaofan 5.13.0-27-generic #29~20.04.1-Ubuntu SMP Fri Jan 14 00:32:30 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
// open terminal 1
kubectl run --image=nginx --image-pull-policy=IfNotPresent nginx-pod
// open terminal 2
/usr/share/bcc/tools/stackcount ext4_sync_fs -i 1 -v -P
ext4_sync_fs
sync_filesystem
ovl_sync_fs
__sync_filesystem
sync_filesystem
generic_shutdown_super
kill_anon_super
deactivate_locked_super
deactivate_super
cleanup_mnt
__cleanup_mnt
task_work_run
exit_to_user_mode_prepare
syscall_exit_to_user_mode
do_syscall_64
entry_SYSCALL_64_after_hwframe
syscall.Syscall.abi0
github.com/containerd/containerd/mount.unmount
github.com/containerd/containerd/mount.UnmountAll
github.com/containerd/containerd/mount.WithTempMount.func2
github.com/containerd/containerd/mount.WithTempMount
github.com/containerd/containerd/oci.WithUserID.func1
github.com/containerd/containerd/oci.WithUser.func1
github.com/containerd/containerd/oci.ApplyOpts
github.com/containerd/containerd.WithSpec.func1
github.com/containerd/containerd.(*Client).NewContainer
github.com/containerd/containerd/pkg/cri/server.(*criService).CreateContainer
github.com/containerd/containerd/pkg/cri/server.(*instrumentedService).CreateContainer
k8s.io/cri-api/pkg/apis/runtime/v1._RuntimeService_CreateContainer_Handler.func1
github.com/containerd/containerd/services/server.unaryNamespaceInterceptor
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1
github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).UnaryServerInterceptor.func1
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc.UnaryServerInterceptor.func1
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1
github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1
k8s.io/cri-api/pkg/apis/runtime/v1._RuntimeService_CreateContainer_Handler
google.golang.org/grpc.(*Server).processUnaryRPC
google.golang.org/grpc.(*Server).handleStream
google.golang.org/grpc.(*Server).serveStreams.func1.2
runtime.goexit.abi0
containerd [34771]
1
```
If there are comming several create requestes, umount actions might
bring high IO pressure on the /var/lib/containerd's underlying disk.
After checkout the kernel code[1], the kernel will not call
__sync_filesystem if the mount is readonly. Based on this, containerd
should use readonly mount to get UID/GID information.
Reference:
* [1] https://elixir.bootlin.com/linux/v5.13/source/fs/sync.c#L61
Closes: #4604
Signed-off-by: Wei Fu <fuweid89@gmail.com>
(cherry picked from commit 813a061fe13998dfaaceec6173448d0c12434e7b)
Signed-off-by: Qiutong Song <qiutongs@google.com>
---
oci/spec_opts.go | 26 ++++++++++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/oci/spec_opts.go b/oci/spec_opts.go
index 5a952f616..4199a85d9 100644
--- a/oci/spec_opts.go
+++ b/oci/spec_opts.go
@@ -590,6 +590,8 @@ func WithUser(userstr string) SpecOpts {
if err != nil {
return err
}
+
+ mounts = tryReadonlyMounts(mounts)
return mount.WithTempMount(ctx, mounts, f)
default:
return fmt.Errorf("invalid USER value %s", userstr)
@@ -643,6 +645,8 @@ func WithUserID(uid uint32) SpecOpts {
if err != nil {
return err
}
+
+ mounts = tryReadonlyMounts(mounts)
return mount.WithTempMount(ctx, mounts, func(root string) error {
user, err := UserFromPath(root, func(u user.User) bool {
return u.Uid == int(uid)
@@ -692,6 +696,8 @@ func WithUsername(username string) SpecOpts {
if err != nil {
return err
}
+
+ mounts = tryReadonlyMounts(mounts)
return mount.WithTempMount(ctx, mounts, func(root string) error {
user, err := UserFromPath(root, func(u user.User) bool {
return u.Name == username
@@ -776,6 +782,8 @@ func WithAdditionalGIDs(userstr string) SpecOpts {
if err != nil {
return err
}
+
+ mounts = tryReadonlyMounts(mounts)
return mount.WithTempMount(ctx, mounts, setAdditionalGids)
}
}
@@ -1264,3 +1272,21 @@ func WithDevShmSize(kb int64) SpecOpts {
return ErrNoShmMount
}
}
+
+// tryReadonlyMounts is used by the options which are trying to get user/group
+// information from container's rootfs. Since the option does read operation
+// only, this helper will append ReadOnly mount option to prevent linux kernel
+// from syncing whole filesystem in umount syscall.
+//
+// TODO(fuweid):
+//
+// Currently, it only works for overlayfs. I think we can apply it to other
+// kinds of filesystem. Maybe we can return `ro` option by `snapshotter.Mount`
+// API, when the caller passes that experimental annotation
+// `containerd.io/snapshot/readonly.mount` something like that.
+func tryReadonlyMounts(mounts []mount.Mount) []mount.Mount {
+ if len(mounts) == 1 && mounts[0].Type == "overlay" {
+ mounts[0].Options = append(mounts[0].Options, "ro")
+ }
+ return mounts
+}
--
2.35.1.265.g69c8d7142f-goog