| From 61be716cdcf8cd8b880a0af73cce26eb87788e52 Mon Sep 17 00:00:00 2001 |
| From: Wei Fu <fuweid89@gmail.com> |
| Date: Wed, 26 Jan 2022 22:26:35 +0800 |
| Subject: [PATCH] oci: use readonly mount to read user/group info |
| |
| In linux kernel, the umount writable-mountpoint will try to do sync-fs |
| to make sure that the dirty pages to the underlying filesystems. The many |
| number of umount actions in the same time maybe introduce performance |
| issue in IOPS limited disk. |
| |
| When CRI-plugin creates container, it will temp-mount rootfs to read |
| that UID/GID info for entrypoint. Basically, the rootfs is writable |
| snapshotter and then after read, umount will invoke sync-fs action. |
| |
| For example, using overlayfs on ext4 and use bcc-tools to monitor |
| ext4_sync_fs call. |
| |
| ``` |
| // uname -a |
| Linux chaofan 5.13.0-27-generic #29~20.04.1-Ubuntu SMP Fri Jan 14 00:32:30 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux |
| |
| // open terminal 1 |
| kubectl run --image=nginx --image-pull-policy=IfNotPresent nginx-pod |
| |
| // open terminal 2 |
| /usr/share/bcc/tools/stackcount ext4_sync_fs -i 1 -v -P |
| |
| ext4_sync_fs |
| sync_filesystem |
| ovl_sync_fs |
| __sync_filesystem |
| sync_filesystem |
| generic_shutdown_super |
| kill_anon_super |
| deactivate_locked_super |
| deactivate_super |
| cleanup_mnt |
| __cleanup_mnt |
| task_work_run |
| exit_to_user_mode_prepare |
| syscall_exit_to_user_mode |
| do_syscall_64 |
| entry_SYSCALL_64_after_hwframe |
| syscall.Syscall.abi0 |
| github.com/containerd/containerd/mount.unmount |
| github.com/containerd/containerd/mount.UnmountAll |
| github.com/containerd/containerd/mount.WithTempMount.func2 |
| github.com/containerd/containerd/mount.WithTempMount |
| github.com/containerd/containerd/oci.WithUserID.func1 |
| github.com/containerd/containerd/oci.WithUser.func1 |
| github.com/containerd/containerd/oci.ApplyOpts |
| github.com/containerd/containerd.WithSpec.func1 |
| github.com/containerd/containerd.(*Client).NewContainer |
| github.com/containerd/containerd/pkg/cri/server.(*criService).CreateContainer |
| github.com/containerd/containerd/pkg/cri/server.(*instrumentedService).CreateContainer |
| k8s.io/cri-api/pkg/apis/runtime/v1._RuntimeService_CreateContainer_Handler.func1 |
| github.com/containerd/containerd/services/server.unaryNamespaceInterceptor |
| github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1 |
| github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).UnaryServerInterceptor.func1 |
| github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1 |
| go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc.UnaryServerInterceptor.func1 |
| github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1.1.1 |
| github.com/grpc-ecosystem/go-grpc-middleware.ChainUnaryServer.func1 |
| k8s.io/cri-api/pkg/apis/runtime/v1._RuntimeService_CreateContainer_Handler |
| google.golang.org/grpc.(*Server).processUnaryRPC |
| google.golang.org/grpc.(*Server).handleStream |
| google.golang.org/grpc.(*Server).serveStreams.func1.2 |
| runtime.goexit.abi0 |
| containerd [34771] |
| 1 |
| ``` |
| |
| If there are comming several create requestes, umount actions might |
| bring high IO pressure on the /var/lib/containerd's underlying disk. |
| |
| After checkout the kernel code[1], the kernel will not call |
| __sync_filesystem if the mount is readonly. Based on this, containerd |
| should use readonly mount to get UID/GID information. |
| |
| Reference: |
| |
| * [1] https://elixir.bootlin.com/linux/v5.13/source/fs/sync.c#L61 |
| |
| Closes: #4604 |
| |
| Signed-off-by: Wei Fu <fuweid89@gmail.com> |
| (cherry picked from commit 813a061fe13998dfaaceec6173448d0c12434e7b) |
| Signed-off-by: Qiutong Song <qiutongs@google.com> |
| --- |
| oci/spec_opts.go | 26 ++++++++++++++++++++++++++ |
| 1 file changed, 26 insertions(+) |
| |
| diff --git a/oci/spec_opts.go b/oci/spec_opts.go |
| index 5a952f616..4199a85d9 100644 |
| --- a/oci/spec_opts.go |
| +++ b/oci/spec_opts.go |
| @@ -590,6 +590,8 @@ func WithUser(userstr string) SpecOpts { |
| if err != nil { |
| return err |
| } |
| + |
| + mounts = tryReadonlyMounts(mounts) |
| return mount.WithTempMount(ctx, mounts, f) |
| default: |
| return fmt.Errorf("invalid USER value %s", userstr) |
| @@ -643,6 +645,8 @@ func WithUserID(uid uint32) SpecOpts { |
| if err != nil { |
| return err |
| } |
| + |
| + mounts = tryReadonlyMounts(mounts) |
| return mount.WithTempMount(ctx, mounts, func(root string) error { |
| user, err := UserFromPath(root, func(u user.User) bool { |
| return u.Uid == int(uid) |
| @@ -692,6 +696,8 @@ func WithUsername(username string) SpecOpts { |
| if err != nil { |
| return err |
| } |
| + |
| + mounts = tryReadonlyMounts(mounts) |
| return mount.WithTempMount(ctx, mounts, func(root string) error { |
| user, err := UserFromPath(root, func(u user.User) bool { |
| return u.Name == username |
| @@ -776,6 +782,8 @@ func WithAdditionalGIDs(userstr string) SpecOpts { |
| if err != nil { |
| return err |
| } |
| + |
| + mounts = tryReadonlyMounts(mounts) |
| return mount.WithTempMount(ctx, mounts, setAdditionalGids) |
| } |
| } |
| @@ -1264,3 +1272,21 @@ func WithDevShmSize(kb int64) SpecOpts { |
| return ErrNoShmMount |
| } |
| } |
| + |
| +// tryReadonlyMounts is used by the options which are trying to get user/group |
| +// information from container's rootfs. Since the option does read operation |
| +// only, this helper will append ReadOnly mount option to prevent linux kernel |
| +// from syncing whole filesystem in umount syscall. |
| +// |
| +// TODO(fuweid): |
| +// |
| +// Currently, it only works for overlayfs. I think we can apply it to other |
| +// kinds of filesystem. Maybe we can return `ro` option by `snapshotter.Mount` |
| +// API, when the caller passes that experimental annotation |
| +// `containerd.io/snapshot/readonly.mount` something like that. |
| +func tryReadonlyMounts(mounts []mount.Mount) []mount.Mount { |
| + if len(mounts) == 1 && mounts[0].Type == "overlay" { |
| + mounts[0].Options = append(mounts[0].Options, "ro") |
| + } |
| + return mounts |
| +} |
| -- |
| 2.35.1.265.g69c8d7142f-goog |
| |