[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#988802: unblock: runc/1.0.0~rc93+ds1-4



Package: release.debian.org
Severity: normal
User: release.debian.org@packages.debian.org
Usertags: unblock
X-Debbugs-Cc: zhsj@debian.org

Please unblock package runc

[ Reason ]
Fix CVE-2021-30465
https://github.com/opencontainers/runc/security/advisories/GHSA-c3xm-pvg7-gh7r

[ Impact ]
The package can migrate itself(have autopkgtest and not key package),
but I'd like to reduce the age.

[ Tests ]
I have done some basic tests. But I'm not sure how to trigger the security
issue that I can't verify if it's really fixed.

[ Risks ]
The patch provided by upstream can't be applied clearly to the version we have
in sid. So I look the changes and backport another two PR, which makes the diff
a bit large.

[ Checklist ]
  [x] all changes are documented in the d/changelog
  [x] I reviewed all changes and I approve them
  [*] attach debdiff against the package in testing
      Since only patches are added in debian/patches dir, I just attached the 3
      new patches.

[ Other info ]

unblock runc/1.0.0~rc93+ds1-4


debian/changelog:

 runc (1.0.0~rc93+ds1-4) unstable; urgency=high
 .
   * Team upload.
   * Backport patches for CVE-2021-30465 (Closes: #988768)
     To apply CVE-2021-30465 patch clearly, following PR are backported as
     well:
     + https://github.com/opencontainers/runc/pull/2798
     + https://github.com/opencontainers/runc/pull/2818

$ cat debian/patches/00{11,12,13}*|filterdiff -x '*.bats' -x '*_test.go' |diffstat

 b/libcontainer/container_linux.go  |   10 +
 b/libcontainer/init_linux.go       |    1
 b/libcontainer/rootfs_linux.go     |   64 ++++++---
 b/libcontainer/specconv/example.go |   18 +-
 b/libcontainer/utils/utils.go      |   54 +++++++
 libcontainer/container_linux.go    |   52 ++++++-
 libcontainer/rootfs_linux.go       |  251 ++++++++++++++++++-------------------
 7 files changed, 283 insertions(+), 167 deletions(-)

$ cat debian/patches/00{11,12,13}*|filterdiff -x '*.bats' -x '*_test.go'

From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Tue, 23 Feb 2021 17:58:07 -0800
Subject: PR2818 Fix cgroup2 mount for rootless case

Backport this PR so we can apply the patch for CVE-2021-30465

1. libct/newInitConfig: nit
2. libct/rootfs: introduce and use mountConfig
3 .libct/rootfs/mountCgroupV2: minor refactor
4. Fix cgroup2 mount for rootless case
5. tests/int: use bfq test with rootless
6. tests/int: add a case for cgroupv2 mount

Origin: backport, https://github.com/opencontainers/runc/pull/2818
---
 libcontainer/container_linux.go  | 10 +++++--
 libcontainer/init_linux.go       |  1 +
 libcontainer/rootfs_linux.go     | 64 ++++++++++++++++++++++++++++------------
 libcontainer/specconv/example.go | 18 +++++------
 tests/integration/cgroups.bats   | 34 +++++++++++++++++++++
 5 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 3dca29e..1cbc734 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -594,6 +594,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
 		AppArmorProfile:  c.config.AppArmorProfile,
 		ProcessLabel:     c.config.ProcessLabel,
 		Rlimits:          c.config.Rlimits,
+		CreateConsole:    process.ConsoleSocket != nil,
+		ConsoleWidth:     process.ConsoleWidth,
+		ConsoleHeight:    process.ConsoleHeight,
 	}
 	if process.NoNewPrivileges != nil {
 		cfg.NoNewPrivileges = *process.NoNewPrivileges
@@ -607,9 +610,10 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
 	if len(process.Rlimits) > 0 {
 		cfg.Rlimits = process.Rlimits
 	}
-	cfg.CreateConsole = process.ConsoleSocket != nil
-	cfg.ConsoleWidth = process.ConsoleWidth
-	cfg.ConsoleHeight = process.ConsoleHeight
+	if cgroups.IsCgroup2UnifiedMode() {
+		cfg.Cgroup2Path = c.cgroupManager.Path("")
+	}
+
 	return cfg
 }
 
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index c57af0e..6817970 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -70,6 +70,7 @@ type initConfig struct {
 	RootlessEUID     bool                  `json:"rootless_euid,omitempty"`
 	RootlessCgroups  bool                  `json:"rootless_cgroups,omitempty"`
 	SpecState        *specs.State          `json:"spec_state,omitempty"`
+	Cgroup2Path      string                `json:"cgroup2_path,omitempty"`
 }
 
 type initer interface {
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 411496a..5d2d74c 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -17,6 +17,7 @@ import (
 	"github.com/moby/sys/mountinfo"
 	"github.com/mrunalp/fileutils"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/system"
@@ -29,6 +30,14 @@ import (
 
 const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
 
+type mountConfig struct {
+	root            string
+	label           string
+	cgroup2Path     string
+	rootlessCgroups bool
+	cgroupns        bool
+}
+
 // needsSetupDev returns true if /dev needs to be set up.
 func needsSetupDev(config *configs.Config) bool {
 	for _, m := range config.Mounts {
@@ -48,7 +57,13 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
 		return newSystemErrorWithCause(err, "preparing rootfs")
 	}
 
-	hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP)
+	mountConfig := &mountConfig{
+		root:            config.Rootfs,
+		label:           config.MountLabel,
+		cgroup2Path:     iConfig.Cgroup2Path,
+		rootlessCgroups: iConfig.RootlessCgroups,
+		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
+	}
 	setupDev := needsSetupDev(config)
 	for _, m := range config.Mounts {
 		for _, precmd := range m.PremountCmds {
@@ -56,7 +71,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
 				return newSystemErrorWithCause(err, "running premount command")
 			}
 		}
-		if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil {
+		if err := mountToRootfs(m, mountConfig); err != nil {
 			return newSystemErrorWithCausef(err, "mounting %q to rootfs at %q", m.Source, m.Destination)
 		}
 
@@ -222,7 +237,7 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
 	return nil
 }
 
-func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
+func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 	binds, err := getCgroupMounts(m)
 	if err != nil {
 		return err
@@ -242,12 +257,12 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
 		Data:             "mode=755",
 		PropagationFlags: m.PropagationFlags,
 	}
-	if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil {
+	if err := mountToRootfs(tmpfs, c); err != nil {
 		return err
 	}
 	for _, b := range binds {
-		if enableCgroupns {
-			subsystemPath := filepath.Join(rootfs, b.Destination)
+		if c.cgroupns {
+			subsystemPath := filepath.Join(c.root, b.Destination)
 			if err := os.MkdirAll(subsystemPath, 0755); err != nil {
 				return err
 			}
@@ -266,7 +281,7 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
 				return err
 			}
 		} else {
-			if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil {
+			if err := mountToRootfs(b, c); err != nil {
 				return err
 			}
 		}
@@ -276,7 +291,7 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
 			// symlink(2) is very dumb, it will just shove the path into
 			// the link and doesn't do any checks or relative path
 			// conversion. Also, don't error out if the cgroup already exists.
-			if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
+			if err := os.Symlink(mc, filepath.Join(c.root, m.Destination, ss)); err != nil && !os.IsExist(err) {
 				return err
 			}
 		}
@@ -284,28 +299,39 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
 	return nil
 }
 
-func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
-	cgroupPath, err := securejoin.SecureJoin(rootfs, m.Destination)
+func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
+	dest, err := securejoin.SecureJoin(c.root, m.Destination)
 	if err != nil {
 		return err
 	}
-	if err := os.MkdirAll(cgroupPath, 0755); err != nil {
+	if err := os.MkdirAll(dest, 0755); err != nil {
 		return err
 	}
-	if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
+	if err := unix.Mount(m.Source, dest, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
 		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
 		if err == unix.EPERM || err == unix.EBUSY {
-			return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "")
+			src := fs2.UnifiedMountpoint
+			if c.cgroupns && c.cgroup2Path != "" {
+				// Emulate cgroupns by bind-mounting
+				// the container cgroup path rather than
+				// the whole /sys/fs/cgroup.
+				src = c.cgroup2Path
+			}
+			err = unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
+			if err == unix.ENOENT && c.rootlessCgroups {
+				err = nil
+			}
+			return err
 		}
 		return err
 	}
 	return nil
 }
 
-func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
-	var (
-		dest = m.Destination
-	)
+func mountToRootfs(m *configs.Mount, c *mountConfig) error {
+	rootfs := c.root
+	mountLabel := c.label
+	dest := m.Destination
 	if !strings.HasPrefix(dest, rootfs) {
 		dest = filepath.Join(rootfs, dest)
 	}
@@ -424,9 +450,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
 		}
 	case "cgroup":
 		if cgroups.IsCgroup2UnifiedMode() {
-			return mountCgroupV2(m, rootfs, mountLabel, enableCgroupns)
+			return mountCgroupV2(m, c)
 		}
-		return mountCgroupV1(m, rootfs, mountLabel, enableCgroupns)
+		return mountCgroupV1(m, c)
 	default:
 		// ensure that the destination of the mount is resolved of symlinks at mount time because
 		// any previous mounts can invalidate the next mount's destination.
diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go
index 8a201bc..56bab3b 100644
--- a/libcontainer/specconv/example.go
+++ b/libcontainer/specconv/example.go
@@ -2,6 +2,7 @@ package specconv
 
 import (
 	"os"
+	"path/filepath"
 	"strings"
 
 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -200,8 +201,14 @@ func ToRootless(spec *specs.Spec) {
 	// Fix up mounts.
 	var mounts []specs.Mount
 	for _, mount := range spec.Mounts {
-		// Ignore all mounts that are under /sys.
-		if strings.HasPrefix(mount.Destination, "/sys") {
+		// Replace the /sys mount with an rbind.
+		if filepath.Clean(mount.Destination) == "/sys" {
+			mounts = append(mounts, specs.Mount{
+				Source:      "/sys",
+				Destination: "/sys",
+				Type:        "none",
+				Options:     []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
+			})
 			continue
 		}
 
@@ -216,13 +223,6 @@ func ToRootless(spec *specs.Spec) {
 		mount.Options = options
 		mounts = append(mounts, mount)
 	}
-	// Add the sysfs mount as an rbind.
-	mounts = append(mounts, specs.Mount{
-		Source:      "/sys",
-		Destination: "/sys",
-		Type:        "none",
-		Options:     []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
-	})
 	spec.Mounts = mounts
 
 	// Remove cgroup settings.
From: Adrian Reber <areber@redhat.com>
Date: Mon, 8 Feb 2021 13:05:54 +0000
Subject: PR2798 Correctly restore containers with nested bind mounts

Backport this PR so we can apply the patch for CVE-2021-30465

1. Re-create mountpoints during restore
2. tests: test nested bind mount restore

Origin: backport, https://github.com/opencontainers/runc/pull/2798
---
 libcontainer/container_linux.go   | 27 +++++++++++++++++++++++++
 tests/integration/checkpoint.bats | 42 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 1cbc734..76a69af 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -1239,11 +1239,38 @@ func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error
 	// Now go through all mounts and create the mountpoints
 	// if the mountpoints are not on a tmpfs, as CRIU will
 	// restore the complete tmpfs content from its checkpoint.
+	umounts := []string{}
+	defer func() {
+		for _, u := range umounts {
+			if e := unix.Unmount(u, unix.MNT_DETACH); e != nil {
+				if e != unix.EINVAL {
+					// Ignore EINVAL as it means 'target is not a mount point.'
+					// It probably has already been unmounted.
+					logrus.Warnf("Error during cleanup unmounting of %q (%v)", u, e)
+				}
+			}
+		}
+	}()
 	for _, m := range mounts {
 		if !isPathInPrefixList(m.Destination, tmpfs) {
 			if err := c.makeCriuRestoreMountpoints(m); err != nil {
 				return err
 			}
+			// If the mount point is a bind mount, we need to mount
+			// it now so that runc can create the necessary mount
+			// points for mounts in bind mounts.
+			// This also happens during initial container creation.
+			// Without this CRIU restore will fail
+			// See: https://github.com/opencontainers/runc/issues/2748
+			// It is also not necessary to order the mount points
+			// because during initial container creation mounts are
+			// set up in the order they are configured.
+			if m.Device == "bind" {
+				if err := unix.Mount(m.Source, m.Destination, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
+					return errorsf.Wrapf(err, "unable to bind mount %q to %q", m.Source, m.Destination)
+				}
+				umounts = append(umounts, m.Destination)
+			}
 		}
 	}
 	return nil
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 1 Apr 2021 12:00:31 -0700
Subject: CVE-2021-30465

Origin: backport, https://github.com/opencontainers/runc/commit/0ca91f44f1664da834bc61115a849b56d22f595f
---
 libcontainer/container_linux.go  |  25 ++--
 libcontainer/rootfs_linux.go     | 251 +++++++++++++++++++--------------------
 libcontainer/utils/utils.go      |  54 +++++++++
 libcontainer/utils/utils_test.go |  35 ++++++
 4 files changed, 229 insertions(+), 136 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 76a69af..309b02a 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -1202,7 +1202,6 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
 		if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
 			return err
 		}
-		m.Destination = dest
 		if err := os.MkdirAll(dest, 0755); err != nil {
 			return err
 		}
@@ -1242,13 +1241,16 @@ func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error
 	umounts := []string{}
 	defer func() {
 		for _, u := range umounts {
-			if e := unix.Unmount(u, unix.MNT_DETACH); e != nil {
-				if e != unix.EINVAL {
-					// Ignore EINVAL as it means 'target is not a mount point.'
-					// It probably has already been unmounted.
-					logrus.Warnf("Error during cleanup unmounting of %q (%v)", u, e)
+			_ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
+				if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
+					if e != unix.EINVAL {
+						// Ignore EINVAL as it means 'target is not a mount point.'
+						// It probably has already been unmounted.
+						logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
+					}
 				}
-			}
+				return nil
+			})
 		}
 	}()
 	for _, m := range mounts {
@@ -1266,8 +1268,13 @@ func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error
 			// because during initial container creation mounts are
 			// set up in the order they are configured.
 			if m.Device == "bind" {
-				if err := unix.Mount(m.Source, m.Destination, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
-					return errorsf.Wrapf(err, "unable to bind mount %q to %q", m.Source, m.Destination)
+				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error {
+					if err := unix.Mount(m.Source, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
+						return errorsf.Wrapf(err, "unable to bind mount %q to %q (through %q)", m.Source, m.Destination, procfd)
+					}
+					return nil
+				}); err != nil {
+					return err
 				}
 				umounts = append(umounts, m.Destination)
 			}
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 5d2d74c..96be669 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -25,6 +25,7 @@ import (
 	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/opencontainers/selinux/go-selinux/label"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -228,8 +229,6 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
 	if err := checkProcMount(rootfs, dest, m.Source); err != nil {
 		return err
 	}
-	// update the mount with the correct dest after symlinks are resolved.
-	m.Destination = dest
 	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
 		return err
 	}
@@ -266,18 +265,21 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 			if err := os.MkdirAll(subsystemPath, 0755); err != nil {
 				return err
 			}
-			flags := defaultMountFlags
-			if m.Flags&unix.MS_RDONLY != 0 {
-				flags = flags | unix.MS_RDONLY
-			}
-			cgroupmount := &configs.Mount{
-				Source:      "cgroup",
-				Device:      "cgroup", // this is actually fstype
-				Destination: subsystemPath,
-				Flags:       flags,
-				Data:        filepath.Base(subsystemPath),
-			}
-			if err := mountNewCgroup(cgroupmount); err != nil {
+			if err := utils.WithProcfd(c.root, b.Destination, func(procfd string) error {
+				flags := defaultMountFlags
+				if m.Flags&unix.MS_RDONLY != 0 {
+					flags = flags | unix.MS_RDONLY
+				}
+				var (
+					source = "cgroup"
+					data   = filepath.Base(subsystemPath)
+				)
+				if data == "systemd" {
+					data = cgroups.CgroupNamePrefix + data
+					source = "systemd"
+				}
+				return unix.Mount(source, procfd, "cgroup", uintptr(flags), data)
+			}); err != nil {
 				return err
 			}
 		} else {
@@ -307,33 +309,79 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 	if err := os.MkdirAll(dest, 0755); err != nil {
 		return err
 	}
-	if err := unix.Mount(m.Source, dest, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
-		// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
-		if err == unix.EPERM || err == unix.EBUSY {
-			src := fs2.UnifiedMountpoint
-			if c.cgroupns && c.cgroup2Path != "" {
-				// Emulate cgroupns by bind-mounting
-				// the container cgroup path rather than
-				// the whole /sys/fs/cgroup.
-				src = c.cgroup2Path
-			}
-			err = unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
-			if err == unix.ENOENT && c.rootlessCgroups {
-				err = nil
+	return utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
+		if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
+			// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
+			if err == unix.EPERM || err == unix.EBUSY {
+				src := fs2.UnifiedMountpoint
+				if c.cgroupns && c.cgroup2Path != "" {
+					// Emulate cgroupns by bind-mounting
+					// the container cgroup path rather than
+					// the whole /sys/fs/cgroup.
+					src = c.cgroup2Path
+				}
+				err = unix.Mount(src, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
+				if err == unix.ENOENT && c.rootlessCgroups {
+					err = nil
+				}
 			}
 			return err
 		}
+		return nil
+	})
+}
+
+func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
+	// Set up a scratch dir for the tmpfs on the host.
+	tmpdir, err := prepareTmp("/tmp")
+	if err != nil {
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
+	}
+	defer cleanupTmp(tmpdir)
+	tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir")
+	if err != nil {
+		return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
+	}
+	defer os.RemoveAll(tmpDir)
+
+	// Configure the *host* tmpdir as if it's the container mount. We change
+	// m.Destination since we are going to mount *on the host*.
+	oldDest := m.Destination
+	m.Destination = tmpDir
+	err = mountPropagate(m, "/", mountLabel)
+	m.Destination = oldDest
+	if err != nil {
 		return err
 	}
-	return nil
+	defer func() {
+		if Err != nil {
+			if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil {
+				logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
+			}
+		}
+	}()
+
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) {
+		// Copy the container data to the host tmpdir. We append "/" to force
+		// CopyDirectory to resolve the symlink rather than trying to copy the
+		// symlink itself.
+		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, procfd, tmpDir, err)
+		}
+		// Now move the mount into the container.
+		if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil {
+			return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %w", tmpDir, procfd, m.Destination, err)
+		}
+		return nil
+	})
 }
 
 func mountToRootfs(m *configs.Mount, c *mountConfig) error {
 	rootfs := c.root
 	mountLabel := c.label
-	dest := m.Destination
-	if !strings.HasPrefix(dest, rootfs) {
-		dest = filepath.Join(rootfs, dest)
+	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
+	if err != nil {
+		return err
 	}
 
 	switch m.Device {
@@ -364,53 +412,21 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
 		}
 		return label.SetFileLabel(dest, mountLabel)
 	case "tmpfs":
-		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
-		tmpDir := ""
-		// dest might be an absolute symlink, so it needs
-		// to be resolved under rootfs.
-		dest, err := securejoin.SecureJoin(rootfs, m.Destination)
-		if err != nil {
-			return err
-		}
-		m.Destination = dest
 		stat, err := os.Stat(dest)
 		if err != nil {
 			if err := os.MkdirAll(dest, 0755); err != nil {
 				return err
 			}
 		}
-		if copyUp {
-			tmpdir, err := prepareTmp("/tmp")
-			if err != nil {
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
-			}
-			defer cleanupTmp(tmpdir)
-			tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
-			if err != nil {
-				return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
-			}
-			defer os.RemoveAll(tmpDir)
-			m.Destination = tmpDir
+
+		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
+			err = doTmpfsCopyUp(m, rootfs, mountLabel)
+		} else {
+			err = mountPropagate(m, rootfs, mountLabel)
 		}
-		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+		if err != nil {
 			return err
 		}
-		if copyUp {
-			if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
-				errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
-				}
-				return errMsg
-			}
-			if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
-				errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
-				if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
-					return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
-				}
-				return errMsg
-			}
-		}
 		if stat != nil {
 			if err = os.Chmod(dest, stat.Mode()); err != nil {
 				return err
@@ -454,19 +470,9 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
 		}
 		return mountCgroupV1(m, c)
 	default:
-		// ensure that the destination of the mount is resolved of symlinks at mount time because
-		// any previous mounts can invalidate the next mount's destination.
-		// this can happen when a user specifies mounts within other mounts to cause breakouts or other
-		// evil stuff to try to escape the container's rootfs.
-		var err error
-		if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
-			return err
-		}
 		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
 			return err
 		}
-		// update the mount with the correct dest after symlinks are resolved.
-		m.Destination = dest
 		if err := os.MkdirAll(dest, 0755); err != nil {
 			return err
 		}
@@ -649,7 +655,7 @@ func createDevices(config *configs.Config) error {
 	return nil
 }
 
-func bindMountDeviceNode(dest string, node *devices.Device) error {
+func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
 	f, err := os.Create(dest)
 	if err != nil && !os.IsExist(err) {
 		return err
@@ -657,7 +663,9 @@ func bindMountDeviceNode(dest string, node *devices.Device) error {
 	if f != nil {
 		f.Close()
 	}
-	return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
+	return utils.WithProcfd(rootfs, dest, func(procfd string) error {
+		return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
+	})
 }
 
 // Creates the device node in the rootfs of the container.
@@ -666,18 +674,21 @@ func createDeviceNode(rootfs string, node *devices.Device, bind bool) error {
 		// The node only exists for cgroup reasons, ignore it here.
 		return nil
 	}
-	dest := filepath.Join(rootfs, node.Path)
+	dest, err := securejoin.SecureJoin(rootfs, node.Path)
+	if err != nil {
+		return err
+	}
 	if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
 		return err
 	}
 	if bind {
-		return bindMountDeviceNode(dest, node)
+		return bindMountDeviceNode(rootfs, dest, node)
 	}
 	if err := mknodDevice(dest, node); err != nil {
 		if os.IsExist(err) {
 			return nil
 		} else if os.IsPermission(err) {
-			return bindMountDeviceNode(dest, node)
+			return bindMountDeviceNode(rootfs, dest, node)
 		}
 		return err
 	}
@@ -1013,61 +1024,47 @@ func writeSystemProperty(key, value string) error {
 }
 
 func remount(m *configs.Mount, rootfs string) error {
-	var (
-		dest = m.Destination
-	)
-	if !strings.HasPrefix(dest, rootfs) {
-		dest = filepath.Join(rootfs, dest)
-	}
-	return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
+	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
+	})
 }
 
 // Do the mount operation followed by additional mounts required to take care
-// of propagation flags.
+// of propagation flags. This will always be scoped inside the container rootfs.
 func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
 	var (
-		dest  = m.Destination
 		data  = label.FormatMountLabel(m.Data, mountLabel)
 		flags = m.Flags
 	)
-	if libcontainerUtils.CleanPath(dest) == "/dev" {
-		flags &= ^unix.MS_RDONLY
-	}
-
-	// Mount it rw to allow chmod operation. A remount will be performed
-	// later to make it ro if set.
-	if m.Device == "tmpfs" {
+	// Delay mounting the filesystem read-only if we need to do further
+	// operations on it. We need to set up files in "/dev" and tmpfs mounts may
+	// need to be chmod-ed after mounting. The mount will be remounted ro later
+	// in finalizeRootfs() if necessary.
+	if libcontainerUtils.CleanPath(m.Destination) == "/dev" || m.Device == "tmpfs" {
 		flags &= ^unix.MS_RDONLY
 	}
 
-	copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
-	if !(copyUp || strings.HasPrefix(dest, rootfs)) {
-		dest = filepath.Join(rootfs, dest)
-	}
-
-	if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
-		return err
-	}
-
-	for _, pflag := range m.PropagationFlags {
-		if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
-			return err
+	// Because the destination is inside a container path which might be
+	// mutating underneath us, we verify that we are actually going to mount
+	// inside the container with WithProcfd() -- mounting through a procfd
+	// mounts on the target.
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
+		return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
+	}); err != nil {
+		return fmt.Errorf("mount through procfd: %w", err)
+	}
+	// We have to apply mount propagation flags in a separate WithProcfd() call
+	// because the previous call invalidates the passed procfd -- the mount
+	// target needs to be re-opened.
+	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
+		for _, pflag := range m.PropagationFlags {
+			if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
+				return err
+			}
 		}
-	}
-	return nil
-}
-
-func mountNewCgroup(m *configs.Mount) error {
-	var (
-		data   = m.Data
-		source = m.Source
-	)
-	if data == "systemd" {
-		data = cgroups.CgroupNamePrefix + data
-		source = "systemd"
-	}
-	if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
-		return err
+		return nil
+	}); err != nil {
+		return fmt.Errorf("change mount propagation through procfd: %w", err)
 	}
 	return nil
 }
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
index 1b72b7a..cd78f23 100644
--- a/libcontainer/utils/utils.go
+++ b/libcontainer/utils/utils.go
@@ -3,12 +3,15 @@ package utils
 import (
 	"encoding/binary"
 	"encoding/json"
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"unsafe"
 
+	"github.com/cyphar/filepath-securejoin"
 	"golang.org/x/sys/unix"
 )
 
@@ -88,6 +91,57 @@ func CleanPath(path string) string {
 	return filepath.Clean(path)
 }
 
+// stripRoot returns the passed path, stripping the root path if it was
+// (lexicially) inside it. Note that both passed paths will always be treated
+// as absolute, and the returned path will also always be absolute. In
+// addition, the paths are cleaned before stripping the root.
+func stripRoot(root, path string) string {
+	// Make the paths clean and absolute.
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
+	switch {
+	case path == root:
+		path = "/"
+	case root == "/":
+		// do nothing
+	case strings.HasPrefix(path, root+"/"):
+		path = strings.TrimPrefix(path, root+"/")
+	}
+	return CleanPath("/" + path)
+}
+
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+	// Remove the root then forcefully resolve inside the root.
+	unsafePath = stripRoot(root, unsafePath)
+	path, err := securejoin.SecureJoin(root, unsafePath)
+	if err != nil {
+		return fmt.Errorf("resolving path inside rootfs failed: %v", err)
+	}
+
+	// Open the target path.
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("open o_path procfd: %w", err)
+	}
+	defer fh.Close()
+
+	// Double-check the path is the one we expected.
+	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
+	if realpath, err := os.Readlink(procfd); err != nil {
+		return fmt.Errorf("procfd verification failed: %w", err)
+	} else if realpath != path {
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+	}
+
+	// Run the closure.
+	return fn(procfd)
+}
+
 // SearchLabels searches a list of key-value pairs for the provided key and
 // returns the corresponding value. The pairs must be separated with '='.
 func SearchLabels(labels []string, query string) string {


Reply to: