[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#908192: Acknowledgement (linux-image-4.18.0-1-amd64: After upgrade kernel to linux-image-4.18.0-1-amd64, LXC can not start)



On Sat, Oct 13, 2018 at 10:26:57PM +0200, Salvatore Bonaccorso wrote:
> control: 908192 blocked by 908223
> Control: tag 908192 + wontfix
> 
> Hi Christian,
> 
> On Tue, Oct 09, 2018 at 02:06:09AM +0200, Christian Brauner wrote:
> > On Sat, 22 Sep 2018 21:20:19 +0200 Salvatore Bonaccorso
> > <carnil@debian.org> wrote:
> > > Hi,
> > >
> > > On Thu, Sep 20, 2018 at 06:09:07PM +0800, johnw wrote:
> > > >
> > > > Is it also reverted/back port to Debian?
> > > > Because I still have this problem.
> > >
> > > it has not yet been reverted upstream, the discussion got stuck here:
> > 
> > Hey, I'm one of the ACKers of the original patch and the author
> > of the revert. The kernel patch that broke userspace won't be reverted upstream.
> > However, I'm also a LXC maintainer and I have fixed LXC upstream to handle
> > 4.18 kernels. The relevant commits are:
> > 
> > master:
> > https://github.com/lxc/lxc/commit/3e04a6083eefe0b837db6d1b826721fd985ce052
> > https://github.com/lxc/lxc/commit/5067e4dd8594c3b1f8ee78f0e86edb480f84a156
> > 
> > stable-3.0:
> > https://github.com/lxc/lxc/commit/c221e3780a59ac08cd7e8758f7d71422f0c4decf
> > 
> > I haven't done a backport of this to stable-2.0 yet. So far there weren't
> > any complains for that one.
> 
> Thanks for your followup, much appreciated. The repsective bug report
> for LXC itself is #908223 where it then should be fixed.

Thanks. :)

> 
> I'm going to mark #908192 as wontfix for the kernel side. Is it
> planned to backport a fix for stable-2.0?

Just took care of that. I'm appending the commit here but it is
available upstream in the stable-2.0 branch too:

https://github.com/lxc/lxc/commit/db4219603946649474b5cb7915dbd6c17ec728f0

Christian
From db4219603946649474b5cb7915dbd6c17ec728f0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 14 Oct 2018 11:42:29 +0200
Subject: [PATCH] autodev: adapt to changes in Linux 4.18

Starting with commit
55956b59df33 ("vfs: Allow userns root to call mknod on owned filesystems.")
Linux will allow mknod() in user namespaces for userns root if CAP_MKNOD is
available.
However, these device nodes are useless since

static struct super_block *alloc_super(struct file_system_type *type, int flags,
                                       struct user_namespace *user_ns)
{
    /* <snip> */

    if (s->s_user_ns != &init_user_ns)
            s->s_iflags |= SB_I_NODEV;

    /* <snip> */
}

will set the SB_I_NODEV flag on the filesystem. When a device node created in
non-init userns is open()ed the call chain will hit:

bool may_open_dev(const struct path *path)
{
    return !(path->mnt->mnt_flags & MNT_NODEV) &&
            !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

which will cause an EPERM because the device node is located on an fs
owned by non-init-userns and thus doesn't grant access to device nodes due to
SB_I_NODEV.

This commit enables LXC to deal with such kernels.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 src/lxc/conf.c | 116 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 78 insertions(+), 38 deletions(-)

diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 09acc34a..e9bed5cb 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -989,6 +989,7 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
 	int ret;
 	size_t clen;
 	char *path;
+	mode_t cur_mask;
 
 	INFO("Preparing \"/dev\"");
 
@@ -1000,37 +1001,45 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
 	if (ret < 0 || (size_t)ret >= clen)
 		return -1;
 
-	if (!dir_exists(path)) {
-		WARN("\"/dev\" directory does not exist. Proceeding without "
-		     "autodev being set up");
-		return 0;
+	cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
+	ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+	if (ret < 0 && errno != EEXIST) {
+		SYSERROR("Failed to create \"/dev\" directory");
+		ret = -errno;
+		goto reset_umask;
 	}
 
 	ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
 			 rootfs->path ? rootfs->mount : NULL);
 	if (ret < 0) {
 		SYSERROR("Failed to mount tmpfs on \"%s\"", path);
-		return -1;
+		goto reset_umask;
 	}
-	INFO("Mounted tmpfs on \"%s\"", path);
+	TRACE("Mounted tmpfs on \"%s\"", path);
 
 	ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
-	if (ret < 0 || (size_t)ret >= clen)
-		return -1;
+	if (ret < 0 || (size_t)ret >= clen) {
+		ret = -1;
+		goto reset_umask;
+	}
 
 	/* If we are running on a devtmpfs mapping, dev/pts may already exist.
 	 * If not, then create it and exit if that fails...
 	 */
-	if (!dir_exists(path)) {
-		ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
-		if (ret < 0) {
-			SYSERROR("Failed to create directory \"%s\"", path);
-			return -1;
-		}
+	ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+	if (ret < 0 && errno != EEXIST) {
+		SYSERROR("Failed to create directory \"%s\"", path);
+		ret = -errno;
+		goto reset_umask;
 	}
 
+	ret = 0;
+
+reset_umask:
+	(void)umask(cur_mask);
+
 	INFO("Prepared \"/dev\"");
-	return 0;
+	return ret;
 }
 
 struct lxc_device_node {
@@ -1049,16 +1058,23 @@ static const struct lxc_device_node lxc_devices[] = {
 	{ "zero",    S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
 };
 
+enum {
+	LXC_DEVNODE_BIND,
+	LXC_DEVNODE_MKNOD,
+	LXC_DEVNODE_PARTIAL,
+	LXC_DEVNODE_OPEN,
+};
+
 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
 {
 	int i, ret;
-	char path[MAXPATHLEN];
+	char path[PATH_MAX];
 	mode_t cmask;
-	bool can_mknod = true;
+	int use_mknod = LXC_DEVNODE_MKNOD;
 
-	ret = snprintf(path, MAXPATHLEN, "%s/dev",
+	ret = snprintf(path, PATH_MAX, "%s/dev",
 		       rootfs->path ? rootfs->mount : "");
-	if (ret < 0 || ret >= MAXPATHLEN)
+	if (ret < 0 || ret >= PATH_MAX)
 		return -1;
 
 	/* ignore, just don't try to fill in */
@@ -1069,41 +1085,65 @@ static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
 
 	cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
 	for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
-		char hostpath[MAXPATHLEN];
+		char hostpath[PATH_MAX];
 		const struct lxc_device_node *device = &lxc_devices[i];
 
-		ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
+		ret = snprintf(path, PATH_MAX, "%s/dev/%s",
 			       rootfs->path ? rootfs->mount : "", device->name);
-		if (ret < 0 || ret >= MAXPATHLEN)
+		if (ret < 0 || ret >= PATH_MAX)
 			return -1;
 
-		if (can_mknod) {
+		if (use_mknod >= LXC_DEVNODE_MKNOD) {
 			ret = mknod(path, device->mode, makedev(device->maj, device->min));
 			if (ret == 0 || (ret < 0 && errno == EEXIST)) {
 				DEBUG("Created device node \"%s\"", path);
-				continue;
-			}
+			} else if (ret < 0) {
+				if (errno != EPERM) {
+					SYSERROR("Failed to create device node \"%s\"", path);
+					return -1;
+				}
 
-			if (errno != EPERM) {
-				SYSERROR("Failed to create device node \"%s\"", path);
-				return -1;
+				use_mknod = LXC_DEVNODE_BIND;
 			}
 
-			/* This can e.g. happen when the container is
-			 * unprivileged or CAP_MKNOD has been dropped.
-			 */
-			can_mknod = false;
+			/* Device nodes are fully useable. */
+			if (use_mknod == LXC_DEVNODE_OPEN)
+				continue;
+
+			if (use_mknod == LXC_DEVNODE_MKNOD) {
+				/* See
+				 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
+				 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
+				 */
+				ret = open(path, O_RDONLY | O_CLOEXEC);
+				if (ret >= 0) {
+					close(ret);
+					/* Device nodes are fully useable. */
+					use_mknod = LXC_DEVNODE_OPEN;
+					continue;
+				}
+
+				TRACE("Failed to open \"%s\" device", path);
+				/* Device nodes are only partially useable. */
+				use_mknod = LXC_DEVNODE_PARTIAL;
+			}
 		}
 
-		ret = mknod(path, S_IFREG, 0);
-		if (ret < 0 && errno != EEXIST) {
-			SYSERROR("Failed to create file \"%s\"", path);
-			return -1;
+		if (use_mknod != LXC_DEVNODE_PARTIAL) {
+			/* If we are dealing with partially functional device
+			 * nodes the prio mknod() call will have created the
+			 * device node so we can use it as a bind-mount target.
+			 */
+			ret = mknod(path, S_IFREG | 0000, 0);
+			if (ret < 0 && errno != EEXIST) {
+				SYSERROR("Failed to create file \"%s\"", path);
+				return -1;
+			}
 		}
 
 		/* Fallback to bind-mounting the device from the host. */
-		ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", device->name);
-		if (ret < 0 || ret >= MAXPATHLEN)
+		ret = snprintf(hostpath, PATH_MAX, "/dev/%s", device->name);
+		if (ret < 0 || ret >= PATH_MAX)
 			return -1;
 
 		ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
-- 
2.17.1

Attachment: signature.asc
Description: PGP signature


Reply to: