On Sat, Oct 13, 2018 at 10:26:57PM +0200, Salvatore Bonaccorso wrote: > control: 908192 blocked by 908223 > Control: tag 908192 + wontfix > > Hi Christian, > > On Tue, Oct 09, 2018 at 02:06:09AM +0200, Christian Brauner wrote: > > On Sat, 22 Sep 2018 21:20:19 +0200 Salvatore Bonaccorso > > <carnil@debian.org> wrote: > > > Hi, > > > > > > On Thu, Sep 20, 2018 at 06:09:07PM +0800, johnw wrote: > > > > > > > > Is it also reverted/back port to Debian? > > > > Because I still have this problem. > > > > > > it has not yet been reverted upstream, the discussion got stuck here: > > > > Hey, I'm one of the ACKers of the original patch and the author > > of the revert. The kernel patch that broke userspace won't be reverted upstream. > > However, I'm also a LXC maintainer and I have fixed LXC upstream to handle > > 4.18 kernels. The relevant commits are: > > > > master: > > https://github.com/lxc/lxc/commit/3e04a6083eefe0b837db6d1b826721fd985ce052 > > https://github.com/lxc/lxc/commit/5067e4dd8594c3b1f8ee78f0e86edb480f84a156 > > > > stable-3.0: > > https://github.com/lxc/lxc/commit/c221e3780a59ac08cd7e8758f7d71422f0c4decf > > > > I haven't done a backport of this to stable-2.0 yet. So far there weren't > > any complains for that one. > > Thanks for your followup, much appreciated. The repsective bug report > for LXC itself is #908223 where it then should be fixed. Thanks. :) > > I'm going to mark #908192 as wontfix for the kernel side. Is it > planned to backport a fix for stable-2.0? Just took care of that. I'm appending the commit here but it is available upstream in the stable-2.0 branch too: https://github.com/lxc/lxc/commit/db4219603946649474b5cb7915dbd6c17ec728f0 Christian
From db4219603946649474b5cb7915dbd6c17ec728f0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 14 Oct 2018 11:42:29 +0200
Subject: [PATCH] autodev: adapt to changes in Linux 4.18
Starting with commit
55956b59df33 ("vfs: Allow userns root to call mknod on owned filesystems.")
Linux will allow mknod() in user namespaces for userns root if CAP_MKNOD is
available.
However, these device nodes are useless since
static struct super_block *alloc_super(struct file_system_type *type, int flags,
struct user_namespace *user_ns)
{
/* <snip> */
if (s->s_user_ns != &init_user_ns)
s->s_iflags |= SB_I_NODEV;
/* <snip> */
}
will set the SB_I_NODEV flag on the filesystem. When a device node created in
non-init userns is open()ed the call chain will hit:
bool may_open_dev(const struct path *path)
{
return !(path->mnt->mnt_flags & MNT_NODEV) &&
!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}
which will cause an EPERM because the device node is located on an fs
owned by non-init-userns and thus doesn't grant access to device nodes due to
SB_I_NODEV.
This commit enables LXC to deal with such kernels.
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
src/lxc/conf.c | 116 +++++++++++++++++++++++++++++++++----------------
1 file changed, 78 insertions(+), 38 deletions(-)
diff --git a/src/lxc/conf.c b/src/lxc/conf.c
index 09acc34a..e9bed5cb 100644
--- a/src/lxc/conf.c
+++ b/src/lxc/conf.c
@@ -989,6 +989,7 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
int ret;
size_t clen;
char *path;
+ mode_t cur_mask;
INFO("Preparing \"/dev\"");
@@ -1000,37 +1001,45 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
if (ret < 0 || (size_t)ret >= clen)
return -1;
- if (!dir_exists(path)) {
- WARN("\"/dev\" directory does not exist. Proceeding without "
- "autodev being set up");
- return 0;
+ cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
+ ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+ if (ret < 0 && errno != EEXIST) {
+ SYSERROR("Failed to create \"/dev\" directory");
+ ret = -errno;
+ goto reset_umask;
}
ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
rootfs->path ? rootfs->mount : NULL);
if (ret < 0) {
SYSERROR("Failed to mount tmpfs on \"%s\"", path);
- return -1;
+ goto reset_umask;
}
- INFO("Mounted tmpfs on \"%s\"", path);
+ TRACE("Mounted tmpfs on \"%s\"", path);
ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
- if (ret < 0 || (size_t)ret >= clen)
- return -1;
+ if (ret < 0 || (size_t)ret >= clen) {
+ ret = -1;
+ goto reset_umask;
+ }
/* If we are running on a devtmpfs mapping, dev/pts may already exist.
* If not, then create it and exit if that fails...
*/
- if (!dir_exists(path)) {
- ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
- if (ret < 0) {
- SYSERROR("Failed to create directory \"%s\"", path);
- return -1;
- }
+ ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+ if (ret < 0 && errno != EEXIST) {
+ SYSERROR("Failed to create directory \"%s\"", path);
+ ret = -errno;
+ goto reset_umask;
}
+ ret = 0;
+
+reset_umask:
+ (void)umask(cur_mask);
+
INFO("Prepared \"/dev\"");
- return 0;
+ return ret;
}
struct lxc_device_node {
@@ -1049,16 +1058,23 @@ static const struct lxc_device_node lxc_devices[] = {
{ "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
};
+enum {
+ LXC_DEVNODE_BIND,
+ LXC_DEVNODE_MKNOD,
+ LXC_DEVNODE_PARTIAL,
+ LXC_DEVNODE_OPEN,
+};
+
static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
{
int i, ret;
- char path[MAXPATHLEN];
+ char path[PATH_MAX];
mode_t cmask;
- bool can_mknod = true;
+ int use_mknod = LXC_DEVNODE_MKNOD;
- ret = snprintf(path, MAXPATHLEN, "%s/dev",
+ ret = snprintf(path, PATH_MAX, "%s/dev",
rootfs->path ? rootfs->mount : "");
- if (ret < 0 || ret >= MAXPATHLEN)
+ if (ret < 0 || ret >= PATH_MAX)
return -1;
/* ignore, just don't try to fill in */
@@ -1069,41 +1085,65 @@ static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
- char hostpath[MAXPATHLEN];
+ char hostpath[PATH_MAX];
const struct lxc_device_node *device = &lxc_devices[i];
- ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
+ ret = snprintf(path, PATH_MAX, "%s/dev/%s",
rootfs->path ? rootfs->mount : "", device->name);
- if (ret < 0 || ret >= MAXPATHLEN)
+ if (ret < 0 || ret >= PATH_MAX)
return -1;
- if (can_mknod) {
+ if (use_mknod >= LXC_DEVNODE_MKNOD) {
ret = mknod(path, device->mode, makedev(device->maj, device->min));
if (ret == 0 || (ret < 0 && errno == EEXIST)) {
DEBUG("Created device node \"%s\"", path);
- continue;
- }
+ } else if (ret < 0) {
+ if (errno != EPERM) {
+ SYSERROR("Failed to create device node \"%s\"", path);
+ return -1;
+ }
- if (errno != EPERM) {
- SYSERROR("Failed to create device node \"%s\"", path);
- return -1;
+ use_mknod = LXC_DEVNODE_BIND;
}
- /* This can e.g. happen when the container is
- * unprivileged or CAP_MKNOD has been dropped.
- */
- can_mknod = false;
+ /* Device nodes are fully useable. */
+ if (use_mknod == LXC_DEVNODE_OPEN)
+ continue;
+
+ if (use_mknod == LXC_DEVNODE_MKNOD) {
+ /* See
+ * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
+ * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
+ */
+ ret = open(path, O_RDONLY | O_CLOEXEC);
+ if (ret >= 0) {
+ close(ret);
+ /* Device nodes are fully useable. */
+ use_mknod = LXC_DEVNODE_OPEN;
+ continue;
+ }
+
+ TRACE("Failed to open \"%s\" device", path);
+ /* Device nodes are only partially useable. */
+ use_mknod = LXC_DEVNODE_PARTIAL;
+ }
}
- ret = mknod(path, S_IFREG, 0);
- if (ret < 0 && errno != EEXIST) {
- SYSERROR("Failed to create file \"%s\"", path);
- return -1;
+ if (use_mknod != LXC_DEVNODE_PARTIAL) {
+ /* If we are dealing with partially functional device
+ * nodes the prio mknod() call will have created the
+ * device node so we can use it as a bind-mount target.
+ */
+ ret = mknod(path, S_IFREG | 0000, 0);
+ if (ret < 0 && errno != EEXIST) {
+ SYSERROR("Failed to create file \"%s\"", path);
+ return -1;
+ }
}
/* Fallback to bind-mounting the device from the host. */
- ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", device->name);
- if (ret < 0 || ret >= MAXPATHLEN)
+ ret = snprintf(hostpath, PATH_MAX, "/dev/%s", device->name);
+ if (ret < 0 || ret >= PATH_MAX)
return -1;
ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
--
2.17.1
Attachment:
signature.asc
Description: PGP signature