Skip to content

Commit

Permalink
Merge patch series "fs: allow detached mounts in clone_private_mount()"
Browse files Browse the repository at this point in the history
Christian Brauner <brauner@kernel.org> says:

In container workloads idmapped mounts are often used as layers for
overlayfs. Recently I added the ability to specify layers in overlayfs
as file descriptors instead of path names. It should be possible to
simply use the detached mounts directly when specifying layers instead
of having to attach them beforehand. They are discarded after overlayfs
is mounted anyway so it's pointless system calls for userspace and
pointless locking for the kernel.

This just recently come up again in [1]. So enable clone_private_mount()
to use detached mounts directly. Following conditions must be met:

- Provided path must be the root of a detached mount tree.
- Provided path may not create mount namespace loops.
- Provided path must be mounted.

It would be possible to be stricter and require that the caller must
have CAP_SYS_ADMIN in the owning user namespace of the anonymous mount
namespace but since this restriction isn't enforced for move_mount()
there's no point in enforcing it for clone_private_mount().

* patches from https://lore.kernel.org/r/20250123-avancieren-erfreuen-3d61f6588fdd@brauner:
  selftests: add tests for using detached mount with overlayfs
  fs: allow detached mounts in clone_private_mount()

Link: https://lore.kernel.org/r/20250123-avancieren-erfreuen-3d61f6588fdd@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
  • Loading branch information
Christian Brauner committed Feb 12, 2025
2 parents 29349a3 + ccc829b commit 3129946
Show file tree
Hide file tree
Showing 3 changed files with 190 additions and 35 deletions.
78 changes: 43 additions & 35 deletions fs/namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -2369,6 +2369,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}

/*
* Check that there aren't references to earlier/same mount namespaces in the
* specified subtree. Such references can act as pins for mount namespaces
* that aren't checked by the mount-cycle checking code, thereby allowing
* cycles to be made.
*/
static bool check_for_nsfs_mounts(struct mount *subtree)
{
struct mount *p;
bool ret = false;

lock_mount_hash();
for (p = subtree; p; p = next_mnt(p, subtree))
if (mnt_ns_loop(p->mnt.mnt_root))
goto out;

ret = true;
out:
unlock_mount_hash();
return ret;
}

/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
Expand All @@ -2377,37 +2399,45 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
* will not be attached anywhere in the namespace and will be private (i.e.
* changes to the originating mount won't be propagated into this).
*
* This assumes caller has called or done the equivalent of may_mount().
*
* Release with mntput().
*/
struct vfsmount *clone_private_mount(const struct path *path)
{
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;

down_read(&namespace_sem);
scoped_guard(rwsem_read, &namespace_sem)
if (IS_MNT_UNBINDABLE(old_mnt))
goto invalid;
return ERR_PTR(-EINVAL);

if (mnt_has_parent(old_mnt)) {
if (!check_mnt(old_mnt))
return ERR_PTR(-EINVAL);
} else {
if (!is_mounted(&old_mnt->mnt))
return ERR_PTR(-EINVAL);

if (!check_mnt(old_mnt))
goto invalid;
/* Make sure this isn't something purely kernel internal. */
if (!is_anon_ns(old_mnt->mnt_ns))
return ERR_PTR(-EINVAL);

/* Make sure we don't create mount namespace loops. */
if (!check_for_nsfs_mounts(old_mnt))
return ERR_PTR(-EINVAL);
}

if (has_locked_children(old_mnt, path->dentry))
goto invalid;
return ERR_PTR(-EINVAL);

new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
up_read(&namespace_sem);

if (IS_ERR(new_mnt))
return ERR_CAST(new_mnt);
return ERR_PTR(-EINVAL);

/* Longterm mount to be removed by kern_unmount*() */
new_mnt->mnt_ns = MNT_NS_INTERNAL;

return &new_mnt->mnt;

invalid:
up_read(&namespace_sem);
return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);

Expand Down Expand Up @@ -3206,28 +3236,6 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}

/*
* Check that there aren't references to earlier/same mount namespaces in the
* specified subtree. Such references can act as pins for mount namespaces
* that aren't checked by the mount-cycle checking code, thereby allowing
* cycles to be made.
*/
static bool check_for_nsfs_mounts(struct mount *subtree)
{
struct mount *p;
bool ret = false;

lock_mount_hash();
for (p = subtree; p; p = next_mnt(p, subtree))
if (mnt_ns_loop(p->mnt.mnt_root))
goto out;

ret = true;
out:
unlock_mount_hash();
return ret;
}

static int do_set_group(struct path *from_path, struct path *to_path)
{
struct mount *from, *to;
Expand Down
130 changes: 130 additions & 0 deletions tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@ FIXTURE(set_layers_via_fds) {
FIXTURE_SETUP(set_layers_via_fds)
{
ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
ASSERT_EQ(mkdir("/set_layers_via_fds_tmpfs", 0755), 0);
}

FIXTURE_TEARDOWN(set_layers_via_fds)
{
umount2("/set_layers_via_fds", 0);
ASSERT_EQ(rmdir("/set_layers_via_fds"), 0);

umount2("/set_layers_via_fds_tmpfs", 0);
ASSERT_EQ(rmdir("/set_layers_via_fds_tmpfs"), 0);
}

TEST_F(set_layers_via_fds, set_layers_via_fds)
Expand Down Expand Up @@ -279,4 +283,130 @@ TEST_F(set_layers_via_fds, set_500_layers_via_opath_fds)
ASSERT_EQ(close(fd_overlay), 0);
}

TEST_F(set_layers_via_fds, set_layers_via_detached_mount_fds)
{
int fd_context, fd_tmpfs, fd_overlay, fd_tmp;
int layer_fds[] = { [0 ... 8] = -EBADF };
bool layers_found[] = { [0 ... 8] = false };
size_t len = 0;
char *line = NULL;
FILE *f_mountinfo;

ASSERT_EQ(unshare(CLONE_NEWNS), 0);
ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);

fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);

ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
fd_tmpfs = sys_fsmount(fd_context, 0, 0);
ASSERT_GE(fd_tmpfs, 0);
ASSERT_EQ(close(fd_context), 0);

ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "u/upper", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "u/work", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0);
ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0);

ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/set_layers_via_fds_tmpfs", MOVE_MOUNT_F_EMPTY_PATH), 0);

fd_tmp = open_tree(fd_tmpfs, "u", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
ASSERT_GE(fd_tmp, 0);

layer_fds[0] = openat(fd_tmp, "upper", O_CLOEXEC | O_DIRECTORY | O_PATH);
ASSERT_GE(layer_fds[0], 0);

layer_fds[1] = openat(fd_tmp, "work", O_CLOEXEC | O_DIRECTORY | O_PATH);
ASSERT_GE(layer_fds[1], 0);

layer_fds[2] = open_tree(fd_tmpfs, "l1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
ASSERT_GE(layer_fds[2], 0);

layer_fds[3] = open_tree(fd_tmpfs, "l2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
ASSERT_GE(layer_fds[3], 0);

layer_fds[4] = open_tree(fd_tmpfs, "l3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
ASSERT_GE(layer_fds[4], 0);

layer_fds[5] = open_tree(fd_tmpfs, "l4", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
ASSERT_GE(layer_fds[5], 0);

layer_fds[6] = open_tree(fd_tmpfs, "d1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
ASSERT_GE(layer_fds[6], 0);

layer_fds[7] = open_tree(fd_tmpfs, "d2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
ASSERT_GE(layer_fds[7], 0);

layer_fds[8] = open_tree(fd_tmpfs, "d3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
ASSERT_GE(layer_fds[8], 0);

ASSERT_EQ(close(fd_tmpfs), 0);

fd_context = sys_fsopen("overlay", 0);
ASSERT_GE(fd_context, 0);

ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);

ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[0]), 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[1]), 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[6]), 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[7]), 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[8]), 0);

ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);

ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);

fd_overlay = sys_fsmount(fd_context, 0, 0);
ASSERT_GE(fd_overlay, 0);

ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);

f_mountinfo = fopen("/proc/self/mountinfo", "r");
ASSERT_NE(f_mountinfo, NULL);

while (getline(&line, &len, f_mountinfo) != -1) {
char *haystack = line;

if (strstr(haystack, "workdir=/tmp/w"))
layers_found[0] = true;
if (strstr(haystack, "upperdir=/tmp/u"))
layers_found[1] = true;
if (strstr(haystack, "lowerdir+=/tmp/l1"))
layers_found[2] = true;
if (strstr(haystack, "lowerdir+=/tmp/l2"))
layers_found[3] = true;
if (strstr(haystack, "lowerdir+=/tmp/l3"))
layers_found[4] = true;
if (strstr(haystack, "lowerdir+=/tmp/l4"))
layers_found[5] = true;
if (strstr(haystack, "datadir+=/tmp/d1"))
layers_found[6] = true;
if (strstr(haystack, "datadir+=/tmp/d2"))
layers_found[7] = true;
if (strstr(haystack, "datadir+=/tmp/d3"))
layers_found[8] = true;
}
free(line);

for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
ASSERT_EQ(layers_found[i], true);
ASSERT_EQ(close(layer_fds[i]), 0);
}

ASSERT_EQ(close(fd_context), 0);
ASSERT_EQ(close(fd_overlay), 0);
ASSERT_EQ(fclose(f_mountinfo), 0);
}

TEST_HARNESS_MAIN
17 changes: 17 additions & 0 deletions tools/testing/selftests/filesystems/overlayfs/wrappers.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,21 @@ static inline int sys_move_mount(int from_dfd, const char *from_pathname,
to_pathname, flags);
}

#ifndef OPEN_TREE_CLONE
#define OPEN_TREE_CLONE 1
#endif

#ifndef OPEN_TREE_CLOEXEC
#define OPEN_TREE_CLOEXEC O_CLOEXEC
#endif

#ifndef AT_RECURSIVE
#define AT_RECURSIVE 0x8000
#endif

static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
{
return syscall(__NR_open_tree, dfd, filename, flags);
}

#endif

0 comments on commit 3129946

Please sign in to comment.