Skip to content

Commit

Permalink
Merge patch series "Support foreign mount namespace with statmount/li…
Browse files Browse the repository at this point in the history
…stmount"

Josef Bacik <josef@toxicpanda.com> says:

Currently the only way to iterate over mount entries in mount namespaces that
aren't your own is to trawl through /proc in order to find /proc/$PID/mountinfo
for the mount namespace that you want.  This is hugely inefficient, so extend
both statmount() and listmount() to allow specifying a mount namespace id in
order to get to mounts in other mount namespaces.

There are a few components to this

1. Having a global index of the mount namespace based on the ->seq value in the
   mount namespace.  This gives us a unique identifier that isn't re-used.
2. Support looking up mount namespaces based on that unique identifier, and
   validating the user has permission to access the given mount namespace.
3. Provide a new ioctl() on nsfs in order to extract the unique identifier we
   can use for statmount() and listmount().

The code is relatively straightforward, and there is a selftest provided to
validate everything works properly.

This is based on vfs.all as of last week, so must be applied onto a tree that
has Christians error handling rework in this area.  If you wish you can pull the
tree directly here

https://github.com/josefbacik/linux/tree/listmount.combined

Christian and I collaborated on this series, which is why there's patches from
both of us in this series.

Christian Brauner (4):
  fs: relax permissions for listmount()
  fs: relax permissions for statmount()
  fs: Allow listmount() in foreign mount namespace
  fs: Allow statmount() in foreign mount namespace

Josef Bacik (4):
  fs: keep an index of current mount namespaces
  fs: export the mount ns id via statmount
  fs: add an ioctl to get the mnt ns id from nsfs
  selftests: add a test for the foreign mnt ns extensions

fs/mount.h                                    |   2 +
 fs/namespace.c                                | 240 ++++++++++--
 fs/nsfs.c                                     |  14 +
 include/uapi/linux/mount.h                    |   6 +-
 include/uapi/linux/nsfs.h                     |   2 +
 .../selftests/filesystems/statmount/Makefile  |   2 +-
 .../filesystems/statmount/statmount.h         |  46 +++
 .../filesystems/statmount/statmount_test.c    |  53 +--
 .../filesystems/statmount/statmount_test_ns.c | 360 ++++++++++++++++++
 9 files changed, 659 insertions(+), 66 deletions(-)
 create mode 100644 tools/testing/selftests/filesystems/statmount/statmount.h
 create mode 100644 tools/testing/selftests/filesystems/statmount/statmount_test_ns.c

Link: https://lore.kernel.org/r/cover.1719243756.git.josef@toxicpanda.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
  • Loading branch information
Christian Brauner committed Jun 28, 2024
2 parents d04bccd + d896f71 commit a7ebb0f
Show file tree
Hide file tree
Showing 9 changed files with 663 additions and 66 deletions.
2 changes: 2 additions & 0 deletions fs/mount.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ struct mnt_namespace {
u64 event;
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;

struct mnt_pcp {
Expand Down
240 changes: 216 additions & 24 deletions fs/namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_RWLOCK(mnt_ns_tree_lock);
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */

struct mount_kattr {
unsigned int attr_set;
Expand All @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
{
u64 seq_b = ns->seq;

if (seq < seq_b)
return -1;
if (seq > seq_b)
return 1;
return 0;
}

static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
if (!node)
return NULL;
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}

static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
{
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
u64 seq_a = ns_a->seq;

return mnt_ns_cmp(seq_a, ns_b) < 0;
}

static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
guard(write_lock)(&mnt_ns_tree_lock);
rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
}

static void mnt_ns_release(struct mnt_namespace *ns)
{
lockdep_assert_not_held(&mnt_ns_tree_lock);

/* keep alive for {list,stat}mount() */
if (refcount_dec_and_test(&ns->passive)) {
put_user_ns(ns->user_ns);
kfree(ns);
}
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))

static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
/* remove from global mount namespace list */
if (!is_anon_ns(ns)) {
guard(write_lock)(&mnt_ns_tree_lock);
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
}

mnt_ns_release(ns);
}

/*
* Returns the mount namespace which either has the specified id, or has the
* next smallest id afer the specified one.
*/
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
{
struct rb_node *node = mnt_ns_tree.rb_node;
struct mnt_namespace *ret = NULL;

lockdep_assert_held(&mnt_ns_tree_lock);

while (node) {
struct mnt_namespace *n = node_to_mnt_ns(node);

if (mnt_ns_id <= n->seq) {
ret = node_to_mnt_ns(node);
if (mnt_ns_id == n->seq)
break;
node = node->rb_left;
} else {
node = node->rb_right;
}
}
return ret;
}

/*
* Lookup a mount namespace by id and take a passive reference count. Taking a
* passive reference means the mount namespace can be emptied if e.g., the last
* task holding an active reference exits. To access the mounts of the
* namespace the @namespace_sem must first be acquired. If the namespace has
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
* see that the mount rbtree of the namespace is empty.
*/
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
struct mnt_namespace *ns;

guard(read_lock)(&mnt_ns_tree_lock);
ns = mnt_ns_find_id_at(mnt_ns_id);
if (!ns || ns->seq != mnt_ns_id)
return NULL;

refcount_inc(&ns->passive);
return ns;
}

static inline void lock_mount_hash(void)
{
write_seqlock(&mount_lock);
Expand Down Expand Up @@ -3733,8 +3838,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
if (!is_anon_ns(ns))
ns_free_inum(&ns->ns);
dec_mnt_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
kfree(ns);
mnt_ns_tree_remove(ns);
}

/*
Expand Down Expand Up @@ -3773,7 +3877,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
Expand Down Expand Up @@ -3850,6 +3956,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
mnt_ns_tree_add(new_ns);
namespace_unlock();

if (rootmnt)
Expand Down Expand Up @@ -4867,6 +4974,12 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
return 0;
}

static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
s->sm.mask |= STATMOUNT_MNT_NS_ID;
s->sm.mnt_ns_id = ns->seq;
}

static int statmount_string(struct kstatmount *s, u64 flag)
{
int ret;
Expand Down Expand Up @@ -4930,14 +5043,15 @@ static int copy_statmount_to_user(struct kstatmount *s)
static int do_statmount(struct kstatmount *s)
{
struct mount *m = real_mount(s->mnt);
struct mnt_namespace *ns = m->mnt_ns;
int err;

/*
* Don't trigger audit denials. We just want to determine what
* mounts to show users.
*/
if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
!ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;

err = security_sb_statfs(s->mnt->mnt_root);
Expand All @@ -4962,6 +5076,9 @@ static int do_statmount(struct kstatmount *s)
if (!err && s->mask & STATMOUNT_MNT_POINT)
err = statmount_string(s, STATMOUNT_MNT_POINT);

if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);

if (err)
return err;

Expand Down Expand Up @@ -5003,7 +5120,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
int ret;
size_t usize;

BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);

ret = get_user(usize, &req->size);
if (ret)
Expand All @@ -5021,10 +5138,63 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
return 0;
}

static struct mount *listmnt_next(struct mount *curr, bool reverse)
{
struct rb_node *node;

if (reverse)
node = rb_prev(&curr->mnt_node);
else
node = rb_next(&curr->mnt_node);

return node_to_mount(node);
}

static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
{
struct mount *first;

rwsem_assert_held(&namespace_sem);

/* We're looking at our own ns, just use get_fs_root. */
if (ns == current->nsproxy->mnt_ns) {
get_fs_root(current->fs, root);
return 0;
}

/*
* We have to find the first mount in our ns and use that, however it
* may not exist, so handle that properly.
*/
if (RB_EMPTY_ROOT(&ns->mounts))
return -ENOENT;

first = listmnt_next(ns->root, false);
if (!first)
return -ENOENT;
root->mnt = mntget(&first->mnt);
root->dentry = dget(root->mnt->mnt_root);
return 0;
}

/*
* If the user requested a specific mount namespace id, look that up and return
* that, or if not simply grab a passive reference on our mount namespace and
* return that.
*/
static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
{
if (mnt_ns_id)
return lookup_mnt_ns(mnt_ns_id);
refcount_inc(&current->nsproxy->mnt_ns->passive);
return current->nsproxy->mnt_ns;
}

SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
struct statmount __user *, buf, size_t, bufsize,
unsigned int, flags)
{
struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct vfsmount *mnt;
struct mnt_id_req kreq;
struct kstatmount ks;
Expand All @@ -5039,21 +5209,41 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;

ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
if (!ns)
return -ENOENT;

if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -ENOENT;

retry:
ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
if (ret)
return ret;

down_read(&namespace_sem);
mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
/* Has the namespace already been emptied? */
if (kreq.mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) {
up_read(&namespace_sem);
kvfree(ks.seq.buf);
return -ENOENT;
}

mnt = lookup_mnt_in_ns(kreq.mnt_id, ns);
if (!mnt) {
up_read(&namespace_sem);
kvfree(ks.seq.buf);
return -ENOENT;
}

ks.mnt = mnt;
get_fs_root(current->fs, &ks.root);
ret = grab_requested_root(ns, &ks.root);
if (ret) {
up_read(&namespace_sem);
kvfree(ks.seq.buf);
return ret;
}
ret = do_statmount(&ks);
path_put(&ks.root);
up_read(&namespace_sem);
Expand All @@ -5066,30 +5256,21 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
return ret;
}

static struct mount *listmnt_next(struct mount *curr, bool reverse)
{
struct rb_node *node;

if (reverse)
node = rb_prev(&curr->mnt_node);
else
node = rb_next(&curr->mnt_node);

return node_to_mount(node);
}

static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
size_t nr_mnt_ids, bool reverse)
static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
bool reverse)
{
struct path root __free(path_put) = {};
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct path orig;
struct mount *r, *first;
ssize_t ret;

rwsem_assert_held(&namespace_sem);

get_fs_root(current->fs, &root);
ret = grab_requested_root(ns, &root);
if (ret)
return ret;

if (mnt_parent_id == LSMT_ROOT) {
orig = root;
} else {
Expand All @@ -5104,7 +5285,7 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
* mounts to show users.
*/
if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
!ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;

ret = security_sb_statfs(orig.dentry);
Expand Down Expand Up @@ -5141,6 +5322,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
{
u64 *kmnt_ids __free(kvfree) = NULL;
const size_t maxcount = 1000000;
struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct mnt_id_req kreq;
ssize_t ret;

Expand All @@ -5167,8 +5349,16 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (!kmnt_ids)
return -ENOMEM;

ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
if (!ns)
return -ENOENT;

if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -ENOENT;

scoped_guard(rwsem_read, &namespace_sem)
ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids,
ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids,
nr_mnt_ids, (flags & LISTMOUNT_REVERSE));

if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
Expand Down Expand Up @@ -5204,6 +5394,8 @@ static void __init init_mount_tree(void)

set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);

mnt_ns_tree_add(ns);
}

void __init mnt_init(void)
Expand Down
Loading

0 comments on commit a7ebb0f

Please sign in to comment.