Skip to content

Commit

Permalink
proc: Dentry flushing without proc_mnt
Browse files Browse the repository at this point in the history
Cleanly handling proc mount options require the internal mount of
proc to be removed (so mount options are not ignored), and quite
possibly multiple proc superblocks per pid namespace (so a
second mount of proc does not silently get the mount options of the
first mount of proc.  In either case being able to flush proc
dentries on process exit needs to be made to work without going
through proc_mnt.  After serveral discussions this is the set
of changes that work and no one objects to.

---

I have addressed all of the review comments as I understand them,
and fixed the small oversight the kernel test robot was able to
find. (I had failed to initialize the new field pid->inodes).

I did not hear any concerns from the 10,000 foot level last time
so I am assuming this set of changes (baring bugs) is good to go.

Unless some new issues appear my plan is to put this in my tree
and get this into linux-next.  Which will give Alexey something
to build his changes on.

I tested this set of changes by running:
 (while ls -1 -f /proc > /dev/null ; do :; done ) &
And monitoring the amount of free memory.

With the flushing disabled I saw the used memory in the system grow by
20M before the shrinker would bring it back down to where it started.
With the patch applied I saw the memory usage stay essentially fixed.

So flushing definitely keeps things working better.

Eric W. Biederman (6):
      proc: Rename in proc_inode rename sysctl_inodes sibling_inodes
      proc: Generalize proc_sys_prune_dcache into proc_prune_siblings_dcache
      proc: In proc_prune_siblings_dcache cache an aquired super block
      proc: Use d_invalidate in proc_prune_siblings_dcache
      proc: Clear the pieces of proc_inode that proc_evict_inode cares about
      proc: Use a list of inodes to flush from proc

 fs/proc/base.c          | 111 ++++++++++++++++--------------------------------
 fs/proc/inode.c         |  73 ++++++++++++++++++++++++++++---
 fs/proc/internal.h      |   4 +-
 fs/proc/proc_sysctl.c   |  45 +++-----------------
 include/linux/pid.h     |   1 +
 include/linux/proc_fs.h |   4 +-
 kernel/exit.c           |   4 +-
 kernel/pid.c            |   1 +
 8 files changed, 120 insertions(+), 123 deletions(-)

Link: https://lore.kernel.org/lkml/871rqk2brn.fsf_-_@x220.int.ebiederm.org/
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

Merge branch 'proc-dentry-flushing-without-proc-mnt-v2' into HEAD
  • Loading branch information
Eric W. Biederman committed Feb 28, 2020
2 parents 11a48a5 + 7bc3e6e commit a13ae69
Show file tree
Hide file tree
Showing 8 changed files with 120 additions and 123 deletions.
111 changes: 36 additions & 75 deletions fs/proc/base.c
Original file line number Diff line number Diff line change
Expand Up @@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
*rgid = gid;
}

void proc_pid_evict_inode(struct proc_inode *ei)
{
struct pid *pid = ei->pid;

if (S_ISDIR(ei->vfs_inode.i_mode)) {
spin_lock(&pid->wait_pidfd.lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->wait_pidfd.lock);
}

put_pid(pid);
}

struct inode *proc_pid_make_inode(struct super_block * sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
struct proc_inode *ei;
struct pid *pid;

/* We need a new inode */

Expand All @@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/*
* grab the reference to task.
*/
ei->pid = get_task_pid(task, PIDTYPE_PID);
if (!ei->pid)
pid = get_task_pid(task, PIDTYPE_PID);
if (!pid)
goto out_unlock;

/* Let the pid remember us for quick removal */
ei->pid = pid;
if (S_ISDIR(mode)) {
spin_lock(&pid->wait_pidfd.lock);
hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
spin_unlock(&pid->wait_pidfd.lock);
}

task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);

Expand Down Expand Up @@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.permission = proc_pid_permission,
};

static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
{
struct dentry *dentry, *leader, *dir;
char buf[10 + 1];
struct qstr name;

name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", pid);
/* no ->d_hash() rejects on procfs */
dentry = d_hash_and_lookup(mnt->mnt_root, &name);
if (dentry) {
d_invalidate(dentry);
dput(dentry);
}

if (pid == tgid)
return;

name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", tgid);
leader = d_hash_and_lookup(mnt->mnt_root, &name);
if (!leader)
goto out;

name.name = "task";
name.len = strlen(name.name);
dir = d_hash_and_lookup(leader, &name);
if (!dir)
goto out_put_leader;

name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%u", pid);
dentry = d_hash_and_lookup(dir, &name);
if (dentry) {
d_invalidate(dentry);
dput(dentry);
}

dput(dir);
out_put_leader:
dput(leader);
out:
return;
}

/**
* proc_flush_task - Remove dcache entries for @task from the /proc dcache.
* @task: task that should be flushed.
* proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
* @pid: pid that should be flushed.
*
* When flushing dentries from proc, one needs to flush them from global
* proc (proc_mnt) and from all the namespaces' procs this task was seen
* in. This call is supposed to do all of this job.
*
* Looks in the dcache for
* /proc/@pid
* /proc/@tgid/task/@pid
* if either directory is present flushes it and all of it'ts children
* from the dcache.
* This function walks a list of inodes (that belong to any proc
* filesystem) that are attached to the pid and flushes them from
* the dentry cache.
*
* It is safe and reasonable to cache /proc entries for a task until
* that task exits. After that they just clog up the dcache with
* useless entries, possibly causing useful dcache entries to be
* flushed instead. This routine is proved to flush those useless
* dcache entries at process exit time.
* flushed instead. This routine is provided to flush those useless
* dcache entries when a process is reaped.
*
* NOTE: This routine is just an optimization so it does not guarantee
* that no dcache entries will exist at process exit time it
* just makes it very unlikely that any will persist.
* that no dcache entries will exist after a process is reaped
* it just makes it very unlikely that any will persist.
*/

void proc_flush_task(struct task_struct *task)
void proc_flush_pid(struct pid *pid)
{
int i;
struct pid *pid, *tgid;
struct upid *upid;

pid = task_pid(task);
tgid = task_tgid(task);

for (i = 0; i <= pid->level; i++) {
upid = &pid->numbers[i];
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
put_pid(pid);
}

static struct dentry *proc_pid_instantiate(struct dentry * dentry,
Expand Down
73 changes: 68 additions & 5 deletions fs/proc/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
struct proc_inode *ei = PROC_I(inode);

truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);

/* Stop tracking associated processes */
put_pid(PROC_I(inode)->pid);
if (ei->pid) {
proc_pid_evict_inode(ei);
ei->pid = NULL;
}

/* Let go of any associated proc directory entry */
de = PDE(inode);
if (de)
de = ei->pde;
if (de) {
pde_put(de);
ei->pde = NULL;
}

head = PROC_I(inode)->sysctl;
head = ei->sysctl;
if (head) {
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
RCU_INIT_POINTER(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
Expand All @@ -68,6 +74,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
INIT_HLIST_NODE(&ei->sibling_inodes);
ei->ns_ops = NULL;
return &ei->vfs_inode;
}
Expand Down Expand Up @@ -102,6 +109,62 @@ void __init proc_init_kmemcache(void)
BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}

void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
struct inode *inode;
struct proc_inode *ei;
struct hlist_node *node;
struct super_block *old_sb = NULL;

rcu_read_lock();
for (;;) {
struct super_block *sb;
node = hlist_first_rcu(inodes);
if (!node)
break;
ei = hlist_entry(node, struct proc_inode, sibling_inodes);
spin_lock(lock);
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(lock);

inode = &ei->vfs_inode;
sb = inode->i_sb;
if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
continue;
inode = igrab(inode);
rcu_read_unlock();
if (sb != old_sb) {
if (old_sb)
deactivate_super(old_sb);
old_sb = sb;
}
if (unlikely(!inode)) {
rcu_read_lock();
continue;
}

if (S_ISDIR(inode->i_mode)) {
struct dentry *dir = d_find_any_alias(inode);
if (dir) {
d_invalidate(dir);
dput(dir);
}
} else {
struct dentry *dentry;
while ((dentry = d_find_alias(inode))) {
d_invalidate(dentry);
dput(dentry);
}
}
iput(inode);

rcu_read_lock();
}
rcu_read_unlock();
if (old_sb)
deactivate_super(old_sb);
}

static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
struct super_block *sb = root->d_sb;
Expand Down
4 changes: 3 additions & 1 deletion fs/proc/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
struct hlist_node sysctl_inodes;
struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
} __randomize_layout;
Expand Down Expand Up @@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
Expand Down Expand Up @@ -210,6 +211,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;

void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);
Expand Down
45 changes: 6 additions & 39 deletions fs/proc/proc_sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -267,42 +267,9 @@ static void unuse_table(struct ctl_table_header *p)
complete(p->unregistering);
}

static void proc_sys_prune_dcache(struct ctl_table_header *head)
static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
{
struct inode *inode;
struct proc_inode *ei;
struct hlist_node *node;
struct super_block *sb;

rcu_read_lock();
for (;;) {
node = hlist_first_rcu(&head->inodes);
if (!node)
break;
ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
spin_lock(&sysctl_lock);
hlist_del_init_rcu(&ei->sysctl_inodes);
spin_unlock(&sysctl_lock);

inode = &ei->vfs_inode;
sb = inode->i_sb;
if (!atomic_inc_not_zero(&sb->s_active))
continue;
inode = igrab(inode);
rcu_read_unlock();
if (unlikely(!inode)) {
deactivate_super(sb);
rcu_read_lock();
continue;
}

d_prune_aliases(inode);
iput(inode);
deactivate_super(sb);

rcu_read_lock();
}
rcu_read_unlock();
proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}

/* called under sysctl_lock, will reacquire if has to wait */
Expand All @@ -324,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p)
spin_unlock(&sysctl_lock);
}
/*
* Prune dentries for unregistered sysctls: namespaced sysctls
* Invalidate dentries for unregistered sysctls: namespaced sysctls
* can have duplicate names and contaminate dcache very badly.
*/
proc_sys_prune_dcache(p);
proc_sys_invalidate_dcache(p);
/*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
Expand Down Expand Up @@ -483,7 +450,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
}
ei->sysctl = head;
ei->sysctl_entry = table;
hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
head->count++;
spin_unlock(&sysctl_lock);

Expand Down Expand Up @@ -514,7 +481,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
Expand Down
1 change: 1 addition & 0 deletions include/linux/pid.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct pid
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
Expand Down
4 changes: 2 additions & 2 deletions include/linux/proc_fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ struct proc_ops {
typedef int (*proc_write_t)(struct file *, char *, size_t);

extern void proc_root_init(void);
extern void proc_flush_task(struct task_struct *);
extern void proc_flush_pid(struct pid *);

extern struct proc_dir_entry *proc_symlink(const char *,
struct proc_dir_entry *, const char *);
Expand Down Expand Up @@ -105,7 +105,7 @@ static inline void proc_root_init(void)
{
}

static inline void proc_flush_task(struct task_struct *task)
static inline void proc_flush_pid(struct pid *pid)
{
}

Expand Down
Loading

0 comments on commit a13ae69

Please sign in to comment.