Skip to content

Commit

Permalink
pidns: Make the pidns proc mount/umount logic obvious.
Browse files Browse the repository at this point in the history
Track the number of pids in the proc hash table.  When the number of
pids goes to 0 schedule work to unmount the kernel mount of proc.

Move the mount of proc into alloc_pid when we allocate the pid for
init.

Remove the surprising calls of pid_ns_release proc in fork and
proc_flush_task.  Those code paths really shouldn't know about proc
namespace implementation details and people have demonstrated several
times that finding and understanding those code paths is difficult and
non-obvious.

Because of the call path detach pid is alwasy called with the
rtnl_lock held free_pid is not allowed to sleep, so the work to
unmounting proc is moved to a work queue.  This has the side benefit
of not blocking the entire world waiting for the unnecessary
rcu_barrier in deactivate_locked_super.

In the process of making the code clear and obvious this fixes a bug
reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a
mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns
succeeded and copy_net_ns failed.

Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
  • Loading branch information
Eric W. Biederman committed Nov 19, 2012
1 parent 17cf22c commit 0a01f2c
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 22 deletions.
4 changes: 0 additions & 4 deletions fs/proc/base.c
Original file line number Diff line number Diff line change
Expand Up @@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task)
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}

upid = &pid->numbers[pid->level];
if (upid->nr == 1)
pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
Expand Down
5 changes: 0 additions & 5 deletions fs/proc/root.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,6 @@ void __init proc_root_init(void)
err = register_filesystem(&proc_fs_type);
if (err)
return;
err = pid_ns_prepare_proc(&init_pid_ns);
if (err) {
unregister_filesystem(&proc_fs_type);
return;
}

proc_self_init();
proc_symlink("mounts", NULL, "self/mounts");
Expand Down
2 changes: 2 additions & 0 deletions include/linux/pid_namespace.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
int last_pid;
int nr_hashed;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
Expand All @@ -32,6 +33,7 @@ struct pid_namespace {
struct bsd_acct_struct *bacct;
#endif
struct user_namespace *user_ns;
struct work_struct proc_work;
kgid_t pid_gid;
int hide_pid;
int reboot; /* group exit code if this pidns was rebooted */
Expand Down
2 changes: 0 additions & 2 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -1476,8 +1476,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
if (unlikely(clone_flags & CLONE_NEWPID))
pid_ns_release_proc(p->nsproxy->pid_ns);
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
Expand Down
21 changes: 17 additions & 4 deletions kernel/pid.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_fs.h>

#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
Expand Down Expand Up @@ -270,8 +271,12 @@ void free_pid(struct pid *pid)
unsigned long flags;

spin_lock_irqsave(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++)
hlist_del_rcu(&pid->numbers[i].pid_chain);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
hlist_del_rcu(&upid->pid_chain);
if (--upid->ns->nr_hashed == 0)
schedule_work(&upid->ns->proc_work);
}
spin_unlock_irqrestore(&pidmap_lock, flags);

for (i = 0; i <= pid->level; i++)
Expand All @@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
goto out;

tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
if (nr < 0)
Expand All @@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = tmp->parent;
}

if (unlikely(is_child_reaper(pid))) {
if (pid_ns_prepare_proc(ns))
goto out_free;
}

get_pid_ns(ns);
pid->level = ns->level;
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);

upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
for ( ; upid >= pid->numbers; --upid)
for ( ; upid >= pid->numbers; --upid) {
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
upid->ns->nr_hashed++;
}
spin_unlock_irq(&pidmap_lock);

out:
Expand Down Expand Up @@ -570,6 +582,7 @@ void __init pidmap_init(void)
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
init_pid_ns.nr_hashed = 1;

init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
Expand Down
14 changes: 7 additions & 7 deletions kernel/pid_namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
return NULL;
}

static void proc_cleanup_work(struct work_struct *work)
{
struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
pid_ns_release_proc(ns);
}

/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

Expand Down Expand Up @@ -105,22 +111,16 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
INIT_WORK(&ns->proc_work, proc_cleanup_work);

set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);

for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);

err = pid_ns_prepare_proc(ns);
if (err)
goto out_put_parent_pid_ns;

return ns;

out_put_parent_pid_ns:
put_pid_ns(parent_pid_ns);
put_user_ns(user_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
Expand Down

0 comments on commit 0a01f2c

Please sign in to comment.