Skip to content

Commit

Permalink
RDMA/umem: Get rid of per_mm->notifier_count
Browse files Browse the repository at this point in the history
This is intrinsically racy and the scheme is simply unnecessary. New MR
registration can wait for any on going invalidation to fully complete.

      CPU0                              CPU1
                                  if (atomic_read())
 if (atomic_dec_and_test() &&
     !list_empty())
  { /* not taken */ }
                                       list_add()

Putting the new UMEM into some kind of purgatory until another invalidate
rolls through..

Instead hold the read side of the umem_rwsem across the pair'd start/end
and get rid of the racy 'deferred add' approach.

Since all umem's in the rbt are always ready to go, also get rid of the
mn_counters_active stuff.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
  • Loading branch information
Jason Gunthorpe authored and Doug Ledford committed Sep 21, 2018
1 parent f27a0d5 commit ca748c3
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 110 deletions.
113 changes: 18 additions & 95 deletions drivers/infiniband/core/umem_odp.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,83 +80,29 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
{
mutex_lock(&umem_odp->umem_mutex);

/* Only update private counters for this umem if it has them.
* Otherwise skip it. All page faults will be delayed for this umem. */
if (umem_odp->mn_counters_active) {
int notifiers_count = umem_odp->notifiers_count++;

if (notifiers_count == 0)
/* Initialize the completion object for waiting on
* notifiers. Since notifier_count is zero, no one
* should be waiting right now. */
reinit_completion(&umem_odp->notifier_completion);
}
if (umem_odp->notifiers_count++ == 0)
/*
* Initialize the completion object for waiting on
* notifiers. Since notifier_count is zero, no one should be
* waiting right now.
*/
reinit_completion(&umem_odp->notifier_completion);
mutex_unlock(&umem_odp->umem_mutex);
}

static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
{
mutex_lock(&umem_odp->umem_mutex);

/* Only update private counters for this umem if it has them.
* Otherwise skip it. All page faults will be delayed for this umem. */
if (umem_odp->mn_counters_active) {
/*
* This sequence increase will notify the QP page fault that
* the page that is going to be mapped in the spte could have
* been freed.
*/
++umem_odp->notifiers_seq;
if (--umem_odp->notifiers_count == 0)
complete_all(&umem_odp->notifier_completion);
}
/*
* This sequence increase will notify the QP page fault that the page
* that is going to be mapped in the spte could have been freed.
*/
++umem_odp->notifiers_seq;
if (--umem_odp->notifiers_count == 0)
complete_all(&umem_odp->notifier_completion);
mutex_unlock(&umem_odp->umem_mutex);
}

/* Account for a new mmu notifier in an ib_ucontext. */
static void
ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm)
{
atomic_inc(&per_mm->notifier_count);
}

/* Account for a terminating mmu notifier in an ib_ucontext.
*
* Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
* the function takes the semaphore itself. */
static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm)
{
int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count);

if (zero_notifiers &&
!list_empty(&per_mm->no_private_counters)) {
/* No currently running mmu notifiers. Now is the chance to
* add private accounting to all previously added umems. */
struct ib_umem_odp *odp_data, *next;

/* Prevent concurrent mmu notifiers from working on the
* no_private_counters list. */
down_write(&per_mm->umem_rwsem);

/* Read the notifier_count again, with the umem_rwsem
* semaphore taken for write. */
if (!atomic_read(&per_mm->notifier_count)) {
list_for_each_entry_safe(odp_data, next,
&per_mm->no_private_counters,
no_private_counters) {
mutex_lock(&odp_data->umem_mutex);
odp_data->mn_counters_active = true;
list_del(&odp_data->no_private_counters);
complete_all(&odp_data->notifier_completion);
mutex_unlock(&odp_data->umem_mutex);
}
}

up_write(&per_mm->umem_rwsem);
}
}

static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
u64 start, u64 end, void *cookie)
{
Expand Down Expand Up @@ -186,7 +132,6 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn,
if (!per_mm->context->invalidate_range)
return;

ib_ucontext_notifier_start_account(per_mm);
down_read(&per_mm->umem_rwsem);
rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0,
ULLONG_MAX,
Expand Down Expand Up @@ -231,14 +176,9 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
else if (!down_read_trylock(&per_mm->umem_rwsem))
return -EAGAIN;

ib_ucontext_notifier_start_account(per_mm);
ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
end,
invalidate_range_start_trampoline,
blockable, NULL);
up_read(&per_mm->umem_rwsem);

return ret;
return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end,
invalidate_range_start_trampoline,
blockable, NULL);
}

static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
Expand All @@ -259,17 +199,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
if (!per_mm->context->invalidate_range)
return;

/*
* TODO: we currently bail out if there is any sleepable work to be done
* in ib_umem_notifier_invalidate_range_start so we shouldn't really block
* here. But this is ugly and fragile.
*/
down_read(&per_mm->umem_rwsem);
rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
end,
invalidate_range_end_trampoline, true, NULL);
up_read(&per_mm->umem_rwsem);
ib_ucontext_notifier_end_account(per_mm);
}

static const struct mmu_notifier_ops ib_umem_notifiers = {
Expand All @@ -287,12 +220,6 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
rbt_ib_umem_insert(&umem_odp->interval_tree,
&per_mm->umem_tree);

if (likely(!atomic_read(&per_mm->notifier_count)))
umem_odp->mn_counters_active = true;
else
list_add(&umem_odp->no_private_counters,
&per_mm->no_private_counters);
up_write(&per_mm->umem_rwsem);
}

Expand All @@ -305,10 +232,7 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
rbt_ib_umem_remove(&umem_odp->interval_tree,
&per_mm->umem_tree);
if (!umem_odp->mn_counters_active) {
list_del(&umem_odp->no_private_counters);
complete_all(&umem_odp->notifier_completion);
}
complete_all(&umem_odp->notifier_completion);

up_write(&per_mm->umem_rwsem);
}
Expand All @@ -327,7 +251,6 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
per_mm->mm = mm;
per_mm->umem_tree = RB_ROOT_CACHED;
init_rwsem(&per_mm->umem_rwsem);
INIT_LIST_HEAD(&per_mm->no_private_counters);

rcu_read_lock();
per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
Expand Down
15 changes: 0 additions & 15 deletions include/rdma/ib_umem_odp.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,9 @@ struct ib_umem_odp {
struct mutex umem_mutex;
void *private; /* for the HW driver to use. */

/* When false, use the notifier counter in the ucontext struct. */
bool mn_counters_active;
int notifiers_seq;
int notifiers_count;

/* A linked list of umems that don't have private mmu notifier
* counters yet. */
struct list_head no_private_counters;

/* Tree tracking */
struct umem_odp_node interval_tree;

Expand All @@ -99,11 +93,8 @@ struct ib_ucontext_per_mm {
struct rb_root_cached umem_tree;
/* Protects umem_tree */
struct rw_semaphore umem_rwsem;
atomic_t notifier_count;

struct mmu_notifier mn;
/* A list of umems that don't have private mmu notifier counters yet. */
struct list_head no_private_counters;
unsigned int odp_mrs_count;

struct list_head ucontext_list;
Expand Down Expand Up @@ -162,12 +153,6 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
* and the ucontext umem_mutex semaphore locked for read).
*/

/* Do not allow page faults while the new ib_umem hasn't seen a state
* with zero notifiers yet, and doesn't have its own valid set of
* private counters. */
if (!umem_odp->mn_counters_active)
return 1;

if (unlikely(umem_odp->notifiers_count))
return 1;
if (umem_odp->notifiers_seq != mmu_seq)
Expand Down

0 comments on commit ca748c3

Please sign in to comment.