Skip to content

Commit

Permalink
IPoIB: Use a private hash table for path lookup in xmit path
Browse files Browse the repository at this point in the history
Dave Miller <davem@davemloft.net> provided a detailed description of
why the way IPoIB is using neighbours for its own ipoib_neigh struct
is buggy:

    Any time an ipoib_neigh is changed, a sequence like the following is made:

    			spin_lock_irqsave(&priv->lock, flags);
    			/*
    			 * It's safe to call ipoib_put_ah() inside
    			 * priv->lock here, because we know that
    			 * path->ah will always hold one more reference,
    			 * so ipoib_put_ah() will never do more than
    			 * decrement the ref count.
    			 */
    			if (neigh->ah)
    				ipoib_put_ah(neigh->ah);
    			list_del(&neigh->list);
    			ipoib_neigh_free(dev, neigh);
    			spin_unlock_irqrestore(&priv->lock, flags);
    			ipoib_path_lookup(skb, n, dev);

    This doesn't work, because you're leaving a stale pointer to the freed up
    ipoib_neigh in the special neigh->ha pointer cookie.  Yes, it even fails
    with all the locking done to protect _changes_ to *ipoib_neigh(n), and
    with the code in ipoib_neigh_free() that NULLs out the pointer.

    The core issue is that read side calls to *to_ipoib_neigh(n) are not
    being synchronized at all, they are performed without any locking.  So
    whether we hold the lock or not when making changes to *ipoib_neigh(n)
    you still can have threads see references to freed up ipoib_neigh
    objects.

    	cpu 1			cpu 2
    	n = *ipoib_neigh()
    				*ipoib_neigh() = NULL
    				kfree(n)
    	n->foo == OOPS

    [..]

    Perhaps the ipoib code can have a private path database it manages
    entirely itself, which holds all the necessary information and is
    looked up by some generic key which is available easily at transmit
    time and does not involve generic neighbour entries.

See <http://marc.info/?l=linux-rdma&m=132812793105624&w=2> and
<http://marc.info/?l=linux-rdma&w=2&r=1&s=allows+references+to+freed+memory&q=b>
for the full discussion.

This patch aims to solve the race conditions found in the IPoIB driver.

The patch removes the connection between the core networking neighbour
structure and the ipoib_neigh structure.  In addition to avoiding the
race described above, it allows us to handle SKBs carrying IP packets
that don't have any associated neighbour.

We add an ipoib_neigh hash table with N buckets where the key is the
destination hardware address.  The ipoib_neigh is fetched from the
hash table and instead of the stashed location in the neighbour
structure. The hash table uses both RCU and reference counting to
guarantee that no ipoib_neigh instance is ever deleted while in use.

Fetching the ipoib_neigh structure instance from the hash also makes
the special code in ipoib_start_xmit that handles remote and local
bonding failover redundant.

Aged ipoib_neigh instances are deleted by a garbage collection task
that runs every M seconds and deletes every ipoib_neigh instance that
was idle for at least 2*M seconds. The deletion is safe since the
ipoib_neigh instances are protected using RCU and reference count
mechanisms.

The number of buckets (N) and frequency of running the GC thread (M),
are taken from the exported arb_tbl.

Signed-off-by: Shlomo Pongratz <shlomop@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
  • Loading branch information
Shlomo Pongratz authored and Roland Dreier committed Jul 30, 2012
1 parent 5dedb9f commit b63b70d
Show file tree
Hide file tree
Showing 4 changed files with 539 additions and 236 deletions.
56 changes: 36 additions & 20 deletions drivers/infiniband/ulp/ipoib/ipoib.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ enum {
IPOIB_STOP_REAPER = 7,
IPOIB_FLAG_ADMIN_CM = 9,
IPOIB_FLAG_UMCAST = 10,
IPOIB_STOP_NEIGH_GC = 11,
IPOIB_NEIGH_TBL_FLUSH = 12,

IPOIB_MAX_BACKOFF_SECONDS = 16,

Expand Down Expand Up @@ -260,6 +262,20 @@ struct ipoib_ethtool_st {
u16 max_coalesced_frames;
};

struct ipoib_neigh_hash {
struct ipoib_neigh __rcu **buckets;
struct rcu_head rcu;
u32 mask;
u32 size;
};

struct ipoib_neigh_table {
struct ipoib_neigh_hash __rcu *htbl;
rwlock_t rwlock;
atomic_t entries;
struct completion flushed;
};

/*
* Device private locking: network stack tx_lock protects members used
* in TX fast path, lock protects everything else. lock nests inside
Expand All @@ -279,6 +295,8 @@ struct ipoib_dev_priv {
struct rb_root path_tree;
struct list_head path_list;

struct ipoib_neigh_table ntbl;

struct ipoib_mcast *broadcast;
struct list_head multicast_list;
struct rb_root multicast_tree;
Expand All @@ -291,7 +309,7 @@ struct ipoib_dev_priv {
struct work_struct flush_heavy;
struct work_struct restart_task;
struct delayed_work ah_reap_task;

struct delayed_work neigh_reap_task;
struct ib_device *ca;
u8 port;
u16 pkey;
Expand Down Expand Up @@ -377,13 +395,16 @@ struct ipoib_neigh {
#ifdef CONFIG_INFINIBAND_IPOIB_CM
struct ipoib_cm_tx *cm;
#endif
union ib_gid dgid;
u8 daddr[INFINIBAND_ALEN];
struct sk_buff_head queue;

struct neighbour *neighbour;
struct net_device *dev;

struct list_head list;
struct ipoib_neigh __rcu *hnext;
struct rcu_head rcu;
atomic_t refcnt;
unsigned long alive;
};

#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN)
Expand All @@ -394,21 +415,17 @@ static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
}

/*
* We stash a pointer to our private neighbour information after our
* hardware address in neigh->ha. The ALIGN() expression here makes
* sure that this pointer is stored aligned so that an unaligned
* load is not needed to dereference it.
*/
static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh)
void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
{
return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) +
INFINIBAND_ALEN, sizeof(void *));
if (atomic_dec_and_test(&neigh->refcnt))
ipoib_neigh_dtor(neigh);
}

struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh,
struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr);
struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
struct net_device *dev);
void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
void ipoib_neigh_free(struct ipoib_neigh *neigh);
void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);

extern struct workqueue_struct *ipoib_workqueue;

Expand All @@ -425,7 +442,6 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah)
{
kref_put(&ah->ref, ipoib_free_ah);
}

int ipoib_open(struct net_device *dev);
int ipoib_add_pkey_attr(struct net_device *dev);
int ipoib_add_umcast_attr(struct net_device *dev);
Expand Down Expand Up @@ -455,7 +471,7 @@ void ipoib_dev_cleanup(struct net_device *dev);

void ipoib_mcast_join_task(struct work_struct *work);
void ipoib_mcast_carrier_on_task(struct work_struct *work);
void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb);
void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);

void ipoib_mcast_restart_task(struct work_struct *work);
int ipoib_mcast_start_thread(struct net_device *dev);
Expand Down Expand Up @@ -517,10 +533,10 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev)
test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
}

static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
return IPOIB_CM_SUPPORTED(n->ha) &&
return IPOIB_CM_SUPPORTED(hwaddr) &&
test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
}

Expand Down Expand Up @@ -575,7 +591,7 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev)
{
return 0;
}
static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)

{
return 0;
Expand Down
16 changes: 5 additions & 11 deletions drivers/infiniband/ulp/ipoib/ipoib_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -811,9 +811,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
if (neigh) {
neigh->cm = NULL;
list_del(&neigh->list);
if (neigh->ah)
ipoib_put_ah(neigh->ah);
ipoib_neigh_free(dev, neigh);
ipoib_neigh_free(neigh);

tx->neigh = NULL;
}
Expand Down Expand Up @@ -1230,9 +1228,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
if (neigh) {
neigh->cm = NULL;
list_del(&neigh->list);
if (neigh->ah)
ipoib_put_ah(neigh->ah);
ipoib_neigh_free(dev, neigh);
ipoib_neigh_free(neigh);

tx->neigh = NULL;
}
Expand Down Expand Up @@ -1279,7 +1275,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
list_move(&tx->list, &priv->cm.reap_list);
queue_work(ipoib_workqueue, &priv->cm.reap_task);
ipoib_dbg(priv, "Reap connection for gid %pI6\n",
tx->neigh->dgid.raw);
tx->neigh->daddr + 4);
tx->neigh = NULL;
}
}
Expand All @@ -1304,7 +1300,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)
p = list_entry(priv->cm.start_list.next, typeof(*p), list);
list_del_init(&p->list);
neigh = p->neigh;
qpn = IPOIB_QPN(neigh->neighbour->ha);
qpn = IPOIB_QPN(neigh->daddr);
memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);

spin_unlock_irqrestore(&priv->lock, flags);
Expand All @@ -1320,9 +1316,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)
if (neigh) {
neigh->cm = NULL;
list_del(&neigh->list);
if (neigh->ah)
ipoib_put_ah(neigh->ah);
ipoib_neigh_free(dev, neigh);
ipoib_neigh_free(neigh);
}
list_del(&p->list);
kfree(p);
Expand Down
Loading

0 comments on commit b63b70d

Please sign in to comment.