Skip to content

Commit

Permalink
Merge branch 'mlx4-next'
Browse files Browse the repository at this point in the history
Or Gerlitz says:

====================
Add HA and LAG support to mlx4 RoCE and SRIOV services

This series takes advanges of bonding mlx4 Ethernet devices to support
a model of High-Availability and Link Aggregation for more environments.

The mlx4 driver reacts on netdev events generated by bonding when
slave state changes happen by programming a HW V2P (Virt-to-Phys)
port table. Bonding was extended to expose these state changes
through netdev events.

When an mlx4 interface such as the mlx4 IB/RoCE driver is subject to
this policy, QPs are created over virtual ports which are mapped
to one of the two physical ports. When a failure happens, the
re-programming of the V2P table allows traffic to keep flowing.

The mlx4 Ethernet driver interfaces are not subject to this
policy and act as usual.

A 2nd use-case for this model would be to add HA and Link Aggregation
support to single ported mlx4 Ethernet VFs. In this case, the PF Ethernet
intrfaces are bonded, all the VFs see single port devices (which is
supported already today), and VF QPs are subject to V2P.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Feb 5, 2015
2 parents 251c005 + c621574 commit ce388ff
Show file tree
Hide file tree
Showing 24 changed files with 756 additions and 64 deletions.
1 change: 1 addition & 0 deletions drivers/infiniband/hw/mlx4/ah.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/inet.h>
#include <linux/string.h>
#include <linux/mlx4/driver.h>

#include "mlx4_ib.h"

Expand Down
161 changes: 143 additions & 18 deletions drivers/infiniband/hw/mlx4/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
enum ib_mtu tmp;
struct mlx4_cmd_mailbox *mailbox;
int err = 0;
int is_bonded = mlx4_is_bonded(mdev->dev);

mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
if (IS_ERR(mailbox))
Expand All @@ -374,8 +375,12 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
props->state = IB_PORT_DOWN;
props->phys_state = state_to_phys_state(props->state);
props->active_mtu = IB_MTU_256;
if (is_bonded)
rtnl_lock(); /* required to get upper dev */
spin_lock_bh(&iboe->lock);
ndev = iboe->netdevs[port - 1];
if (ndev && is_bonded)
ndev = netdev_master_upper_dev_get(ndev);
if (!ndev)
goto out_unlock;

Expand All @@ -387,6 +392,8 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
props->phys_state = state_to_phys_state(props->state);
out_unlock:
spin_unlock_bh(&iboe->lock);
if (is_bonded)
rtnl_unlock();
out:
mlx4_free_cmd_mailbox(mdev->dev, mailbox);
return err;
Expand Down Expand Up @@ -844,7 +851,7 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,

struct mlx4_ib_steering {
struct list_head list;
u64 reg_id;
struct mlx4_flow_reg_id reg_id;
union ib_gid gid;
};

Expand Down Expand Up @@ -1135,9 +1142,11 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
struct ib_flow_attr *flow_attr,
int domain)
{
int err = 0, i = 0;
int err = 0, i = 0, j = 0;
struct mlx4_ib_flow *mflow;
enum mlx4_net_trans_promisc_mode type[2];
struct mlx4_dev *dev = (to_mdev(qp->device))->dev;
int is_bonded = mlx4_is_bonded(dev);

memset(type, 0, sizeof(type));

Expand Down Expand Up @@ -1172,26 +1181,55 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,

while (i < ARRAY_SIZE(type) && type[i]) {
err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
&mflow->reg_id[i]);
&mflow->reg_id[i].id);
if (err)
goto err_create_flow;
i++;
if (is_bonded) {
flow_attr->port = 2;
err = __mlx4_ib_create_flow(qp, flow_attr,
domain, type[j],
&mflow->reg_id[j].mirror);
flow_attr->port = 1;
if (err)
goto err_create_flow;
j++;
}

}

if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) {
err = mlx4_ib_tunnel_steer_add(qp, flow_attr, &mflow->reg_id[i]);
err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
&mflow->reg_id[i].id);
if (err)
goto err_create_flow;
i++;
if (is_bonded) {
flow_attr->port = 2;
err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
&mflow->reg_id[j].mirror);
flow_attr->port = 1;
if (err)
goto err_create_flow;
j++;
}
/* function to create mirror rule */
}

return &mflow->ibflow;

err_create_flow:
while (i) {
(void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, mflow->reg_id[i]);
(void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
mflow->reg_id[i].id);
i--;
}

while (j) {
(void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
mflow->reg_id[j].mirror);
j--;
}
err_free:
kfree(mflow);
return ERR_PTR(err);
Expand All @@ -1204,10 +1242,16 @@ static int mlx4_ib_destroy_flow(struct ib_flow *flow_id)
struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
struct mlx4_ib_flow *mflow = to_mflow(flow_id);

while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) {
err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]);
while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].id) {
err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].id);
if (err)
ret = err;
if (mflow->reg_id[i].mirror) {
err = __mlx4_ib_destroy_flow(mdev->dev,
mflow->reg_id[i].mirror);
if (err)
ret = err;
}
i++;
}

Expand All @@ -1219,11 +1263,12 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
{
int err;
struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
struct mlx4_dev *dev = mdev->dev;
struct mlx4_ib_qp *mqp = to_mqp(ibqp);
u64 reg_id;
struct mlx4_ib_steering *ib_steering = NULL;
enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ?
MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6;
struct mlx4_flow_reg_id reg_id;

if (mdev->dev->caps.steering_mode ==
MLX4_STEERING_MODE_DEVICE_MANAGED) {
Expand All @@ -1235,10 +1280,20 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,
!!(mqp->flags &
MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
prot, &reg_id);
prot, &reg_id.id);
if (err)
goto err_malloc;

reg_id.mirror = 0;
if (mlx4_is_bonded(dev)) {
err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, 2,
!!(mqp->flags &
MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
prot, &reg_id.mirror);
if (err)
goto err_add;
}

err = add_gid_entry(ibqp, gid);
if (err)
goto err_add;
Expand All @@ -1254,7 +1309,10 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)

err_add:
mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
prot, reg_id);
prot, reg_id.id);
if (reg_id.mirror)
mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
prot, reg_id.mirror);
err_malloc:
kfree(ib_steering);

Expand All @@ -1281,10 +1339,12 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
{
int err;
struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
struct mlx4_dev *dev = mdev->dev;
struct mlx4_ib_qp *mqp = to_mqp(ibqp);
struct net_device *ndev;
struct mlx4_ib_gid_entry *ge;
u64 reg_id = 0;
struct mlx4_flow_reg_id reg_id = {0, 0};

enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ?
MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6;

Expand All @@ -1309,10 +1369,17 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
}

err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
prot, reg_id);
prot, reg_id.id);
if (err)
return err;

if (mlx4_is_bonded(dev)) {
err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
prot, reg_id.mirror);
if (err)
return err;
}

mutex_lock(&mqp->mutex);
ge = find_gid_entry(mqp, gid->raw);
if (ge) {
Expand Down Expand Up @@ -1440,6 +1507,7 @@ static void update_gids_task(struct work_struct *work)
union ib_gid *gids;
int err;
struct mlx4_dev *dev = gw->dev->dev;
int is_bonded = mlx4_is_bonded(dev);

if (!gw->dev->ib_active)
return;
Expand All @@ -1459,7 +1527,10 @@ static void update_gids_task(struct work_struct *work)
if (err)
pr_warn("set port command failed\n");
else
mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE);
if ((gw->port == 1) || !is_bonded)
mlx4_ib_dispatch_event(gw->dev,
is_bonded ? 1 : gw->port,
IB_EVENT_GID_CHANGE);

mlx4_free_cmd_mailbox(dev, mailbox);
kfree(gw);
Expand Down Expand Up @@ -1875,7 +1946,8 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
* don't want the bond IP based gids in the table since
* flows that select port by gid may get the down port.
*/
if (port_state == IB_PORT_DOWN) {
if (port_state == IB_PORT_DOWN &&
!mlx4_is_bonded(ibdev->dev)) {
reset_gid_table(ibdev, port);
mlx4_ib_set_default_gid(ibdev,
curr_netdev,
Expand Down Expand Up @@ -2047,6 +2119,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
int err;
struct mlx4_ib_iboe *iboe;
int ib_num_ports = 0;
int num_req_counters;

pr_info_once("%s", mlx4_ib_version);

Expand Down Expand Up @@ -2080,13 +2153,15 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);

ibdev->dev = dev;
ibdev->bond_next_port = 0;

strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
ibdev->ib_dev.owner = THIS_MODULE;
ibdev->ib_dev.node_type = RDMA_NODE_IB_CA;
ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey;
ibdev->num_ports = num_ports;
ibdev->ib_dev.phys_port_cnt = ibdev->num_ports;
ibdev->ib_dev.phys_port_cnt = mlx4_is_bonded(dev) ?
1 : ibdev->num_ports;
ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
ibdev->ib_dev.dma_device = &dev->persist->pdev->dev;

Expand Down Expand Up @@ -2207,7 +2282,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
if (init_node_data(ibdev))
goto err_map;

for (i = 0; i < ibdev->num_ports; ++i) {
num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports;
for (i = 0; i < num_req_counters; ++i) {
mutex_init(&ibdev->qp1_proxy_lock[i]);
if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
IB_LINK_LAYER_ETHERNET) {
Expand All @@ -2218,6 +2294,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->counters[i] = -1;
}
}
if (mlx4_is_bonded(dev))
for (i = 1; i < ibdev->num_ports ; ++i)
ibdev->counters[i] = ibdev->counters[0];


mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
ib_num_ports++;
Expand Down Expand Up @@ -2538,6 +2618,38 @@ static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)
return;
}

static void handle_bonded_port_state_event(struct work_struct *work)
{
struct ib_event_work *ew =
container_of(work, struct ib_event_work, work);
struct mlx4_ib_dev *ibdev = ew->ib_dev;
enum ib_port_state bonded_port_state = IB_PORT_NOP;
int i;
struct ib_event ibev;

kfree(ew);
spin_lock_bh(&ibdev->iboe.lock);
for (i = 0; i < MLX4_MAX_PORTS; ++i) {
struct net_device *curr_netdev = ibdev->iboe.netdevs[i];

enum ib_port_state curr_port_state =
(netif_running(curr_netdev) &&
netif_carrier_ok(curr_netdev)) ?
IB_PORT_ACTIVE : IB_PORT_DOWN;

bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ?
curr_port_state : IB_PORT_ACTIVE;
}
spin_unlock_bh(&ibdev->iboe.lock);

ibev.device = &ibdev->ib_dev;
ibev.element.port_num = 1;
ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ?
IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;

ib_dispatch_event(&ibev);
}

static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
enum mlx4_dev_event event, unsigned long param)
{
Expand All @@ -2547,6 +2659,18 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
struct ib_event_work *ew;
int p = 0;

if (mlx4_is_bonded(dev) &&
((event == MLX4_DEV_EVENT_PORT_UP) ||
(event == MLX4_DEV_EVENT_PORT_DOWN))) {
ew = kmalloc(sizeof(*ew), GFP_ATOMIC);
if (!ew)
return;
INIT_WORK(&ew->work, handle_bonded_port_state_event);
ew->ib_dev = ibdev;
queue_work(wq, &ew->work);
return;
}

if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
eqe = (struct mlx4_eqe *)param;
else
Expand Down Expand Up @@ -2607,7 +2731,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
}

ibev.device = ibdev_ptr;
ibev.element.port_num = (u8) p;
ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p;

ib_dispatch_event(&ibev);
}
Expand All @@ -2616,7 +2740,8 @@ static struct mlx4_interface mlx4_ib_interface = {
.add = mlx4_ib_add,
.remove = mlx4_ib_remove,
.event = mlx4_ib_event,
.protocol = MLX4_PROT_IB_IPV6
.protocol = MLX4_PROT_IB_IPV6,
.flags = MLX4_INTFF_BONDING
};

static int __init mlx4_ib_init(void)
Expand Down
Loading

0 comments on commit ce388ff

Please sign in to comment.