From 503eebc265dcf5c512454fd5a6b6673ea4f1d7f2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:37 +0200 Subject: [PATCH 01/16] net: add dev arg to ndo_neigh_construct/destroy As the following patch will allow upper devices to follow the call down lower devices, we need to add dev here and not rely on n->dev. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker_main.c | 3 ++- include/linux/netdevice.h | 6 ++++-- net/atm/clip.c | 2 +- net/core/neighbour.c | 4 ++-- net/ieee802154/6lowpan/core.c | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 28b775e5a9ad4..f0b09b05ed3f1 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -1996,7 +1996,8 @@ static int rocker_port_change_proto_down(struct net_device *dev, return 0; } -static void rocker_port_neigh_destroy(struct neighbour *n) +static void rocker_port_neigh_destroy(struct net_device *dev, + struct neighbour *n) { struct rocker_port *rocker_port = netdev_priv(n->dev); int err; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0c6ee2c5099f2..91af73c9dd510 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1209,8 +1209,10 @@ struct net_device_ops { netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); - int (*ndo_neigh_construct)(struct neighbour *n); - void (*ndo_neigh_destroy)(struct neighbour *n); + int (*ndo_neigh_construct)(struct net_device *dev, + struct neighbour *n); + void (*ndo_neigh_destroy)(struct net_device *dev, + struct neighbour *n); int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], diff --git a/net/atm/clip.c b/net/atm/clip.c index e07f551a863c8..53b4ac09e7b7d 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -286,7 +286,7 @@ static const struct neigh_ops clip_neigh_ops = { .connected_output = neigh_direct_output, }; -static int clip_constructor(struct neighbour *neigh) +static int clip_constructor(struct net_device *dev, struct neighbour *neigh) { struct atmarp_entry *entry = neighbour_priv(neigh); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 510cd62fcb996..952aabb5aa565 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -473,7 +473,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, } if (dev->netdev_ops->ndo_neigh_construct) { - error = dev->netdev_ops->ndo_neigh_construct(n); + error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; @@ -701,7 +701,7 @@ void neigh_destroy(struct neighbour *neigh) neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) - dev->netdev_ops->ndo_neigh_destroy(neigh); + dev->netdev_ops->ndo_neigh_destroy(dev, neigh); dev_put(dev); neigh_parms_put(neigh->parms); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 8c004a0c8d641..935ab932e8412 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -81,7 +81,7 @@ static int lowpan_stop(struct net_device *dev) return 0; } -static int lowpan_neigh_construct(struct neighbour *n) +static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) { struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); From 18bfb924f0005a728caadd90ba755b2a660bf441 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:38 +0200 Subject: [PATCH 02/16] net: introduce default neigh_construct/destroy ndo calls for L2 upper devices L2 upper device needs to propagate neigh_construct/destroy calls down to lower devices. Do this by defining default ndo functions and use them in team, bond, bridge and vlan. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 ++ drivers/net/team/team.c | 2 ++ include/linux/netdevice.h | 4 +++ net/8021q/vlan_dev.c | 2 ++ net/bridge/br_device.c | 2 ++ net/core/dev.c | 44 +++++++++++++++++++++++++++++++++ 6 files changed, 56 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 90157e20357e6..480d73ac7d1b3 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4137,6 +4137,8 @@ static const struct net_device_ops bond_netdev_ops = { .ndo_add_slave = bond_enslave, .ndo_del_slave = bond_release, .ndo_fix_features = bond_fix_features, + .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, + .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_bridge_setlink = switchdev_port_bridge_setlink, .ndo_bridge_getlink = switchdev_port_bridge_getlink, .ndo_bridge_dellink = switchdev_port_bridge_dellink, diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index f9eebea83516b..a380649bf6b5c 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2002,6 +2002,8 @@ static const struct net_device_ops team_netdev_ops = { .ndo_add_slave = team_add_slave, .ndo_del_slave = team_del_slave, .ndo_fix_features = team_fix_features, + .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, + .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_change_carrier = team_change_carrier, .ndo_bridge_setlink = switchdev_port_bridge_setlink, .ndo_bridge_getlink = switchdev_port_bridge_getlink, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 91af73c9dd510..49736a31acaa3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3845,6 +3845,10 @@ void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info); +int netdev_default_l2upper_neigh_construct(struct net_device *dev, + struct neighbour *n); +void netdev_default_l2upper_neigh_destroy(struct net_device *dev, + struct neighbour *n); /* RSS keys are 40 or 52 bytes long */ #define NETDEV_RSS_KEY_LEN 52 diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 86ae75b773909..c8f422c90856f 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -790,6 +790,8 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, + .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, + .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_fdb_add = switchdev_port_fdb_add, .ndo_fdb_del = switchdev_port_fdb_del, .ndo_fdb_dump = switchdev_port_fdb_dump, diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0c39e0f6da09d..8eecd0ec22f2c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -349,6 +349,8 @@ static const struct net_device_ops br_netdev_ops = { .ndo_add_slave = br_add_slave, .ndo_del_slave = br_del_slave, .ndo_fix_features = br_fix_features, + .ndo_neigh_construct = netdev_default_l2upper_neigh_construct, + .ndo_neigh_destroy = netdev_default_l2upper_neigh_destroy, .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, .ndo_fdb_dump = br_fdb_dump, diff --git a/net/core/dev.c b/net/core/dev.c index a4f3b0a9aeaf6..b92d63bfde7ae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6087,6 +6087,50 @@ void netdev_lower_state_changed(struct net_device *lower_dev, } EXPORT_SYMBOL(netdev_lower_state_changed); +int netdev_default_l2upper_neigh_construct(struct net_device *dev, + struct neighbour *n) +{ + struct net_device *lower_dev, *stop_dev; + struct list_head *iter; + int err; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (!lower_dev->netdev_ops->ndo_neigh_construct) + continue; + err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n); + if (err) { + stop_dev = lower_dev; + goto rollback; + } + } + return 0; + +rollback: + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (lower_dev == stop_dev) + break; + if (!lower_dev->netdev_ops->ndo_neigh_destroy) + continue; + lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); + } + return err; +} +EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct); + +void netdev_default_l2upper_neigh_destroy(struct net_device *dev, + struct neighbour *n) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + if (!lower_dev->netdev_ops->ndo_neigh_destroy) + continue; + lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); + } +} +EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; From 6cf3c971dc84cb36579515ddb488919b9e9fb6de Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:39 +0200 Subject: [PATCH 03/16] mlxsw: spectrum_router: Add private neigh table We need to hold some private data for every neigh entry. It would be possible to do it using neigh_priv_len/ndo_neigh_construct/ ndo_neigh_destroy however only for the port device itself. That would not work for stacked devices like bridge/team/bond. So introduce a private neigh table. Hook onto ndos neigh_construct/destroy and add/remove table entry according to that. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.c | 2 + .../net/ethernet/mellanox/mlxsw/spectrum.h | 6 + .../ethernet/mellanox/mlxsw/spectrum_router.c | 146 +++++++++++++++++- 3 files changed, 153 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 7b2b741b2a233..9bebb7af4e542 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -803,6 +803,8 @@ static const struct net_device_ops mlxsw_sp_port_netdev_ops = { .ndo_get_stats64 = mlxsw_sp_port_get_stats64, .ndo_vlan_rx_add_vid = mlxsw_sp_port_add_vid, .ndo_vlan_rx_kill_vid = mlxsw_sp_port_kill_vid, + .ndo_neigh_construct = mlxsw_sp_router_neigh_construct, + .ndo_neigh_destroy = mlxsw_sp_router_neigh_destroy, .ndo_fdb_add = switchdev_port_fdb_add, .ndo_fdb_del = switchdev_port_fdb_del, .ndo_fdb_dump = switchdev_port_fdb_dump, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 958e821ce845d..734c5baffaf19 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -39,6 +39,7 @@ #include #include +#include #include #include #include @@ -212,6 +213,7 @@ struct mlxsw_sp_vr { struct mlxsw_sp_router { struct mlxsw_sp_lpm_tree lpm_trees[MLXSW_SP_LPM_TREE_COUNT]; struct mlxsw_sp_vr vrs[MLXSW_SP_VIRTUAL_ROUTER_MAX]; + struct rhashtable neigh_ht; }; struct mlxsw_sp { @@ -524,5 +526,9 @@ int mlxsw_sp_router_fib4_add(struct mlxsw_sp_port *mlxsw_sp_port, struct switchdev_trans *trans); int mlxsw_sp_router_fib4_del(struct mlxsw_sp_port *mlxsw_sp_port, const struct switchdev_obj_ipv4_fib *fib4); +int mlxsw_sp_router_neigh_construct(struct net_device *dev, + struct neighbour *n); +void mlxsw_sp_router_neigh_destroy(struct net_device *dev, + struct neighbour *n); #endif diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 7e3992a681b30..90d382a5d6a7e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -38,6 +38,8 @@ #include #include #include +#include +#include #include "spectrum.h" #include "core.h" @@ -544,6 +546,147 @@ static void mlxsw_sp_vrs_init(struct mlxsw_sp *mlxsw_sp) } } +struct mlxsw_sp_neigh_key { + unsigned char addr[sizeof(struct in6_addr)]; + struct net_device *dev; +}; + +struct mlxsw_sp_neigh_entry { + struct rhash_head ht_node; + struct mlxsw_sp_neigh_key key; + u16 rif; + struct neighbour *n; +}; + +static const struct rhashtable_params mlxsw_sp_neigh_ht_params = { + .key_offset = offsetof(struct mlxsw_sp_neigh_entry, key), + .head_offset = offsetof(struct mlxsw_sp_neigh_entry, ht_node), + .key_len = sizeof(struct mlxsw_sp_neigh_key), +}; + +static int +mlxsw_sp_neigh_entry_insert(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_neigh_entry *neigh_entry) +{ + return rhashtable_insert_fast(&mlxsw_sp->router.neigh_ht, + &neigh_entry->ht_node, + mlxsw_sp_neigh_ht_params); +} + +static void +mlxsw_sp_neigh_entry_remove(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_neigh_entry *neigh_entry) +{ + rhashtable_remove_fast(&mlxsw_sp->router.neigh_ht, + &neigh_entry->ht_node, + mlxsw_sp_neigh_ht_params); +} + +static struct mlxsw_sp_neigh_entry * +mlxsw_sp_neigh_entry_create(const void *addr, size_t addr_len, + struct net_device *dev, u16 rif, + struct neighbour *n) +{ + struct mlxsw_sp_neigh_entry *neigh_entry; + + neigh_entry = kzalloc(sizeof(*neigh_entry), GFP_ATOMIC); + if (!neigh_entry) + return NULL; + memcpy(neigh_entry->key.addr, addr, addr_len); + neigh_entry->key.dev = dev; + neigh_entry->rif = rif; + neigh_entry->n = n; + return neigh_entry; +} + +static void +mlxsw_sp_neigh_entry_destroy(struct mlxsw_sp_neigh_entry *neigh_entry) +{ + kfree(neigh_entry); +} + +static struct mlxsw_sp_neigh_entry * +mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, const void *addr, + size_t addr_len, struct net_device *dev) +{ + struct mlxsw_sp_neigh_key key = {{ 0 } }; + + memcpy(key.addr, addr, addr_len); + key.dev = dev; + return rhashtable_lookup_fast(&mlxsw_sp->router.neigh_ht, + &key, mlxsw_sp_neigh_ht_params); +} + +int mlxsw_sp_router_neigh_construct(struct net_device *dev, + struct neighbour *n) +{ + struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_neigh_entry *neigh_entry; + struct mlxsw_sp_rif *r; + u32 dip; + int err; + + if (n->tbl != &arp_tbl) + return 0; + + dip = ntohl(*((__be32 *) n->primary_key)); + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &dip, sizeof(dip), + n->dev); + if (neigh_entry) { + WARN_ON(neigh_entry->n != n); + return 0; + } + + r = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev); + if (WARN_ON(!r)) + return -EINVAL; + + neigh_entry = mlxsw_sp_neigh_entry_create(&dip, sizeof(dip), n->dev, + r->rif, n); + if (!neigh_entry) + return -ENOMEM; + err = mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry); + if (err) + goto err_neigh_entry_insert; + return 0; + +err_neigh_entry_insert: + mlxsw_sp_neigh_entry_destroy(neigh_entry); + return err; +} + +void mlxsw_sp_router_neigh_destroy(struct net_device *dev, + struct neighbour *n) +{ + struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_neigh_entry *neigh_entry; + u32 dip; + + if (n->tbl != &arp_tbl) + return; + + dip = ntohl(*((__be32 *) n->primary_key)); + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &dip, sizeof(dip), + n->dev); + if (!neigh_entry) + return; + mlxsw_sp_neigh_entry_remove(mlxsw_sp, neigh_entry); + mlxsw_sp_neigh_entry_destroy(neigh_entry); +} + +static int mlxsw_sp_neigh_init(struct mlxsw_sp *mlxsw_sp) +{ + return rhashtable_init(&mlxsw_sp->router.neigh_ht, + &mlxsw_sp_neigh_ht_params); +} + +static void mlxsw_sp_neigh_fini(struct mlxsw_sp *mlxsw_sp) +{ + rhashtable_destroy(&mlxsw_sp->router.neigh_ht); +} + static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) { char rgcr_pl[MLXSW_REG_RGCR_LEN]; @@ -570,11 +713,12 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) return err; mlxsw_sp_lpm_init(mlxsw_sp); mlxsw_sp_vrs_init(mlxsw_sp); - return 0; + return mlxsw_sp_neigh_init(mlxsw_sp); } void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) { + mlxsw_sp_neigh_fini(mlxsw_sp); __mlxsw_sp_router_fini(mlxsw_sp); } From 4457b3df3fb933aae3f4e75eca669f1be5a4dd68 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 5 Jul 2016 11:27:40 +0200 Subject: [PATCH 04/16] mlxsw: reg: Add Router Algorithmic LPM Unicast Host Table register The RAUHT register is used to configure and query the Unicast Host Table in devices that implement the Algorithmic LPM. In other words, it is used to configure neighbour entries in the device. Signed-off-by: Yotam Gigi Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 141 ++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 9280d96bb2914..cc6a0b34ad75c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -4,6 +4,7 @@ * Copyright (c) 2015-2016 Ido Schimmel * Copyright (c) 2015 Elad Raz * Copyright (c) 2015-2016 Jiri Pirko + * Copyright (c) 2016 Yotam Gigi * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -3884,6 +3885,144 @@ mlxsw_reg_ralue_act_ip2me_pack(char *payload) MLXSW_REG_RALUE_ACTION_TYPE_IP2ME); } +/* RAUHT - Router Algorithmic LPM Unicast Host Table Register + * ---------------------------------------------------------- + * The RAUHT register is used to configure and query the Unicast Host table in + * devices that implement the Algorithmic LPM. + */ +#define MLXSW_REG_RAUHT_ID 0x8014 +#define MLXSW_REG_RAUHT_LEN 0x74 + +static const struct mlxsw_reg_info mlxsw_reg_rauht = { + .id = MLXSW_REG_RAUHT_ID, + .len = MLXSW_REG_RAUHT_LEN, +}; + +enum mlxsw_reg_rauht_type { + MLXSW_REG_RAUHT_TYPE_IPV4, + MLXSW_REG_RAUHT_TYPE_IPV6, +}; + +/* reg_rauht_type + * Access: Index + */ +MLXSW_ITEM32(reg, rauht, type, 0x00, 24, 2); + +enum mlxsw_reg_rauht_op { + MLXSW_REG_RAUHT_OP_QUERY_READ = 0, + /* Read operation */ + MLXSW_REG_RAUHT_OP_QUERY_CLEAR_ON_READ = 1, + /* Clear on read operation. Used to read entry and clear + * activity bit. + */ + MLXSW_REG_RAUHT_OP_WRITE_ADD = 0, + /* Add. Used to write a new entry to the table. All R/W fields are + * relevant for new entry. Activity bit is set for new entries. + */ + MLXSW_REG_RAUHT_OP_WRITE_UPDATE = 1, + /* Update action. Used to update an existing route entry and + * only update the following fields: + * trap_action, trap_id, mac, counter_set_type, counter_index + */ + MLXSW_REG_RAUHT_OP_WRITE_CLEAR_ACTIVITY = 2, + /* Clear activity. A bit is cleared for the entry. */ + MLXSW_REG_RAUHT_OP_WRITE_DELETE = 3, + /* Delete entry */ + MLXSW_REG_RAUHT_OP_WRITE_DELETE_ALL = 4, + /* Delete all host entries on a RIF. In this command, dip + * field is reserved. + */ +}; + +/* reg_rauht_op + * Access: OP + */ +MLXSW_ITEM32(reg, rauht, op, 0x00, 20, 3); + +/* reg_rauht_a + * Activity. Set for new entries. Set if a packet lookup has hit on + * the specific entry. + * To clear the a bit, use "clear activity" op. + * Enabled by activity_dis in RGCR + * Access: RO + */ +MLXSW_ITEM32(reg, rauht, a, 0x00, 16, 1); + +/* reg_rauht_rif + * Router Interface + * Access: Index + */ +MLXSW_ITEM32(reg, rauht, rif, 0x00, 0, 16); + +/* reg_rauht_dip* + * Destination address. + * Access: Index + */ +MLXSW_ITEM32(reg, rauht, dip4, 0x1C, 0x0, 32); + +enum mlxsw_reg_rauht_trap_action { + MLXSW_REG_RAUHT_TRAP_ACTION_NOP, + MLXSW_REG_RAUHT_TRAP_ACTION_TRAP, + MLXSW_REG_RAUHT_TRAP_ACTION_MIRROR_TO_CPU, + MLXSW_REG_RAUHT_TRAP_ACTION_MIRROR, + MLXSW_REG_RAUHT_TRAP_ACTION_DISCARD_ERRORS, +}; + +/* reg_rauht_trap_action + * Access: RW + */ +MLXSW_ITEM32(reg, rauht, trap_action, 0x60, 28, 4); + +enum mlxsw_reg_rauht_trap_id { + MLXSW_REG_RAUHT_TRAP_ID_RTR_EGRESS0, + MLXSW_REG_RAUHT_TRAP_ID_RTR_EGRESS1, +}; + +/* reg_rauht_trap_id + * Trap ID to be reported to CPU. + * Trap-ID is RTR_EGRESS0 or RTR_EGRESS1. + * For trap_action of NOP, MIRROR and DISCARD_ERROR, + * trap_id is reserved. + * Access: RW + */ +MLXSW_ITEM32(reg, rauht, trap_id, 0x60, 0, 9); + +/* reg_rauht_counter_set_type + * Counter set type for flow counters + * Access: RW + */ +MLXSW_ITEM32(reg, rauht, counter_set_type, 0x68, 24, 8); + +/* reg_rauht_counter_index + * Counter index for flow counters + * Access: RW + */ +MLXSW_ITEM32(reg, rauht, counter_index, 0x68, 0, 24); + +/* reg_rauht_mac + * MAC address. + * Access: RW + */ +MLXSW_ITEM_BUF(reg, rauht, mac, 0x6E, 6); + +static inline void mlxsw_reg_rauht_pack(char *payload, + enum mlxsw_reg_rauht_op op, u16 rif, + const char *mac) +{ + MLXSW_REG_ZERO(rauht, payload); + mlxsw_reg_rauht_op_set(payload, op); + mlxsw_reg_rauht_rif_set(payload, rif); + mlxsw_reg_rauht_mac_memcpy_to(payload, mac); +} + +static inline void mlxsw_reg_rauht_pack4(char *payload, + enum mlxsw_reg_rauht_op op, u16 rif, + const char *mac, u32 dip) +{ + mlxsw_reg_rauht_pack(payload, op, rif, mac); + mlxsw_reg_rauht_dip4_set(payload, dip); +} + /* MFCR - Management Fan Control Register * -------------------------------------- * This register controls the settings of the Fan Speed PWM mechanism. @@ -4634,6 +4773,8 @@ static inline const char *mlxsw_reg_id_str(u16 reg_id) return "RALTB"; case MLXSW_REG_RALUE_ID: return "RALUE"; + case MLXSW_REG_RAUHT_ID: + return "RAUHT"; case MLXSW_REG_MFCR_ID: return "MFCR"; case MLXSW_REG_MFSC_ID: From 7cf2c205d7035a7ae525e3cfb5b7141fb7152372 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 5 Jul 2016 11:27:41 +0200 Subject: [PATCH 05/16] mlxsw: reg: Add Router Algorithmic LPM Unicast Host Table Dump register The RAUHTD register allows dumping entries from the Router Unicast Host Table. Signed-off-by: Yotam Gigi Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 147 ++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index cc6a0b34ad75c..fcf379b5984c3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -4023,6 +4023,151 @@ static inline void mlxsw_reg_rauht_pack4(char *payload, mlxsw_reg_rauht_dip4_set(payload, dip); } +/* RAUHTD - Router Algorithmic LPM Unicast Host Table Dump Register + * ---------------------------------------------------------------- + * The RAUHTD register allows dumping entries from the Router Unicast Host + * Table. For a given session an entry is dumped no more than one time. The + * first RAUHTD access after reset is a new session. A session ends when the + * num_rec response is smaller than num_rec request or for IPv4 when the + * num_entries is smaller than 4. The clear activity affect the current session + * or the last session if a new session has not started. + */ +#define MLXSW_REG_RAUHTD_ID 0x8018 +#define MLXSW_REG_RAUHTD_BASE_LEN 0x20 +#define MLXSW_REG_RAUHTD_REC_LEN 0x20 +#define MLXSW_REG_RAUHTD_REC_MAX_NUM 32 +#define MLXSW_REG_RAUHTD_LEN (MLXSW_REG_RAUHTD_BASE_LEN + \ + MLXSW_REG_RAUHTD_REC_MAX_NUM * MLXSW_REG_RAUHTD_REC_LEN) +#define MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC 4 + +static const struct mlxsw_reg_info mlxsw_reg_rauhtd = { + .id = MLXSW_REG_RAUHTD_ID, + .len = MLXSW_REG_RAUHTD_LEN, +}; + +#define MLXSW_REG_RAUHTD_FILTER_A BIT(0) +#define MLXSW_REG_RAUHTD_FILTER_RIF BIT(3) + +/* reg_rauhtd_filter_fields + * if a bit is '0' then the relevant field is ignored and dump is done + * regardless of the field value + * Bit0 - filter by activity: entry_a + * Bit3 - filter by entry rip: entry_rif + * Access: Index + */ +MLXSW_ITEM32(reg, rauhtd, filter_fields, 0x00, 0, 8); + +enum mlxsw_reg_rauhtd_op { + MLXSW_REG_RAUHTD_OP_DUMP, + MLXSW_REG_RAUHTD_OP_DUMP_AND_CLEAR, +}; + +/* reg_rauhtd_op + * Access: OP + */ +MLXSW_ITEM32(reg, rauhtd, op, 0x04, 24, 2); + +/* reg_rauhtd_num_rec + * At request: number of records requested + * At response: number of records dumped + * For IPv4, each record has 4 entries at request and up to 4 entries + * at response + * Range is 0..MLXSW_REG_RAUHTD_REC_MAX_NUM + * Access: Index + */ +MLXSW_ITEM32(reg, rauhtd, num_rec, 0x04, 0, 8); + +/* reg_rauhtd_entry_a + * Dump only if activity has value of entry_a + * Reserved if filter_fields bit0 is '0' + * Access: Index + */ +MLXSW_ITEM32(reg, rauhtd, entry_a, 0x08, 16, 1); + +enum mlxsw_reg_rauhtd_type { + MLXSW_REG_RAUHTD_TYPE_IPV4, + MLXSW_REG_RAUHTD_TYPE_IPV6, +}; + +/* reg_rauhtd_type + * Dump only if record type is: + * 0 - IPv4 + * 1 - IPv6 + * Access: Index + */ +MLXSW_ITEM32(reg, rauhtd, type, 0x08, 0, 4); + +/* reg_rauhtd_entry_rif + * Dump only if RIF has value of entry_rif + * Reserved if filter_fields bit3 is '0' + * Access: Index + */ +MLXSW_ITEM32(reg, rauhtd, entry_rif, 0x0C, 0, 16); + +static inline void mlxsw_reg_rauhtd_pack(char *payload, + enum mlxsw_reg_rauhtd_type type) +{ + MLXSW_REG_ZERO(rauhtd, payload); + mlxsw_reg_rauhtd_filter_fields_set(payload, MLXSW_REG_RAUHTD_FILTER_A); + mlxsw_reg_rauhtd_op_set(payload, MLXSW_REG_RAUHTD_OP_DUMP_AND_CLEAR); + mlxsw_reg_rauhtd_num_rec_set(payload, MLXSW_REG_RAUHTD_REC_MAX_NUM); + mlxsw_reg_rauhtd_entry_a_set(payload, 1); + mlxsw_reg_rauhtd_type_set(payload, type); +} + +/* reg_rauhtd_ipv4_rec_num_entries + * Number of valid entries in this record: + * 0 - 1 valid entry + * 1 - 2 valid entries + * 2 - 3 valid entries + * 3 - 4 valid entries + * Access: RO + */ +MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_rec_num_entries, + MLXSW_REG_RAUHTD_BASE_LEN, 28, 2, + MLXSW_REG_RAUHTD_REC_LEN, 0x00, false); + +/* reg_rauhtd_rec_type + * Record type. + * 0 - IPv4 + * 1 - IPv6 + * Access: RO + */ +MLXSW_ITEM32_INDEXED(reg, rauhtd, rec_type, MLXSW_REG_RAUHTD_BASE_LEN, 24, 2, + MLXSW_REG_RAUHTD_REC_LEN, 0x00, false); + +#define MLXSW_REG_RAUHTD_IPV4_ENT_LEN 0x8 + +/* reg_rauhtd_ipv4_ent_a + * Activity. Set for new entries. Set if a packet lookup has hit on the + * specific entry. + * Access: RO + */ +MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_a, MLXSW_REG_RAUHTD_BASE_LEN, 16, 1, + MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x00, false); + +/* reg_rauhtd_ipv4_ent_rif + * Router interface. + * Access: RO + */ +MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_rif, MLXSW_REG_RAUHTD_BASE_LEN, 0, + 16, MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x00, false); + +/* reg_rauhtd_ipv4_ent_dip + * Destination IPv4 address. + * Access: RO + */ +MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_dip, MLXSW_REG_RAUHTD_BASE_LEN, 0, + 32, MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x04, false); + +static inline void mlxsw_reg_rauhtd_ent_ipv4_unpack(char *payload, + int ent_index, u16 *p_rif, + u32 *p_dip) +{ + *p_rif = mlxsw_reg_rauhtd_ipv4_ent_rif_get(payload, ent_index); + *p_dip = mlxsw_reg_rauhtd_ipv4_ent_dip_get(payload, ent_index); +} + /* MFCR - Management Fan Control Register * -------------------------------------- * This register controls the settings of the Fan Speed PWM mechanism. @@ -4775,6 +4920,8 @@ static inline const char *mlxsw_reg_id_str(u16 reg_id) return "RALUE"; case MLXSW_REG_RAUHT_ID: return "RAUHT"; + case MLXSW_REG_RAUHTD_ID: + return "RAUHTD"; case MLXSW_REG_MFCR_ID: return "MFCR"; case MLXSW_REG_MFSC_ID: From 2a4501ae18b52fcdf553404286e6cefabd1d17ec Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 5 Jul 2016 11:27:42 +0200 Subject: [PATCH 06/16] neigh: Send a notification when DELAY_PROBE_TIME changes When the data plane is offloaded the traffic doesn't go through the networking stack. Therefore, after first resolving a neighbour the NUD state machine will transition it from REACHABLE to STALE until it's finally deleted by the garbage collector. To prevent such situations the offloading driver should notify the NUD state machine on any neighbours that were recently used. The driver's polling interval should be set so that the NUD state machine can function as if the traffic wasn't offloaded. Currently, there are no in-tree drivers that can report confirmation for a neighbour, but only 'used' indication. Therefore, the polling interval should be set according to DELAY_FIRST_PROBE_TIME, as a neighbour will transition from REACHABLE state to DELAY (instead of STALE) if "a packet was sent within the last DELAY_FIRST_PROBE_TIME seconds" (RFC 4861). Send a netevent whenever the DELAY_FIRST_PROBE_TIME changes - either via netlink or sysctl - so that offloading drivers can correctly set their polling interval. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/netevent.h | 1 + net/core/neighbour.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/net/netevent.h b/include/net/netevent.h index d8bbb38584b60..f440df172b560 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -24,6 +24,7 @@ struct netevent_redirect { enum netevent_notif_type { NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ + NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 952aabb5aa565..5cdc62a8eb846 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2047,6 +2047,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, @@ -2930,6 +2931,7 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) return; set_bit(index, p->data_state); + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } From c723c735fa6bacfdb01c4697c2cfeba142990d18 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 5 Jul 2016 11:27:43 +0200 Subject: [PATCH 07/16] mlxsw: spectrum_router: Periodically update the kernel's neigh table As previously explained, the driver should periodically poll the device for neighbours activity according to the configured DELAY_PROBE_TIME. This will prevent active neighbours from staying in STALE state for long periods of time. During init configure the polling interval according to the DELAY_PROBE_TIME used in the default table. In addition, register a netevent notification block, so that the interval is updated whenever DELAY_PROBE_TIME changes. Using the computed interval schedule a delayed work, which will update the kernel via neigh_event_send() on any active neighbour since the last delayed work. Signed-off-by: Yotam Gigi Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 4 + .../ethernet/mellanox/mlxsw/spectrum_router.c | 192 +++++++++++++++++- 2 files changed, 194 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 734c5baffaf19..9c2a60f015178 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -214,6 +214,10 @@ struct mlxsw_sp_router { struct mlxsw_sp_lpm_tree lpm_trees[MLXSW_SP_LPM_TREE_COUNT]; struct mlxsw_sp_vr vrs[MLXSW_SP_VIRTUAL_ROUTER_MAX]; struct rhashtable neigh_ht; + struct { + struct delayed_work dw; + unsigned long interval; /* ms */ + } neighs_update; }; struct mlxsw_sp { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 90d382a5d6a7e..db1c2c42cd3b6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3,6 +3,7 @@ * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko * Copyright (c) 2016 Ido Schimmel + * Copyright (c) 2016 Yotam Gigi * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,6 +39,8 @@ #include #include #include +#include +#include #include #include @@ -676,14 +679,199 @@ void mlxsw_sp_router_neigh_destroy(struct net_device *dev, mlxsw_sp_neigh_entry_destroy(neigh_entry); } +static void +mlxsw_sp_router_neighs_update_interval_init(struct mlxsw_sp *mlxsw_sp) +{ + unsigned long interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME); + + mlxsw_sp->router.neighs_update.interval = jiffies_to_msecs(interval); +} + +static void mlxsw_sp_router_neigh_ent_ipv4_process(struct mlxsw_sp *mlxsw_sp, + char *rauhtd_pl, + int ent_index) +{ + struct net_device *dev; + struct neighbour *n; + __be32 dipn; + u32 dip; + u16 rif; + + mlxsw_reg_rauhtd_ent_ipv4_unpack(rauhtd_pl, ent_index, &rif, &dip); + + if (!mlxsw_sp->rifs[rif]) { + dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect RIF in neighbour entry\n"); + return; + } + + dipn = htonl(dip); + dev = mlxsw_sp->rifs[rif]->dev; + n = neigh_lookup(&arp_tbl, &dipn, dev); + if (!n) { + netdev_err(dev, "Failed to find matching neighbour for IP=%pI4h\n", + &dip); + return; + } + + netdev_dbg(dev, "Updating neighbour with IP=%pI4h\n", &dip); + neigh_event_send(n, NULL); + neigh_release(n); +} + +static void mlxsw_sp_router_neigh_rec_ipv4_process(struct mlxsw_sp *mlxsw_sp, + char *rauhtd_pl, + int rec_index) +{ + u8 num_entries; + int i; + + num_entries = mlxsw_reg_rauhtd_ipv4_rec_num_entries_get(rauhtd_pl, + rec_index); + /* Hardware starts counting at 0, so add 1. */ + num_entries++; + + /* Each record consists of several neighbour entries. */ + for (i = 0; i < num_entries; i++) { + int ent_index; + + ent_index = rec_index * MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC + i; + mlxsw_sp_router_neigh_ent_ipv4_process(mlxsw_sp, rauhtd_pl, + ent_index); + } + +} + +static void mlxsw_sp_router_neigh_rec_process(struct mlxsw_sp *mlxsw_sp, + char *rauhtd_pl, int rec_index) +{ + switch (mlxsw_reg_rauhtd_rec_type_get(rauhtd_pl, rec_index)) { + case MLXSW_REG_RAUHTD_TYPE_IPV4: + mlxsw_sp_router_neigh_rec_ipv4_process(mlxsw_sp, rauhtd_pl, + rec_index); + break; + case MLXSW_REG_RAUHTD_TYPE_IPV6: + WARN_ON_ONCE(1); + break; + } +} + +static void +mlxsw_sp_router_neighs_update_work_schedule(struct mlxsw_sp *mlxsw_sp) +{ + unsigned long interval = mlxsw_sp->router.neighs_update.interval; + + mlxsw_core_schedule_dw(&mlxsw_sp->router.neighs_update.dw, + msecs_to_jiffies(interval)); +} + +static void mlxsw_sp_router_neighs_update_work(struct work_struct *work) +{ + struct mlxsw_sp *mlxsw_sp; + char *rauhtd_pl; + u8 num_rec; + int i, err; + + rauhtd_pl = kmalloc(MLXSW_REG_RAUHTD_LEN, GFP_KERNEL); + if (!rauhtd_pl) + return; + + mlxsw_sp = container_of(work, struct mlxsw_sp, + router.neighs_update.dw.work); + + /* Make sure the neighbour's netdev isn't removed in the + * process. + */ + rtnl_lock(); + do { + mlxsw_reg_rauhtd_pack(rauhtd_pl, MLXSW_REG_RAUHTD_TYPE_IPV4); + err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(rauhtd), + rauhtd_pl); + if (err) { + dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to dump neighbour talbe\n"); + break; + } + num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl); + for (i = 0; i < num_rec; i++) + mlxsw_sp_router_neigh_rec_process(mlxsw_sp, rauhtd_pl, + i); + } while (num_rec); + rtnl_unlock(); + + kfree(rauhtd_pl); + mlxsw_sp_router_neighs_update_work_schedule(mlxsw_sp); +} + +static int mlxsw_sp_router_netevent_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct mlxsw_sp_port *mlxsw_sp_port; + struct mlxsw_sp *mlxsw_sp; + unsigned long interval; + struct neigh_parms *p; + + switch (event) { + case NETEVENT_DELAY_PROBE_TIME_UPDATE: + p = ptr; + + /* We don't care about changes in the default table. */ + if (!p->dev || p->tbl != &arp_tbl) + return NOTIFY_DONE; + + /* We are in atomic context and can't take RTNL mutex, + * so use RCU variant to walk the device chain. + */ + mlxsw_sp_port = mlxsw_sp_port_lower_dev_hold(p->dev); + if (!mlxsw_sp_port) + return NOTIFY_DONE; + + mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + interval = jiffies_to_msecs(NEIGH_VAR(p, DELAY_PROBE_TIME)); + mlxsw_sp->router.neighs_update.interval = interval; + + mlxsw_sp_port_dev_put(mlxsw_sp_port); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block mlxsw_sp_router_netevent_nb __read_mostly = { + .notifier_call = mlxsw_sp_router_netevent_event, +}; + static int mlxsw_sp_neigh_init(struct mlxsw_sp *mlxsw_sp) { - return rhashtable_init(&mlxsw_sp->router.neigh_ht, - &mlxsw_sp_neigh_ht_params); + int err; + + err = rhashtable_init(&mlxsw_sp->router.neigh_ht, + &mlxsw_sp_neigh_ht_params); + if (err) + return err; + + /* Initialize the polling interval according to the default + * table. + */ + mlxsw_sp_router_neighs_update_interval_init(mlxsw_sp); + + err = register_netevent_notifier(&mlxsw_sp_router_netevent_nb); + if (err) + goto err_register_netevent_notifier; + + INIT_DELAYED_WORK(&mlxsw_sp->router.neighs_update.dw, + mlxsw_sp_router_neighs_update_work); + mlxsw_core_schedule_dw(&mlxsw_sp->router.neighs_update.dw, 0); + + return 0; + +err_register_netevent_notifier: + rhashtable_destroy(&mlxsw_sp->router.neigh_ht); + return err; } static void mlxsw_sp_neigh_fini(struct mlxsw_sp *mlxsw_sp) { + cancel_delayed_work_sync(&mlxsw_sp->router.neighs_update.dw); + unregister_netevent_notifier(&mlxsw_sp_router_netevent_nb); rhashtable_destroy(&mlxsw_sp->router.neigh_ht); } From a6bf9e933daf9609c3673e8348e7f95f3ef325c7 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 5 Jul 2016 11:27:44 +0200 Subject: [PATCH 08/16] mlxsw: spectrum_router: Offload neighbours based on NUD state change Listen to any NEIGH_UPDATE events sent and program the device accordingly. If NUD state is VALID and neighbour isn't yet offloaded, then program it into the device's table. Otherwise, just edit its parameters. If NUD state machine transitioned neighbour out of VALID state and it's present in the device's table, then remove it. Note that the device is programmed in delayed work, as the netevent notification chain is atomic and prevents us from going to sleep. Signed-off-by: Yotam Gigi Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index db1c2c42cd3b6..ed0e6c09dcc88 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -559,6 +559,10 @@ struct mlxsw_sp_neigh_entry { struct mlxsw_sp_neigh_key key; u16 rif; struct neighbour *n; + bool offloaded; + struct delayed_work dw; + struct mlxsw_sp_port *mlxsw_sp_port; + unsigned char ha[ETH_ALEN]; }; static const struct rhashtable_params mlxsw_sp_neigh_ht_params = { @@ -585,6 +589,8 @@ mlxsw_sp_neigh_entry_remove(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_neigh_ht_params); } +static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work); + static struct mlxsw_sp_neigh_entry * mlxsw_sp_neigh_entry_create(const void *addr, size_t addr_len, struct net_device *dev, u16 rif, @@ -599,6 +605,7 @@ mlxsw_sp_neigh_entry_create(const void *addr, size_t addr_len, neigh_entry->key.dev = dev; neigh_entry->rif = rif; neigh_entry->n = n; + INIT_DELAYED_WORK(&neigh_entry->dw, mlxsw_sp_router_neigh_update_hw); return neigh_entry; } @@ -801,13 +808,76 @@ static void mlxsw_sp_router_neighs_update_work(struct work_struct *work) mlxsw_sp_router_neighs_update_work_schedule(mlxsw_sp); } +static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work) +{ + struct mlxsw_sp_neigh_entry *neigh_entry = + container_of(work, struct mlxsw_sp_neigh_entry, dw.work); + struct neighbour *n = neigh_entry->n; + struct mlxsw_sp_port *mlxsw_sp_port = neigh_entry->mlxsw_sp_port; + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + char rauht_pl[MLXSW_REG_RAUHT_LEN]; + struct net_device *dev; + bool entry_connected; + u8 nud_state; + bool updating; + bool removing; + bool adding; + u32 dip; + int err; + + read_lock_bh(&n->lock); + dip = ntohl(*((__be32 *) n->primary_key)); + memcpy(neigh_entry->ha, n->ha, sizeof(neigh_entry->ha)); + nud_state = n->nud_state; + dev = n->dev; + read_unlock_bh(&n->lock); + + entry_connected = nud_state & NUD_VALID; + adding = (!neigh_entry->offloaded) && entry_connected; + updating = neigh_entry->offloaded && entry_connected; + removing = neigh_entry->offloaded && !entry_connected; + + if (adding || updating) { + mlxsw_reg_rauht_pack4(rauht_pl, MLXSW_REG_RAUHT_OP_WRITE_ADD, + neigh_entry->rif, + neigh_entry->ha, dip); + err = mlxsw_reg_write(mlxsw_sp->core, + MLXSW_REG(rauht), rauht_pl); + if (err) { + netdev_err(dev, "Could not add neigh %pI4h\n", &dip); + neigh_entry->offloaded = false; + } else { + neigh_entry->offloaded = true; + } + } else if (removing) { + mlxsw_reg_rauht_pack4(rauht_pl, MLXSW_REG_RAUHT_OP_WRITE_DELETE, + neigh_entry->rif, + neigh_entry->ha, dip); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rauht), + rauht_pl); + if (err) { + netdev_err(dev, "Could not delete neigh %pI4h\n", &dip); + neigh_entry->offloaded = true; + } else { + neigh_entry->offloaded = false; + } + } + + neigh_release(n); + mlxsw_sp_port_dev_put(mlxsw_sp_port); +} + static int mlxsw_sp_router_netevent_event(struct notifier_block *unused, unsigned long event, void *ptr) { + struct mlxsw_sp_neigh_entry *neigh_entry; struct mlxsw_sp_port *mlxsw_sp_port; struct mlxsw_sp *mlxsw_sp; unsigned long interval; + struct net_device *dev; struct neigh_parms *p; + struct neighbour *n; + u32 dip; switch (event) { case NETEVENT_DELAY_PROBE_TIME_UPDATE: @@ -830,6 +900,39 @@ static int mlxsw_sp_router_netevent_event(struct notifier_block *unused, mlxsw_sp_port_dev_put(mlxsw_sp_port); break; + case NETEVENT_NEIGH_UPDATE: + n = ptr; + dev = n->dev; + + if (n->tbl != &arp_tbl) + return NOTIFY_DONE; + + mlxsw_sp_port = mlxsw_sp_port_lower_dev_hold(dev); + if (!mlxsw_sp_port) + return NOTIFY_DONE; + + mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + dip = ntohl(*((__be32 *) n->primary_key)); + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, + &dip, + sizeof(__be32), + dev); + if (WARN_ON(!neigh_entry) || WARN_ON(neigh_entry->n != n)) { + mlxsw_sp_port_dev_put(mlxsw_sp_port); + return NOTIFY_DONE; + } + neigh_entry->mlxsw_sp_port = mlxsw_sp_port; + + /* Take a reference to ensure the neighbour won't be + * destructed until we drop the reference in delayed + * work. + */ + neigh_clone(n); + if (!mlxsw_core_schedule_dw(&neigh_entry->dw, 0)) { + neigh_release(n); + mlxsw_sp_port_dev_put(mlxsw_sp_port); + } + break; } return NOTIFY_DONE; From 489107bda1d1ad20a56cb19753d705006b21bb90 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:45 +0200 Subject: [PATCH 09/16] mlxsw: Add KVD sizes configuration into profile Up until now we only used hash-based tables in the device, but we are going to use the linear table for remote routes adjacency lists. Add the configuration fields that control the size of the linear table. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/cmd.h | 43 ++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlxsw/core.h | 6 ++- drivers/net/ethernet/mellanox/mlxsw/pci.c | 14 +++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/cmd.h b/drivers/net/ethernet/mellanox/mlxsw/cmd.h index cd63b82636888..f9cd6e3f7709a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/cmd.h +++ b/drivers/net/ethernet/mellanox/mlxsw/cmd.h @@ -607,6 +607,24 @@ MLXSW_ITEM32(cmd_mbox, config_profile, */ MLXSW_ITEM32(cmd_mbox, config_profile, set_ar_sec, 0x0C, 15, 1); +/* cmd_mbox_config_set_kvd_linear_size + * Capability bit. Setting a bit to 1 configures the profile + * according to the mailbox contents. + */ +MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_linear_size, 0x0C, 24, 1); + +/* cmd_mbox_config_set_kvd_hash_single_size + * Capability bit. Setting a bit to 1 configures the profile + * according to the mailbox contents. + */ +MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_single_size, 0x0C, 25, 1); + +/* cmd_mbox_config_set_kvd_hash_double_size + * Capability bit. Setting a bit to 1 configures the profile + * according to the mailbox contents. + */ +MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_double_size, 0x0C, 26, 1); + /* cmd_mbox_config_profile_max_vepa_channels * Maximum number of VEPA channels per port (0 through 16) * 0 - multi-channel VEPA is disabled @@ -733,6 +751,31 @@ MLXSW_ITEM32(cmd_mbox, config_profile, adaptive_routing_group_cap, 0x4C, 0, 16); */ MLXSW_ITEM32(cmd_mbox, config_profile, arn, 0x50, 31, 1); +/* cmd_mbox_config_kvd_linear_size + * KVD Linear Size + * Valid for Spectrum only + * Allowed values are 128*N where N=0 or higher + */ +MLXSW_ITEM32(cmd_mbox, config_profile, kvd_linear_size, 0x54, 0, 24); + +/* cmd_mbox_config_kvd_hash_single_size + * KVD Hash single-entries size + * Valid for Spectrum only + * Allowed values are 128*N where N=0 or higher + * Must be greater or equal to cap_min_kvd_hash_single_size + * Must be smaller or equal to cap_kvd_size - kvd_linear_size + */ +MLXSW_ITEM32(cmd_mbox, config_profile, kvd_hash_single_size, 0x58, 0, 24); + +/* cmd_mbox_config_kvd_hash_double_size + * KVD Hash double-entries size (units of single-size entries) + * Valid for Spectrum only + * Allowed values are 128*N where N=0 or higher + * Must be either 0 or greater or equal to cap_min_kvd_hash_double_size + * Must be smaller or equal to cap_kvd_size - kvd_linear_size + */ +MLXSW_ITEM32(cmd_mbox, config_profile, kvd_hash_double_size, 0x5C, 0, 24); + /* cmd_mbox_config_profile_swid_config_mask * Modify Switch Partition Configuration mask. When set, the configu- * ration value for the Switch Partition are taken from the mailbox. diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 436bc49df6ab5..2fe385cce203c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -190,7 +190,8 @@ struct mlxsw_config_profile { used_max_ib_mc:1, used_max_pkey:1, used_ar_sec:1, - used_adaptive_routing_group_cap:1; + used_adaptive_routing_group_cap:1, + used_kvd_sizes:1; u8 max_vepa_channels; u16 max_lag; u16 max_port_per_lag; @@ -211,6 +212,9 @@ struct mlxsw_config_profile { u8 ar_sec; u16 adaptive_routing_group_cap; u8 arn; + u32 kvd_linear_size; + u32 kvd_hash_single_size; + u32 kvd_hash_double_size; struct mlxsw_swid_config swid_config[MLXSW_CONFIG_PROFILE_SWID_COUNT]; }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 7f4173c8eda36..ddbc9f22278de 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1255,6 +1255,20 @@ static int mlxsw_pci_config_profile(struct mlxsw_pci *mlxsw_pci, char *mbox, mlxsw_cmd_mbox_config_profile_adaptive_routing_group_cap_set( mbox, profile->adaptive_routing_group_cap); } + if (profile->used_kvd_sizes) { + mlxsw_cmd_mbox_config_profile_set_kvd_linear_size_set( + mbox, 1); + mlxsw_cmd_mbox_config_profile_kvd_linear_size_set( + mbox, profile->kvd_linear_size); + mlxsw_cmd_mbox_config_profile_set_kvd_hash_single_size_set( + mbox, 1); + mlxsw_cmd_mbox_config_profile_kvd_hash_single_size_set( + mbox, profile->kvd_hash_single_size); + mlxsw_cmd_mbox_config_profile_set_kvd_hash_double_size_set( + mbox, 1); + mlxsw_cmd_mbox_config_profile_kvd_hash_double_size_set( + mbox, profile->kvd_hash_double_size); + } for (i = 0; i < MLXSW_CONFIG_PROFILE_SWID_COUNT; i++) mlxsw_pci_config_profile_swid_config(mlxsw_pci, mbox, i, From c602242761dff9f698e16f3ab474e48da1b83f8f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:46 +0200 Subject: [PATCH 10/16] mlxsw: spectrum: Define sizes of KVD areas Override the defaults and define the area sizes ourselves. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++++ drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 9bebb7af4e542..c812513e079d6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2356,6 +2356,10 @@ static struct mlxsw_config_profile mlxsw_sp_config_profile = { .max_ib_mc = 0, .used_max_pkey = 1, .max_pkey = 0, + .used_kvd_sizes = 1, + .kvd_linear_size = MLXSW_SP_KVD_LINEAR_SIZE, + .kvd_hash_single_size = MLXSW_SP_KVD_HASH_SINGLE_SIZE, + .kvd_hash_double_size = MLXSW_SP_KVD_HASH_DOUBLE_SIZE, .swid_config = { { .used_type = 1, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 9c2a60f015178..f7d34d8ce7590 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -76,6 +76,10 @@ #define MLXSW_SP_BYTES_TO_CELLS(b) DIV_ROUND_UP(b, MLXSW_SP_BYTES_PER_CELL) #define MLXSW_SP_CELLS_TO_BYTES(c) (c * MLXSW_SP_BYTES_PER_CELL) +#define MLXSW_SP_KVD_LINEAR_SIZE 65536 /* entries */ +#define MLXSW_SP_KVD_HASH_SINGLE_SIZE 163840 /* entries */ +#define MLXSW_SP_KVD_HASH_DOUBLE_SIZE 32768 /* entries */ + /* Maximum delay buffer needed in case of PAUSE frames, in cells. * Assumes 100m cable and maximum MTU. */ From b090ef068645d6a9e69f04e571a3dcb32eb37d81 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:47 +0200 Subject: [PATCH 11/16] mlxsw: Introduce simplistic KVD linear area manager This is a very simple manager for KVD linear area. Currently, the allocator will either allocate a single entry from pre-defined sub-area, or in case more than one entry is needed, it will allocate 32-entry chunk in other pre-defined sub-area. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/Makefile | 3 +- .../net/ethernet/mellanox/mlxsw/spectrum.h | 6 ++ .../ethernet/mellanox/mlxsw/spectrum_kvdl.c | 91 +++++++++++++++++++ 3 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index ea05f8a10e8c9..d20ae1838a64d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -7,5 +7,6 @@ obj-$(CONFIG_MLXSW_SWITCHX2) += mlxsw_switchx2.o mlxsw_switchx2-objs := switchx2.o obj-$(CONFIG_MLXSW_SPECTRUM) += mlxsw_spectrum.o mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ - spectrum_switchdev.o spectrum_router.o + spectrum_switchdev.o spectrum_router.o \ + spectrum_kvdl.o mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB) += spectrum_dcb.o diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index f7d34d8ce7590..e78112897dcf3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -253,6 +253,9 @@ struct mlxsw_sp { u8 port_to_module[MLXSW_PORT_MAX_PORTS]; struct mlxsw_sp_sb sb; struct mlxsw_sp_router router; + struct { + DECLARE_BITMAP(usage, MLXSW_SP_KVD_LINEAR_SIZE); + } kvdl; }; static inline struct mlxsw_sp_upper * @@ -539,4 +542,7 @@ int mlxsw_sp_router_neigh_construct(struct net_device *dev, void mlxsw_sp_router_neigh_destroy(struct net_device *dev, struct neighbour *n); +int mlxsw_sp_kvdl_alloc(struct mlxsw_sp *mlxsw_sp, unsigned int entry_count); +void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index); + #endif diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c new file mode 100644 index 0000000000000..ac321e8e5c1ac --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c @@ -0,0 +1,91 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "spectrum.h" + +#define MLXSW_SP_KVDL_SINGLE_BASE 0 +#define MLXSW_SP_KVDL_SINGLE_SIZE 16384 +#define MLXSW_SP_KVDL_CHUNKS_BASE \ + (MLXSW_SP_KVDL_SINGLE_BASE + MLXSW_SP_KVDL_SINGLE_SIZE) +#define MLXSW_SP_KVDL_CHUNKS_SIZE \ + (MLXSW_SP_KVD_LINEAR_SIZE - MLXSW_SP_KVDL_CHUNKS_BASE) +#define MLXSW_SP_CHUNK_MAX 32 + +int mlxsw_sp_kvdl_alloc(struct mlxsw_sp *mlxsw_sp, unsigned int entry_count) +{ + int entry_index; + int size; + int type_base; + int type_size; + int type_entries; + + if (entry_count == 0 || entry_count > MLXSW_SP_CHUNK_MAX) { + return -EINVAL; + } else if (entry_count == 1) { + type_base = MLXSW_SP_KVDL_SINGLE_BASE; + type_size = MLXSW_SP_KVDL_SINGLE_SIZE; + type_entries = 1; + } else { + type_base = MLXSW_SP_KVDL_CHUNKS_BASE; + type_size = MLXSW_SP_KVDL_CHUNKS_SIZE; + type_entries = MLXSW_SP_CHUNK_MAX; + } + + entry_index = type_base; + size = type_base + type_size; + for_each_clear_bit_from(entry_index, mlxsw_sp->kvdl.usage, size) { + int i; + + for (i = 0; i < type_entries; i++) + set_bit(entry_index + i, mlxsw_sp->kvdl.usage); + return entry_index; + } + return -ENOBUFS; +} + +void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index) +{ + int type_entries; + int i; + + if (entry_index < MLXSW_SP_KVDL_CHUNKS_BASE) + type_entries = 1; + else + type_entries = MLXSW_SP_CHUNK_MAX; + for (i = 0; i < type_entries; i++) + clear_bit(entry_index + i, mlxsw_sp->kvdl.usage); +} From 089f981683f87a299f8a3b81696629ae1c7b1abb Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 5 Jul 2016 11:27:48 +0200 Subject: [PATCH 12/16] mlxsw: reg: Add Router Adjacency Table register The RATR register is used to configure the Router Adjacency (next-hop) Table. Signed-off-by: Yotam Gigi Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 133 ++++++++++++++++++++++ 1 file changed, 133 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index fcf379b5984c3..a2d7870ea60e9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -3455,6 +3455,137 @@ static inline void mlxsw_reg_ritr_pack(char *payload, bool enable, mlxsw_reg_ritr_if_mac_memcpy_to(payload, mac); } +/* RATR - Router Adjacency Table Register + * -------------------------------------- + * The RATR register is used to configure the Router Adjacency (next-hop) + * Table. + */ +#define MLXSW_REG_RATR_ID 0x8008 +#define MLXSW_REG_RATR_LEN 0x2C + +static const struct mlxsw_reg_info mlxsw_reg_ratr = { + .id = MLXSW_REG_RATR_ID, + .len = MLXSW_REG_RATR_LEN, +}; + +enum mlxsw_reg_ratr_op { + /* Read */ + MLXSW_REG_RATR_OP_QUERY_READ = 0, + /* Read and clear activity */ + MLXSW_REG_RATR_OP_QUERY_READ_CLEAR = 2, + /* Write Adjacency entry */ + MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY = 1, + /* Write Adjacency entry only if the activity is cleared. + * The write may not succeed if the activity is set. There is not + * direct feedback if the write has succeeded or not, however + * the get will reveal the actual entry (SW can compare the get + * response to the set command). + */ + MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY_ON_ACTIVITY = 3, +}; + +/* reg_ratr_op + * Note that Write operation may also be used for updating + * counter_set_type and counter_index. In this case all other + * fields must not be updated. + * Access: OP + */ +MLXSW_ITEM32(reg, ratr, op, 0x00, 28, 4); + +/* reg_ratr_v + * Valid bit. Indicates if the adjacency entry is valid. + * Note: the device may need some time before reusing an invalidated + * entry. During this time the entry can not be reused. It is + * recommended to use another entry before reusing an invalidated + * entry (e.g. software can put it at the end of the list for + * reusing). Trying to access an invalidated entry not yet cleared + * by the device results with failure indicating "Try Again" status. + * When valid is '0' then egress_router_interface,trap_action, + * adjacency_parameters and counters are reserved + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, v, 0x00, 24, 1); + +/* reg_ratr_a + * Activity. Set for new entries. Set if a packet lookup has hit on + * the specific entry. To clear the a bit, use "clear activity". + * Access: RO + */ +MLXSW_ITEM32(reg, ratr, a, 0x00, 16, 1); + +/* reg_ratr_adjacency_index_low + * Bits 15:0 of index into the adjacency table. + * For SwitchX and SwitchX-2, the adjacency table is linear and + * used for adjacency entries only. + * For Spectrum, the index is to the KVD linear. + * Access: Index + */ +MLXSW_ITEM32(reg, ratr, adjacency_index_low, 0x04, 0, 16); + +/* reg_ratr_egress_router_interface + * Range is 0 .. cap_max_router_interfaces - 1 + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, egress_router_interface, 0x08, 0, 16); + +enum mlxsw_reg_ratr_trap_action { + MLXSW_REG_RATR_TRAP_ACTION_NOP, + MLXSW_REG_RATR_TRAP_ACTION_TRAP, + MLXSW_REG_RATR_TRAP_ACTION_MIRROR_TO_CPU, + MLXSW_REG_RATR_TRAP_ACTION_MIRROR, + MLXSW_REG_RATR_TRAP_ACTION_DISCARD_ERRORS, +}; + +/* reg_ratr_trap_action + * see mlxsw_reg_ratr_trap_action + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, trap_action, 0x0C, 28, 4); + +enum mlxsw_reg_ratr_trap_id { + MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS0 = 0, + MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS1 = 1, +}; + +/* reg_ratr_adjacency_index_high + * Bits 23:16 of the adjacency_index. + * Access: Index + */ +MLXSW_ITEM32(reg, ratr, adjacency_index_high, 0x0C, 16, 8); + +/* reg_ratr_trap_id + * Trap ID to be reported to CPU. + * Trap-ID is RTR_EGRESS0 or RTR_EGRESS1. + * For trap_action of NOP, MIRROR and DISCARD_ERROR + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, trap_id, 0x0C, 0, 8); + +/* reg_ratr_eth_destination_mac + * MAC address of the destination next-hop. + * Access: RW + */ +MLXSW_ITEM_BUF(reg, ratr, eth_destination_mac, 0x12, 6); + +static inline void +mlxsw_reg_ratr_pack(char *payload, + enum mlxsw_reg_ratr_op op, bool valid, + u32 adjacency_index, u16 egress_rif) +{ + MLXSW_REG_ZERO(ratr, payload); + mlxsw_reg_ratr_op_set(payload, op); + mlxsw_reg_ratr_v_set(payload, valid); + mlxsw_reg_ratr_adjacency_index_low_set(payload, adjacency_index); + mlxsw_reg_ratr_adjacency_index_high_set(payload, adjacency_index >> 16); + mlxsw_reg_ratr_egress_router_interface_set(payload, egress_rif); +} + +static inline void mlxsw_reg_ratr_eth_entry_pack(char *payload, + const char *dest_mac) +{ + mlxsw_reg_ratr_eth_destination_mac_memcpy_to(payload, dest_mac); +} + /* RALTA - Router Algorithmic LPM Tree Allocation Register * ------------------------------------------------------- * RALTA is used to allocate the LPM trees of the SHSPM method. @@ -4910,6 +5041,8 @@ static inline const char *mlxsw_reg_id_str(u16 reg_id) return "RGCR"; case MLXSW_REG_RITR_ID: return "RITR"; + case MLXSW_REG_RATR_ID: + return "RATR"; case MLXSW_REG_RALTA_ID: return "RALTA"; case MLXSW_REG_RALST_ID: From a59f0b312a197d695a6d99469114d2134b5befb2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:49 +0200 Subject: [PATCH 13/16] mlxsw: reg: Add Router Algorithmic LPM ECMP Update Register The RALEU register is used to mass update remote action adjacency index and ecmp size. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 69 +++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index a2d7870ea60e9..0cc1485666778 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -4154,6 +4154,73 @@ static inline void mlxsw_reg_rauht_pack4(char *payload, mlxsw_reg_rauht_dip4_set(payload, dip); } +/* RALEU - Router Algorithmic LPM ECMP Update Register + * --------------------------------------------------- + * The register enables updating the ECMP section in the action for multiple + * LPM Unicast entries in a single operation. The update is executed to + * all entries of a {virtual router, protocol} tuple using the same ECMP group. + */ +#define MLXSW_REG_RALEU_ID 0x8015 +#define MLXSW_REG_RALEU_LEN 0x28 + +static const struct mlxsw_reg_info mlxsw_reg_raleu = { + .id = MLXSW_REG_RALEU_ID, + .len = MLXSW_REG_RALEU_LEN, +}; + +/* reg_raleu_protocol + * Protocol. + * Access: Index + */ +MLXSW_ITEM32(reg, raleu, protocol, 0x00, 24, 4); + +/* reg_raleu_virtual_router + * Virtual Router ID + * Range is 0..cap_max_virtual_routers-1 + * Access: Index + */ +MLXSW_ITEM32(reg, raleu, virtual_router, 0x00, 0, 16); + +/* reg_raleu_adjacency_index + * Adjacency Index used for matching on the existing entries. + * Access: Index + */ +MLXSW_ITEM32(reg, raleu, adjacency_index, 0x10, 0, 24); + +/* reg_raleu_ecmp_size + * ECMP Size used for matching on the existing entries. + * Access: Index + */ +MLXSW_ITEM32(reg, raleu, ecmp_size, 0x14, 0, 13); + +/* reg_raleu_new_adjacency_index + * New Adjacency Index. + * Access: WO + */ +MLXSW_ITEM32(reg, raleu, new_adjacency_index, 0x20, 0, 24); + +/* reg_raleu_new_ecmp_size + * New ECMP Size. + * Access: WO + */ +MLXSW_ITEM32(reg, raleu, new_ecmp_size, 0x24, 0, 13); + +static inline void mlxsw_reg_raleu_pack(char *payload, + enum mlxsw_reg_ralxx_protocol protocol, + u16 virtual_router, + u32 adjacency_index, u16 ecmp_size, + u32 new_adjacency_index, + u16 new_ecmp_size) +{ + MLXSW_REG_ZERO(raleu, payload); + mlxsw_reg_raleu_protocol_set(payload, protocol); + mlxsw_reg_raleu_virtual_router_set(payload, virtual_router); + mlxsw_reg_raleu_adjacency_index_set(payload, adjacency_index); + mlxsw_reg_raleu_ecmp_size_set(payload, ecmp_size); + mlxsw_reg_raleu_new_adjacency_index_set(payload, new_adjacency_index); + mlxsw_reg_raleu_new_ecmp_size_set(payload, new_ecmp_size); +} + /* RAUHTD - Router Algorithmic LPM Unicast Host Table Dump Register * ---------------------------------------------------------------- * The RAUHTD register allows dumping entries from the Router Unicast Host @@ -5053,6 +5120,8 @@ static inline const char *mlxsw_reg_id_str(u16 reg_id) return "RALUE"; case MLXSW_REG_RAUHT_ID: return "RAUHT"; + case MLXSW_REG_RALEU_ID: + return "RALEU"; case MLXSW_REG_RAUHTD_ID: return "RAUHTD"; case MLXSW_REG_MFCR_ID: From a7ff87acd995e3c024f3262bf90c8682c99b1f6b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 Jul 2016 11:27:50 +0200 Subject: [PATCH 14/16] mlxsw: spectrum_router: Implement next-hop routing Implement next-hop routing offload including ECMP. To make it possible, introduce next-hop group entity. This entity keeps track of resolved neighbours and updates HW adjacency table accordingly. Note that HW next-hops are stored in this adjacency table, in form of MAC. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 + .../ethernet/mellanox/mlxsw/spectrum_router.c | 493 +++++++++++++++++- 2 files changed, 492 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index e78112897dcf3..0fe6051ab1959 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -222,6 +222,7 @@ struct mlxsw_sp_router { struct delayed_work dw; unsigned long interval; /* ms */ } neighs_update; + struct list_head nexthop_group_list; }; struct mlxsw_sp { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index ed0e6c09dcc88..dc13178b6f338 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -117,6 +117,8 @@ enum mlxsw_sp_fib_entry_type { MLXSW_SP_FIB_ENTRY_TYPE_TRAP, }; +struct mlxsw_sp_nexthop_group; + struct mlxsw_sp_fib_entry { struct rhash_head ht_node; struct mlxsw_sp_fib_key key; @@ -124,6 +126,8 @@ struct mlxsw_sp_fib_entry { u8 added:1; u16 rif; /* used for action local */ struct mlxsw_sp_vr *vr; + struct list_head nexthop_group_node; + struct mlxsw_sp_nexthop_group *nh_group; }; struct mlxsw_sp_fib { @@ -563,6 +567,9 @@ struct mlxsw_sp_neigh_entry { struct delayed_work dw; struct mlxsw_sp_port *mlxsw_sp_port; unsigned char ha[ETH_ALEN]; + struct list_head nexthop_list; /* list of nexthops using + * this neigh entry + */ }; static const struct rhashtable_params mlxsw_sp_neigh_ht_params = { @@ -606,6 +613,7 @@ mlxsw_sp_neigh_entry_create(const void *addr, size_t addr_len, neigh_entry->rif = rif; neigh_entry->n = n; INIT_DELAYED_WORK(&neigh_entry->dw, mlxsw_sp_router_neigh_update_hw); + INIT_LIST_HEAD(&neigh_entry->nexthop_list); return neigh_entry; } @@ -808,6 +816,11 @@ static void mlxsw_sp_router_neighs_update_work(struct work_struct *work) mlxsw_sp_router_neighs_update_work_schedule(mlxsw_sp); } +static void +mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_neigh_entry *neigh_entry, + bool removing); + static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work) { struct mlxsw_sp_neigh_entry *neigh_entry = @@ -849,6 +862,7 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work) } else { neigh_entry->offloaded = true; } + mlxsw_sp_nexthop_neigh_update(mlxsw_sp, neigh_entry, false); } else if (removing) { mlxsw_reg_rauht_pack4(rauht_pl, MLXSW_REG_RAUHT_OP_WRITE_DELETE, neigh_entry->rif, @@ -861,6 +875,7 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work) } else { neigh_entry->offloaded = false; } + mlxsw_sp_nexthop_neigh_update(mlxsw_sp, neigh_entry, true); } neigh_release(n); @@ -978,6 +993,434 @@ static void mlxsw_sp_neigh_fini(struct mlxsw_sp *mlxsw_sp) rhashtable_destroy(&mlxsw_sp->router.neigh_ht); } +struct mlxsw_sp_nexthop { + struct list_head neigh_list_node; /* member of neigh entry list */ + struct mlxsw_sp_nexthop_group *nh_grp; /* pointer back to the group + * this belongs to + */ + u8 should_offload:1, /* set indicates this neigh is connected and + * should be put to KVD linear area of this group. + */ + offloaded:1, /* set in case the neigh is actually put into + * KVD linear area of this group. + */ + update:1; /* set indicates that MAC of this neigh should be + * updated in HW + */ + struct mlxsw_sp_neigh_entry *neigh_entry; +}; + +struct mlxsw_sp_nexthop_group { + struct list_head list; /* node in mlxsw->router.nexthop_group_list */ + struct list_head fib_list; /* list of fib entries that use this group */ + u8 adj_index_valid:1; + u32 adj_index; + u16 ecmp_size; + u16 count; + struct mlxsw_sp_nexthop nexthops[0]; +}; + +static int mlxsw_sp_adj_index_mass_update_vr(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_vr *vr, + u32 adj_index, u16 ecmp_size, + u32 new_adj_index, + u16 new_ecmp_size) +{ + char raleu_pl[MLXSW_REG_RALEU_LEN]; + + mlxsw_reg_raleu_pack(raleu_pl, vr->proto, vr->id, + adj_index, ecmp_size, + new_adj_index, new_ecmp_size); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raleu), raleu_pl); +} + +static int mlxsw_sp_adj_index_mass_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp, + u32 old_adj_index, u16 old_ecmp_size) +{ + struct mlxsw_sp_fib_entry *fib_entry; + struct mlxsw_sp_vr *vr = NULL; + int err; + + list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) { + if (vr == fib_entry->vr) + continue; + vr = fib_entry->vr; + err = mlxsw_sp_adj_index_mass_update_vr(mlxsw_sp, vr, + old_adj_index, + old_ecmp_size, + nh_grp->adj_index, + nh_grp->ecmp_size); + if (err) + return err; + } + return 0; +} + +static int mlxsw_sp_nexthop_mac_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index, + struct mlxsw_sp_nexthop *nh) +{ + struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry; + char ratr_pl[MLXSW_REG_RATR_LEN]; + + mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY, + true, adj_index, neigh_entry->rif); + mlxsw_reg_ratr_eth_entry_pack(ratr_pl, neigh_entry->ha); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl); +} + +static int +mlxsw_sp_nexthop_group_mac_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp) +{ + u32 adj_index = nh_grp->adj_index; /* base */ + struct mlxsw_sp_nexthop *nh; + int i; + int err; + + for (i = 0; i < nh_grp->count; i++) { + nh = &nh_grp->nexthops[i]; + + if (!nh->should_offload) { + nh->offloaded = 0; + continue; + } + + if (nh->update) { + err = mlxsw_sp_nexthop_mac_update(mlxsw_sp, + adj_index, nh); + if (err) + return err; + nh->update = 0; + nh->offloaded = 1; + } + adj_index++; + } + return 0; +} + +static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry); + +static int +mlxsw_sp_nexthop_fib_entries_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp) +{ + struct mlxsw_sp_fib_entry *fib_entry; + int err; + + list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) { + err = mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry); + if (err) + return err; + } + return 0; +} + +static void +mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp) +{ + struct mlxsw_sp_nexthop *nh; + bool offload_change = false; + u32 adj_index; + u16 ecmp_size = 0; + bool old_adj_index_valid; + u32 old_adj_index; + u16 old_ecmp_size; + int ret; + int i; + int err; + + for (i = 0; i < nh_grp->count; i++) { + nh = &nh_grp->nexthops[i]; + + if (nh->should_offload ^ nh->offloaded) { + offload_change = true; + if (nh->should_offload) + nh->update = 1; + } + if (nh->should_offload) + ecmp_size++; + } + if (!offload_change) { + /* Nothing was added or removed, so no need to reallocate. Just + * update MAC on existing adjacency indexes. + */ + err = mlxsw_sp_nexthop_group_mac_update(mlxsw_sp, nh_grp); + if (err) { + dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n"); + goto set_trap; + } + return; + } + if (!ecmp_size) + /* No neigh of this group is connected so we just set + * the trap and let everthing flow through kernel. + */ + goto set_trap; + + ret = mlxsw_sp_kvdl_alloc(mlxsw_sp, ecmp_size); + if (ret < 0) { + /* We ran out of KVD linear space, just set the + * trap and let everything flow through kernel. + */ + dev_warn(mlxsw_sp->bus_info->dev, "Failed to allocate KVD linear area for nexthop group.\n"); + goto set_trap; + } + adj_index = ret; + old_adj_index_valid = nh_grp->adj_index_valid; + old_adj_index = nh_grp->adj_index; + old_ecmp_size = nh_grp->ecmp_size; + nh_grp->adj_index_valid = 1; + nh_grp->adj_index = adj_index; + nh_grp->ecmp_size = ecmp_size; + err = mlxsw_sp_nexthop_group_mac_update(mlxsw_sp, nh_grp); + if (err) { + dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n"); + goto set_trap; + } + + if (!old_adj_index_valid) { + /* The trap was set for fib entries, so we have to call + * fib entry update to unset it and use adjacency index. + */ + err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp); + if (err) { + dev_warn(mlxsw_sp->bus_info->dev, "Failed to add adjacency index to fib entries.\n"); + goto set_trap; + } + return; + } + + err = mlxsw_sp_adj_index_mass_update(mlxsw_sp, nh_grp, + old_adj_index, old_ecmp_size); + mlxsw_sp_kvdl_free(mlxsw_sp, old_adj_index); + if (err) { + dev_warn(mlxsw_sp->bus_info->dev, "Failed to mass-update adjacency index for nexthop group.\n"); + goto set_trap; + } + return; + +set_trap: + old_adj_index_valid = nh_grp->adj_index_valid; + nh_grp->adj_index_valid = 0; + for (i = 0; i < nh_grp->count; i++) { + nh = &nh_grp->nexthops[i]; + nh->offloaded = 0; + } + err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp); + if (err) + dev_warn(mlxsw_sp->bus_info->dev, "Failed to set traps for fib entries.\n"); + if (old_adj_index_valid) + mlxsw_sp_kvdl_free(mlxsw_sp, nh_grp->adj_index); +} + +static void __mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp_nexthop *nh, + bool removing) +{ + if (!removing && !nh->should_offload) + nh->should_offload = 1; + else if (removing && nh->offloaded) + nh->should_offload = 0; + nh->update = 1; +} + +static void +mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_neigh_entry *neigh_entry, + bool removing) +{ + struct mlxsw_sp_nexthop *nh; + + /* Take RTNL mutex here to prevent lists from changes */ + rtnl_lock(); + list_for_each_entry(nh, &neigh_entry->nexthop_list, + neigh_list_node) { + __mlxsw_sp_nexthop_neigh_update(nh, removing); + mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp); + } + rtnl_unlock(); +} + +static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp, + struct mlxsw_sp_nexthop *nh, + struct fib_nh *fib_nh) +{ + struct mlxsw_sp_neigh_entry *neigh_entry; + u32 gwip = ntohl(fib_nh->nh_gw); + struct net_device *dev = fib_nh->nh_dev; + struct neighbour *n; + u8 nud_state; + + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &gwip, + sizeof(gwip), dev); + if (!neigh_entry) { + __be32 gwipn = htonl(gwip); + + n = neigh_create(&arp_tbl, &gwipn, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + neigh_event_send(n, NULL); + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &gwip, + sizeof(gwip), dev); + if (!neigh_entry) { + neigh_release(n); + return -EINVAL; + } + } else { + /* Take a reference of neigh here ensuring that neigh would + * not be detructed before the nexthop entry is finished. + * The second branch takes the reference in neith_create() + */ + n = neigh_entry->n; + neigh_clone(n); + } + nh->nh_grp = nh_grp; + nh->neigh_entry = neigh_entry; + list_add_tail(&nh->neigh_list_node, &neigh_entry->nexthop_list); + read_lock_bh(&n->lock); + nud_state = n->nud_state; + read_unlock_bh(&n->lock); + __mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID)); + + return 0; +} + +static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) +{ + struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry; + + list_del(&nh->neigh_list_node); + neigh_release(neigh_entry->n); +} + +static struct mlxsw_sp_nexthop_group * +mlxsw_sp_nexthop_group_create(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi) +{ + struct mlxsw_sp_nexthop_group *nh_grp; + struct mlxsw_sp_nexthop *nh; + struct fib_nh *fib_nh; + size_t alloc_size; + int i; + int err; + + alloc_size = sizeof(*nh_grp) + + fi->fib_nhs * sizeof(struct mlxsw_sp_nexthop); + nh_grp = kzalloc(alloc_size, GFP_KERNEL); + if (!nh_grp) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&nh_grp->fib_list); + nh_grp->count = fi->fib_nhs; + for (i = 0; i < nh_grp->count; i++) { + nh = &nh_grp->nexthops[i]; + fib_nh = &fi->fib_nh[i]; + err = mlxsw_sp_nexthop_init(mlxsw_sp, nh_grp, nh, fib_nh); + if (err) + goto err_nexthop_init; + } + list_add_tail(&nh_grp->list, &mlxsw_sp->router.nexthop_group_list); + mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp); + return nh_grp; + +err_nexthop_init: + for (i--; i >= 0; i--) + mlxsw_sp_nexthop_fini(mlxsw_sp, nh); + kfree(nh_grp); + return ERR_PTR(err); +} + +static void +mlxsw_sp_nexthop_group_destroy(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop_group *nh_grp) +{ + struct mlxsw_sp_nexthop *nh; + int i; + + list_del(&nh_grp->list); + for (i = 0; i < nh_grp->count; i++) { + nh = &nh_grp->nexthops[i]; + mlxsw_sp_nexthop_fini(mlxsw_sp, nh); + } + kfree(nh_grp); +} + +static bool mlxsw_sp_nexthop_match(struct mlxsw_sp_nexthop *nh, + struct fib_info *fi) +{ + int i; + + for (i = 0; i < fi->fib_nhs; i++) { + struct fib_nh *fib_nh = &fi->fib_nh[i]; + u32 gwip = ntohl(fib_nh->nh_gw); + + if (memcmp(nh->neigh_entry->key.addr, + &gwip, sizeof(u32)) == 0 && + nh->neigh_entry->key.dev == fib_nh->nh_dev) + return true; + } + return false; +} + +static bool mlxsw_sp_nexthop_group_match(struct mlxsw_sp_nexthop_group *nh_grp, + struct fib_info *fi) +{ + int i; + + if (nh_grp->count != fi->fib_nhs) + return false; + for (i = 0; i < nh_grp->count; i++) { + struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; + + if (!mlxsw_sp_nexthop_match(nh, fi)) + return false; + } + return true; +} + +static struct mlxsw_sp_nexthop_group * +mlxsw_sp_nexthop_group_find(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi) +{ + struct mlxsw_sp_nexthop_group *nh_grp; + + list_for_each_entry(nh_grp, &mlxsw_sp->router.nexthop_group_list, + list) { + if (mlxsw_sp_nexthop_group_match(nh_grp, fi)) + return nh_grp; + } + return NULL; +} + +static int mlxsw_sp_nexthop_group_get(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry, + struct fib_info *fi) +{ + struct mlxsw_sp_nexthop_group *nh_grp; + + nh_grp = mlxsw_sp_nexthop_group_find(mlxsw_sp, fi); + if (!nh_grp) { + nh_grp = mlxsw_sp_nexthop_group_create(mlxsw_sp, fi); + if (IS_ERR(nh_grp)) + return PTR_ERR(nh_grp); + } + list_add_tail(&fib_entry->nexthop_group_node, &nh_grp->fib_list); + fib_entry->nh_group = nh_grp; + return 0; +} + +static void mlxsw_sp_nexthop_group_put(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) +{ + struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; + + list_del(&fib_entry->nexthop_group_node); + if (!list_empty(&nh_grp->fib_list)) + return; + mlxsw_sp_nexthop_group_destroy(mlxsw_sp, nh_grp); +} + static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) { char rgcr_pl[MLXSW_REG_RGCR_LEN]; @@ -999,6 +1442,7 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) { int err; + INIT_LIST_HEAD(&mlxsw_sp->router.nexthop_group_list); err = __mlxsw_sp_router_init(mlxsw_sp); if (err) return err; @@ -1013,6 +1457,38 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) __mlxsw_sp_router_fini(mlxsw_sp); } +static int mlxsw_sp_fib_entry_op4_remote(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry, + enum mlxsw_reg_ralue_op op) +{ + char ralue_pl[MLXSW_REG_RALUE_LEN]; + u32 *p_dip = (u32 *) fib_entry->key.addr; + struct mlxsw_sp_vr *vr = fib_entry->vr; + enum mlxsw_reg_ralue_trap_action trap_action; + u16 trap_id = 0; + u32 adjacency_index = 0; + u16 ecmp_size = 0; + + /* In case the nexthop group adjacency index is valid, use it + * with provided ECMP size. Otherwise, setup trap and pass + * traffic to kernel. + */ + if (fib_entry->nh_group->adj_index_valid) { + trap_action = MLXSW_REG_RALUE_TRAP_ACTION_NOP; + adjacency_index = fib_entry->nh_group->adj_index; + ecmp_size = fib_entry->nh_group->ecmp_size; + } else { + trap_action = MLXSW_REG_RALUE_TRAP_ACTION_TRAP; + trap_id = MLXSW_TRAP_ID_RTR_INGRESS0; + } + + mlxsw_reg_ralue_pack4(ralue_pl, vr->proto, op, vr->id, + fib_entry->key.prefix_len, *p_dip); + mlxsw_reg_ralue_act_remote_pack(ralue_pl, trap_action, trap_id, + adjacency_index, ecmp_size); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl); +} + static int mlxsw_sp_fib_entry_op4_local(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib_entry *fib_entry, enum mlxsw_reg_ralue_op op) @@ -1049,7 +1525,7 @@ static int mlxsw_sp_fib_entry_op4(struct mlxsw_sp *mlxsw_sp, { switch (fib_entry->type) { case MLXSW_SP_FIB_ENTRY_TYPE_REMOTE: - return -EINVAL; + return mlxsw_sp_fib_entry_op4_remote(mlxsw_sp, fib_entry, op); case MLXSW_SP_FIB_ENTRY_TYPE_LOCAL: return mlxsw_sp_fib_entry_op4_local(mlxsw_sp, fib_entry, op); case MLXSW_SP_FIB_ENTRY_TYPE_TRAP: @@ -1129,7 +1605,17 @@ mlxsw_sp_router_fib4_entry_init(struct mlxsw_sp *mlxsw_sp, fib_entry->rif = r->rif; return 0; } - return -EINVAL; + fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE; + return mlxsw_sp_nexthop_group_get(mlxsw_sp, fib_entry, fi); +} + +static void +mlxsw_sp_router_fib4_entry_fini(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_fib_entry *fib_entry) +{ + if (fib_entry->type != MLXSW_SP_FIB_ENTRY_TYPE_REMOTE) + return; + mlxsw_sp_nexthop_group_put(mlxsw_sp, fib_entry); } static int @@ -1173,6 +1659,7 @@ mlxsw_sp_router_fib4_add_prepare(struct mlxsw_sp_port *mlxsw_sp_port, return 0; err_alloc_info: + mlxsw_sp_router_fib4_entry_fini(mlxsw_sp, fib_entry); err_fib4_entry_init: mlxsw_sp_fib_entry_destroy(fib_entry); err_fib_entry_create: @@ -1207,6 +1694,7 @@ mlxsw_sp_router_fib4_add_commit(struct mlxsw_sp_port *mlxsw_sp_port, err_fib_entry_add: mlxsw_sp_fib_entry_remove(vr->fib, fib_entry); err_fib_entry_insert: + mlxsw_sp_router_fib4_entry_fini(mlxsw_sp, fib_entry); mlxsw_sp_fib_entry_destroy(fib_entry); mlxsw_sp_vr_put(mlxsw_sp, vr); return err; @@ -1243,6 +1731,7 @@ int mlxsw_sp_router_fib4_del(struct mlxsw_sp_port *mlxsw_sp_port, } mlxsw_sp_fib_entry_del(mlxsw_sp_port->mlxsw_sp, fib_entry); mlxsw_sp_fib_entry_remove(vr->fib, fib_entry); + mlxsw_sp_router_fib4_entry_fini(mlxsw_sp, fib_entry); mlxsw_sp_fib_entry_destroy(fib_entry); mlxsw_sp_vr_put(mlxsw_sp, vr); return 0; From b2157149b0b0f2cdaad9b82ed3bb1b84f82aa7f1 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 5 Jul 2016 11:27:51 +0200 Subject: [PATCH 15/16] mlxsw: spectrum_router: Add the nexthop neigh activity update For nexthop neighbours we need to make kernel to think there is a traffic flowing to them preventing it from going to stale state. Otherwise kernel would stale it and eventually the neigh would be removed from HW and nexthop as well. That would reduce ECMP group in HW. Signed-off-by: Yotam Gigi Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 + .../ethernet/mellanox/mlxsw/spectrum_router.c | 75 +++++++++++++++---- 2 files changed, 61 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 0fe6051ab1959..ff5b8593b21ad 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -223,6 +223,7 @@ struct mlxsw_sp_router { unsigned long interval; /* ms */ } neighs_update; struct list_head nexthop_group_list; + struct list_head nexthop_neighs_list; }; struct mlxsw_sp { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index dc13178b6f338..2b20279d2d237 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -570,6 +570,7 @@ struct mlxsw_sp_neigh_entry { struct list_head nexthop_list; /* list of nexthops using * this neigh entry */ + struct list_head nexthop_neighs_list_node; }; static const struct rhashtable_params mlxsw_sp_neigh_ht_params = { @@ -770,28 +771,15 @@ static void mlxsw_sp_router_neigh_rec_process(struct mlxsw_sp *mlxsw_sp, } } -static void -mlxsw_sp_router_neighs_update_work_schedule(struct mlxsw_sp *mlxsw_sp) +static int mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp) { - unsigned long interval = mlxsw_sp->router.neighs_update.interval; - - mlxsw_core_schedule_dw(&mlxsw_sp->router.neighs_update.dw, - msecs_to_jiffies(interval)); -} - -static void mlxsw_sp_router_neighs_update_work(struct work_struct *work) -{ - struct mlxsw_sp *mlxsw_sp; char *rauhtd_pl; u8 num_rec; int i, err; rauhtd_pl = kmalloc(MLXSW_REG_RAUHTD_LEN, GFP_KERNEL); if (!rauhtd_pl) - return; - - mlxsw_sp = container_of(work, struct mlxsw_sp, - router.neighs_update.dw.work); + return -ENOMEM; /* Make sure the neighbour's netdev isn't removed in the * process. @@ -813,6 +801,47 @@ static void mlxsw_sp_router_neighs_update_work(struct work_struct *work) rtnl_unlock(); kfree(rauhtd_pl); + return err; +} + +static void mlxsw_sp_router_neighs_update_nh(struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_neigh_entry *neigh_entry; + + /* Take RTNL mutex here to prevent lists from changes */ + rtnl_lock(); + list_for_each_entry(neigh_entry, &mlxsw_sp->router.nexthop_neighs_list, + nexthop_neighs_list_node) { + /* If this neigh have nexthops, make the kernel think this neigh + * is active regardless of the traffic. + */ + if (!list_empty(&neigh_entry->nexthop_list)) + neigh_event_send(neigh_entry->n, NULL); + } + rtnl_unlock(); +} + +static void +mlxsw_sp_router_neighs_update_work_schedule(struct mlxsw_sp *mlxsw_sp) +{ + unsigned long interval = mlxsw_sp->router.neighs_update.interval; + + mlxsw_core_schedule_dw(&mlxsw_sp->router.neighs_update.dw, + msecs_to_jiffies(interval)); +} + +static void mlxsw_sp_router_neighs_update_work(struct work_struct *work) +{ + struct mlxsw_sp *mlxsw_sp = container_of(work, struct mlxsw_sp, + router.neighs_update.dw.work); + int err; + + err = mlxsw_sp_router_neighs_update_rauhtd(mlxsw_sp); + if (err) + dev_err(mlxsw_sp->bus_info->dev, "Could not update kernel for neigh activity"); + + mlxsw_sp_router_neighs_update_nh(mlxsw_sp); + mlxsw_sp_router_neighs_update_work_schedule(mlxsw_sp); } @@ -1277,6 +1306,14 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp, n = neigh_entry->n; neigh_clone(n); } + + /* If that is the first nexthop connected to that neigh, add to + * nexthop_neighs_list + */ + if (list_empty(&neigh_entry->nexthop_list)) + list_add_tail(&neigh_entry->nexthop_neighs_list_node, + &mlxsw_sp->router.nexthop_neighs_list); + nh->nh_grp = nh_grp; nh->neigh_entry = neigh_entry; list_add_tail(&nh->neigh_list_node, &neigh_entry->nexthop_list); @@ -1294,6 +1331,13 @@ static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry; list_del(&nh->neigh_list_node); + + /* If that is the last nexthop connected to that neigh, remove from + * nexthop_neighs_list + */ + if (list_empty(&nh->neigh_entry->nexthop_list)) + list_del(&nh->neigh_entry->nexthop_neighs_list_node); + neigh_release(neigh_entry->n); } @@ -1442,6 +1486,7 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) { int err; + INIT_LIST_HEAD(&mlxsw_sp->router.nexthop_neighs_list); INIT_LIST_HEAD(&mlxsw_sp->router.nexthop_group_list); err = __mlxsw_sp_router_init(mlxsw_sp); if (err) From 0b2361d9d946558c90f0ae8b21725022bea9f2cb Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 5 Jul 2016 11:27:52 +0200 Subject: [PATCH 16/16] mlxsw: Add the unresolved next-hops probes Now, the driver sends arp probes for all unresolved neighbours that are currently a nexthop for some route on the system. The job is set periodically every 5 seconds. Signed-off-by: Yotam Gigi Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 2 ++ .../ethernet/mellanox/mlxsw/spectrum_router.c | 33 ++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index ff5b8593b21ad..ef4ac8987a2aa 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -222,6 +222,8 @@ struct mlxsw_sp_router { struct delayed_work dw; unsigned long interval; /* ms */ } neighs_update; + struct delayed_work nexthop_probe_dw; +#define MLXSW_SP_UNRESOLVED_NH_PROBE_INTERVAL 5000 /* ms */ struct list_head nexthop_group_list; struct list_head nexthop_neighs_list; }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 2b20279d2d237..e084ea5448ac5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -845,6 +845,33 @@ static void mlxsw_sp_router_neighs_update_work(struct work_struct *work) mlxsw_sp_router_neighs_update_work_schedule(mlxsw_sp); } +static void mlxsw_sp_router_probe_unresolved_nexthops(struct work_struct *work) +{ + struct mlxsw_sp_neigh_entry *neigh_entry; + struct mlxsw_sp *mlxsw_sp = container_of(work, struct mlxsw_sp, + router.nexthop_probe_dw.work); + + /* Iterate over nexthop neighbours, find those who are unresolved and + * send arp on them. This solves the chicken-egg problem when + * the nexthop wouldn't get offloaded until the neighbor is resolved + * but it wouldn't get resolved ever in case traffic is flowing in HW + * using different nexthop. + * + * Take RTNL mutex here to prevent lists from changes. + */ + rtnl_lock(); + list_for_each_entry(neigh_entry, &mlxsw_sp->router.nexthop_neighs_list, + nexthop_neighs_list_node) { + if (!(neigh_entry->n->nud_state & NUD_VALID) && + !list_empty(&neigh_entry->nexthop_list)) + neigh_event_send(neigh_entry->n, NULL); + } + rtnl_unlock(); + + mlxsw_core_schedule_dw(&mlxsw_sp->router.nexthop_probe_dw, + MLXSW_SP_UNRESOLVED_NH_PROBE_INTERVAL); +} + static void mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_neigh_entry *neigh_entry, @@ -1004,10 +1031,13 @@ static int mlxsw_sp_neigh_init(struct mlxsw_sp *mlxsw_sp) if (err) goto err_register_netevent_notifier; + /* Create the delayed works for the activity_update */ INIT_DELAYED_WORK(&mlxsw_sp->router.neighs_update.dw, mlxsw_sp_router_neighs_update_work); + INIT_DELAYED_WORK(&mlxsw_sp->router.nexthop_probe_dw, + mlxsw_sp_router_probe_unresolved_nexthops); mlxsw_core_schedule_dw(&mlxsw_sp->router.neighs_update.dw, 0); - + mlxsw_core_schedule_dw(&mlxsw_sp->router.nexthop_probe_dw, 0); return 0; err_register_netevent_notifier: @@ -1018,6 +1048,7 @@ static int mlxsw_sp_neigh_init(struct mlxsw_sp *mlxsw_sp) static void mlxsw_sp_neigh_fini(struct mlxsw_sp *mlxsw_sp) { cancel_delayed_work_sync(&mlxsw_sp->router.neighs_update.dw); + cancel_delayed_work_sync(&mlxsw_sp->router.nexthop_probe_dw); unregister_netevent_notifier(&mlxsw_sp_router_netevent_nb); rhashtable_destroy(&mlxsw_sp->router.neigh_ht); }