Skip to content

Commit

Permalink
Merge branch 'fib-notifier-event-replay'
Browse files Browse the repository at this point in the history
Jiri Pirko says:

====================
ipv4: fib: Replay events when registering FIB notifier

Ido says:

In kernel 4.9 the switchdev-specific FIB offload mechanism was replaced
by a new FIB notification chain to which modules could register in order
to be notified about the addition and deletion of FIB entries. The
motivation for this change was that switchdev drivers need to be able to
reflect the entire FIB table and not only FIBs configured on top of the
port netdevs themselves. This is useful in case of in-band management.

The fundamental problem with this approach is that upon registration
listeners lose all the information previously sent in the chain and
thus have an incomplete view of the FIB tables, which can result in
packet loss. This patchset fixes that by dumping the FIB tables and
replaying notifications previously sent in the chain for the registered
notification block.

The entire dump process is done under RCU and thus the FIB notification
chain is converted to be atomic. The listeners are modified accordingly.
This is done in the first eight patches.

The ninth patch adds a change sequence counter to ensure the integrity
of the FIB dump. The last patch adds the dump itself to the FIB chain
registration function and modifies existing listeners to pass a callback
to be executed in case dump was inconsistent.

---
v3->v4:
- Register the notification block after the dump and protect it using
  the change sequence counter (Hannes Frederic Sowa).
- Since we now integrate the dump into the registration function, drop
  the sysctl to set maximum number of retries and instead set it to a
  fixed number. Lets see if it's really a problem before adding something
  we can never remove.
- For the same reason, dump FIB tables for all net namespaces.
- Add a comment regarding guarantees provided by mutex semantics.

v2->v3:
- Add sysctl to set the number of FIB dump retries (Hannes Frederic Sowa).
- Read the sequence counter under RTNL to ensure synchronization
  between the dump process and other processes changing the routing
  tables (Hannes Frederic Sowa).
- Pass a callback to the dump function to be executed prior to a retry.
- Limit the dump to a single net namespace.

v1->v2:
- Add a sequence counter to ensure the integrity of the FIB dump
  (David S. Miller, Hannes Frederic Sowa).
- Protect notifications from re-ordering in listeners by using an
  ordered workqueue (Hannes Frederic Sowa).
- Introduce fib_info_hold() (Jiri Pirko).
- Relieve rocker from the need to invoke the FIB dump by registering
  to the FIB notification chain prior to ports creation.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Dec 4, 2016
2 parents 548ed72 + c3852ef commit 6924871
Show file tree
Hide file tree
Showing 11 changed files with 342 additions and 29 deletions.
22 changes: 22 additions & 0 deletions drivers/net/ethernet/mellanox/mlxsw/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ static const char mlxsw_core_driver_name[] = "mlxsw_core";
static struct dentry *mlxsw_core_dbg_root;

static struct workqueue_struct *mlxsw_wq;
static struct workqueue_struct *mlxsw_owq;

struct mlxsw_core_pcpu_stats {
u64 trap_rx_packets[MLXSW_TRAP_ID_MAX];
Expand Down Expand Up @@ -1900,13 +1901,31 @@ int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
}
EXPORT_SYMBOL(mlxsw_core_schedule_dw);

int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay)
{
return queue_delayed_work(mlxsw_owq, dwork, delay);
}
EXPORT_SYMBOL(mlxsw_core_schedule_odw);

void mlxsw_core_flush_owq(void)
{
flush_workqueue(mlxsw_owq);
}
EXPORT_SYMBOL(mlxsw_core_flush_owq);

static int __init mlxsw_core_module_init(void)
{
int err;

mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_MEM_RECLAIM, 0);
if (!mlxsw_wq)
return -ENOMEM;
mlxsw_owq = alloc_ordered_workqueue("%s_ordered", WQ_MEM_RECLAIM,
mlxsw_core_driver_name);
if (!mlxsw_owq) {
err = -ENOMEM;
goto err_alloc_ordered_workqueue;
}
mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL);
if (!mlxsw_core_dbg_root) {
err = -ENOMEM;
Expand All @@ -1915,13 +1934,16 @@ static int __init mlxsw_core_module_init(void)
return 0;

err_debugfs_create_dir:
destroy_workqueue(mlxsw_owq);
err_alloc_ordered_workqueue:
destroy_workqueue(mlxsw_wq);
return err;
}

static void __exit mlxsw_core_module_exit(void)
{
debugfs_remove_recursive(mlxsw_core_dbg_root);
destroy_workqueue(mlxsw_owq);
destroy_workqueue(mlxsw_wq);
}

Expand Down
2 changes: 2 additions & 0 deletions drivers/net/ethernet/mellanox/mlxsw/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
u8 local_port);

int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay);
void mlxsw_core_flush_owq(void);

#define MLXSW_CONFIG_PROFILE_SWID_COUNT 8

Expand Down
92 changes: 81 additions & 11 deletions drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,14 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);

static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
{
/* At this stage we're guaranteed not to have new incoming
* FIB notifications and the work queue is free from FIBs
* sitting on top of mlxsw netdevs. However, we can still
* have other FIBs queued. Flush the queue before flushing
* the device's tables. No need for locks, as we're the only
* writer.
*/
mlxsw_core_flush_owq();
mlxsw_sp_router_fib_flush(mlxsw_sp);
kfree(mlxsw_sp->router.vrs);
}
Expand Down Expand Up @@ -1948,33 +1956,89 @@ static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
kfree(mlxsw_sp->rifs);
}

static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
unsigned long event, void *ptr)
struct mlxsw_sp_fib_event_work {
struct delayed_work dw;
struct fib_entry_notifier_info fen_info;
struct mlxsw_sp *mlxsw_sp;
unsigned long event;
};

static void mlxsw_sp_router_fib_event_work(struct work_struct *work)
{
struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
struct fib_entry_notifier_info *fen_info = ptr;
struct mlxsw_sp_fib_event_work *fib_work =
container_of(work, struct mlxsw_sp_fib_event_work, dw.work);
struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
int err;

if (!net_eq(fen_info->info.net, &init_net))
return NOTIFY_DONE;

switch (event) {
/* Protect internal structures from changes */
rtnl_lock();
switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info);
err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info);
if (err)
mlxsw_sp_router_fib4_abort(mlxsw_sp);
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_ENTRY_DEL:
mlxsw_sp_router_fib4_del(mlxsw_sp, fen_info);
mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_RULE_ADD: /* fall through */
case FIB_EVENT_RULE_DEL:
mlxsw_sp_router_fib4_abort(mlxsw_sp);
break;
}
rtnl_unlock();
kfree(fib_work);
}

/* Called with rcu_read_lock() */
static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
struct mlxsw_sp_fib_event_work *fib_work;
struct fib_notifier_info *info = ptr;

if (!net_eq(info->net, &init_net))
return NOTIFY_DONE;

fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
if (WARN_ON(!fib_work))
return NOTIFY_BAD;

INIT_DELAYED_WORK(&fib_work->dw, mlxsw_sp_router_fib_event_work);
fib_work->mlxsw_sp = mlxsw_sp;
fib_work->event = event;

switch (event) {
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
/* Take referece on fib_info to prevent it from being
* freed while work is queued. Release it afterwards.
*/
fib_info_hold(fib_work->fen_info.fi);
break;
}

mlxsw_core_schedule_odw(&fib_work->dw, 0);

return NOTIFY_DONE;
}

static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
{
struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);

/* Flush pending FIB notifications and then flush the device's
* table before requesting another dump. The FIB notification
* block is unregistered, so no need to take RTNL.
*/
mlxsw_core_flush_owq();
mlxsw_sp_router_fib_flush(mlxsw_sp);
}

int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
{
int err;
Expand All @@ -1995,9 +2059,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
goto err_neigh_init;

mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
register_fib_notifier(&mlxsw_sp->fib_nb);
err = register_fib_notifier(&mlxsw_sp->fib_nb,
mlxsw_sp_router_fib_dump_flush);
if (err)
goto err_register_fib_notifier;

return 0;

err_register_fib_notifier:
mlxsw_sp_neigh_fini(mlxsw_sp);
err_neigh_init:
mlxsw_sp_vrs_fini(mlxsw_sp);
err_vrs_init:
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/rocker/rocker.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ struct rocker {
struct rocker_dma_ring_info event_ring;
struct notifier_block fib_nb;
struct rocker_world_ops *wops;
struct workqueue_struct *rocker_owq;
void *wpriv;
};

Expand Down
84 changes: 72 additions & 12 deletions drivers/net/ethernet/rocker/rocker_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <linux/if_bridge.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
#include <linux/workqueue.h>
#include <net/switchdev.h>
#include <net/rtnetlink.h>
#include <net/netevent.h>
Expand Down Expand Up @@ -2165,28 +2166,70 @@ static const struct switchdev_ops rocker_port_switchdev_ops = {
.switchdev_port_obj_dump = rocker_port_obj_dump,
};

static int rocker_router_fib_event(struct notifier_block *nb,
unsigned long event, void *ptr)
struct rocker_fib_event_work {
struct work_struct work;
struct fib_entry_notifier_info fen_info;
struct rocker *rocker;
unsigned long event;
};

static void rocker_router_fib_event_work(struct work_struct *work)
{
struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
struct fib_entry_notifier_info *fen_info = ptr;
struct rocker_fib_event_work *fib_work =
container_of(work, struct rocker_fib_event_work, work);
struct rocker *rocker = fib_work->rocker;
int err;

switch (event) {
/* Protect internal structures from changes */
rtnl_lock();
switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
err = rocker_world_fib4_add(rocker, fen_info);
err = rocker_world_fib4_add(rocker, &fib_work->fen_info);
if (err)
rocker_world_fib4_abort(rocker);
else
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_ENTRY_DEL:
rocker_world_fib4_del(rocker, fen_info);
rocker_world_fib4_del(rocker, &fib_work->fen_info);
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_RULE_ADD: /* fall through */
case FIB_EVENT_RULE_DEL:
rocker_world_fib4_abort(rocker);
break;
}
rtnl_unlock();
kfree(fib_work);
}

/* Called with rcu_read_lock() */
static int rocker_router_fib_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
struct rocker_fib_event_work *fib_work;

fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
if (WARN_ON(!fib_work))
return NOTIFY_BAD;

INIT_WORK(&fib_work->work, rocker_router_fib_event_work);
fib_work->rocker = rocker;
fib_work->event = event;

switch (event) {
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
/* Take referece on fib_info to prevent it from being
* freed while work is queued. Release it afterwards.
*/
fib_info_hold(fib_work->fen_info.fi);
break;
}

queue_work(rocker->rocker_owq, &fib_work->work);

return NOTIFY_DONE;
}

Expand Down Expand Up @@ -2754,6 +2797,21 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_request_event_irq;
}

rocker->rocker_owq = alloc_ordered_workqueue(rocker_driver_name,
WQ_MEM_RECLAIM);
if (!rocker->rocker_owq) {
err = -ENOMEM;
goto err_alloc_ordered_workqueue;
}

/* Only FIBs pointing to our own netdevs are programmed into
* the device, so no need to pass a callback.
*/
rocker->fib_nb.notifier_call = rocker_router_fib_event;
err = register_fib_notifier(&rocker->fib_nb, NULL);
if (err)
goto err_register_fib_notifier;

rocker->hw.id = rocker_read64(rocker, SWITCH_ID);

err = rocker_probe_ports(rocker);
Expand All @@ -2762,15 +2820,16 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_probe_ports;
}

rocker->fib_nb.notifier_call = rocker_router_fib_event;
register_fib_notifier(&rocker->fib_nb);

dev_info(&pdev->dev, "Rocker switch with id %*phN\n",
(int)sizeof(rocker->hw.id), &rocker->hw.id);

return 0;

err_probe_ports:
unregister_fib_notifier(&rocker->fib_nb);
err_register_fib_notifier:
destroy_workqueue(rocker->rocker_owq);
err_alloc_ordered_workqueue:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
err_request_event_irq:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
Expand All @@ -2796,9 +2855,10 @@ static void rocker_remove(struct pci_dev *pdev)
{
struct rocker *rocker = pci_get_drvdata(pdev);

rocker_remove_ports(rocker);
unregister_fib_notifier(&rocker->fib_nb);
rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
rocker_remove_ports(rocker);
destroy_workqueue(rocker->rocker_owq);
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
rocker_dma_rings_fini(rocker);
Expand Down
1 change: 1 addition & 0 deletions drivers/net/ethernet/rocker/rocker_ofdpa.c
Original file line number Diff line number Diff line change
Expand Up @@ -2516,6 +2516,7 @@ static void ofdpa_fini(struct rocker *rocker)
int bkt;

del_timer_sync(&ofdpa->fdb_cleanup_timer);
flush_workqueue(rocker->rocker_owq);

spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry)
Expand Down
8 changes: 7 additions & 1 deletion include/net/ip_fib.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,8 @@ enum fib_event_type {
FIB_EVENT_RULE_DEL,
};

int register_fib_notifier(struct notifier_block *nb);
int register_fib_notifier(struct notifier_block *nb,
void (*cb)(struct notifier_block *nb));
int unregister_fib_notifier(struct notifier_block *nb);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info);
Expand Down Expand Up @@ -397,6 +398,11 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)

void free_fib_info(struct fib_info *fi);

static inline void fib_info_hold(struct fib_info *fi)
{
atomic_inc(&fi->fib_clntref);
}

static inline void fib_info_put(struct fib_info *fi)
{
if (atomic_dec_and_test(&fi->fib_clntref))
Expand Down
3 changes: 3 additions & 0 deletions include/net/netns/ipv4.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ struct netns_ipv4 {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int sysctl_fib_multipath_use_neigh;
#endif

unsigned int fib_seq; /* protected by rtnl_mutex */

atomic_t rt_genid;
};
#endif
2 changes: 2 additions & 0 deletions net/ipv4/fib_frontend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1219,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;

net->ipv4.fib_seq = 0;

/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);

Expand Down
Loading

0 comments on commit 6924871

Please sign in to comment.