From 0c4f691ff6791e55ac831666df0b49b1679c56e4 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:48 -0700 Subject: [PATCH 1/5] net: don't reforward packets already forwarded by offload device Just before queuing skb for xmit on port, check if skb has been marked by switchdev port driver as already fordwarded by device. If so, drop skb. A non-zero skb->offload_fwd_mark field is set by the switchdev port driver/device on ingress to indicate the skb has already been forwarded by the device to egress ports with matching dev->skb_mark. The switchdev port driver would assign a non-zero dev->offload_skb_mark for each device port netdev during registration, for example. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Acked-by: Roopa Prabhu Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/linux/skbuff.h | 9 ++++++++- net/core/dev.c | 10 ++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 45cfd797eb77e..8364f29e08be2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1456,6 +1456,8 @@ enum netdev_priv_flags { * * @xps_maps: XXX: need comments on this one * + * @offload_fwd_mark: Offload device fwding mark + * * @trans_start: Time (in jiffies) of last Tx * @watchdog_timeo: Represents the timeout that is used by * the watchdog ( see dev_watchdog() ) @@ -1697,6 +1699,10 @@ struct net_device { struct xps_dev_maps __rcu *xps_maps; #endif +#ifdef CONFIG_NET_SWITCHDEV + u32 offload_fwd_mark; +#endif + /* These may be needed for future network-power-down code. */ /* diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6cdd6e87d53b..af7a09650fa24 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -506,6 +506,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking + * @offload_fwd_mark: fwding offload mark * @mark: Generic packet mark * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information @@ -650,9 +651,15 @@ struct sk_buff { unsigned int sender_cpu; }; #endif + union { #ifdef CONFIG_NETWORK_SECMARK - __u32 secmark; + __u32 secmark; +#endif +#ifdef CONFIG_NET_SWITCHDEV + __u32 offload_fwd_mark; #endif + }; + union { __u32 mark; __u32 reserved_tailroom; diff --git a/net/core/dev.c b/net/core/dev.c index 8810b6bbebfea..2ee15afb412d7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3061,6 +3061,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); +#ifdef CONFIG_NET_SWITCHDEV + /* Don't forward if offload device already forwarded */ + if (skb->offload_fwd_mark && + skb->offload_fwd_mark == dev->offload_fwd_mark) { + consume_skb(skb); + rc = NET_XMIT_SUCCESS; + goto out; + } +#endif + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); From d754f98b502ad9a8c7570d494e1eaa0e6bc0350c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:49 -0700 Subject: [PATCH 2/5] net: add phys ID compare helper to test if two IDs are the same Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ net/switchdev/switchdev.c | 8 ++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8364f29e08be2..607b5f41f46f9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -766,6 +766,13 @@ struct netdev_phys_item_id { unsigned char id_len; }; +static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, + struct netdev_phys_item_id *b) +{ + return a->id_len == b->id_len && + memcmp(a->id, b->id, a->id_len) == 0; +} + typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 9f2add3cba26e..4e5bba50ccffa 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) if (switchdev_port_attr_get(dev, &attr)) return NULL; - if (nhsel > 0) { - if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len) + if (nhsel > 0 && + !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid)) return NULL; - if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id, - attr.u.ppid.id_len)) - return NULL; - } prev_attr = attr; } From 1a3b2ec93d4277b121979321b4024b438cb09504 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:50 -0700 Subject: [PATCH 3/5] switchdev: add offload_fwd_mark generator helper skb->offload_fwd_mark and dev->offload_fwd_mark are 32-bit and should be unique for device and may even be unique for a sub-set of ports within device, so add switchdev helper function to generate unique marks based on port's switch ID and group_ifindex. group_ifindex would typically be the container dev's ifindex, such as the bridge's ifindex. The generator uses a global hash table to store offload_fwd_marks hashed by {switch ID, group_ifindex} key. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/switchdev.h | 9 ++++ net/switchdev/switchdev.c | 103 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d5671f118bfc5..89da8934519bb 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -157,6 +157,9 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int idx); +void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining); #else @@ -271,6 +274,12 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb, return -EOPNOTSUPP; } +static inline void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining) +{ +} + #endif #endif /* _LINUX_SWITCHDEV_H_ */ diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 4e5bba50ccffa..33bafa2e703e2 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1039,3 +1039,106 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi) fi->fib_net->ipv4.fib_offload_disabled = true; } EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); + +static bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b) +{ + struct switchdev_attr a_attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + struct switchdev_attr b_attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + + if (switchdev_port_attr_get(a, &a_attr) || + switchdev_port_attr_get(b, &b_attr)) + return false; + + return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); +} + +static u32 switchdev_port_fwd_mark_get(struct net_device *dev, + struct net_device *group_dev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(group_dev, lower_dev, iter) { + if (lower_dev == dev) + continue; + if (switchdev_port_same_parent_id(dev, lower_dev)) + return lower_dev->offload_fwd_mark; + return switchdev_port_fwd_mark_get(dev, lower_dev); + } + + return dev->ifindex; +} + +static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, + u32 old_mark, u32 *reset_mark) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(group_dev, lower_dev, iter) { + if (lower_dev->offload_fwd_mark == old_mark) { + if (!*reset_mark) + *reset_mark = lower_dev->ifindex; + lower_dev->offload_fwd_mark = *reset_mark; + } + switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark); + } +} + +/** + * switchdev_port_fwd_mark_set - Set port offload forwarding mark + * + * @dev: port device + * @group_dev: containing device + * @joining: true if dev is joining group; false if leaving group + * + * An ungrouped port's offload mark is just its ifindex. A grouped + * port's (member of a bridge, for example) offload mark is the ifindex + * of one of the ports in the group with the same parent (switch) ID. + * Ports on the same device in the same group will have the same mark. + * + * Example: + * + * br0 ifindex=9 + * sw1p1 ifindex=2 mark=2 + * sw1p2 ifindex=3 mark=2 + * sw2p1 ifindex=4 mark=5 + * sw2p2 ifindex=5 mark=5 + * + * If sw2p2 leaves the bridge, we'll have: + * + * br0 ifindex=9 + * sw1p1 ifindex=2 mark=2 + * sw1p2 ifindex=3 mark=2 + * sw2p1 ifindex=4 mark=4 + * sw2p2 ifindex=5 mark=5 + */ +void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining) +{ + u32 mark = dev->ifindex; + u32 reset_mark = 0; + + if (group_dev && joining) { + mark = switchdev_port_fwd_mark_get(dev, group_dev); + } else if (group_dev && !joining) { + if (dev->offload_fwd_mark == mark) + /* Ohoh, this port was the mark reference port, + * but it's leaving the group, so reset the + * mark for the remaining ports in the group. + */ + switchdev_port_fwd_mark_reset(group_dev, mark, + &reset_mark); + } + + dev->offload_fwd_mark = mark; +} +EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set); From 3f98a8e636757ce404f305d65dc93e9366112886 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:51 -0700 Subject: [PATCH 4/5] rocker: add offload_fwd_mark support If device flags ingress packet as "fwd offload", mark the skb->offlaod_fwd_mark using the ingress port's dev->offlaod_fwd_mark. This will be the hint to the kernel that this packet has already been forwarded by device to egress ports matching skb->offlaod_fwd_mark. For rocker, derive port dev->offlaod_fwd_mark based on device switch ID and port ifindex. If port is bridged, use the bridge ifindex rather than the port ifindex. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 11 +++++++++++ drivers/net/ethernet/rocker/rocker.h | 1 + 2 files changed, 12 insertions(+) diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 4ccde93cd07a0..7b4c3474acfe8 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4822,6 +4822,7 @@ static int rocker_port_rx_proc(const struct rocker *rocker, const struct rocker_tlv *attrs[ROCKER_TLV_RX_MAX + 1]; struct sk_buff *skb = rocker_desc_cookie_ptr_get(desc_info); size_t rx_len; + u16 rx_flags = 0; if (!skb) return -ENOENT; @@ -4829,6 +4830,8 @@ static int rocker_port_rx_proc(const struct rocker *rocker, rocker_tlv_parse_desc(attrs, ROCKER_TLV_RX_MAX, desc_info); if (!attrs[ROCKER_TLV_RX_FRAG_LEN]) return -EINVAL; + if (attrs[ROCKER_TLV_RX_FLAGS]) + rx_flags = rocker_tlv_get_u16(attrs[ROCKER_TLV_RX_FLAGS]); rocker_dma_rx_ring_skb_unmap(rocker, attrs); @@ -4836,6 +4839,9 @@ static int rocker_port_rx_proc(const struct rocker *rocker, skb_put(skb, rx_len); skb->protocol = eth_type_trans(skb, rocker_port->dev); + if (rx_flags & ROCKER_RX_FLAGS_FWD_OFFLOAD) + skb->offload_fwd_mark = rocker_port->dev->offload_fwd_mark; + rocker_port->dev->stats.rx_packets++; rocker_port->dev->stats.rx_bytes += skb->len; @@ -4973,6 +4979,8 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number) } rocker->ports[port_number] = rocker_port; + switchdev_port_fwd_mark_set(rocker_port->dev, NULL, false); + rocker_port_set_learning(rocker_port, SWITCHDEV_TRANS_NONE); err = rocker_port_ig_tbl(rocker_port, SWITCHDEV_TRANS_NONE, 0); @@ -5252,6 +5260,7 @@ static int rocker_port_bridge_join(struct rocker_port *rocker_port, rocker_port_internal_vlan_id_get(rocker_port, bridge->ifindex); rocker_port->bridge_dev = bridge; + switchdev_port_fwd_mark_set(rocker_port->dev, bridge, true); return rocker_port_vlan_add(rocker_port, SWITCHDEV_TRANS_NONE, untagged_vid, 0); @@ -5272,6 +5281,8 @@ static int rocker_port_bridge_leave(struct rocker_port *rocker_port) rocker_port_internal_vlan_id_get(rocker_port, rocker_port->dev->ifindex); + switchdev_port_fwd_mark_set(rocker_port->dev, rocker_port->bridge_dev, + false); rocker_port->bridge_dev = NULL; err = rocker_port_vlan_add(rocker_port, SWITCHDEV_TRANS_NONE, diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h index 08b2c3d961887..12490b2f65040 100644 --- a/drivers/net/ethernet/rocker/rocker.h +++ b/drivers/net/ethernet/rocker/rocker.h @@ -246,6 +246,7 @@ enum { #define ROCKER_RX_FLAGS_TCP BIT(5) #define ROCKER_RX_FLAGS_UDP BIT(6) #define ROCKER_RX_FLAGS_TCP_UDP_CSUM_GOOD BIT(7) +#define ROCKER_RX_FLAGS_FWD_OFFLOAD BIT(8) enum { ROCKER_TLV_TX_UNSPEC, From a48037e7c6c25436912f78f48cdbb75a710b7aa9 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:52 -0700 Subject: [PATCH 5/5] switchdev: update documentation for offload_fwd_mark Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index c5d7ade10ff21..9825f32a86349 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -279,8 +279,18 @@ and unknown unicast packets to all ports in domain, if allowed by port's current STP state. The switch driver, knowing which ports are within which vlan L2 domain, can program the switch device for flooding. The packet should also be sent to the port netdev for processing by the bridge driver. The -bridge should not reflood the packet to the same ports the device flooded. -XXX: the mechanism to avoid duplicate flood packets is being discuseed. +bridge should not reflood the packet to the same ports the device flooded, +otherwise there will be duplicate packets on the wire. + +To avoid duplicate packets, the device/driver should mark a packet as already +forwarded using skb->offload_fwd_mark. The same mark is set on the device +ports in the domain using dev->offload_fwd_mark. If the skb->offload_fwd_mark +is non-zero and matches the forwarding egress port's dev->skb_mark, the kernel +will drop the skb right before transmit on the egress port, with the +understanding that the device already forwarded the packet on same egress port. +The driver can use switchdev_port_fwd_mark_set() to set a globally unique mark +for port's dev->offload_fwd_mark, based on the port's parent ID (switch ID) and +a group ifindex. It is possible for the switch device to not handle flooding and push the packets up to the bridge driver for flooding. This is not ideal as the number