From 073ff3c8e6acdd6bae91a037e6f2d0edeed4165d Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 11 Dec 2016 12:15:08 +0200 Subject: [PATCH 01/12] net/mlx5: Use exact encap header size for the FW input buffer The current code is allocating the max encap size supported by the firmware and not the size requested by the caller, fix that. Also, spare a warning when the size of the encapsulation headers is bigger from what is supported by the firmware. Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index c4478ecd8056e..b5253b59e8fe1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -473,10 +473,13 @@ int mlx5_encap_alloc(struct mlx5_core_dev *dev, int err; u32 *in; - if (size > MLX5_CAP_ESW(dev, max_encap_header_size)) + if (size > max_encap_size) { + mlx5_core_warn(dev, "encap size %zd too big, max supported is %d\n", + size, max_encap_size); return -EINVAL; + } - in = kzalloc(MLX5_ST_SZ_BYTES(alloc_encap_header_in) + max_encap_size, + in = kzalloc(MLX5_ST_SZ_BYTES(alloc_encap_header_in) + size, GFP_KERNEL); if (!in) return -ENOMEM; From 19f4440141af8cd9b2f280ec38476baa86dc87f9 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 11 Dec 2016 12:20:53 +0200 Subject: [PATCH 02/12] net/mlx5e: Add TC offloads matching on IPv6 encapsulation headers Enhance the parsing of offloaded TC rules to set HW matching on outer IPv6 encapsulation headers. This effectively adds support for TC tunnel key release action (decapsulation) of SRIOV offloads over IPv6 tunnels. Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index d4af5507679f4..e9f0854e9509a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -298,6 +298,32 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv, MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP); + } else if (enc_control->addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct flow_dissector_key_ipv6_addrs *key = + skb_flow_dissector_target(f->dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + f->key); + struct flow_dissector_key_ipv6_addrs *mask = + skb_flow_dissector_target(f->dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + f->mask); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IPV6); } /* Enforce DMAC when offloading incoming tunneled flows. @@ -358,12 +384,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, f->key); switch (key->addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: if (parse_tunnel_attr(priv, spec, f)) return -EOPNOTSUPP; break; - case FLOW_DISSECTOR_KEY_IPV6_ADDRS: - netdev_warn(priv->netdev, - "IPv6 tunnel decap offload isn't supported\n"); default: return -EOPNOTSUPP; } From 75c33da827365afa6f3d708ad1f7abe18e0ba4a3 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 21 Dec 2016 17:31:18 +0200 Subject: [PATCH 03/12] net/mlx5e: TC ipv4 tunnel encap offload cosmetic changes Move around some settings of variables as pre-step to make things more robust and clear for the ipv6 case in down-stream patch. This patch doesn't change any functionality. Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index e9f0854e9509a..a515444d24890 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -689,7 +689,6 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, { struct rtable *rt; struct neighbour *n = NULL; - int ttl; #if IS_ENABLED(CONFIG_INET) int ret; @@ -708,7 +707,7 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, return -EOPNOTSUPP; } - ttl = ip4_dst_hoplimit(&rt->dst); + *out_ttl = ip4_dst_hoplimit(&rt->dst); n = dst_neigh_lookup(&rt->dst, &fl4->daddr); ip_rt_put(rt); if (!n) @@ -716,7 +715,6 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, *out_n = n; *saddr = fl4->saddr; - *out_ttl = ttl; *out_dev = rt->dst.dev; return 0; @@ -792,15 +790,15 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, if (err) goto out; - e->n = n; - e->out_dev = *out_dev; - if (!(n->nud_state & NUD_VALID)) { pr_warn("%s: can't offload, neighbour to %pI4 invalid\n", __func__, &fl4.daddr); err = -EOPNOTSUPP; goto out; } + e->n = n; + e->out_dev = *out_dev; + neigh_ha_snapshot(e->h_dest, n, *out_dev); switch (e->tunnel_type) { From 76f7444dd5a4349af40e4c67e4b995d4ae3c5c92 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 5 Jan 2017 16:43:29 +0200 Subject: [PATCH 04/12] net/mlx5e: Use the full tunnel key info for encapsulation offload house-keeping Currently we use subset of the input tunnel key fields (id, ip daddr, dst port) which are provided by upper layers to indentify flows that should go through the same encapsulation and maintain the HW encapsulation table. This is redundant and can get us wrong. Instead, keep a copy of the ip tunnel info provided by the user through TC and have the tunnel key part as the key to our internal hash. Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 38 ++++++++----------- .../net/ethernet/mellanox/mlx5/core/eswitch.h | 9 +---- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index a515444d24890..477b2796c12e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -668,15 +668,15 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, return 0; } -static inline int cmp_encap_info(struct mlx5_encap_info *a, - struct mlx5_encap_info *b) +static inline int cmp_encap_info(struct ip_tunnel_key *a, + struct ip_tunnel_key *b) { return memcmp(a, b, sizeof(*a)); } -static inline int hash_encap_info(struct mlx5_encap_info *info) +static inline int hash_encap_info(struct ip_tunnel_key *key) { - return jhash(info, sizeof(*info), 0); + return jhash(key, sizeof(*key), 0); } static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, @@ -762,6 +762,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, struct net_device **out_dev) { int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); + struct ip_tunnel_key *tun_key = &e->tun_info.key; struct neighbour *n = NULL; struct flowi4 fl4 = {}; char *encap_header; @@ -777,13 +778,13 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, switch (e->tunnel_type) { case MLX5_HEADER_TYPE_VXLAN: fl4.flowi4_proto = IPPROTO_UDP; - fl4.fl4_dport = e->tun_info.tp_dst; + fl4.fl4_dport = tun_key->tp_dst; break; default: err = -EOPNOTSUPP; goto out; } - fl4.daddr = e->tun_info.daddr; + fl4.daddr = tun_key->u.ipv4.dst; err = mlx5e_route_lookup_ipv4(priv, mirred_dev, out_dev, &fl4, &n, &saddr, &ttl); @@ -805,9 +806,9 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, case MLX5_HEADER_TYPE_VXLAN: encap_size = gen_vxlan_header_ipv4(*out_dev, encap_header, e->h_dest, ttl, - e->tun_info.daddr, - saddr, e->tun_info.tp_dst, - e->tun_info.tun_id); + tun_key->u.ipv4.dst, + saddr, tun_key->tp_dst, + tunnel_id_to_key32(tun_key->tun_id)); break; default: err = -EOPNOTSUPP; @@ -831,13 +832,11 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; unsigned short family = ip_tunnel_info_af(tun_info); struct ip_tunnel_key *key = &tun_info->key; - struct mlx5_encap_info info; struct mlx5_encap_entry *e; struct net_device *out_dev; + int tunnel_type, err; uintptr_t hash_key; bool found = false; - int tunnel_type; - int err; /* udp dst port must be set */ if (!memchr_inv(&key->tp_dst, 0, sizeof(key->tp_dst))) @@ -853,8 +852,6 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) && MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) { - info.tp_dst = key->tp_dst; - info.tun_id = tunnel_id_to_key32(key->tun_id); tunnel_type = MLX5_HEADER_TYPE_VXLAN; } else { netdev_warn(priv->netdev, @@ -862,22 +859,17 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, return -EOPNOTSUPP; } - switch (family) { - case AF_INET: - info.daddr = key->u.ipv4.dst; - break; - case AF_INET6: + if (family == AF_INET6) { netdev_warn(priv->netdev, "IPv6 tunnel encap offload isn't supported\n"); - default: return -EOPNOTSUPP; } - hash_key = hash_encap_info(&info); + hash_key = hash_encap_info(key); hash_for_each_possible_rcu(esw->offloads.encap_tbl, e, encap_hlist, hash_key) { - if (!cmp_encap_info(&e->tun_info, &info)) { + if (!cmp_encap_info(&e->tun_info.key, key)) { found = true; break; } @@ -892,7 +884,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, if (!e) return -ENOMEM; - e->tun_info = info; + e->tun_info = *tun_info; e->tunnel_type = tunnel_type; INIT_LIST_HEAD(&e->flows); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 8661dd3f542c4..cea1660d59ac6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #define MLX5_MAX_UC_PER_VPORT(dev) \ @@ -274,18 +275,12 @@ enum { #define MLX5_FLOW_CONTEXT_ACTION_VLAN_POP 0x40 #define MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH 0x80 -struct mlx5_encap_info { - __be32 daddr; - __be32 tun_id; - __be16 tp_dst; -}; - struct mlx5_encap_entry { struct hlist_node encap_hlist; struct list_head flows; u32 encap_id; struct neighbour *n; - struct mlx5_encap_info tun_info; + struct ip_tunnel_info tun_info; unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ struct net_device *out_dev; From 9a941117fb761dcfb4f698f1f67340484b781b90 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 3 Jan 2017 19:03:00 +0200 Subject: [PATCH 05/12] net/mlx5e: Maximize ip tunnel key usage on the TC offloading path Use more fields out of the tunnel key (e.g the tunnel source IP address) provided by upper layers for the route lookup done on the encap offload path. Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 477b2796c12e0..3d0dbfb018ca5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -684,7 +684,6 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, struct net_device **out_dev, struct flowi4 *fl4, struct neighbour **out_n, - __be32 *saddr, int *out_ttl) { struct rtable *rt; @@ -714,7 +713,6 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, return -ENOMEM; *out_n = n; - *saddr = fl4->saddr; *out_dev = rt->dst.dev; return 0; @@ -763,13 +761,10 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, { int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); struct ip_tunnel_key *tun_key = &e->tun_info.key; + int encap_size, ttl, err; struct neighbour *n = NULL; struct flowi4 fl4 = {}; char *encap_header; - int encap_size; - __be32 saddr; - int ttl; - int err; encap_header = kzalloc(max_encap_size, GFP_KERNEL); if (!encap_header) @@ -784,10 +779,12 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, err = -EOPNOTSUPP; goto out; } + fl4.flowi4_tos = tun_key->tos; fl4.daddr = tun_key->u.ipv4.dst; + fl4.saddr = tun_key->u.ipv4.src; err = mlx5e_route_lookup_ipv4(priv, mirred_dev, out_dev, - &fl4, &n, &saddr, &ttl); + &fl4, &n, &ttl); if (err) goto out; @@ -806,8 +803,8 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, case MLX5_HEADER_TYPE_VXLAN: encap_size = gen_vxlan_header_ipv4(*out_dev, encap_header, e->h_dest, ttl, - tun_key->u.ipv4.dst, - saddr, tun_key->tp_dst, + fl4.daddr, + fl4.saddr, tun_key->tp_dst, tunnel_id_to_key32(tun_key->tun_id)); break; default: From ce99f6b97fcdcb4e7f6f7e2fe5e5fe6c65585cab Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 11 Dec 2016 21:28:28 +0200 Subject: [PATCH 06/12] net/mlx5e: Support SRIOV TC encapsulation offloads for IPv6 tunnels Add the missing parts for offloading IPv6 tunnels. This includes route and neigh lookups and construnction of the IPv6 tunnel headers. Signed-off-by: Or Gerlitz Reviewed-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 159 +++++++++++++++++- 1 file changed, 151 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 3d0dbfb018ca5..640f10f2e994f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -718,6 +718,47 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, return 0; } +static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct net_device **out_dev, + struct flowi6 *fl6, + struct neighbour **out_n, + int *out_ttl) +{ + struct neighbour *n = NULL; + struct dst_entry *dst; + +#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + int ret; + + dst = ip6_route_output(dev_net(mirred_dev), NULL, fl6); + if (dst->error) { + ret = dst->error; + dst_release(dst); + return ret; + } + + *out_ttl = ip6_dst_hoplimit(dst); + + /* if the egress device isn't on the same HW e-switch, we use the uplink */ + if (!switchdev_port_same_parent_id(priv->netdev, dst->dev)) + *out_dev = mlx5_eswitch_get_uplink_netdev(esw); + else + *out_dev = dst->dev; +#else + return -EOPNOTSUPP; +#endif + + n = dst_neigh_lookup(dst, &fl6->daddr); + dst_release(dst); + if (!n) + return -ENOMEM; + + *out_n = n; + return 0; +} + static int gen_vxlan_header_ipv4(struct net_device *out_dev, char buf[], unsigned char h_dest[ETH_ALEN], @@ -754,6 +795,41 @@ static int gen_vxlan_header_ipv4(struct net_device *out_dev, return encap_size; } +static int gen_vxlan_header_ipv6(struct net_device *out_dev, + char buf[], + unsigned char h_dest[ETH_ALEN], + int ttl, + struct in6_addr *daddr, + struct in6_addr *saddr, + __be16 udp_dst_port, + __be32 vx_vni) +{ + int encap_size = VXLAN_HLEN + sizeof(struct ipv6hdr) + ETH_HLEN; + struct ethhdr *eth = (struct ethhdr *)buf; + struct ipv6hdr *ip6h = (struct ipv6hdr *)((char *)eth + sizeof(struct ethhdr)); + struct udphdr *udp = (struct udphdr *)((char *)ip6h + sizeof(struct ipv6hdr)); + struct vxlanhdr *vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr)); + + memset(buf, 0, encap_size); + + ether_addr_copy(eth->h_dest, h_dest); + ether_addr_copy(eth->h_source, out_dev->dev_addr); + eth->h_proto = htons(ETH_P_IPV6); + + ip6_flow_hdr(ip6h, 0, 0); + /* the HW fills up ipv6 payload len */ + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + + udp->dest = udp_dst_port; + vxh->vx_flags = VXLAN_HF_VNI; + vxh->vx_vni = vxlan_vni_field(vx_vni); + + return encap_size; +} + static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, struct net_device *mirred_dev, struct mlx5_encap_entry *e, @@ -821,6 +897,75 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, return err; } +static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5_encap_entry *e, + struct net_device **out_dev) + +{ + int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); + struct ip_tunnel_key *tun_key = &e->tun_info.key; + int encap_size, err, ttl = 0; + struct neighbour *n = NULL; + struct flowi6 fl6 = {}; + char *encap_header; + + encap_header = kzalloc(max_encap_size, GFP_KERNEL); + if (!encap_header) + return -ENOMEM; + + switch (e->tunnel_type) { + case MLX5_HEADER_TYPE_VXLAN: + fl6.flowi6_proto = IPPROTO_UDP; + fl6.fl6_dport = tun_key->tp_dst; + break; + default: + err = -EOPNOTSUPP; + goto out; + } + + fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tun_key->tos), tun_key->label); + fl6.daddr = tun_key->u.ipv6.dst; + fl6.saddr = tun_key->u.ipv6.src; + + err = mlx5e_route_lookup_ipv6(priv, mirred_dev, out_dev, + &fl6, &n, &ttl); + if (err) + goto out; + + if (!(n->nud_state & NUD_VALID)) { + pr_warn("%s: can't offload, neighbour to %pI6 invalid\n", __func__, &fl6.daddr); + err = -EOPNOTSUPP; + goto out; + } + + e->n = n; + e->out_dev = *out_dev; + + neigh_ha_snapshot(e->h_dest, n, *out_dev); + + switch (e->tunnel_type) { + case MLX5_HEADER_TYPE_VXLAN: + encap_size = gen_vxlan_header_ipv6(*out_dev, encap_header, + e->h_dest, ttl, + &fl6.daddr, + &fl6.saddr, tun_key->tp_dst, + tunnel_id_to_key32(tun_key->tun_id)); + break; + default: + err = -EOPNOTSUPP; + goto out; + } + + err = mlx5_encap_alloc(priv->mdev, e->tunnel_type, + encap_size, encap_header, &e->encap_id); +out: + if (err && n) + neigh_release(n); + kfree(encap_header); + return err; +} + static int mlx5e_attach_encap(struct mlx5e_priv *priv, struct ip_tunnel_info *tun_info, struct net_device *mirred_dev, @@ -831,7 +976,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, struct ip_tunnel_key *key = &tun_info->key; struct mlx5_encap_entry *e; struct net_device *out_dev; - int tunnel_type, err; + int tunnel_type, err = -EOPNOTSUPP; uintptr_t hash_key; bool found = false; @@ -856,12 +1001,6 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, return -EOPNOTSUPP; } - if (family == AF_INET6) { - netdev_warn(priv->netdev, - "IPv6 tunnel encap offload isn't supported\n"); - return -EOPNOTSUPP; - } - hash_key = hash_encap_info(key); hash_for_each_possible_rcu(esw->offloads.encap_tbl, e, @@ -885,7 +1024,11 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv, e->tunnel_type = tunnel_type; INIT_LIST_HEAD(&e->flows); - err = mlx5e_create_encap_header_ipv4(priv, mirred_dev, e, &out_dev); + if (family == AF_INET) + err = mlx5e_create_encap_header_ipv4(priv, mirred_dev, e, &out_dev); + else if (family == AF_INET6) + err = mlx5e_create_encap_header_ipv6(priv, mirred_dev, e, &out_dev); + if (err) goto out_err; From 264d7bf3c1cfd3a128d621b367f57b81d038ba10 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 20 Dec 2016 12:38:05 +0200 Subject: [PATCH 07/12] net/mlx5: E-Switch, Enlarge the FDB size for the switchdev mode The E-Switch FDB size was hard coded to 8k. Change it to be min(max eswitch table size, max flow counters * num flow groups) where the max values are read from the firmware and the number of flow groups is hard-coded as before this change. We don't know upfront the division of flows to group. This setup allows each group to be of size up to the where we want to support (we mandate pairing of flows with counters for offloading). Thus, we don't expect multiple occurences for a group which in turn adds steering hops. Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Tested-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 03293ed1cc22d..3481825992947 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -402,19 +402,18 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw) } #define MAX_PF_SQ 256 -#define ESW_OFFLOADS_NUM_ENTRIES (1 << 13) /* 8K */ #define ESW_OFFLOADS_NUM_GROUPS 4 static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports) { int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int table_size, ix, esw_size, err = 0; struct mlx5_core_dev *dev = esw->dev; struct mlx5_flow_namespace *root_ns; struct mlx5_flow_table *fdb = NULL; struct mlx5_flow_group *g; u32 *flow_group_in; void *match_criteria; - int table_size, ix, err = 0; u32 flags = 0; flow_group_in = mlx5_vzalloc(inlen); @@ -427,15 +426,19 @@ static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports) goto ns_err; } - esw_debug(dev, "Create offloads FDB table, log_max_size(%d)\n", - MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size)); + esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n", + MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size), + MLX5_CAP_GEN(dev, max_flow_counter), ESW_OFFLOADS_NUM_GROUPS); + + esw_size = min_t(int, MLX5_CAP_GEN(dev, max_flow_counter) * ESW_OFFLOADS_NUM_GROUPS, + 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size)); if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) && MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap)) flags |= MLX5_FLOW_TABLE_TUNNEL_EN; fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH, - ESW_OFFLOADS_NUM_ENTRIES, + esw_size, ESW_OFFLOADS_NUM_GROUPS, 0, flags); if (IS_ERR(fdb)) { From c9497c98901c689bf6c357f812bf864ed8f50ace Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Thu, 15 Dec 2016 14:02:53 +0200 Subject: [PATCH 08/12] net/mlx5: Add support for setting VF min rate Add support for SRIOV VF min rate guarantee by using the TSAR BW share weights mechanism. The TSAR BW share vport attribute represents the weight of that vport among the other vports weights which means that the actual vport BW percentage is the same vport weight percentage among the total vports weights sum. Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 5 +- .../net/ethernet/mellanox/mlx5/core/eswitch.c | 97 +++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/eswitch.h | 11 ++- include/linux/mlx5/mlx5_ifc.h | 4 +- 4 files changed, 104 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3a06c81ef85e1..c819d07fbdb38 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3021,11 +3021,8 @@ static int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; - if (min_tx_rate) - return -EOPNOTSUPP; - return mlx5_eswitch_set_vport_rate(mdev->priv.eswitch, vf + 1, - max_tx_rate); + max_tx_rate, min_tx_rate); } static int mlx5_vport_link2ifla(u8 esw_link) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 4b3b60be319d0..efa1a7a76d8a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1415,7 +1415,7 @@ static void esw_destroy_tsar(struct mlx5_eswitch *esw) } static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num, - u32 initial_max_rate) + u32 initial_max_rate, u32 initial_bw_share) { u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; struct mlx5_vport *vport = &esw->vports[vport_num]; @@ -1439,6 +1439,7 @@ static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num, esw->qos.root_tsar_id); MLX5_SET(scheduling_context, &sched_ctx, max_average_bw, initial_max_rate); + MLX5_SET(scheduling_context, &sched_ctx, bw_share, initial_bw_share); err = mlx5_create_scheduling_element_cmd(dev, SCHEDULING_HIERARCHY_E_SWITCH, @@ -1473,7 +1474,7 @@ static void esw_vport_disable_qos(struct mlx5_eswitch *esw, int vport_num) } static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num, - u32 max_rate) + u32 max_rate, u32 bw_share) { u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; struct mlx5_vport *vport = &esw->vports[vport_num]; @@ -1497,7 +1498,9 @@ static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num, esw->qos.root_tsar_id); MLX5_SET(scheduling_context, &sched_ctx, max_average_bw, max_rate); + MLX5_SET(scheduling_context, &sched_ctx, bw_share, bw_share); bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW; + bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE; err = mlx5_modify_scheduling_element_cmd(dev, SCHEDULING_HIERARCHY_E_SWITCH, @@ -1563,7 +1566,8 @@ static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num, esw_apply_vport_conf(esw, vport); /* Attach vport to the eswitch rate limiter */ - if (esw_vport_enable_qos(esw, vport_num, vport->info.max_rate)) + if (esw_vport_enable_qos(esw, vport_num, vport->info.max_rate, + vport->qos.bw_share)) esw_warn(esw->dev, "Failed to attach vport %d to eswitch rate limiter", vport_num); /* Sync with current vport context */ @@ -1952,6 +1956,7 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw, ivi->qos = evport->info.qos; ivi->spoofchk = evport->info.spoofchk; ivi->trusted = evport->info.trusted; + ivi->min_tx_rate = evport->info.min_rate; ivi->max_tx_rate = evport->info.max_rate; mutex_unlock(&esw->state_lock); @@ -2046,23 +2051,103 @@ int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw, return 0; } -int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, - int vport, u32 max_rate) +static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw) { + u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); struct mlx5_vport *evport; + u32 max_guarantee = 0; + int i; + + for (i = 0; i <= esw->total_vports; i++) { + evport = &esw->vports[i]; + if (!evport->enabled || evport->info.min_rate < max_guarantee) + continue; + max_guarantee = evport->info.min_rate; + } + + return max_t(u32, max_guarantee / fw_max_bw_share, 1); +} + +static int normalize_vports_min_rate(struct mlx5_eswitch *esw, u32 divider) +{ + u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + struct mlx5_vport *evport; + u32 vport_max_rate; + u32 vport_min_rate; + u32 bw_share; + int err; + int i; + + for (i = 0; i <= esw->total_vports; i++) { + evport = &esw->vports[i]; + if (!evport->enabled) + continue; + vport_min_rate = evport->info.min_rate; + vport_max_rate = evport->info.max_rate; + bw_share = MLX5_MIN_BW_SHARE; + + if (vport_min_rate) + bw_share = MLX5_RATE_TO_BW_SHARE(vport_min_rate, + divider, + fw_max_bw_share); + + if (bw_share == evport->qos.bw_share) + continue; + + err = esw_vport_qos_config(esw, i, vport_max_rate, + bw_share); + if (!err) + evport->qos.bw_share = bw_share; + else + return err; + } + + return 0; +} + +int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport, + u32 max_rate, u32 min_rate) +{ + u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + bool min_rate_supported = MLX5_CAP_QOS(esw->dev, esw_bw_share) && + fw_max_bw_share >= MLX5_MIN_BW_SHARE; + bool max_rate_supported = MLX5_CAP_QOS(esw->dev, esw_rate_limit); + struct mlx5_vport *evport; + u32 previous_min_rate; + u32 divider; int err = 0; if (!ESW_ALLOWED(esw)) return -EPERM; if (!LEGAL_VPORT(esw, vport)) return -EINVAL; + if ((min_rate && !min_rate_supported) || (max_rate && !max_rate_supported)) + return -EOPNOTSUPP; mutex_lock(&esw->state_lock); evport = &esw->vports[vport]; - err = esw_vport_qos_config(esw, vport, max_rate); + + if (min_rate == evport->info.min_rate) + goto set_max_rate; + + previous_min_rate = evport->info.min_rate; + evport->info.min_rate = min_rate; + divider = calculate_vports_min_rate_divider(esw); + err = normalize_vports_min_rate(esw, divider); + if (err) { + evport->info.min_rate = previous_min_rate; + goto unlock; + } + +set_max_rate: + if (max_rate == evport->info.max_rate) + goto unlock; + + err = esw_vport_qos_config(esw, vport, max_rate, evport->qos.bw_share); if (!err) evport->info.max_rate = max_rate; +unlock: mutex_unlock(&esw->state_lock); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index cea1660d59ac6..5b78883d56541 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -50,6 +50,11 @@ #define FDB_UPLINK_VPORT 0xffff +#define MLX5_MIN_BW_SHARE 1 + +#define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \ + min_t(u32, max_t(u32, (rate) / (divider), MLX5_MIN_BW_SHARE), limit) + /* L2 -mac address based- hash helpers */ struct l2addr_node { struct hlist_node hlist; @@ -116,6 +121,7 @@ struct mlx5_vport_info { u8 qos; u64 node_guid; int link_state; + u32 min_rate; u32 max_rate; bool spoofchk; bool trusted; @@ -138,6 +144,7 @@ struct mlx5_vport { struct { bool enabled; u32 esw_tsar_ix; + u32 bw_share; } qos; bool enabled; @@ -249,8 +256,8 @@ int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw, int vport, bool spoofchk); int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw, int vport_num, bool setting); -int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, - int vport, u32 max_rate); +int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport, + u32 max_rate, u32 min_rate); int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw, int vport, struct ifla_vf_info *ivi); int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index d96ebc319d63a..a919dfb920ae6 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -547,7 +547,9 @@ struct mlx5_ifc_e_switch_cap_bits { struct mlx5_ifc_qos_cap_bits { u8 packet_pacing[0x1]; u8 esw_scheduling[0x1]; - u8 reserved_at_2[0x1e]; + u8 esw_bw_share[0x1]; + u8 esw_rate_limit[0x1]; + u8 reserved_at_4[0x1c]; u8 reserved_at_20[0x20]; From 8c7245a60ef8f8c4a427349690c5a141cfed6217 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 30 Nov 2016 20:23:51 +0200 Subject: [PATCH 09/12] net/mlx5: Push min-inline mode resolution helper into the core So we can use that from the IB driver too in downstream patches. This patch doesn't change any functionality. Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 18 +----------------- .../net/ethernet/mellanox/mlx5/core/vport.c | 17 +++++++++++++++++ include/linux/mlx5/vport.h | 1 + 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c819d07fbdb38..636372873e644 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3458,22 +3458,6 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE; } -static void mlx5e_query_min_inline(struct mlx5_core_dev *mdev, - u8 *min_inline_mode) -{ - switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) { - case MLX5_CAP_INLINE_MODE_L2: - *min_inline_mode = MLX5_INLINE_MODE_L2; - break; - case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: - mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode); - break; - case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: - *min_inline_mode = MLX5_INLINE_MODE_NONE; - break; - } -} - u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout) { int i; @@ -3533,7 +3517,7 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev, priv->params.tx_cq_moderation.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS; priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev); - mlx5e_query_min_inline(mdev, &priv->params.tx_min_inline_mode); + mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode); priv->params.num_tc = 1; priv->params.rss_hfunc = ETH_RSS_HASH_XOR; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 269e4401c342d..b49cfc895c109 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -127,6 +127,23 @@ int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_min_inline); +void mlx5_query_min_inline(struct mlx5_core_dev *mdev, + u8 *min_inline_mode) +{ + switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) { + case MLX5_CAP_INLINE_MODE_L2: + *min_inline_mode = MLX5_INLINE_MODE_L2; + break; + case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: + mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode); + break; + case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: + *min_inline_mode = MLX5_INLINE_MODE_NONE; + break; + } +} +EXPORT_SYMBOL_GPL(mlx5_query_min_inline); + int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, u16 vport, u8 min_inline) { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index ec35157ea7252..656c70b65dd27 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -51,6 +51,7 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, u16 vport, u8 *min_inline); +void mlx5_query_min_inline(struct mlx5_core_dev *mdev, u8 *min_inline); int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, u16 vport, u8 min_inline); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, From 7898489880f55a9c3a954cd5660a0fb4fd81b625 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 30 Nov 2016 20:33:33 +0200 Subject: [PATCH 10/12] IB/mlx5: Enable Eth VFs to query their min-inline value for user-space For some mlx5 HW models (CX4, CX4Lx), the VF driver needs to put part of the packet headers on the TX descriptor so the e-switch can do proper matching and steering. This is called "min-inline", it's advertized to the VF by the FW and also enforced on them by the HW, such that if they don't obey, their packets are dropped. SRIOV VF libmlx5 instances should take into account the min-inline value of their vports. For that end, we provide this value through the vendor response part of init_ucontext command. The min inline value is reported in a way which will let newer libmlx5 instances realize that they are running over an older kernel and act accordingly (e.g apply some educated guess). Signed-off-by: Or Gerlitz Reviewed-by: Matan Barak Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/main.c | 9 +++++++++ include/uapi/rdma/mlx5-abi.h | 14 +++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8727116a4cabf..9d8535385bb8b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "mlx5_ib.h" #define DRIVER_NAME "mlx5_ib" @@ -1202,6 +1203,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.response_length += sizeof(resp.cmds_supp_uhw); } + if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) { + if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { + mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline); + resp.eth_min_inline++; + } + resp.response_length += sizeof(resp.eth_min_inline); + } + /* * We don't want to expose information from the PCI bar that is located * after 4096 bytes, so if the arch only supports larger pages, let's diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 85dc966ea70be..da7cd62bace74 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -90,6 +90,17 @@ enum mlx5_user_cmds_supp_uhw { MLX5_USER_CMDS_SUPP_UHW_CREATE_AH = 1 << 1, }; +/* The eth_min_inline response value is set to off-by-one vs the FW + * returned value to allow user-space to deal with older kernels. + */ +enum mlx5_user_inline_mode { + MLX5_USER_INLINE_MODE_NA, + MLX5_USER_INLINE_MODE_NONE, + MLX5_USER_INLINE_MODE_L2, + MLX5_USER_INLINE_MODE_IP, + MLX5_USER_INLINE_MODE_TCP_UDP, +}; + struct mlx5_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 bf_reg_size; @@ -106,7 +117,8 @@ struct mlx5_ib_alloc_ucontext_resp { __u32 response_length; __u8 cqe_version; __u8 cmds_supp_uhw; - __u16 reserved2; + __u8 eth_min_inline; + __u8 reserved2; __u64 hca_core_clock_offset; __u32 log_uar_size; __u32 num_uars_per_page; From b4e029da29ce8244879b607fbf0514d1087dc3f5 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 22 Nov 2016 11:03:32 +0200 Subject: [PATCH 11/12] net/mlx5e: Reduce memory consumption on kdump kernel Reduce memory consumption on kdump kernel by decreasing the number of channels to 1 and the size of RQs and SQs to the minimal values. Signed-off-by: Kamal Heib Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 7 +------ .../ethernet/mellanox/mlx5/core/en_ethtool.c | 4 ++-- .../net/ethernet/mellanox/mlx5/core/en_main.c | 21 ++++++++++++++++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index d603a30ad6396..8d660a018bf82 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -101,6 +101,7 @@ #define MLX5E_LOG_INDIR_RQT_SIZE 0x7 #define MLX5E_INDIR_RQT_SIZE BIT(MLX5E_LOG_INDIR_RQT_SIZE) +#define MLX5E_MIN_NUM_CHANNELS 0x1 #define MLX5E_MAX_NUM_CHANNELS (MLX5E_INDIR_RQT_SIZE >> 1) #define MLX5E_MAX_NUM_SQS (MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC) #define MLX5E_TX_CQ_POLL_BUDGET 128 @@ -847,12 +848,6 @@ static inline u32 mlx5e_get_wqe_mtt_offset(struct mlx5e_rq *rq, u16 wqe_ix) return wqe_ix * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8); } -static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) -{ - return min_t(int, mdev->priv.eq_table.num_comp_vectors, - MLX5E_MAX_NUM_CHANNELS); -} - extern const struct ethtool_ops mlx5e_ethtool_ops; #ifdef CONFIG_MLX5_CORE_EN_DCB extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 4021863e38402..9d520f8d6d1a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -552,7 +552,7 @@ static void mlx5e_get_channels(struct net_device *dev, { struct mlx5e_priv *priv = netdev_priv(dev); - ch->max_combined = mlx5e_get_max_num_channels(priv->mdev); + ch->max_combined = priv->profile->max_nch(priv->mdev); ch->combined_count = priv->params.num_channels; } @@ -560,7 +560,7 @@ static int mlx5e_set_channels(struct net_device *dev, struct ethtool_channels *ch) { struct mlx5e_priv *priv = netdev_priv(dev); - int ncv = mlx5e_get_max_num_channels(priv->mdev); + int ncv = priv->profile->max_nch(priv->mdev); unsigned int count = ch->combined_count; bool arfs_enabled; bool was_opened; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 636372873e644..e829143efc148 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -31,6 +31,7 @@ */ #include +#include #include #include #include @@ -83,7 +84,9 @@ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, u8 rq_type) priv->params.rq_wq_type = rq_type; switch (priv->params.rq_wq_type) { case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: - priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW; + priv->params.log_rq_size = is_kdump_kernel() ? + MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW : + MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW; priv->params.mpwqe_log_stride_sz = MLX5E_GET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS) ? MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS : @@ -92,7 +95,9 @@ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, u8 rq_type) priv->params.mpwqe_log_stride_sz; break; default: /* MLX5_WQ_TYPE_LINKED_LIST */ - priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE; + priv->params.log_rq_size = is_kdump_kernel() ? + MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE : + MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE; } priv->params.min_rx_wqes = mlx5_min_rx_wqes(priv->params.rq_wq_type, BIT(priv->params.log_rq_size)); @@ -1508,6 +1513,14 @@ static int mlx5e_set_tx_maxrate(struct net_device *dev, int index, u32 rate) return err; } +static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) +{ + return is_kdump_kernel() ? + MLX5E_MIN_NUM_CHANNELS : + min_t(int, mdev->priv.eq_table.num_comp_vectors, + MLX5E_MAX_NUM_CHANNELS); +} + static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, struct mlx5e_channel_param *cparam, struct mlx5e_channel **cp) @@ -3491,7 +3504,9 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev, priv->params.lro_timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT); - priv->params.log_sq_size = MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; + priv->params.log_sq_size = is_kdump_kernel() ? + MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE : + MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; /* set CQE compression */ priv->params.rx_cqe_compress_def = false; From 5eb0249b4352c813f0a3c31b967f6cb4b9869a50 Mon Sep 17 00:00:00 2001 From: Shaker Daibes Date: Sat, 10 Dec 2016 18:45:55 +0200 Subject: [PATCH 12/12] net/mlx5e: CQE compression control code reuse This patch is intended for code reuse of mlx5e_modify_rx_cqe_compression function. Signed-off-by: Shaker Daibes Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_clock.c | 7 +++++-- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 13 ++----------- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 ++------ 4 files changed, 10 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 8d660a018bf82..46f728de9e768 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -787,7 +787,7 @@ void mlx5e_pps_event_handler(struct mlx5e_priv *priv, struct ptp_clock_event *event); int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr); int mlx5e_hwstamp_get(struct net_device *dev, struct ifreq *ifr); -void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val); +void mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val); int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto, u16 vid); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index 349dc72cd2b62..37e66eef6fb5e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -106,11 +106,12 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr) return -ERANGE; } + mutex_lock(&priv->state_lock); /* RX HW timestamp */ switch (config.rx_filter) { case HWTSTAMP_FILTER_NONE: /* Reset CQE compression to Admin default */ - mlx5e_modify_rx_cqe_compression(priv, priv->params.rx_cqe_compress_def); + mlx5e_modify_rx_cqe_compression_locked(priv, priv->params.rx_cqe_compress_def); break; case HWTSTAMP_FILTER_ALL: case HWTSTAMP_FILTER_SOME: @@ -128,14 +129,16 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: /* Disable CQE compression */ netdev_warn(dev, "Disabling cqe compression"); - mlx5e_modify_rx_cqe_compression(priv, false); + mlx5e_modify_rx_cqe_compression_locked(priv, false); config.rx_filter = HWTSTAMP_FILTER_ALL; break; default: + mutex_unlock(&priv->state_lock); return -ERANGE; } memcpy(&priv->tstamp.hwtstamp_config, &config, sizeof(config)); + mutex_unlock(&priv->state_lock); return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? -EFAULT : 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 9d520f8d6d1a5..6c1a5cb43f8cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1476,8 +1476,6 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev, { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; - int err = 0; - bool reset; if (!MLX5_CAP_GEN(mdev, cqe_compression)) return -ENOTSUPP; @@ -1487,17 +1485,10 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev, return -EINVAL; } - reset = test_bit(MLX5E_STATE_OPENED, &priv->state); - - if (reset) - mlx5e_close_locked(netdev); - - MLX5E_SET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS, enable); + mlx5e_modify_rx_cqe_compression_locked(priv, enable); priv->params.rx_cqe_compress_def = enable; - if (reset) - err = mlx5e_open_locked(netdev); - return err; + return 0; } static int mlx5e_handle_pflag(struct net_device *netdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 20f116f8c4575..ba50583ea3ede 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -155,17 +155,15 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq, return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1; } -void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val) +void mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val) { bool was_opened; if (!MLX5_CAP_GEN(priv->mdev, cqe_compression)) return; - mutex_lock(&priv->state_lock); - if (MLX5E_GET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS) == val) - goto unlock; + return; was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); if (was_opened) @@ -176,8 +174,6 @@ void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val) if (was_opened) mlx5e_open_locked(priv->netdev); -unlock: - mutex_unlock(&priv->state_lock); } #define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)