Skip to content

Commit

Permalink
Merge branch 'icmp-reply-optimize'
Browse files Browse the repository at this point in the history
Jesper Dangaard Brouer says:

====================
net: optimize ICMP-reply code path

This patchset is optimizing the ICMP-reply code path, for ICMP packets
that gets rate limited. A remote party can easily trigger this code
path by sending packets to port number with no listening service.

Generally the patchset moves the sysctl_icmp_msgs_per_sec ratelimit
checking to earlier in the code path and removes an allocation.

Use-case: The specific case I experienced this being a bottleneck is,
sending UDP packets to a port with no listener, which obviously result
in kernel replying with ICMP Destination Unreachable (type:3), Port
Unreachable (code:3), which cause the bottleneck.

 After Eric and Paolo optimized the UDP socket code, the kernels PPS
processing capabilities is lower for no-listen ports, than normal UDP
sockets.  This is bad for capacity planning when restarting a service.

UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh:
 Baseline: 6.6 Mpps
 Patch:   14.7 Mpps
Driver mlx5 at 50Gbit/s.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Jan 9, 2017
2 parents aaa9c10 + 7ba91ec commit 9f2f27a
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 70 deletions.
125 changes: 76 additions & 49 deletions net/ipv4/icmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -209,27 +209,25 @@ static struct sock *icmp_sk(struct net *net)
return *this_cpu_ptr(net->ipv4.icmp_sk);
}

/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;

local_bh_disable();

sk = icmp_sk(net);

if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet.
*/
local_bh_enable();
return NULL;
}
return sk;
}

static inline void icmp_xmit_unlock(struct sock *sk)
{
spin_unlock_bh(&sk->sk_lock.slock);
spin_unlock(&sk->sk_lock.slock);
}

int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
Expand Down Expand Up @@ -282,6 +280,33 @@ bool icmp_global_allow(void)
}
EXPORT_SYMBOL(icmp_global_allow);

static bool icmpv4_mask_allow(struct net *net, int type, int code)
{
if (type > NR_ICMP_TYPES)
return true;

/* Don't limit PMTU discovery. */
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
return true;

/* Limit if icmp type is enabled in ratemask. */
if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
return true;

return false;
}

static bool icmpv4_global_allow(struct net *net, int type, int code)
{
if (icmpv4_mask_allow(net, type, code))
return true;

if (icmp_global_allow())
return true;

return false;
}

/*
* Send an ICMP frame.
*/
Expand All @@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
struct flowi4 *fl4, int type, int code)
{
struct dst_entry *dst = &rt->dst;
struct inet_peer *peer;
bool rc = true;
int vif;

if (type > NR_ICMP_TYPES)
goto out;

/* Don't limit PMTU discovery. */
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
if (icmpv4_mask_allow(net, type, code))
goto out;

/* No rate limit on loopback */
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
goto out;

/* Limit if icmp type is enabled in ratemask. */
if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
goto out;

rc = false;
if (icmp_global_allow()) {
int vif = l3mdev_master_ifindex(dst->dev);
struct inet_peer *peer;

peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
rc = inet_peer_xrlim_allow(peer,
net->ipv4.sysctl_icmp_ratelimit);
if (peer)
inet_putpeer(peer);
}
vif = l3mdev_master_ifindex(dst->dev);
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
if (peer)
inet_putpeer(peer);
out:
return rc;
}
Expand Down Expand Up @@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct inet_sock *inet;
__be32 daddr, saddr;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
int type = icmp_param->data.icmph.type;
int code = icmp_param->data.icmph.code;

if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
return;

/* Needed by both icmp_global_allow and icmp_xmit_lock */
local_bh_disable();

/* global icmp_msgs_per_sec */
if (!icmpv4_global_allow(net, type, code))
goto out_bh_enable;

sk = icmp_xmit_lock(net);
if (!sk)
return;
goto out_bh_enable;
inet = inet_sk(sk);

icmp_param->data.icmph.checksum = 0;
Expand Down Expand Up @@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
goto out_unlock;
if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
icmp_param->data.icmph.code))
if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
Expand Down Expand Up @@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct iphdr *iph;
int room;
struct icmp_bxm *icmp_param;
struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc;
struct flowi4 fl4;
Expand Down Expand Up @@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}

icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
if (!icmp_param)
return;
/* Needed by both icmp_global_allow and icmp_xmit_lock */
local_bh_disable();

/* Check global sysctl_icmp_msgs_per_sec ratelimit */
if (!icmpv4_global_allow(net, type, code))
goto out_bh_enable;

sk = icmp_xmit_lock(net);
if (!sk)
goto out_free;
goto out_bh_enable;

/*
* Construct source address and options.
Expand All @@ -681,33 +707,34 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);

if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
goto out_unlock;


/*
* Prepare data for ICMP header.
*/

icmp_param->data.icmph.type = type;
icmp_param->data.icmph.code = code;
icmp_param->data.icmph.un.gateway = info;
icmp_param->data.icmph.checksum = 0;
icmp_param->skb = skb_in;
icmp_param->offset = skb_network_offset(skb_in);
icmp_param.data.icmph.type = type;
icmp_param.data.icmph.code = code;
icmp_param.data.icmph.un.gateway = info;
icmp_param.data.icmph.checksum = 0;
icmp_param.skb = skb_in;
icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
sk->sk_mark = mark;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param->replyopts.opt;
ipc.opt = &icmp_param.replyopts.opt;
ipc.tx_flags = 0;
ipc.ttl = 0;
ipc.tos = -1;

rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
type, code, icmp_param);
type, code, &icmp_param);
if (IS_ERR(rt))
goto out_unlock;

/* peer icmp_ratelimit */
if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
goto ende;

Expand All @@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);

icmp_param->data_len = skb_in->len - icmp_param->offset;
if (icmp_param->data_len > room)
icmp_param->data_len = room;
icmp_param->head_len = sizeof(struct icmphdr);
icmp_param.data_len = skb_in->len - icmp_param.offset;
if (icmp_param.data_len > room)
icmp_param.data_len = room;
icmp_param.head_len = sizeof(struct icmphdr);

icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
out_free:
kfree(icmp_param);
out_bh_enable:
local_bh_enable();
out:;
}
EXPORT_SYMBOL(icmp_send);
Expand Down
Loading

0 comments on commit 9f2f27a

Please sign in to comment.