Skip to content

Commit

Permalink
Merge branch 'tcp-receive-path-optimizations'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: receive path optimizations

This series aims to reduce cache line misses in RX path.

I am still working on better cache locality in tcp_sock but
this will wait few more weeks.
====================

Link: https://lore.kernel.org/r/20211025164825.259415-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Oct 26, 2021
2 parents fd559a9 + 12c8691 commit e43b76a
Show file tree
Hide file tree
Showing 11 changed files with 79 additions and 38 deletions.
1 change: 0 additions & 1 deletion include/linux/ipv6.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,6 @@ struct ipv6_pinfo {
__be32 rcv_flowinfo;

__u32 dst_cookie;
__u32 rx_dst_cookie;

struct ipv6_mc_socklist __rcu *ipv6_mc_list;
struct ipv6_ac_socklist *ipv6_ac_list;
Expand Down
3 changes: 2 additions & 1 deletion include/net/busy_poll.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id))
WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
sk_rx_queue_set(sk, skb);
}
Expand Down
3 changes: 1 addition & 2 deletions include/net/inet_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,10 @@ struct inet_sock {
__be32 inet_saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
struct ip_options_rcu __rcu *inet_opt;
__be16 inet_sport;
__u16 inet_id;

struct ip_options_rcu __rcu *inet_opt;
int rx_dst_ifindex;
__u8 tos;
__u8 min_ttl;
__u8 mc_ttl;
Expand Down
2 changes: 2 additions & 0 deletions include/net/ip.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/sockptr.h>
#include <linux/static_key.h>

#include <net/inet_sock.h>
#include <net/route.h>
Expand Down Expand Up @@ -750,6 +751,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
struct ipcm_cookie *ipc, bool allow_ipv6);
DECLARE_STATIC_KEY_FALSE(ip4_min_ttl);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
Expand Down
1 change: 1 addition & 0 deletions include/net/ipv6.h
Original file line number Diff line number Diff line change
Expand Up @@ -1092,6 +1092,7 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
/*
* socket options (ipv6_sockglue.c)
*/
DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);

int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
Expand Down
21 changes: 14 additions & 7 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ struct bpf_local_storage;
* @sk_rcvbuf: size of receive buffer in bytes
* @sk_wq: sock wait queue and async head
* @sk_rx_dst: receive input route used by early demux
* @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
* @sk_rx_dst_cookie: cookie for @sk_rx_dst
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
Expand Down Expand Up @@ -430,6 +432,9 @@ struct sock {
struct xfrm_policy __rcu *sk_policy[2];
#endif
struct dst_entry *sk_rx_dst;
int sk_rx_dst_ifindex;
u32 sk_rx_dst_cookie;

struct dst_entry __rcu *sk_dst_cache;
atomic_t sk_omem_alloc;
int sk_sndbuf;
Expand Down Expand Up @@ -1911,26 +1916,28 @@ static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
if (skb_rx_queue_recorded(skb)) {
u16 rx_queue = skb_get_rx_queue(skb);

if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING))
return;

sk->sk_rx_queue_mapping = rx_queue;
if (unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
}
#endif
}

static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif
}

static inline int sk_rx_queue_get(const struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
return sk->sk_rx_queue_mapping;
if (sk) {
int res = READ_ONCE(sk->sk_rx_queue_mapping);

if (res != NO_QUEUE_MAPPING)
return res;
}
#endif

return -1;
Expand Down
11 changes: 10 additions & 1 deletion net/ipv4/ip_sockglue.c
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
return ip_mc_leave_group(sk, &mreq);
}

DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);

static int do_ip_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
Expand Down Expand Up @@ -1352,7 +1354,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
if (val < 0 || val > 255)
goto e_inval;
inet->min_ttl = val;

if (val)
static_branch_enable(&ip4_min_ttl);

/* tcp_v4_err() and tcp_v4_rcv() might read min_ttl
* while we are changint it.
*/
WRITE_ONCE(inet->min_ttl, val);
break;

default:
Expand Down
25 changes: 16 additions & 9 deletions net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -508,9 +508,12 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
if (sk->sk_state == TCP_CLOSE)
goto out;

if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
}

tp = tcp_sk(sk);
Expand Down Expand Up @@ -1703,7 +1706,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
dst, 0)) {
dst_release(dst);
Expand Down Expand Up @@ -1788,7 +1791,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst &&
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
Expand Down Expand Up @@ -2068,9 +2071,13 @@ int tcp_v4_rcv(struct sk_buff *skb)
return 0;
}
}
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;

if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
}

if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
Expand Down Expand Up @@ -2195,7 +2202,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)

if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
sk->sk_rx_dst_ifindex = skb->skb_iif;
}
}
EXPORT_SYMBOL(inet_sk_rx_dst_set);
Expand Down
11 changes: 10 additions & 1 deletion net/ipv6/ipv6_sockglue.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@
struct ip6_ra_chain *ip6_ra_chain;
DEFINE_RWLOCK(ip6_ra_lock);

DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount);

int ip6_ra_control(struct sock *sk, int sel)
{
struct ip6_ra_chain *ra, *new_ra, **rap;
Expand Down Expand Up @@ -950,7 +952,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
if (val < 0 || val > 255)
goto e_inval;
np->min_hopcount = val;

if (val)
static_branch_enable(&ip6_min_hopcount);

/* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
* while we are changing it.
*/
WRITE_ONCE(np->min_hopcount, val);
retv = 0;
break;
case IPV6_DONTFRAG:
Expand Down
35 changes: 21 additions & 14 deletions net/ipv6/tcp_ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
const struct rt6_info *rt = (const struct rt6_info *)dst;

sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
sk->sk_rx_dst_ifindex = skb->skb_iif;
sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
}
}

Expand Down Expand Up @@ -414,9 +414,12 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sk->sk_state == TCP_CLOSE)
goto out;

if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
if (static_branch_unlikely(&ip6_min_hopcount)) {
/* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
}

tp = tcp_sk(sk);
Expand Down Expand Up @@ -569,7 +572,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts);
consume_skb(inet_rsk(req)->pktopts);
}

#ifdef CONFIG_TCP_MD5SIG
Expand Down Expand Up @@ -1509,9 +1512,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
dst, np->rx_dst_cookie) == NULL) {
dst, sk->sk_rx_dst_cookie) == NULL) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
Expand Down Expand Up @@ -1591,7 +1594,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
}
}

kfree_skb(opt_skb);
consume_skb(opt_skb);
return 0;
}

Expand Down Expand Up @@ -1726,9 +1729,13 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
return 0;
}
}
if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;

if (static_branch_unlikely(&ip6_min_hopcount)) {
/* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
}

if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
Expand Down Expand Up @@ -1872,9 +1879,9 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb)
struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);

if (dst)
dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie);
dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst &&
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
Expand Down
4 changes: 2 additions & 2 deletions net/ipv6/udp.c
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
if (udp_sk_rx_dst_set(sk, dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;

inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
}
}

Expand Down Expand Up @@ -1073,7 +1073,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb)
dst = READ_ONCE(sk->sk_rx_dst);

if (dst)
dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst) {
/* set noref for now.
* any place which wants to hold dst has to call
Expand Down

0 comments on commit e43b76a

Please sign in to comment.