From a4298e4522d687a79af8f8fbb7eca68399ab2d81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:12 -0700 Subject: [PATCH 01/11] net: add SOCK_RCU_FREE socket flag We want a generic way to insert an RCU grace period before socket freeing for cases where RCU_SLAB_DESTROY_BY_RCU is adding too much overhead. SLAB_DESTROY_BY_RCU strict rules force us to take a reference on the socket sk_refcnt, and it is a performance problem for UDP encapsulation, or TCP synflood behavior, as many CPUs might attempt the atomic operations on a shared sk_refcnt UDP sockets and TCP listeners can set SOCK_RCU_FREE so that their lookup can use traditional RCU rules, without refcount changes. They can set the flag only once hashed and visible by other cpus. Signed-off-by: Eric Dumazet Cc: Tom Herbert Tested-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 14 +++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index e91b87f54f99b..9e77353a92ae1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -438,6 +438,7 @@ struct sock { struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; + struct rcu_head sk_rcu; }; #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) @@ -720,6 +721,7 @@ enum sock_flags { */ SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ + SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) diff --git a/net/core/sock.c b/net/core/sock.c index 315f5e57fffed..7a6a063b28b3c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1419,8 +1419,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -void sk_destruct(struct sock *sk) +/* Sockets having SOCK_RCU_FREE will call this function after one RCU + * grace period. This is the case for UDP sockets and TCP listeners. + */ +static void __sk_destruct(struct rcu_head *head) { + struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) @@ -1449,6 +1453,14 @@ void sk_destruct(struct sock *sk) sk_prot_free(sk->sk_prot_creator, sk); } +void sk_destruct(struct sock *sk) +{ + if (sock_flag(sk, SOCK_RCU_FREE)) + call_rcu(&sk->sk_rcu, __sk_destruct); + else + __sk_destruct(&sk->sk_rcu); +} + static void __sk_free(struct sock *sk) { if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) From ca065d0cf80fa547724440a8bf37f1e674d917c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:13 -0700 Subject: [PATCH 02/11] udp: no longer use SLAB_DESTROY_BY_RCU Tom Herbert would like not touching UDP socket refcnt for encapsulated traffic. For this to happen, we need to use normal RCU rules, with a grace period before freeing a socket. UDP sockets are not short lived in the high usage case, so the added cost of call_rcu() should not be a concern. This actually removes a lot of complexity in UDP stack. Multicast receives no longer need to hold a bucket spinlock. Note that ip early demux still needs to take a reference on the socket. Same remark for functions used by xt_socket and xt_PROXY netfilter modules, but this might be changed later. Performance for a single UDP socket receiving flood traffic from many RX queues/cpus. Simple udp_rx using simple recvfrom() loop : 438 kpps instead of 374 kpps : 17 % increase of the peak rate. v2: Addressed Willem de Bruijn feedback in multicast handling - keep early demux break in __udp4_lib_demux_lookup() Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Willem de Bruijn Tested-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/udp.h | 8 +- include/net/sock.h | 12 +- include/net/udp.h | 2 +- net/ipv4/udp.c | 293 +++++++++++++------------------------------- net/ipv4/udp_diag.c | 18 +-- net/ipv6/udp.c | 196 ++++++++++------------------- 6 files changed, 171 insertions(+), 358 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 87c094961bd58..32342754643a7 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -98,11 +98,11 @@ static inline bool udp_get_no_check6_rx(struct sock *sk) return udp_sk(sk)->no_check6_rx; } -#define udp_portaddr_for_each_entry(__sk, node, list) \ - hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node) +#define udp_portaddr_for_each_entry(__sk, list) \ + hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node) -#define udp_portaddr_for_each_entry_rcu(__sk, node, list) \ - hlist_nulls_for_each_entry_rcu(__sk, node, list, __sk_common.skc_portaddr_node) +#define udp_portaddr_for_each_entry_rcu(__sk, list) \ + hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) #define IS_UDPLITE(__sk) (udp_sk(__sk)->pcflag) diff --git a/include/net/sock.h b/include/net/sock.h index 9e77353a92ae1..7ad73db9dde2c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -178,7 +178,7 @@ struct sock_common { int skc_bound_dev_if; union { struct hlist_node skc_bind_node; - struct hlist_nulls_node skc_portaddr_node; + struct hlist_node skc_portaddr_node; }; struct proto *skc_prot; possible_net_t skc_net; @@ -670,18 +670,18 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_for_each_entry(__sk, list, sk_bind_node) /** - * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset + * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. * */ -#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset) \ - for (pos = (head)->first; \ - (!is_a_nulls(pos)) && \ +#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ + for (pos = rcu_dereference((head)->first); \ + pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ - pos = pos->next) + pos = rcu_dereference(pos->next)) static inline struct user_namespace *sk_user_ns(struct sock *sk) { diff --git a/include/net/udp.h b/include/net/udp.h index 92927f729ac86..d870ec1611c4e 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -59,7 +59,7 @@ struct udp_skb_cb { * @lock: spinlock protecting changes to head/count */ struct udp_hslot { - struct hlist_nulls_head head; + struct hlist_head head; int count; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 45ff590661f4b..355bdb221057c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned int log) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, bool match_wildcard)) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { + udp_portaddr_for_each_entry(sk2, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, bool match_wildcard)) { struct net *net = sock_net(sk); - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); struct sock *sk2; - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_family == sk->sk_family && @@ -333,17 +330,18 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, goto fail_unlock; } - sk_nulls_add_node_rcu(sk, &hslot->head); + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } + sock_set_flag(sk, SOCK_RCU_FREE); error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); @@ -497,37 +495,27 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; -begin: result = NULL; badness = 0; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + badness = score; + result = sk; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -535,23 +523,6 @@ static struct sock *udp4_lib_lookup2(struct net *net, hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } @@ -563,15 +534,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -593,35 +561,27 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, htonl(INADDR_ANY), hnum, dif, hslot2, slot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = 0; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, hnum, sport, daddr, dport, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -629,25 +589,6 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, saddr, hnum, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); @@ -663,13 +604,24 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, udptable, skb); } +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, - &udp_table, NULL); + struct sock *sk; + + sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp4_lib_lookup); +#endif static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, @@ -771,7 +723,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } void udp_err(struct sk_buff *skb, u32 info) @@ -1474,13 +1426,13 @@ void udp_lib_unhash(struct sock *sk) spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (sk_nulls_del_node_init_rcu(sk)) { + if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); } @@ -1513,12 +1465,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; spin_unlock(&nhslot2->lock); @@ -1697,35 +1649,6 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return -1; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - unsigned int i; - struct sk_buff *skb1 = NULL; - struct sock *sk; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ @@ -1749,14 +1672,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct hlist_nulls_node *node; + struct sock *sk, *first = NULL; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = skb->dev->ifindex; - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + unsigned int offset = offsetof(typeof(*sk), sk_node); + int dif = skb->dev->ifindex; + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -1767,23 +1690,28 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + + if (!first) { + first = sk; + continue; } - } + nskb = skb_clone(skb, GFP_ATOMIC); - spin_unlock(&hslot->lock); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; + } + if (udp_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -1791,16 +1719,13 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, goto start_lookup; } - /* - * do the slow work with no lock held - */ - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udp_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -1897,7 +1822,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 @@ -1958,49 +1882,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, int dif) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); - unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); + unsigned int slot = udp_hashfn(net, hnum, udp_table.mask); struct udp_hslot *hslot = &udp_table.hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL; - rcu_read_lock(); -begin: - count = 0; result = NULL; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { - if (__udp_is_mcast_sock(net, sk, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) { + sk_for_each_rcu(sk, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, + rmt_port, rmt_addr, dif, hnum)) { + if (result) + return NULL; result = sk; - ++count; - } - } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { - if (count != 1 || - unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!__udp_is_mcast_sock(net, result, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum))) { - sock_put(result); - result = NULL; } } - rcu_read_unlock(); + return result; } @@ -2013,37 +1912,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + struct sock *sk; - rcu_read_lock(); - result = NULL; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { - if (INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, ports, dif)) - result = sk; + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + if (INET_MATCH(sk, net, acookie, rmt_addr, + loc_addr, ports, dif)) + return sk; /* Only check first socket in chain */ break; } - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, - ports, dif))) { - sock_put(result); - result = NULL; - } - } - rcu_read_unlock(); - return result; + return NULL; } void udp_v4_early_demux(struct sk_buff *skb) @@ -2051,7 +1935,7 @@ void udp_v4_early_demux(struct sk_buff *skb) struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct udphdr *uh; - struct sock *sk; + struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; int ours; @@ -2083,11 +1967,9 @@ void udp_v4_early_demux(struct sk_buff *skb) } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); - } else { - return; } - if (!sk) + if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2)) return; skb->sk = sk; @@ -2387,14 +2269,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) for (state->bucket = start; state->bucket <= state->udp_table->mask; ++state->bucket) { - struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) @@ -2413,7 +2294,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_nulls_next(sk); + sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { @@ -2622,12 +2503,12 @@ void __init udp_table_init(struct udp_table *table, const char *name) table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + INIT_HLIST_HEAD(&table->hash2[i].head); table->hash2[i].count = 0; spin_lock_init(&table->hash2[i].lock); } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index df1966f3b6ecc..3d5ccf4b14120 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { int err = -EINVAL; - struct sock *sk; + struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); + rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, @@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_dport, req->id.idiag_if, tbl, NULL); #endif - else - goto out_nosk; - + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; @@ -96,24 +97,23 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { - int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { - struct sock *sk; - struct hlist_nulls_node *node; struct udp_hslot *hslot = &table->hash[slot]; + struct sock *sk; num = 0; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b772a7641fbdd..78a7dfd127072 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -213,37 +213,28 @@ static struct sock *udp6_lib_lookup2(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; -begin: result = NULL; badness = -1; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score2(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -251,27 +242,10 @@ static struct sock *udp6_lib_lookup2(struct net *net, hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } +/* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, @@ -279,15 +253,12 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -309,34 +280,26 @@ struct sock *__udp6_lib_lookup(struct net *net, &in6addr_any, hnum, dif, hslot2, slot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = -1; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { + sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -344,25 +307,6 @@ struct sock *__udp6_lib_lookup(struct net *net, hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, saddr, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp6_lib_lookup); @@ -382,12 +326,24 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, udptable, skb); } +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { - return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL); + struct sock *sk; + + sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp6_lib_lookup); +#endif /* * This should be easy, if there is something there we @@ -585,7 +541,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -747,33 +703,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, return true; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - struct sk_buff *skb1 = NULL; - struct sock *sk; - unsigned int i; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - static void udp6_csum_zero_error(struct sk_buff *skb) { /* RFC 2460 section 8.1 says that we SHOULD log @@ -792,15 +721,15 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; + struct sock *sk, *first = NULL; const struct udphdr *uh = udp_hdr(skb); - struct hlist_nulls_node *node; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = inet6_iif(skb); - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + int dif = inet6_iif(skb); + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & @@ -811,27 +740,32 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_v6_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum) && - /* If zero checksum and no_check is not on for - * the socket then skip it. - */ - (uh->check || udp_sk(sk)->no_check6_rx)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + if (!uh->check && !udp_sk(sk)->no_check6_rx) + continue; + if (!first) { + first = sk; + continue; + } + nskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + UDP6_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; } - } - spin_unlock(&hslot->lock); + if (udpv6_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -839,13 +773,13 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, goto start_lookup; } - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udpv6_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -853,10 +787,10 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { + const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); - struct sock *sk; struct udphdr *uh; - const struct in6_addr *saddr, *daddr; + struct sock *sk; u32 ulen = 0; if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -910,7 +844,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int ret; if (!uh->check && !udp_sk(sk)->no_check6_rx) { - sock_put(sk); udp6_csum_zero_error(skb); goto csum_error; } @@ -920,7 +853,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input */ if (ret > 0) From ee3cf32a4a5e6cf5ccc0f0de9865fda3ebc46436 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:14 -0700 Subject: [PATCH 03/11] tcp/dccp: remove BH disable/enable in lookup Since linux 2.6.29, lookups only use rcu locking. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 7 +------ net/ipv6/inet6_hashtables.c | 2 -- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 50f635c2c5365..a77acee93aaf4 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -280,11 +280,8 @@ static inline struct sock *inet_lookup_listener(struct net *net, net_eq(sock_net(__sk), (__net))) #endif /* 64-bit arch */ -/* - * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need +/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM - * - * Local BH must be disabled here. */ struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, @@ -326,10 +323,8 @@ static inline struct sock *inet_lookup(struct net *net, { struct sock *sk; - local_bh_disable(); sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, dport, dif); - local_bh_enable(); return sk; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 70f2628be6fac..d253f32874c90 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -200,10 +200,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, { struct sock *sk; - local_bh_disable(); sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); return sk; } From 2d331915a04144dad738e725769d8fac06ef6155 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:15 -0700 Subject: [PATCH 04/11] tcp/dccp: use rcu locking in inet_diag_find_one_icsk() RX packet processing holds rcu_read_lock(), so we can remove pairs of rcu_read_lock()/rcu_read_unlock() in lookup functions if inet_diag also holds rcu before calling them. This is needed anyway as __inet_lookup_listener() and inet6_lookup_listener() will soon no longer increment refcount on the found listener. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 7 +++++-- net/ipv4/inet_hashtables.c | 4 ---- net/ipv6/inet6_hashtables.c | 4 ---- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5fdb02f5598eb..ea8df527b2799 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -356,6 +356,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net, { struct sock *sk; + rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], @@ -376,9 +377,11 @@ struct sock *inet_diag_find_one_icsk(struct net *net, req->id.idiag_if); } #endif - else + else { + rcu_read_unlock(); return ERR_PTR(-EINVAL); - + } + rcu_read_unlock(); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index bc68eced01057..387338d71dcde 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -220,7 +220,6 @@ struct sock *__inet_lookup_listener(struct net *net, bool select_ok = true; u32 phash = 0; - rcu_read_lock(); begin: result = NULL; hiscore = 0; @@ -269,7 +268,6 @@ struct sock *__inet_lookup_listener(struct net *net, goto begin; } } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -312,7 +310,6 @@ struct sock *__inet_lookup_established(struct net *net, unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - rcu_read_lock(); begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -339,7 +336,6 @@ struct sock *__inet_lookup_established(struct net *net, out: sk = NULL; found: - rcu_read_unlock(); return sk; } EXPORT_SYMBOL_GPL(__inet_lookup_established); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index d253f32874c90..e6ef6ce1ed74b 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -69,7 +69,6 @@ struct sock *__inet6_lookup_established(struct net *net, struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - rcu_read_lock(); begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -90,7 +89,6 @@ struct sock *__inet6_lookup_established(struct net *net, out: sk = NULL; found: - rcu_read_unlock(); return sk; } EXPORT_SYMBOL(__inet6_lookup_established); @@ -138,7 +136,6 @@ struct sock *inet6_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - rcu_read_lock(); begin: result = NULL; hiscore = 0; @@ -187,7 +184,6 @@ struct sock *inet6_lookup_listener(struct net *net, goto begin; } } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); From 3a5d1c0e7cb5ba91aabbd7e28626e3cc925f8093 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:16 -0700 Subject: [PATCH 05/11] inet: reqsk_alloc() needs to take care of dead listeners We'll soon no longer take a refcount on listeners, so reqsk_alloc() can not assume a listener refcount is not zero. We need to use atomic_inc_not_zero() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index f49759decb287..6ebe13eb1c4cb 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -85,24 +85,23 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, struct request_sock *req; req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN); - - if (req) { - req->rsk_ops = ops; - if (attach_listener) { - sock_hold(sk_listener); - req->rsk_listener = sk_listener; - } else { - req->rsk_listener = NULL; + if (!req) + return NULL; + req->rsk_listener = NULL; + if (attach_listener) { + if (unlikely(!atomic_inc_not_zero(&sk_listener->sk_refcnt))) { + kmem_cache_free(ops->slab, req); + return NULL; } - req_to_sk(req)->sk_prot = sk_listener->sk_prot; - sk_node_init(&req_to_sk(req)->sk_node); - sk_tx_queue_clear(req_to_sk(req)); - req->saved_syn = NULL; - /* Following is temporary. It is coupled with debugging - * helpers in reqsk_put() & reqsk_free() - */ - atomic_set(&req->rsk_refcnt, 0); + req->rsk_listener = sk_listener; } + req->rsk_ops = ops; + req_to_sk(req)->sk_prot = sk_listener->sk_prot; + sk_node_init(&req_to_sk(req)->sk_node); + sk_tx_queue_clear(req_to_sk(req)); + req->saved_syn = NULL; + atomic_set(&req->rsk_refcnt, 0); + return req; } From 3b24d854cb35383c30642116e5992fd619bdc9bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:17 -0700 Subject: [PATCH 06/11] tcp/dccp: do not touch listener sk_refcnt under synflood When a SYNFLOOD targets a non SO_REUSEPORT listener, multiple cpus contend on sk->sk_refcnt and sk->sk_wmem_alloc changes. By letting listeners use SOCK_RCU_FREE infrastructure, we can relax TCP_LISTEN lookup rules and avoid touching sk_refcnt Note that we still use SLAB_DESTROY_BY_RCU rules for other sockets, only listeners are impacted by this change. Peak performance under SYNFLOOD is increased by ~33% : On my test machine, I could process 3.2 Mpps instead of 2.4 Mpps Most consuming functions are now skb_set_owner_w() and sock_wfree() contending on sk->sk_wmem_alloc when cooking SYNACK and freeing them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 12 ++++-- include/net/inet_hashtables.h | 40 +++++++++++-------- net/dccp/ipv4.c | 7 +++- net/dccp/ipv6.c | 7 +++- net/ipv4/inet_diag.c | 3 +- net/ipv4/inet_hashtables.c | 73 ++++++++++++---------------------- net/ipv4/tcp_ipv4.c | 66 +++++++++++++++--------------- net/ipv6/inet6_hashtables.c | 56 +++++++------------------- net/ipv6/tcp_ipv6.c | 27 +++++++------ net/netfilter/xt_socket.c | 6 +-- 10 files changed, 134 insertions(+), 163 deletions(-) diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 28332bdac3338..b87becacd9d34 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -66,13 +66,15 @@ static inline struct sock *__inet6_lookup(struct net *net, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, - const int dif) + const int dif, + bool *refcounted) { struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif); + *refcounted = true; if (sk) return sk; - + *refcounted = false; return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif); } @@ -81,17 +83,19 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, - int iif) + int iif, + bool *refcounted) { struct sock *sk = skb_steal_sock(skb); + *refcounted = true; if (sk) return sk; return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, doff, &ipv6_hdr(skb)->saddr, sport, &ipv6_hdr(skb)->daddr, ntohs(dport), - iif); + iif, refcounted); } struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index a77acee93aaf4..0574493e38993 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -100,14 +100,10 @@ struct inet_bind_hashbucket { /* * Sockets can be hashed in established or listening table - * We must use different 'nulls' end-of-chain value for listening - * hash table, or we might find a socket that was closed and - * reallocated/inserted into established hash table */ -#define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; - struct hlist_nulls_head head; + struct hlist_head head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ @@ -304,14 +300,20 @@ static inline struct sock *__inet_lookup(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, - const int dif) + const int dif, + bool *refcounted) { u16 hnum = ntohs(dport); - struct sock *sk = __inet_lookup_established(net, hashinfo, - saddr, sport, daddr, hnum, dif); + struct sock *sk; - return sk ? : __inet_lookup_listener(net, hashinfo, skb, doff, saddr, - sport, daddr, hnum, dif); + sk = __inet_lookup_established(net, hashinfo, saddr, sport, + daddr, hnum, dif); + *refcounted = true; + if (sk) + return sk; + *refcounted = false; + return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + sport, daddr, hnum, dif); } static inline struct sock *inet_lookup(struct net *net, @@ -322,10 +324,13 @@ static inline struct sock *inet_lookup(struct net *net, const int dif) { struct sock *sk; + bool refcounted; sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, - dport, dif); + dport, dif, &refcounted); + if (sk && !refcounted && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; return sk; } @@ -333,17 +338,20 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, - const __be16 dport) + const __be16 dport, + bool *refcounted) { struct sock *sk = skb_steal_sock(skb); const struct iphdr *iph = ip_hdr(skb); + *refcounted = true; if (sk) return sk; - else - return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, - doff, iph->saddr, sport, - iph->daddr, dport, inet_iif(skb)); + + return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + doff, iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + refcounted); } u32 sk_ehashfn(const struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 9c67a961ba538..6438c5a7efc41 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -764,6 +764,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; const struct iphdr *iph; + bool refcounted; struct sock *sk; int min_cov; @@ -801,7 +802,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) lookup: sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport); + dh->dccph_sport, dh->dccph_dport, &refcounted); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); @@ -830,6 +831,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) goto lookup; } sock_hold(sk); + refcounted = true; nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); @@ -886,7 +888,8 @@ static int dccp_v4_rcv(struct sk_buff *skb) return 0; discard_and_relse: - sock_put(sk); + if (refcounted) + sock_put(sk); goto discard_it; } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4663a01d50399..71bf1deba4c5e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -642,6 +642,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) static int dccp_v6_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; + bool refcounted; struct sock *sk; int min_cov; @@ -670,7 +671,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) lookup: sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, - inet6_iif(skb)); + inet6_iif(skb), &refcounted); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); @@ -699,6 +700,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) goto lookup; } sock_hold(sk); + refcounted = true; nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); @@ -752,7 +754,8 @@ static int dccp_v6_rcv(struct sk_buff *skb) return 0; discard_and_relse: - sock_put(sk); + if (refcounted) + sock_put(sk); goto discard_it; } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ea8df527b2799..bd591eb81ec9c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -775,13 +775,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; struct sock *sk; num = 0; ilb = &hashinfo->listening_hash[i]; spin_lock_bh(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->head) { + sk_for_each(sk, &ilb->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 387338d71dcde..98ba03b6f87d3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -198,13 +198,13 @@ static inline int compute_score(struct sock *sk, struct net *net, } /* - * Don't inline this cruft. Here are some nice properties to exploit here. The - * BSD API does not allow a listening sock to specify the remote port nor the + * Here are some nice properties to exploit here. The BSD API + * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ - +/* called with rcu_read_lock() : No refcount taken on the socket */ struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -212,37 +212,27 @@ struct sock *__inet_lookup_listener(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore, matches = 0, reuseport = 0; - bool select_ok = true; + int score, hiscore = 0, matches = 0, reuseport = 0; + struct sock *sk, *result = NULL; u32 phash = 0; -begin: - result = NULL; - hiscore = 0; - sk_nulls_for_each_rcu(sk, node, &ilb->head) { + sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { - result = sk; - hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - sk2 = reuseport_select_sock(sk, phash, - skb, doff); - if (sk2) { - result = sk2; - goto found; - } - } + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; matches = 1; } + result = sk; + hiscore = score; } else if (score == hiscore && reuseport) { matches++; if (reciprocal_scale(phash, matches) == 0) @@ -250,24 +240,6 @@ struct sock *__inet_lookup_listener(struct net *net, phash = next_pseudo_random32(phash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, daddr, - dif) < hiscore)) { - sock_put(result); - select_ok = false; - goto begin; - } - } return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -508,7 +480,8 @@ int __inet_hash(struct sock *sk, struct sock *osk, if (err) goto unlock; } - __sk_nulls_add_node_rcu(sk, &ilb->head); + hlist_add_head_rcu(&sk->sk_node, &ilb->head); + sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb->lock); @@ -535,20 +508,25 @@ void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; spinlock_t *lock; + bool listener = false; int done; if (sk_unhashed(sk)) return; - if (sk->sk_state == TCP_LISTEN) + if (sk->sk_state == TCP_LISTEN) { lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; - else + listener = true; + } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - + } spin_lock_bh(lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - done = __sk_nulls_del_node_init_rcu(sk); + if (listener) + done = __sk_del_node_init(sk); + else + done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -684,9 +662,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, - i + LISTENING_NULLS_BASE); - } + INIT_HLIST_HEAD(&h->listening_hash[i].head); + } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad450509029bc..e5f924b299462 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -628,6 +628,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) @@ -646,16 +647,18 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ if (!sk1) - return; - rcu_read_lock(); + goto out; + key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) &ip_hdr(skb)->saddr, AF_INET); if (!key) - goto release_sk1; + goto out; + genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) - goto release_sk1; + goto out; + } if (key) { @@ -698,11 +701,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); #ifdef CONFIG_TCP_MD5SIG -release_sk1: - if (sk1) { - rcu_read_unlock(); - sock_put(sk1); - } +out: + rcu_read_unlock(); #endif } @@ -1538,11 +1538,12 @@ EXPORT_SYMBOL(tcp_prequeue); int tcp_v4_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; + bool refcounted; struct sock *sk; int ret; - struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1588,7 +1589,7 @@ int tcp_v4_rcv(struct sk_buff *skb) lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, - th->dest); + th->dest, &refcounted); if (!sk) goto no_tcp_socket; @@ -1609,7 +1610,11 @@ int tcp_v4_rcv(struct sk_buff *skb) inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ sock_hold(sk); + refcounted = true; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1665,7 +1670,8 @@ int tcp_v4_rcv(struct sk_buff *skb) bh_unlock_sock(sk); put_and_return: - sock_put(sk); + if (refcounted) + sock_put(sk); return ret; @@ -1688,7 +1694,8 @@ int tcp_v4_rcv(struct sk_buff *skb) return 0; discard_and_relse: - sock_put(sk); + if (refcounted) + sock_put(sk); goto discard_it; do_time_wait: @@ -1712,6 +1719,7 @@ int tcp_v4_rcv(struct sk_buff *skb) if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; + refcounted = false; goto process; } /* Fall through to ACK */ @@ -1845,17 +1853,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { - struct inet_connection_sock *icsk; - struct hlist_nulls_node *node; - struct sock *sk = cur; - struct inet_listen_hashbucket *ilb; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); + struct inet_listen_hashbucket *ilb; + struct inet_connection_sock *icsk; + struct sock *sk = cur; if (!sk) { +get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); - sk = sk_nulls_head(&ilb->head); + sk = sk_head(&ilb->head); st->offset = 0; goto get_sk; } @@ -1863,28 +1871,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - sk = sk_nulls_next(sk); + sk = sk_next(sk); get_sk: - sk_nulls_for_each_from(sk, node) { + sk_for_each_from(sk) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == st->family) { - cur = sk; - goto out; - } + if (sk->sk_family == st->family) + return sk; icsk = inet_csk(sk); } spin_unlock_bh(&ilb->lock); st->offset = 0; - if (++st->bucket < INET_LHTABLE_SIZE) { - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock_bh(&ilb->lock); - sk = sk_nulls_head(&ilb->head); - goto get_sk; - } - cur = NULL; -out: - return cur; + if (++st->bucket < INET_LHTABLE_SIZE) + goto get_head; + return NULL; } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index e6ef6ce1ed74b..607da088344d6 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -120,6 +120,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +/* called with rcu_read_lock() */ struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -127,38 +128,27 @@ struct sock *inet6_lookup_listener(struct net *net, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { - struct sock *sk; - const struct hlist_nulls_node *node; - struct sock *result; - int score, hiscore, matches = 0, reuseport = 0; - bool select_ok = true; - u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; + int score, hiscore = 0, matches = 0, reuseport = 0; + struct sock *sk, *result = NULL; + u32 phash = 0; -begin: - result = NULL; - hiscore = 0; - sk_nulls_for_each(sk, node, &ilb->head) { + sk_for_each(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { hiscore = score; - result = sk; - reuseport = sk->sk_reuseport; if (reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - sk2 = reuseport_select_sock(sk, phash, - skb, doff); - if (sk2) { - result = sk2; - goto found; - } - } + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; matches = 1; } + result = sk; + reuseport = sk->sk_reuseport; } else if (score == hiscore && reuseport) { matches++; if (reciprocal_scale(phash, matches) == 0) @@ -166,24 +156,6 @@ struct sock *inet6_lookup_listener(struct net *net, phash = next_pseudo_random32(phash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, daddr, - dif) < hiscore)) { - sock_put(result); - select_ok = false; - goto begin; - } - } return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); @@ -195,10 +167,12 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, const int dif) { struct sock *sk; + bool refcounted; sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, - ntohs(dport), dif); - + ntohs(dport), dif, &refcounted); + if (sk && !refcounted && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 711d209f91247..f0422e7827313 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -858,6 +858,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) return; #ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr); @@ -875,16 +876,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) th->source, &ipv6h->daddr, ntohs(th->source), tcp_v6_iif(skb)); if (!sk1) - return; + goto out; - rcu_read_lock(); key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr); if (!key) - goto release_sk1; + goto out; genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) - goto release_sk1; + goto out; } #endif @@ -898,11 +898,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); #ifdef CONFIG_TCP_MD5SIG -release_sk1: - if (sk1) { - rcu_read_unlock(); - sock_put(sk1); - } +out: + rcu_read_unlock(); #endif } @@ -1351,6 +1348,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) { const struct tcphdr *th; const struct ipv6hdr *hdr; + bool refcounted; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); @@ -1381,7 +1379,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) lookup: sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), - th->source, th->dest, inet6_iif(skb)); + th->source, th->dest, inet6_iif(skb), + &refcounted); if (!sk) goto no_tcp_socket; @@ -1404,6 +1403,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) goto lookup; } sock_hold(sk); + refcounted = true; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1460,7 +1460,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) bh_unlock_sock(sk); put_and_return: - sock_put(sk); + if (refcounted) + sock_put(sk); return ret ? -1 : 0; no_tcp_socket: @@ -1483,7 +1484,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) return 0; discard_and_relse: - sock_put(sk); + if (refcounted) + sock_put(sk); goto discard_it; do_time_wait: @@ -1514,6 +1516,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); + refcounted = false; goto process; } /* Fall through to ACK */ diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 49d14ecad4445..b10ade272b509 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -120,9 +120,9 @@ xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return __inet_lookup(net, &tcp_hashinfo, skb, doff, - saddr, sport, daddr, dport, - in->ifindex); + return inet_lookup(net, &tcp_hashinfo, skb, doff, + saddr, sport, daddr, dport, + in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); From 15239302edd46b184e758048253541fb211e315e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:18 -0700 Subject: [PATCH 07/11] sock_diag: add SK_MEMINFO_DROPS Reporting sk_drops to user space was available for UDP sockets using /proc interface. Add this to sock_diag, so that we can have the same information available to ss users, and we'll be able to add sk_drops indications for TCP sockets as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/sock_diag.h | 1 + net/core/sock_diag.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index bae2d80034d4a..7ff505d8a47ba 100644 --- a/include/uapi/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h @@ -20,6 +20,7 @@ enum { SK_MEMINFO_WMEM_QUEUED, SK_MEMINFO_OPTMEM, SK_MEMINFO_BACKLOG, + SK_MEMINFO_DROPS, SK_MEMINFO_VARS, }; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a996ce8c8fb25..ca9e35bbe13ca 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -67,6 +67,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); return nla_put(skb, attrtype, sizeof(mem), &mem); } From 532182cd610782db8c18230c2747626562032205 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:19 -0700 Subject: [PATCH 08/11] tcp: increment sk_drops for dropped rx packets Now ss can report sk_drops, we can instruct TCP to increment this per socket counter when it drops an incoming frame, to refine monitoring and debugging. Following patch takes care of listeners drops. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 7 +++++++ net/ipv4/tcp_input.c | 33 ++++++++++++++++++++------------- net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 7ad73db9dde2c..310c4367ea83f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2012,6 +2012,13 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops); } +static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +{ + int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + atomic_add(segs, &sk->sk_drops); +} + void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a26e2d2623587..0ffcd07e34096 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4307,6 +4307,12 @@ static bool tcp_try_coalesce(struct sock *sk, return true; } +static void tcp_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4331,7 +4337,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); - __kfree_skb(skb); + tcp_drop(sk, skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", @@ -4383,7 +4389,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); - __kfree_skb(skb); + tcp_drop(sk, skb); return; } @@ -4447,7 +4453,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); + tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4486,7 +4492,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb1); + tcp_drop(sk, skb1); } add_sack: @@ -4569,12 +4575,13 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - int eaten = -1; bool fragstolen = false; + int eaten = -1; - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) - goto drop; - + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + __kfree_skb(skb); + return; + } skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); @@ -4656,7 +4663,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) tcp_enter_quickack_mode(sk); inet_csk_schedule_ack(sk); drop: - __kfree_skb(skb); + tcp_drop(sk, skb); return; } @@ -5233,7 +5240,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, return true; discard: - __kfree_skb(skb); + tcp_drop(sk, skb); return false; } @@ -5451,7 +5458,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: - __kfree_skb(skb); + tcp_drop(sk, skb); } EXPORT_SYMBOL(tcp_rcv_established); @@ -5682,7 +5689,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, TCP_DELACK_MAX, TCP_RTO_MAX); discard: - __kfree_skb(skb); + tcp_drop(sk, skb); return 0; } else { tcp_send_ack(sk); @@ -6043,7 +6050,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!queued) { discard: - __kfree_skb(skb); + tcp_drop(sk, skb); } return 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e5f924b299462..059a98f5e7e19 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1694,6 +1694,7 @@ int tcp_v4_rcv(struct sk_buff *skb) return 0; discard_and_relse: + sk_drops_add(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f0422e7827313..5fa8fea394c94 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1484,6 +1484,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) return 0; discard_and_relse: + sk_drops_add(sk, skb); if (refcounted) sock_put(sk); goto discard_it; From 9caad864151e525929d323de96cad382da49c3b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:20 -0700 Subject: [PATCH 09/11] tcp: increment sk_drops for listeners Goal: packets dropped by a listener are accounted for. This adds tcp_listendrop() helper, and clears sk_drops in sk_clone_lock() so that children do not inherit their parent drop count. Note that we no longer increment LINUX_MIB_LISTENDROPS counter when sending a SYNCOOKIE, since the SYN packet generated a SYNACK. We already have a separate LINUX_MIB_SYNCOOKIESSENT Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++++++++ net/core/sock.c | 1 + net/ipv4/tcp_input.c | 8 +++++--- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv6/tcp_ipv6.c | 4 ++-- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index a23282996ca9d..74d3ed5eb2190 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1836,4 +1836,17 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) tp->data_segs_in += segs_in; } +/* + * TCP listen path runs lockless. + * We forced "struct sock" to be const qualified to make sure + * we don't modify one of its field by mistake. + * Here, we increment sk_drops which is an atomic_t, so we can safely + * make sock writable again. + */ +static inline void tcp_listendrop(const struct sock *sk) +{ + atomic_inc(&((struct sock *)sk)->sk_drops); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); +} + #endif /* _TCP_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 7a6a063b28b3c..2f517ea567861 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1525,6 +1525,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_dst_cache = NULL; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; + atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0ffcd07e34096..983f04c111778 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6339,8 +6339,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie); - if (want_cookie) - goto drop_and_free; + if (want_cookie) { + reqsk_free(req); + return 0; + } } reqsk_put(req); return 0; @@ -6350,7 +6352,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, drop_and_free: reqsk_free(req); drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return 0; } EXPORT_SYMBOL(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 059a98f5e7e19..f3ce0afe70aad 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -329,7 +329,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) * errors returned from accept(). */ inet_csk_reqsk_queue_drop(req->rsk_listener, req); - NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); + tcp_listendrop(req->rsk_listener); } reqsk_put(req); } @@ -1246,7 +1246,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) &tcp_request_sock_ipv4_ops, sk, skb); drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return 0; } EXPORT_SYMBOL(tcp_v4_conn_request); @@ -1348,7 +1348,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, exit_nonewsk: dst_release(dst); exit: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return NULL; put_and_exit: inet_csk_prepare_forced_close(newsk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5fa8fea394c94..7cde1b6fdda3f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -964,7 +964,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) &tcp_request_sock_ipv6_ops, sk, skb); drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return 0; /* don't send reset */ } @@ -1169,7 +1169,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * out_nonewsk: dst_release(dst); out: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return NULL; } From a9d6532b567489196dac4ce60c62343e43228759 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:21 -0700 Subject: [PATCH 10/11] ipv4: tcp: set SOCK_USE_WRITE_QUEUE for ip_send_unicast_reply() TCP uses per cpu 'sockets' to send some packets : - RST packets ( tcp_v4_send_reset()) ) - ACK packets for SYN_RECV and TIMEWAIT sockets By setting SOCK_USE_WRITE_QUEUE flag, we tell sock_wfree() to not call sk_write_space() since these internal sockets do not care. This gives a small performance improvement, merely by allowing cpu to properly predict the sock_wfree() conditional branch, and avoiding one atomic operation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f3ce0afe70aad..456ff3d6a1322 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2384,6 +2384,7 @@ static int __net_init tcp_sk_init(struct net *net) IPPROTO_TCP, net); if (res) goto fail; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } From 4ce7e93cb3fe87db5b700050172dc41def9834b3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Apr 2016 08:52:22 -0700 Subject: [PATCH 11/11] tcp: rate limit ACK sent by SYN_RECV request sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Attackers like to use SYNFLOOD targeting one 5-tuple, as they hit a single RX queue (and cpu) on the victim. If they use random sequence numbers in their SYN, we detect they do not match the expected window and send back an ACK. This patch adds a rate limitation, so that the effect of such attacks is limited to ingress only. We roughly double our ability to absorb such attacks. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Neal Cardwell Cc: Maciej Żenczykowski Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index acb366dd61e68..4c53e7c865864 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -704,7 +704,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ - if (!(flg & TCP_FLAG_RST)) + if (!(flg & TCP_FLAG_RST) && + !tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSYNRECV, + &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);