Skip to content

Commit

Permalink
tcp: switch orphan_count to bare per-cpu counters
Browse files Browse the repository at this point in the history
Use of percpu_counter structure to track count of orphaned
sockets is causing problems on modern hosts with 256 cpus
or more.

Stefan Bach reported a serious spinlock contention in real workloads,
that I was able to reproduce with a netfilter rule dropping
incoming FIN packets.

    53.56%  server  [kernel.kallsyms]      [k] queued_spin_lock_slowpath
            |
            ---queued_spin_lock_slowpath
               |
                --53.51%--_raw_spin_lock_irqsave
                          |
                           --53.51%--__percpu_counter_sum
                                     tcp_check_oom
                                     |
                                     |--39.03%--__tcp_close
                                     |          tcp_close
                                     |          inet_release
                                     |          inet6_release
                                     |          sock_close
                                     |          __fput
                                     |          ____fput
                                     |          task_work_run
                                     |          exit_to_usermode_loop
                                     |          do_syscall_64
                                     |          entry_SYSCALL_64_after_hwframe
                                     |          __GI___libc_close
                                     |
                                      --14.48%--tcp_out_of_resources
                                                tcp_write_timeout
                                                tcp_retransmit_timer
                                                tcp_write_timer_handler
                                                tcp_write_timer
                                                call_timer_fn
                                                expire_timers
                                                __run_timers
                                                run_timer_softirq
                                                __softirqentry_text_start

As explained in commit cf86a08 ("net/dst: use a smaller percpu_counter
batch for dst entries accounting"), default batch size is too big
for the default value of tcp_max_orphans (262144).

But even if we reduce batch sizes, there would still be cases
where the estimated count of orphans is beyond the limit,
and where tcp_too_many_orphans() has to call the expensive
percpu_counter_sum_positive().

One solution is to use plain per-cpu counters, and have
a timer to periodically refresh this cache.

Updating this cache every 100ms seems about right, tcp pressure
state is not radically changing over shorter periods.

percpu_counter was nice 15 years ago while hosts had less
than 16 cpus, not anymore by current standards.

v2: Fix the build issue for CONFIG_CRYPTO_DEV_CHELSIO_TLS=m,
    reported by kernel test robot <lkp@intel.com>
    Remove unused socket argument from tcp_too_many_orphans()

Fixes: dd24c00 ("net: Use a percpu_counter for orphan_count")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Stefan Bach <sfb@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and David S. Miller committed Oct 15, 2021
1 parent 0b93aed commit 19757ce
Show file tree
Hide file tree
Showing 11 changed files with 49 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ static void do_abort_syn_rcv(struct sock *child, struct sock *parent)
* created only after 3 way handshake is done.
*/
sock_orphan(child);
percpu_counter_inc((child)->sk_prot->orphan_count);
INC_ORPHAN_COUNT(child);
chtls_release_resources(child);
chtls_conn_done(child);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ struct deferred_skb_cb {
#define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok)
#define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok)
#define SACK_OK(tp) ((tp)->rx_opt.sack_ok)
#define INC_ORPHAN_COUNT(sk) percpu_counter_inc((sk)->sk_prot->orphan_count)
#define INC_ORPHAN_COUNT(sk) this_cpu_inc(*(sk)->sk_prot->orphan_count)

/* TLS SKB */
#define skb_ulp_tls_inline(skb) (ULP_SKB_CB(skb)->ulp.tls.ofld)
Expand Down
2 changes: 1 addition & 1 deletion include/net/inet_connection_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
{
/* The below has to be done to allow calling inet_csk_destroy_sock */
sock_set_flag(sk, SOCK_DEAD);
percpu_counter_inc(sk->sk_prot->orphan_count);
this_cpu_inc(*sk->sk_prot->orphan_count);
}

void inet_csk_destroy_sock(struct sock *sk);
Expand Down
2 changes: 1 addition & 1 deletion include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -1235,7 +1235,7 @@ struct proto {
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */

struct percpu_counter *orphan_count;
unsigned int __percpu *orphan_count;

struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
Expand Down
17 changes: 3 additions & 14 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@

extern struct inet_hashinfo tcp_hashinfo;

extern struct percpu_counter tcp_orphan_count;
DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
int tcp_orphan_count_sum(void);

void tcp_time_wait(struct sock *sk, int state, int timeo);

#define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER)
Expand Down Expand Up @@ -290,19 +292,6 @@ static inline bool tcp_out_of_memory(struct sock *sk)

void sk_forced_mem_schedule(struct sock *sk, int size);

static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
{
struct percpu_counter *ocp = sk->sk_prot->orphan_count;
int orphans = percpu_counter_read_positive(ocp);

if (orphans << shift > sysctl_tcp_max_orphans) {
orphans = percpu_counter_sum_positive(ocp);
if (orphans << shift > sysctl_tcp_max_orphans)
return true;
}
return false;
}

bool tcp_check_oom(struct sock *sk, int shift);


Expand Down
2 changes: 1 addition & 1 deletion net/dccp/dccp.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ extern bool dccp_debug;

extern struct inet_hashinfo dccp_hashinfo;

extern struct percpu_counter dccp_orphan_count;
DECLARE_PER_CPU(unsigned int, dccp_orphan_count);

void dccp_time_wait(struct sock *sk, int state, int timeo);

Expand Down
14 changes: 4 additions & 10 deletions net/dccp/proto.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;

EXPORT_SYMBOL_GPL(dccp_statistics);

struct percpu_counter dccp_orphan_count;
EXPORT_SYMBOL_GPL(dccp_orphan_count);
DEFINE_PER_CPU(unsigned int, dccp_orphan_count);
EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count);

struct inet_hashinfo dccp_hashinfo;
EXPORT_SYMBOL_GPL(dccp_hashinfo);
Expand Down Expand Up @@ -1055,7 +1055,7 @@ void dccp_close(struct sock *sk, long timeout)
bh_lock_sock(sk);
WARN_ON(sock_owned_by_user(sk));

percpu_counter_inc(sk->sk_prot->orphan_count);
this_cpu_inc(dccp_orphan_count);

/* Have we already been destroyed by a softirq or backlog? */
if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
Expand Down Expand Up @@ -1115,13 +1115,10 @@ static int __init dccp_init(void)

BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
sizeof_field(struct sk_buff, cb));
rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
if (rc)
goto out_fail;
inet_hashinfo_init(&dccp_hashinfo);
rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
if (rc)
goto out_free_percpu;
goto out_fail;
rc = -ENOBUFS;
dccp_hashinfo.bind_bucket_cachep =
kmem_cache_create("dccp_bind_bucket",
Expand Down Expand Up @@ -1226,8 +1223,6 @@ static int __init dccp_init(void)
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2:
inet_hashinfo2_free_mod(&dccp_hashinfo);
out_free_percpu:
percpu_counter_destroy(&dccp_orphan_count);
out_fail:
dccp_hashinfo.bhash = NULL;
dccp_hashinfo.ehash = NULL;
Expand All @@ -1250,7 +1245,6 @@ static void __exit dccp_fini(void)
dccp_ackvec_exit();
dccp_sysctl_exit();
inet_hashinfo2_free_mod(&dccp_hashinfo);
percpu_counter_destroy(&dccp_orphan_count);
}

module_init(dccp_init);
Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/inet_connection_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,7 @@ void inet_csk_destroy_sock(struct sock *sk)

sk_refcnt_debug_release(sk);

percpu_counter_dec(sk->sk_prot->orphan_count);
this_cpu_dec(*sk->sk_prot->orphan_count);

sock_put(sk);
}
Expand Down Expand Up @@ -1074,7 +1074,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,

sock_orphan(child);

percpu_counter_inc(sk->sk_prot->orphan_count);
this_cpu_inc(*sk->sk_prot->orphan_count);

if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/inet_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ok) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} else {
percpu_counter_inc(sk->sk_prot->orphan_count);
this_cpu_inc(*sk->sk_prot->orphan_count);
inet_sk_set_state(sk, TCP_CLOSE);
sock_set_flag(sk, SOCK_DEAD);
inet_csk_destroy_sock(sk);
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
struct net *net = seq->private;
int orphans, sockets;

orphans = percpu_counter_sum_positive(&tcp_orphan_count);
orphans = tcp_orphan_count_sum();
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);

socket_seq_show(seq);
Expand Down
38 changes: 33 additions & 5 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ enum {
TCP_CMSG_TS = 2
};

struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);

long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
Expand Down Expand Up @@ -2673,11 +2673,36 @@ void tcp_shutdown(struct sock *sk, int how)
}
EXPORT_SYMBOL(tcp_shutdown);

int tcp_orphan_count_sum(void)
{
int i, total = 0;

for_each_possible_cpu(i)
total += per_cpu(tcp_orphan_count, i);

return max(total, 0);
}

static int tcp_orphan_cache;
static struct timer_list tcp_orphan_timer;
#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)

static void tcp_orphan_update(struct timer_list *unused)
{
WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
}

static bool tcp_too_many_orphans(int shift)
{
return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
}

bool tcp_check_oom(struct sock *sk, int shift)
{
bool too_many_orphans, out_of_socket_memory;

too_many_orphans = tcp_too_many_orphans(sk, shift);
too_many_orphans = tcp_too_many_orphans(shift);
out_of_socket_memory = tcp_out_of_memory(sk);

if (too_many_orphans)
Expand Down Expand Up @@ -2786,7 +2811,7 @@ void __tcp_close(struct sock *sk, long timeout)
/* remove backlog if any, without releasing ownership. */
__release_sock(sk);

percpu_counter_inc(sk->sk_prot->orphan_count);
this_cpu_inc(tcp_orphan_count);

/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
Expand Down Expand Up @@ -4479,7 +4504,10 @@ void __init tcp_init(void)
sizeof_field(struct sk_buff, cb));

percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);

timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);

inet_hashinfo_init(&tcp_hashinfo);
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
thash_entries, 21, /* one slot per 2 MB*/
Expand Down

0 comments on commit 19757ce

Please sign in to comment.