Skip to content

Commit

Permalink
Merge branch 'tcp-scale-connect-under-pressure'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: scale connect() under pressure

Adoption of bhash2 in linux-6.1 made some operations almost twice
more expensive, because of additional locks.

This series adds RCU in __inet_hash_connect() to help the
case where many attempts need to be made before finding
an available 4-tuple.

This brings a ~200 % improvement in this experiment:

Server:
ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog

Client:
ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server

Before series:

  utime_start=0.288582
  utime_end=1.548707
  stime_start=20.637138
  stime_end=2002.489845
  num_transactions=484453
  latency_min=0.156279245
  latency_max=20.922042756
  latency_mean=1.546521274
  latency_stddev=3.936005194
  num_samples=312537
  throughput=47426.00

perf top on the client:

 49.54%  [kernel]       [k] _raw_spin_lock
 25.87%  [kernel]       [k] _raw_spin_lock_bh
  5.97%  [kernel]       [k] queued_spin_lock_slowpath
  5.67%  [kernel]       [k] __inet_hash_connect
  3.53%  [kernel]       [k] __inet6_check_established
  3.48%  [kernel]       [k] inet6_ehashfn
  0.64%  [kernel]       [k] rcu_all_qs

After this series:

  utime_start=0.271607
  utime_end=3.847111
  stime_start=18.407684
  stime_end=1997.485557
  num_transactions=1350742
  latency_min=0.014131929
  latency_max=17.895073144
  latency_mean=0.505675853   # Nice reduction of latency metrics
  latency_stddev=2.125164772
  num_samples=307884
  throughput=139866.80       # 194 % increase

perf top on client:

 56.86%  [kernel]       [k] __inet6_check_established
 17.96%  [kernel]       [k] __inet_hash_connect
 13.88%  [kernel]       [k] inet6_ehashfn
  2.52%  [kernel]       [k] rcu_all_qs
  2.01%  [kernel]       [k] __cond_resched
  0.41%  [kernel]       [k] _raw_spin_lock
====================

Link: https://patch.msgid.link/20250302124237.3913746-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Mar 5, 2025
2 parents 7ff1c88 + 86c2bc2 commit 85f66df
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 30 deletions.
7 changes: 4 additions & 3 deletions include/net/inet_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ struct inet_bind_bucket {
bool fast_ipv6_only;
struct hlist_node node;
struct hlist_head bhash2;
struct rcu_head rcu;
};

struct inet_bind2_bucket {
Expand Down Expand Up @@ -226,8 +227,7 @@ struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum, int l3mdev);
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind_bucket *tb);
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb);

bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
const struct net *net, unsigned short port,
Expand Down Expand Up @@ -529,7 +529,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u64 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16,
struct inet_timewait_sock **));
struct inet_timewait_sock **,
bool rcu_lookup));

int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk);
Expand Down
8 changes: 3 additions & 5 deletions net/ipv4/inet_connection_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,10 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6) {
int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);

if (addr_type == IPV6_ADDR_ANY)
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
return false;

if (addr_type != IPV6_ADDR_MAPPED)
if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
return true;
}
#endif
Expand Down Expand Up @@ -600,7 +598,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
if (bhash2_created)
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
if (bhash_created)
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
inet_bind_bucket_destroy(tb);
}
if (head2_lock_acquired)
spin_unlock(&head2->lock);
Expand Down
65 changes: 48 additions & 17 deletions net/ipv4/inet_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,19 +76,19 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->fastreuse = 0;
tb->fastreuseport = 0;
INIT_HLIST_HEAD(&tb->bhash2);
hlist_add_head(&tb->node, &head->chain);
hlist_add_head_rcu(&tb->node, &head->chain);
}
return tb;
}

/*
* Caller must hold hashbucket lock for this tb with local BH disabled
*/
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
{
if (hlist_empty(&tb->bhash2)) {
__hlist_del(&tb->node);
kmem_cache_free(cachep, tb);
hlist_del_rcu(&tb->node);
kfree_rcu(tb, rcu);
}
}

Expand Down Expand Up @@ -201,7 +201,7 @@ static void __inet_put_port(struct sock *sk)
}
spin_unlock(&head2->lock);

inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
inet_bind_bucket_destroy(tb);
spin_unlock(&head->lock);
}

Expand Down Expand Up @@ -285,7 +285,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)

error:
if (created_inet_bind_bucket)
inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
inet_bind_bucket_destroy(tb);
spin_unlock(&head2->lock);
spin_unlock(&head->lock);
return -ENOMEM;
Expand Down Expand Up @@ -537,7 +537,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established);
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
struct inet_timewait_sock **twp)
struct inet_timewait_sock **twp,
bool rcu_lookup)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
Expand All @@ -551,11 +552,24 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
unsigned int hash = inet_ehashfn(net, daddr, lport,
saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
const struct hlist_nulls_node *node;
struct sock *sk2;
spinlock_t *lock;

if (rcu_lookup) {
sk_nulls_for_each(sk2, node, &head->chain) {
if (sk2->sk_hash != hash ||
!inet_match(net, sk2, acookie, ports, dif, sdif))
continue;
if (sk2->sk_state == TCP_TIME_WAIT)
break;
return -EADDRNOTAVAIL;
}
return 0;
}

lock = inet_ehash_lockp(hinfo, hash);
spin_lock(lock);

sk_nulls_for_each(sk2, node, &head->chain) {
Expand Down Expand Up @@ -994,7 +1008,8 @@ static u32 *table_perturb;
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u64 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **))
struct sock *, __u16, struct inet_timewait_sock **,
bool rcu_lookup))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_bind_hashbucket *head, *head2;
Expand All @@ -1012,7 +1027,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,

if (port) {
local_bh_disable();
ret = check_established(death_row, sk, port, NULL);
ret = check_established(death_row, sk, port, NULL, false);
local_bh_enable();
return ret;
}
Expand Down Expand Up @@ -1048,6 +1063,21 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
rcu_read_lock();
hlist_for_each_entry_rcu(tb, &head->chain, node) {
if (!inet_bind_bucket_match(tb, net, port, l3mdev))
continue;
if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
rcu_read_unlock();
goto next_port;
}
if (!check_established(death_row, sk, port, &tw, true))
break;
rcu_read_unlock();
goto next_port;
}
rcu_read_unlock();

spin_lock_bh(&head->lock);

/* Does not bother with rcv_saddr checks, because
Expand All @@ -1057,12 +1087,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
goto next_port_unlock;
WARN_ON(hlist_empty(&tb->bhash2));
if (!check_established(death_row, sk,
port, &tw))
port, &tw, false))
goto ok;
goto next_port;
goto next_port_unlock;
}
}

Expand All @@ -1076,8 +1106,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
tb->fastreuse = -1;
tb->fastreuseport = -1;
goto ok;
next_port:
next_port_unlock:
spin_unlock_bh(&head->lock);
next_port:
cond_resched();
}

Expand Down Expand Up @@ -1149,7 +1180,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,

spin_unlock(&head2->lock);
if (tb_created)
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
inet_bind_bucket_destroy(tb);
spin_unlock(&head->lock);

if (tw)
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/inet_timewait_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
tw->tw_tb = NULL;
tw->tw_tb2 = NULL;
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
inet_bind_bucket_destroy(tb);

__sock_put((struct sock *)tw);
}
Expand Down
23 changes: 19 additions & 4 deletions net/ipv6/inet6_hashtables.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup);

static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, const __u16 lport,
struct inet_timewait_sock **twp)
struct inet_timewait_sock **twp,
bool rcu_lookup)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
Expand All @@ -276,11 +277,25 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
const struct hlist_nulls_node *node;
struct sock *sk2;
spinlock_t *lock;

if (rcu_lookup) {
sk_nulls_for_each(sk2, node, &head->chain) {
if (sk2->sk_hash != hash ||
!inet6_match(net, sk2, saddr, daddr,
ports, dif, sdif))
continue;
if (sk2->sk_state == TCP_TIME_WAIT)
break;
return -EADDRNOTAVAIL;
}
return 0;
}

lock = inet_ehash_lockp(hinfo, hash);
spin_lock(lock);

sk_nulls_for_each(sk2, node, &head->chain) {
Expand Down

0 comments on commit 85f66df

Please sign in to comment.