From efd402537673f9951992aea4ef0f5ff51d858f4b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:25:55 +0000
Subject: [PATCH 1/9] sock_diag: annotate data-races around
sock_diag_handlers[family]
__sock_diag_cmd() and sock_diag_bind() read sock_diag_handlers[family]
without a lock held.
Use READ_ONCE()/WRITE_ONCE() annotations to avoid potential issues.
Fixes: 8ef874bfc729 ("sock_diag: Move the sock_ code to net/core/")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/core/sock_diag.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index b1e29e18d1d60..c53b731f2d672 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -193,7 +193,7 @@ int sock_diag_register(const struct sock_diag_handler *hndl)
if (sock_diag_handlers[hndl->family])
err = -EBUSY;
else
- sock_diag_handlers[hndl->family] = hndl;
+ WRITE_ONCE(sock_diag_handlers[hndl->family], hndl);
mutex_unlock(&sock_diag_table_mutex);
return err;
@@ -209,7 +209,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld)
mutex_lock(&sock_diag_table_mutex);
BUG_ON(sock_diag_handlers[family] != hnld);
- sock_diag_handlers[family] = NULL;
+ WRITE_ONCE(sock_diag_handlers[family], NULL);
mutex_unlock(&sock_diag_table_mutex);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
@@ -227,7 +227,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
- if (sock_diag_handlers[req->sdiag_family] == NULL)
+ if (READ_ONCE(sock_diag_handlers[req->sdiag_family]) == NULL)
sock_load_diag_module(req->sdiag_family, 0);
mutex_lock(&sock_diag_table_mutex);
@@ -286,12 +286,12 @@ static int sock_diag_bind(struct net *net, int group)
switch (group) {
case SKNLGRP_INET_TCP_DESTROY:
case SKNLGRP_INET_UDP_DESTROY:
- if (!sock_diag_handlers[AF_INET])
+ if (!READ_ONCE(sock_diag_handlers[AF_INET]))
sock_load_diag_module(AF_INET, 0);
break;
case SKNLGRP_INET6_TCP_DESTROY:
case SKNLGRP_INET6_UDP_DESTROY:
- if (!sock_diag_handlers[AF_INET6])
+ if (!READ_ONCE(sock_diag_handlers[AF_INET6]))
sock_load_diag_module(AF_INET6, 0);
break;
}
From e50e10ae5d81ddb41547114bfdc5edc04422f082 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:25:56 +0000
Subject: [PATCH 2/9] inet_diag: annotate data-races around inet_diag_table[]
inet_diag_lock_handler() reads inet_diag_table[proto] locklessly.
Use READ_ONCE()/WRITE_ONCE() annotations to avoid potential issues.
Fixes: d523a328fb02 ("[INET]: Fix inet_diag dead-lock regression")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/ipv4/inet_diag.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8e6b6aa0579e1..9804e9608a5a0 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -57,7 +57,7 @@ static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
return ERR_PTR(-ENOENT);
}
- if (!inet_diag_table[proto])
+ if (!READ_ONCE(inet_diag_table[proto]))
sock_load_diag_module(AF_INET, proto);
mutex_lock(&inet_diag_table_mutex);
@@ -1503,7 +1503,7 @@ int inet_diag_register(const struct inet_diag_handler *h)
mutex_lock(&inet_diag_table_mutex);
err = -EEXIST;
if (!inet_diag_table[type]) {
- inet_diag_table[type] = h;
+ WRITE_ONCE(inet_diag_table[type], h);
err = 0;
}
mutex_unlock(&inet_diag_table_mutex);
@@ -1520,7 +1520,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
return;
mutex_lock(&inet_diag_table_mutex);
- inet_diag_table[type] = NULL;
+ WRITE_ONCE(inet_diag_table[type], NULL);
mutex_unlock(&inet_diag_table_mutex);
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);
From db5914695a84a7b128ec2e4e9272e6e8091753e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:25:57 +0000
Subject: [PATCH 3/9] inet_diag: add module pointer to "struct
inet_diag_handler"
Following patch is going to use RCU instead of
inet_diag_table_mutex acquisition.
This patch is a preparation, no change of behavior yet.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/linux/inet_diag.h | 1 +
net/dccp/diag.c | 1 +
net/ipv4/raw_diag.c | 1 +
net/ipv4/tcp_diag.c | 1 +
net/ipv4/udp_diag.c | 2 ++
net/mptcp/mptcp_diag.c | 1 +
net/sctp/diag.c | 1 +
7 files changed, 8 insertions(+)
diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 84abb30a3fbb1..a9033696b0aad 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -8,6 +8,7 @@
struct inet_hashinfo;
struct inet_diag_handler {
+ struct module *owner;
void (*dump)(struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r);
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 8a82c5a2c5a8c..f5019d95c3ae5 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -58,6 +58,7 @@ static int dccp_diag_dump_one(struct netlink_callback *cb,
}
static const struct inet_diag_handler dccp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = dccp_diag_dump,
.dump_one = dccp_diag_dump_one,
.idiag_get_info = dccp_diag_get_info,
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index fe2140c8375c8..cc793bd8de258 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -213,6 +213,7 @@ static int raw_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler raw_diag_handler = {
+ .owner = THIS_MODULE,
.dump = raw_diag_dump,
.dump_one = raw_diag_dump_one,
.idiag_get_info = raw_diag_get_info,
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 4cbe4b44425a6..f428ecf9120f2 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -222,6 +222,7 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler tcp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = tcp_diag_dump,
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index dc41a22ee80e8..38cb3a28e4ed6 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -237,6 +237,7 @@ static int udplite_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler udp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = udp_diag_dump,
.dump_one = udp_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
@@ -260,6 +261,7 @@ static int udplite_diag_dump_one(struct netlink_callback *cb,
}
static const struct inet_diag_handler udplite_diag_handler = {
+ .owner = THIS_MODULE,
.dump = udplite_diag_dump,
.dump_one = udplite_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 5409c2ea3f572..bd8ff5950c8d3 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -225,6 +225,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
}
static const struct inet_diag_handler mptcp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = mptcp_diag_dump,
.dump_one = mptcp_diag_dump_one,
.idiag_get_info = mptcp_diag_get_info,
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index eb05131ff1dd6..23359e522273f 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -507,6 +507,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
}
static const struct inet_diag_handler sctp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = sctp_diag_dump,
.dump_one = sctp_diag_dump_one,
.idiag_get_info = sctp_diag_get_info,
From 223f55196bbdb182a9b8de6108a0834b5e5e832e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:25:58 +0000
Subject: [PATCH 4/9] inet_diag: allow concurrent operations
inet_diag_lock_handler() current implementation uses a mutex
to protect inet_diag_table[] array against concurrent changes.
This makes inet_diag dump serialized, thus less scalable
than legacy /proc files.
It is time to switch to full RCU protection.
As a bonus, if a target is statically linked instead of being
modular, inet_diag_lock_handler() & inet_diag_unlock_handler()
reduce to reads only.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/ipv4/inet_diag.c | 80 ++++++++++++++++++++++----------------------
1 file changed, 40 insertions(+), 40 deletions(-)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 9804e9608a5a0..abf7dc9827969 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -32,7 +32,7 @@
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
-static const struct inet_diag_handler **inet_diag_table;
+static const struct inet_diag_handler __rcu **inet_diag_table;
struct inet_diag_entry {
const __be32 *saddr;
@@ -48,28 +48,28 @@ struct inet_diag_entry {
#endif
};
-static DEFINE_MUTEX(inet_diag_table_mutex);
-
static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
{
- if (proto < 0 || proto >= IPPROTO_MAX) {
- mutex_lock(&inet_diag_table_mutex);
- return ERR_PTR(-ENOENT);
- }
+ const struct inet_diag_handler *handler;
+
+ if (proto < 0 || proto >= IPPROTO_MAX)
+ return NULL;
if (!READ_ONCE(inet_diag_table[proto]))
sock_load_diag_module(AF_INET, proto);
- mutex_lock(&inet_diag_table_mutex);
- if (!inet_diag_table[proto])
- return ERR_PTR(-ENOENT);
+ rcu_read_lock();
+ handler = rcu_dereference(inet_diag_table[proto]);
+ if (handler && !try_module_get(handler->owner))
+ handler = NULL;
+ rcu_read_unlock();
- return inet_diag_table[proto];
+ return handler;
}
static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
{
- mutex_unlock(&inet_diag_table_mutex);
+ module_put(handler->owner);
}
void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
@@ -104,9 +104,12 @@ static size_t inet_sk_attr_size(struct sock *sk,
const struct inet_diag_handler *handler;
size_t aux = 0;
- handler = inet_diag_table[req->sdiag_protocol];
+ rcu_read_lock();
+ handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]);
+ DEBUG_NET_WARN_ON_ONCE(!handler);
if (handler && handler->idiag_get_aux_size)
aux = handler->idiag_get_aux_size(sk, net_admin);
+ rcu_read_unlock();
return nla_total_size(sizeof(struct tcp_info))
+ nla_total_size(sizeof(struct inet_diag_msg))
@@ -244,10 +247,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct nlmsghdr *nlh;
struct nlattr *attr;
void *info = NULL;
+ int protocol;
cb_data = cb->data;
- handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)];
- BUG_ON(!handler);
+ protocol = inet_diag_get_protocol(req, cb_data);
+
+ /* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */
+ handler = rcu_dereference_protected(inet_diag_table[protocol], 1);
+ DEBUG_NET_WARN_ON_ONCE(!handler);
+ if (!handler)
+ return -ENXIO;
nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
@@ -605,9 +614,10 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
protocol = inet_diag_get_protocol(req, &dump_data);
handler = inet_diag_lock_handler(protocol);
- if (IS_ERR(handler)) {
- err = PTR_ERR(handler);
- } else if (cmd == SOCK_DIAG_BY_FAMILY) {
+ if (!handler)
+ return -ENOENT;
+
+ if (cmd == SOCK_DIAG_BY_FAMILY) {
struct netlink_callback cb = {
.nlh = nlh,
.skb = in_skb,
@@ -1259,12 +1269,12 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
again:
prev_min_dump_alloc = cb->min_dump_alloc;
handler = inet_diag_lock_handler(protocol);
- if (!IS_ERR(handler))
+ if (handler) {
handler->dump(skb, cb, r);
- else
- err = PTR_ERR(handler);
- inet_diag_unlock_handler(handler);
-
+ inet_diag_unlock_handler(handler);
+ } else {
+ err = -ENOENT;
+ }
/* The skb is not large enough to fit one sk info and
* inet_sk_diag_fill() has requested for a larger skb.
*/
@@ -1457,10 +1467,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}
handler = inet_diag_lock_handler(sk->sk_protocol);
- if (IS_ERR(handler)) {
- inet_diag_unlock_handler(handler);
+ if (!handler) {
nlmsg_cancel(skb, nlh);
- return PTR_ERR(handler);
+ return -ENOENT;
}
attr = handler->idiag_info_size
@@ -1495,20 +1504,12 @@ static const struct sock_diag_handler inet6_diag_handler = {
int inet_diag_register(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
- int err = -EINVAL;
if (type >= IPPROTO_MAX)
- goto out;
+ return -EINVAL;
- mutex_lock(&inet_diag_table_mutex);
- err = -EEXIST;
- if (!inet_diag_table[type]) {
- WRITE_ONCE(inet_diag_table[type], h);
- err = 0;
- }
- mutex_unlock(&inet_diag_table_mutex);
-out:
- return err;
+ return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type],
+ NULL, h) ? 0 : -EEXIST;
}
EXPORT_SYMBOL_GPL(inet_diag_register);
@@ -1519,9 +1520,8 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
if (type >= IPPROTO_MAX)
return;
- mutex_lock(&inet_diag_table_mutex);
- WRITE_ONCE(inet_diag_table[type], NULL);
- mutex_unlock(&inet_diag_table_mutex);
+ xchg((const struct inet_diag_handler **)&inet_diag_table[type],
+ NULL);
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);
From 114b4bb1cc19239b272d52ebbe156053483fe2f8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:25:59 +0000
Subject: [PATCH 5/9] sock_diag: add module pointer to "struct
sock_diag_handler"
Following patch is going to use RCU instead of
sock_diag_table_mutex acquisition.
This patch is a preparation, no change of behavior yet.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/linux/sock_diag.h | 1 +
net/ipv4/inet_diag.c | 2 ++
net/netlink/diag.c | 1 +
net/packet/diag.c | 1 +
net/smc/smc_diag.c | 1 +
net/tipc/diag.c | 1 +
net/unix/diag.c | 1 +
net/vmw_vsock/diag.c | 1 +
net/xdp/xsk_diag.c | 1 +
9 files changed, 10 insertions(+)
diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 0b9ecd8cf9793..7c07754d711b9 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -13,6 +13,7 @@ struct nlmsghdr;
struct sock;
struct sock_diag_handler {
+ struct module *owner;
__u8 family;
int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
int (*get_info)(struct sk_buff *skb, struct sock *sk);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index abf7dc9827969..52ce20691e4ef 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1488,6 +1488,7 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}
static const struct sock_diag_handler inet_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_INET,
.dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
@@ -1495,6 +1496,7 @@ static const struct sock_diag_handler inet_diag_handler = {
};
static const struct sock_diag_handler inet6_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_INET6,
.dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 1eeff9422856e..e12c90d5f6ad2 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -241,6 +241,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler netlink_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_NETLINK,
.dump = netlink_diag_handler_dump,
};
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 9a7980e3309d6..b3bd2f6c2bf7b 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -245,6 +245,7 @@ static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler packet_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_PACKET,
.dump = packet_diag_handler_dump,
};
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 52f7c4f1e7670..32bad267fa3e2 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -255,6 +255,7 @@ static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler smc_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_SMC,
.dump = smc_diag_handler_dump,
};
diff --git a/net/tipc/diag.c b/net/tipc/diag.c
index 18733451c9e0c..54dde8c4e4d46 100644
--- a/net/tipc/diag.c
+++ b/net/tipc/diag.c
@@ -95,6 +95,7 @@ static int tipc_sock_diag_handler_dump(struct sk_buff *skb,
}
static const struct sock_diag_handler tipc_sock_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_TIPC,
.dump = tipc_sock_diag_handler_dump,
};
diff --git a/net/unix/diag.c b/net/unix/diag.c
index bec09a3a1d44c..c3648b7065096 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -322,6 +322,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler unix_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_UNIX,
.dump = unix_diag_handler_dump,
};
diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c
index 2e29994f92ffa..ab87ef66c1e88 100644
--- a/net/vmw_vsock/diag.c
+++ b/net/vmw_vsock/diag.c
@@ -157,6 +157,7 @@ static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
static const struct sock_diag_handler vsock_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_VSOCK,
.dump = vsock_diag_handler_dump,
};
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index 9f8955367275e..09dcea0cbbed9 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -194,6 +194,7 @@ static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr)
}
static const struct sock_diag_handler xsk_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_XDP,
.dump = xsk_diag_handler_dump,
};
From 1d55a6974756cf3979efd2cc68bcece611a44053 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:26:00 +0000
Subject: [PATCH 6/9] sock_diag: allow concurrent operations
sock_diag_broadcast_destroy_work() and __sock_diag_cmd()
are currently using sock_diag_table_mutex to protect
against concurrent sock_diag_handlers[] changes.
This makes inet_diag dump serialized, thus less scalable
than legacy /proc files.
It is time to switch to full RCU protection.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/core/sock_diag.c | 73 +++++++++++++++++++++++++-------------------
1 file changed, 42 insertions(+), 31 deletions(-)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c53b731f2d672..72009e1f4380d 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -16,7 +16,7 @@
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
-static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
+static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
static struct workqueue_struct *broadcast_wq;
@@ -122,6 +122,24 @@ static size_t sock_diag_nlmsg_size(void)
+ nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
}
+static const struct sock_diag_handler *sock_diag_lock_handler(int family)
+{
+ const struct sock_diag_handler *handler;
+
+ rcu_read_lock();
+ handler = rcu_dereference(sock_diag_handlers[family]);
+ if (handler && !try_module_get(handler->owner))
+ handler = NULL;
+ rcu_read_unlock();
+
+ return handler;
+}
+
+static void sock_diag_unlock_handler(const struct sock_diag_handler *handler)
+{
+ module_put(handler->owner);
+}
+
static void sock_diag_broadcast_destroy_work(struct work_struct *work)
{
struct broadcast_sk *bsk =
@@ -138,12 +156,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work)
if (!skb)
goto out;
- mutex_lock(&sock_diag_table_mutex);
- hndl = sock_diag_handlers[sk->sk_family];
- if (hndl && hndl->get_info)
- err = hndl->get_info(skb, sk);
- mutex_unlock(&sock_diag_table_mutex);
-
+ hndl = sock_diag_lock_handler(sk->sk_family);
+ if (hndl) {
+ if (hndl->get_info)
+ err = hndl->get_info(skb, sk);
+ sock_diag_unlock_handler(hndl);
+ }
if (!err)
nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
GFP_KERNEL);
@@ -184,33 +202,26 @@ EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
int sock_diag_register(const struct sock_diag_handler *hndl)
{
- int err = 0;
+ int family = hndl->family;
- if (hndl->family >= AF_MAX)
+ if (family >= AF_MAX)
return -EINVAL;
- mutex_lock(&sock_diag_table_mutex);
- if (sock_diag_handlers[hndl->family])
- err = -EBUSY;
- else
- WRITE_ONCE(sock_diag_handlers[hndl->family], hndl);
- mutex_unlock(&sock_diag_table_mutex);
-
- return err;
+ return !cmpxchg((const struct sock_diag_handler **)
+ &sock_diag_handlers[family],
+ NULL, hndl) ? 0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(sock_diag_register);
-void sock_diag_unregister(const struct sock_diag_handler *hnld)
+void sock_diag_unregister(const struct sock_diag_handler *hndl)
{
- int family = hnld->family;
+ int family = hndl->family;
if (family >= AF_MAX)
return;
- mutex_lock(&sock_diag_table_mutex);
- BUG_ON(sock_diag_handlers[family] != hnld);
- WRITE_ONCE(sock_diag_handlers[family], NULL);
- mutex_unlock(&sock_diag_table_mutex);
+ xchg((const struct sock_diag_handler **)&sock_diag_handlers[family],
+ NULL);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
@@ -227,20 +238,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
- if (READ_ONCE(sock_diag_handlers[req->sdiag_family]) == NULL)
+ if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family]))
sock_load_diag_module(req->sdiag_family, 0);
- mutex_lock(&sock_diag_table_mutex);
- hndl = sock_diag_handlers[req->sdiag_family];
+ hndl = sock_diag_lock_handler(req->sdiag_family);
if (hndl == NULL)
- err = -ENOENT;
- else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
+ return -ENOENT;
+
+ if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
err = hndl->dump(skb, nlh);
else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
err = hndl->destroy(skb, nlh);
else
err = -EOPNOTSUPP;
- mutex_unlock(&sock_diag_table_mutex);
+ sock_diag_unlock_handler(hndl);
return err;
}
@@ -286,12 +297,12 @@ static int sock_diag_bind(struct net *net, int group)
switch (group) {
case SKNLGRP_INET_TCP_DESTROY:
case SKNLGRP_INET_UDP_DESTROY:
- if (!READ_ONCE(sock_diag_handlers[AF_INET]))
+ if (!rcu_access_pointer(sock_diag_handlers[AF_INET]))
sock_load_diag_module(AF_INET, 0);
break;
case SKNLGRP_INET6_TCP_DESTROY:
case SKNLGRP_INET6_UDP_DESTROY:
- if (!READ_ONCE(sock_diag_handlers[AF_INET6]))
+ if (!rcu_access_pointer(sock_diag_handlers[AF_INET6]))
sock_load_diag_module(AF_INET6, 0);
break;
}
From 86e8921df05c6e9423ab74ab8d41022775d8b83a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:26:01 +0000
Subject: [PATCH 7/9] sock_diag: allow concurrent operation in
sock_diag_rcv_msg()
TCPDIAG_GETSOCK and DCCPDIAG_GETSOCK diag are serialized
on sock_diag_table_mutex.
This is to make sure inet_diag module is not unloaded
while diag was ongoing.
It is time to get rid of this mutex and use RCU protection,
allowing full parallelism.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/linux/sock_diag.h | 9 ++++++--
net/core/sock_diag.c | 43 +++++++++++++++++++++++----------------
net/ipv4/inet_diag.c | 9 ++++++--
3 files changed, 40 insertions(+), 21 deletions(-)
diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 7c07754d711b9..110978dc9af1b 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -23,8 +23,13 @@ struct sock_diag_handler {
int sock_diag_register(const struct sock_diag_handler *h);
void sock_diag_unregister(const struct sock_diag_handler *h);
-void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
-void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
+struct sock_diag_inet_compat {
+ struct module *owner;
+ int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh);
+};
+
+void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr);
+void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr);
u64 __sock_gen_cookie(struct sock *sk);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 72009e1f4380d..5c3666431df49 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -17,8 +17,9 @@
#include <linux/sock_diag.h>
static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];
-static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
-static DEFINE_MUTEX(sock_diag_table_mutex);
+
+static struct sock_diag_inet_compat __rcu *inet_rcv_compat;
+
static struct workqueue_struct *broadcast_wq;
DEFINE_COOKIE(sock_cookie);
@@ -184,19 +185,20 @@ void sock_diag_broadcast_destroy(struct sock *sk)
queue_work(broadcast_wq, &bsk->work);
}
-void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)
{
- mutex_lock(&sock_diag_table_mutex);
- inet_rcv_compat = fn;
- mutex_unlock(&sock_diag_table_mutex);
+ xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
+ ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
-void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)
{
- mutex_lock(&sock_diag_table_mutex);
- inet_rcv_compat = NULL;
- mutex_unlock(&sock_diag_table_mutex);
+ const struct sock_diag_inet_compat *old;
+
+ old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat,
+ NULL);
+ WARN_ON_ONCE(old != ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
@@ -259,20 +261,27 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ const struct sock_diag_inet_compat *ptr;
int ret;
switch (nlh->nlmsg_type) {
case TCPDIAG_GETSOCK:
case DCCPDIAG_GETSOCK:
- if (inet_rcv_compat == NULL)
+
+ if (!rcu_access_pointer(inet_rcv_compat))
sock_load_diag_module(AF_INET, 0);
- mutex_lock(&sock_diag_table_mutex);
- if (inet_rcv_compat != NULL)
- ret = inet_rcv_compat(skb, nlh);
- else
- ret = -EOPNOTSUPP;
- mutex_unlock(&sock_diag_table_mutex);
+ rcu_read_lock();
+ ptr = rcu_dereference(inet_rcv_compat);
+ if (ptr && !try_module_get(ptr->owner))
+ ptr = NULL;
+ rcu_read_unlock();
+
+ ret = -EOPNOTSUPP;
+ if (ptr) {
+ ret = ptr->fn(skb, nlh);
+ module_put(ptr->owner);
+ }
return ret;
case SOCK_DIAG_BY_FAMILY:
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 52ce20691e4ef..2c2d8b9dd8e9b 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1527,6 +1527,11 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);
+static const struct sock_diag_inet_compat inet_diag_compat = {
+ .owner = THIS_MODULE,
+ .fn = inet_diag_rcv_msg_compat,
+};
+
static int __init inet_diag_init(void)
{
const int inet_diag_table_size = (IPPROTO_MAX *
@@ -1545,7 +1550,7 @@ static int __init inet_diag_init(void)
if (err)
goto out_free_inet;
- sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
+ sock_diag_register_inet_compat(&inet_diag_compat);
out:
return err;
@@ -1560,7 +1565,7 @@ static void __exit inet_diag_exit(void)
{
sock_diag_unregister(&inet6_diag_handler);
sock_diag_unregister(&inet_diag_handler);
- sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
+ sock_diag_unregister_inet_compat(&inet_diag_compat);
kfree(inet_diag_table);
}
From f44e64990beb41167bd7c313d90bcf7e290c3582 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:26:02 +0000
Subject: [PATCH 8/9] sock_diag: remove sock_diag_mutex
sock_diag_rcv() is still serializing its operations using
a mutex, for no good reason.
This came with commit 0a9c73014415 ("[INET_DIAG]: Fix oops
in netlink_rcv_skb"), but the root cause has been fixed
with commit cd40b7d3983c ("[NET]: make netlink user -> kernel
interface synchronious")
Remove this mutex to let multiple threads run concurrently.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/core/sock_diag.c | 4 ----
1 file changed, 4 deletions(-)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 5c3666431df49..6541228380252 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -292,13 +292,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
-static DEFINE_MUTEX(sock_diag_mutex);
-
static void sock_diag_rcv(struct sk_buff *skb)
{
- mutex_lock(&sock_diag_mutex);
netlink_rcv_skb(skb, &sock_diag_rcv_msg);
- mutex_unlock(&sock_diag_mutex);
}
static int sock_diag_bind(struct net *net, int group)
From 622a08e8de9f9ad576fd56e3a2e97a84370a8100 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jan 2024 11:26:03 +0000
Subject: [PATCH 9/9] inet_diag: skip over empty buckets
After the removal of inet_diag_table_mutex, sock_diag_table_mutex
and sock_diag_mutex, I was able so see spinlock contention from
inet_diag_dump_icsk() when running 100 parallel invocations.
It is time to skip over empty buckets.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
net/ipv4/inet_diag.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 2c2d8b9dd8e9b..7adace541fe29 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1045,6 +1045,10 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
num = 0;
ilb = &hashinfo->lhash2[i];
+ if (hlist_nulls_empty(&ilb->nulls_head)) {
+ s_num = 0;
+ continue;
+ }
spin_lock(&ilb->lock);
sk_nulls_for_each(sk, node, &ilb->nulls_head) {
struct inet_sock *inet = inet_sk(sk);
@@ -1109,6 +1113,10 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
accum = 0;
ibb = &hashinfo->bhash2[i];
+ if (hlist_empty(&ibb->chain)) {
+ s_num = 0;
+ continue;
+ }
spin_lock_bh(&ibb->lock);
inet_bind_bucket_for_each(tb2, &ibb->chain) {
if (!net_eq(ib2_net(tb2), net))