Skip to content

Commit

Permalink
Merge branch 'net-first-round-to-use-dev_net_rcu'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
net: first round to use dev_net_rcu()

dev_net(dev) should either be protected by RTNL or RCU.

There is no LOCKDEP support yet for this helper.

Adding it would trigger too many splats.

Instead, add dev_net_rcu() for rcu_read_lock() contexts
and start to use it to fix bugs and clearly document the
safety requirements.

v4: https://lore.kernel.org/CANn89i+AozhFhZNK0Y4e_EqXV1=yKjGuvf43Wa6JJKWMOixWQQ@mail.gmail.com
v3: https://lore.kernel.org/20250203153633.46ce0337@kernel.org/
====================

Link: https://patch.msgid.link/20250205155120.1676781-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Feb 7, 2025
2 parents 3cf0a98 + b768294 commit 3da81cb
Show file tree
Hide file tree
Showing 11 changed files with 113 additions and 65 deletions.
6 changes: 6 additions & 0 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -2663,6 +2663,12 @@ struct net *dev_net(const struct net_device *dev)
return read_pnet(&dev->nd_net);
}

static inline
struct net *dev_net_rcu(const struct net_device *dev)
{
return read_pnet_rcu(&dev->nd_net);
}

static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
Expand Down
13 changes: 10 additions & 3 deletions include/net/ip.h
Original file line number Diff line number Diff line change
Expand Up @@ -471,9 +471,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
bool forwarding)
{
const struct rtable *rt = dst_rtable(dst);
struct net *net = dev_net(dst->dev);
unsigned int mtu;
unsigned int mtu, res;
struct net *net;

rcu_read_lock();

net = dev_net_rcu(dst->dev);
if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
ip_mtu_locked(dst) ||
!forwarding) {
Expand All @@ -497,7 +500,11 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
out:
mtu = min_t(unsigned int, mtu, IP_MAX_MTU);

return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
res = mtu - lwtunnel_headroom(dst->lwtstate, mtu);

rcu_read_unlock();

return res;
}

static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
Expand Down
2 changes: 1 addition & 1 deletion include/net/net_namespace.h
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
#endif
}

static inline struct net *read_pnet_rcu(possible_net_t *pnet)
static inline struct net *read_pnet_rcu(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
return rcu_dereference(pnet->net);
Expand Down
9 changes: 7 additions & 2 deletions include/net/route.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,10 +382,15 @@ static inline int inet_iif(const struct sk_buff *skb)
static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
{
int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
struct net *net = dev_net(dst->dev);

if (hoplimit == 0)
if (hoplimit == 0) {
const struct net *net;

rcu_read_lock();
net = dev_net_rcu(dst->dev);
hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
rcu_read_unlock();
}
return hoplimit;
}

Expand Down
21 changes: 11 additions & 10 deletions net/core/flow_dissector.c
Original file line number Diff line number Diff line change
Expand Up @@ -1108,10 +1108,12 @@ bool __skb_flow_dissect(const struct net *net,
FLOW_DISSECTOR_KEY_BASIC,
target_container);

rcu_read_lock();

if (skb) {
if (!net) {
if (skb->dev)
net = dev_net(skb->dev);
net = dev_net_rcu(skb->dev);
else if (skb->sk)
net = sock_net(skb->sk);
}
Expand All @@ -1122,7 +1124,6 @@ bool __skb_flow_dissect(const struct net *net,
enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
struct bpf_prog_array *run_array;

rcu_read_lock();
run_array = rcu_dereference(init_net.bpf.run_array[type]);
if (!run_array)
run_array = rcu_dereference(net->bpf.run_array[type]);
Expand Down Expand Up @@ -1150,17 +1151,17 @@ bool __skb_flow_dissect(const struct net *net,
prog = READ_ONCE(run_array->items[0].prog);
result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
hlen, flags);
if (result == BPF_FLOW_DISSECTOR_CONTINUE)
goto dissect_continue;
__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container);
rcu_read_unlock();
return result == BPF_OK;
if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container);
rcu_read_unlock();
return result == BPF_OK;
}
}
dissect_continue:
rcu_read_unlock();
}

rcu_read_unlock();

if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
Expand Down
3 changes: 2 additions & 1 deletion net/ipv4/devinet.c
Original file line number Diff line number Diff line change
Expand Up @@ -1371,10 +1371,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
__be32 addr = 0;
unsigned char localnet_scope = RT_SCOPE_HOST;
struct in_device *in_dev;
struct net *net = dev_net(dev);
struct net *net;
int master_idx;

rcu_read_lock();
net = dev_net_rcu(dev);
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto no_in_dev;
Expand Down
31 changes: 17 additions & 14 deletions net/ipv4/icmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -399,10 +399,10 @@ static void icmp_push_reply(struct sock *sk,

static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
struct ipcm_cookie ipc;
struct rtable *rt = skb_rtable(skb);
struct net *net = dev_net(rt->dst.dev);
struct net *net = dev_net_rcu(rt->dst.dev);
bool apply_ratelimit = false;
struct ipcm_cookie ipc;
struct flowi4 fl4;
struct sock *sk;
struct inet_sock *inet;
Expand Down Expand Up @@ -608,12 +608,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
struct sock *sk;

if (!rt)
goto out;
return;

rcu_read_lock();

if (rt->dst.dev)
net = dev_net(rt->dst.dev);
net = dev_net_rcu(rt->dst.dev);
else if (skb_in->dev)
net = dev_net(skb_in->dev);
net = dev_net_rcu(skb_in->dev);
else
goto out;

Expand Down Expand Up @@ -785,7 +787,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
icmp_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
out:;
out:
rcu_read_unlock();
}
EXPORT_SYMBOL(__icmp_send);

Expand Down Expand Up @@ -834,7 +837,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
* avoid additional coding at protocol handlers.
*/
if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
__ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
return;
}

Expand Down Expand Up @@ -868,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
struct net *net;
u32 info = 0;

net = dev_net(skb_dst(skb)->dev);
net = dev_net_rcu(skb_dst(skb)->dev);

/*
* Incomplete header ?
Expand Down Expand Up @@ -979,7 +982,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
__ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
return SKB_DROP_REASON_PKT_TOO_SMALL;
}

Expand Down Expand Up @@ -1011,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
struct icmp_bxm icmp_param;
struct net *net;

net = dev_net(skb_dst(skb)->dev);
net = dev_net_rcu(skb_dst(skb)->dev);
/* should there be an ICMP stat for ignored echos? */
if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
return SKB_NOT_DROPPED_YET;
Expand Down Expand Up @@ -1040,9 +1043,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)

bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
{
struct net *net = dev_net_rcu(skb->dev);
struct icmp_ext_hdr *ext_hdr, _ext_hdr;
struct icmp_ext_echo_iio *iio, _iio;
struct net *net = dev_net(skb->dev);
struct inet6_dev *in6_dev;
struct in_device *in_dev;
struct net_device *dev;
Expand Down Expand Up @@ -1181,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
return SKB_NOT_DROPPED_YET;

out_err:
__ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
__ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
return SKB_DROP_REASON_PKT_TOO_SMALL;
}

Expand All @@ -1198,7 +1201,7 @@ int icmp_rcv(struct sk_buff *skb)
{
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct rtable *rt = skb_rtable(skb);
struct net *net = dev_net(rt->dst.dev);
struct net *net = dev_net_rcu(rt->dst.dev);
struct icmphdr *icmph;

if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
Expand Down Expand Up @@ -1371,9 +1374,9 @@ int icmp_err(struct sk_buff *skb, u32 info)
struct iphdr *iph = (struct iphdr *)skb->data;
int offset = iph->ihl<<2;
struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
struct net *net = dev_net_rcu(skb->dev);
int type = icmp_hdr(skb)->type;
int code = icmp_hdr(skb)->code;
struct net *net = dev_net(skb->dev);

/*
* Use ping_err to handle all icmp errors except those
Expand Down
30 changes: 21 additions & 9 deletions net/ipv4/route.c
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,13 @@ static inline int ip_rt_proc_init(void)

static inline bool rt_is_expired(const struct rtable *rth)
{
return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
bool res;

rcu_read_lock();
res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
rcu_read_unlock();

return res;
}

void rt_cache_flush(struct net *net)
Expand Down Expand Up @@ -1002,9 +1008,9 @@ out: kfree_skb_reason(skb, reason);
static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
struct dst_entry *dst = &rt->dst;
struct net *net = dev_net(dst->dev);
struct fib_result res;
bool lock = false;
struct net *net;
u32 old_mtu;

if (ip_mtu_locked(dst))
Expand All @@ -1014,16 +1020,17 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
if (old_mtu < mtu)
return;

rcu_read_lock();
net = dev_net_rcu(dst->dev);
if (mtu < net->ipv4.ip_rt_min_pmtu) {
lock = true;
mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
}

if (rt->rt_pmtu == mtu && !lock &&
time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
return;
goto out;

rcu_read_lock();
if (fib_lookup(net, fl4, &res, 0) == 0) {
struct fib_nh_common *nhc;

Expand All @@ -1037,14 +1044,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
jiffies + net->ipv4.ip_rt_mtu_expires);
}
rcu_read_unlock();
return;
goto out;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
nhc = FIB_RES_NHC(res);
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
jiffies + net->ipv4.ip_rt_mtu_expires);
}
out:
rcu_read_unlock();
}

Expand Down Expand Up @@ -1307,10 +1314,15 @@ static void set_class_tag(struct rtable *rt, u32 tag)

static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
struct net *net = dev_net(dst->dev);
unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
net->ipv4.ip_rt_min_advmss);
unsigned int advmss;
struct net *net;

rcu_read_lock();
net = dev_net_rcu(dst->dev);
advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
net->ipv4.ip_rt_min_advmss);
rcu_read_unlock();

return min(advmss, IPV4_MAX_PMTU - header_size);
}
Expand Down
Loading

0 comments on commit 3da81cb

Please sign in to comment.