Skip to content

Commit

Permalink
Merge branch 'socket-option-lockless'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
net: more data-races fixes and lockless socket options

This is yet another round of data-races fixes,
and lockless socket options.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Oct 1, 2023
2 parents 06bc366 + eb44ad4 commit 2be825e
Show file tree
Hide file tree
Showing 30 changed files with 138 additions and 131 deletions.
2 changes: 1 addition & 1 deletion drivers/net/ppp/pppoe.c
Original file line number Diff line number Diff line change
Expand Up @@ -877,7 +877,7 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m,

skb->dev = dev;

skb->priority = sk->sk_priority;
skb->priority = READ_ONCE(sk->sk_priority);
skb->protocol = cpu_to_be16(ETH_P_PPP_SES);

ph = skb_put(skb, total_len + sizeof(struct pppoe_hdr));
Expand Down
2 changes: 1 addition & 1 deletion include/net/bluetooth/bluetooth.h
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ static inline struct sk_buff *bt_skb_sendmsg(struct sock *sk,
return ERR_PTR(-EFAULT);
}

skb->priority = sk->sk_priority;
skb->priority = READ_ONCE(sk->sk_priority);

return skb;
}
Expand Down
26 changes: 19 additions & 7 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -2007,21 +2007,33 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
/* sk_tx_queue_mapping accept only upto a 16-bit value */
if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
return;
sk->sk_tx_queue_mapping = tx_queue;
/* Paired with READ_ONCE() in sk_tx_queue_get() and
* other WRITE_ONCE() because socket lock might be not held.
*/
WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
}

#define NO_QUEUE_MAPPING USHRT_MAX

static inline void sk_tx_queue_clear(struct sock *sk)
{
sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING;
/* Paired with READ_ONCE() in sk_tx_queue_get() and
* other WRITE_ONCE() because socket lock might be not held.
*/
WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}

static inline int sk_tx_queue_get(const struct sock *sk)
{
if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING)
return sk->sk_tx_queue_mapping;
if (sk) {
/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
* and sk_tx_queue_set().
*/
int val = READ_ONCE(sk->sk_tx_queue_mapping);

if (val != NO_QUEUE_MAPPING)
return val;
}
return -1;
}

Expand Down Expand Up @@ -2170,7 +2182,7 @@ static inline void __dst_negative_advice(struct sock *sk)
if (ndst != dst) {
rcu_assign_pointer(sk->sk_dst_cache, ndst);
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
}
}
}
Expand All @@ -2187,7 +2199,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old_dst;

sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
old_dst = rcu_dereference_protected(sk->sk_dst_cache,
lockdep_sock_is_held(sk));
rcu_assign_pointer(sk->sk_dst_cache, dst);
Expand All @@ -2200,7 +2212,7 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old_dst;

sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
dst_release(old_dst);
}
Expand Down
2 changes: 1 addition & 1 deletion include/trace/events/mptcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ TRACE_EVENT(mptcp_subflow_get_send,
ssk = mptcp_subflow_tcp_sock(subflow);
if (ssk && sk_fullsock(ssk)) {
__entry->snd_wnd = tcp_sk(ssk)->snd_wnd;
__entry->pace = ssk->sk_pacing_rate;
__entry->pace = READ_ONCE(ssk->sk_pacing_rate);
} else {
__entry->snd_wnd = 0;
__entry->pace = 0;
Expand Down
2 changes: 1 addition & 1 deletion net/appletalk/aarp.c
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,

sendit:
if (skb->sk)
skb->priority = skb->sk->sk_priority;
skb->priority = READ_ONCE(skb->sk->sk_priority);
if (dev_queue_xmit(skb))
goto drop;
sent:
Expand Down
2 changes: 1 addition & 1 deletion net/ax25/af_ax25.c
Original file line number Diff line number Diff line change
Expand Up @@ -939,7 +939,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
sock_init_data(NULL, sk);

sk->sk_type = osk->sk_type;
sk->sk_priority = osk->sk_priority;
sk->sk_priority = READ_ONCE(osk->sk_priority);
sk->sk_protocol = osk->sk_protocol;
sk->sk_rcvbuf = osk->sk_rcvbuf;
sk->sk_sndbuf = osk->sk_sndbuf;
Expand Down
2 changes: 1 addition & 1 deletion net/bluetooth/l2cap_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -1615,7 +1615,7 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
return ERR_PTR(-ENOTCONN);
}

skb->priority = sk->sk_priority;
skb->priority = READ_ONCE(sk->sk_priority);

bt_cb(skb)->l2cap.chan = chan;

Expand Down
2 changes: 1 addition & 1 deletion net/can/j1939/socket.c
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev,
skcb = j1939_skb_to_cb(skb);
memset(skcb, 0, sizeof(*skcb));
skcb->addr = jsk->addr;
skcb->priority = j1939_prio(sk->sk_priority);
skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority));

if (msg->msg_name) {
struct sockaddr_can *addr = msg->msg_name;
Expand Down
2 changes: 1 addition & 1 deletion net/can/raw.c
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}

skb->dev = dev;
skb->priority = sk->sk_priority;
skb->priority = READ_ONCE(sk->sk_priority);
skb->mark = READ_ONCE(sk->sk_mark);
skb->tstamp = sockc.transmit_time;

Expand Down
163 changes: 78 additions & 85 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
Expand Down Expand Up @@ -806,9 +806,7 @@ EXPORT_SYMBOL(sock_no_linger);

void sock_set_priority(struct sock *sk, u32 priority)
{
lock_sock(sk);
WRITE_ONCE(sk->sk_priority, priority);
release_sock(sk);
}
EXPORT_SYMBOL(sock_set_priority);

Expand Down Expand Up @@ -1118,6 +1116,83 @@ int sk_setsockopt(struct sock *sk, int level, int optname,

valbool = val ? 1 : 0;

/* handle options which do not require locking the socket. */
switch (optname) {
case SO_PRIORITY:
if ((val >= 0 && val <= 6) ||
sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
sock_set_priority(sk, val);
return 0;
}
return -EPERM;
case SO_PASSSEC:
assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
return 0;
case SO_PASSCRED:
assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
return 0;
case SO_PASSPIDFD:
assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
return 0;
case SO_TYPE:
case SO_PROTOCOL:
case SO_DOMAIN:
case SO_ERROR:
return -ENOPROTOOPT;
#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
if (val < 0)
return -EINVAL;
WRITE_ONCE(sk->sk_ll_usec, val);
return 0;
case SO_PREFER_BUSY_POLL:
if (valbool && !sockopt_capable(CAP_NET_ADMIN))
return -EPERM;
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
return 0;
case SO_BUSY_POLL_BUDGET:
if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
!sockopt_capable(CAP_NET_ADMIN))
return -EPERM;
if (val < 0 || val > U16_MAX)
return -EINVAL;
WRITE_ONCE(sk->sk_busy_poll_budget, val);
return 0;
#endif
case SO_MAX_PACING_RATE:
{
unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
unsigned long pacing_rate;

if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
return -EFAULT;
}
if (ulval != ~0UL)
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
/* Pairs with READ_ONCE() from sk_getsockopt() */
WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
pacing_rate = READ_ONCE(sk->sk_pacing_rate);
if (ulval < pacing_rate)
WRITE_ONCE(sk->sk_pacing_rate, ulval);
return 0;
}
case SO_TXREHASH:
if (val < -1 || val > 1)
return -EINVAL;
if ((u8)val == SOCK_TXREHASH_DEFAULT)
val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
/* Paired with READ_ONCE() in tcp_rtx_synack()
* and sk_getsockopt().
*/
WRITE_ONCE(sk->sk_txrehash, (u8)val);
return 0;
}

sockopt_lock_sock(sk);

switch (optname) {
Expand All @@ -1133,12 +1208,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
case SO_REUSEPORT:
sk->sk_reuseport = valbool;
break;
case SO_TYPE:
case SO_PROTOCOL:
case SO_DOMAIN:
case SO_ERROR:
ret = -ENOPROTOOPT;
break;
case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
sk_dst_reset(sk);
Expand Down Expand Up @@ -1213,15 +1282,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
sk->sk_no_check_tx = valbool;
break;

case SO_PRIORITY:
if ((val >= 0 && val <= 6) ||
sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
WRITE_ONCE(sk->sk_priority, val);
else
ret = -EPERM;
break;

case SO_LINGER:
if (optlen < sizeof(ling)) {
ret = -EINVAL; /* 1003.1g */
Expand All @@ -1247,14 +1307,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
case SO_BSDCOMPAT:
break;

case SO_PASSCRED:
assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
break;

case SO_PASSPIDFD:
assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
break;

case SO_TIMESTAMP_OLD:
case SO_TIMESTAMP_NEW:
case SO_TIMESTAMPNS_OLD:
Expand Down Expand Up @@ -1360,9 +1412,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
break;

case SO_PASSSEC:
assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
break;
case SO_MARK:
if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
Expand Down Expand Up @@ -1404,50 +1453,7 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
break;

#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
if (val < 0)
ret = -EINVAL;
else
WRITE_ONCE(sk->sk_ll_usec, val);
break;
case SO_PREFER_BUSY_POLL:
if (valbool && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
break;
case SO_BUSY_POLL_BUDGET:
if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
} else {
if (val < 0 || val > U16_MAX)
ret = -EINVAL;
else
WRITE_ONCE(sk->sk_busy_poll_budget, val);
}
break;
#endif

case SO_MAX_PACING_RATE:
{
unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;

if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
ret = -EFAULT;
break;
}
if (ulval != ~0UL)
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
/* Pairs with READ_ONCE() from sk_getsockopt() */
WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
break;
}
case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val);
break;
Expand Down Expand Up @@ -1532,19 +1538,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
break;
}

case SO_TXREHASH:
if (val < -1 || val > 1) {
ret = -EINVAL;
break;
}
if ((u8)val == SOCK_TXREHASH_DEFAULT)
val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
/* Paired with READ_ONCE() in tcp_rtx_synack()
* and sk_getsockopt().
*/
WRITE_ONCE(sk->sk_txrehash, (u8)val);
break;

default:
ret = -ENOPROTOOPT;
break;
Expand Down
2 changes: 1 addition & 1 deletion net/dccp/ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
if (!opt)
opt = rcu_dereference(np->opt);
err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt,
np->tclass, sk->sk_priority);
np->tclass, READ_ONCE(sk->sk_priority));
rcu_read_unlock();
err = net_xmit_eval(err);
}
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/inet_diag.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
* For cgroup2 classid is always zero.
*/
if (!classid)
classid = sk->sk_priority;
classid = READ_ONCE(sk->sk_priority);

if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
goto errout;
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/ip_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -1449,7 +1449,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_options_build(skb, opt, cork->addr, rt);
}

skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
skb->mark = cork->mark;
skb->tstamp = cork->transmit_time;
/*
Expand Down
Loading

0 comments on commit 2be825e

Please sign in to comment.