Skip to content

Commit

Permalink
Merge branch 'tcp-address-KCSAN-reports-in-tcp_poll-part-I'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: address KCSAN reports in tcp_poll() (part I)

This all started with a KCSAN report (included
in "tcp: annotate tp->rcv_nxt lockless reads" changelog)

tcp_poll() runs in a lockless way. This means that about
all accesses of tcp socket fields done in tcp_poll() context
need annotations otherwise KCSAN will complain about data-races.

While doing this detective work, I found a more serious bug,
addressed by the first patch ("tcp: add rcu protection around
tp->fastopen_rsk").
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Oct 13, 2019
2 parents 8caf8a9 + ab4e846 commit 3f23380
Show file tree
Hide file tree
Showing 21 changed files with 175 additions and 128 deletions.
6 changes: 3 additions & 3 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ struct tcp_sock {
/* fastopen_rsk points to request_sock that resulted in this big
* socket. Used to retransmit SYNACKs etc.
*/
struct request_sock *fastopen_rsk;
struct request_sock __rcu *fastopen_rsk;
u32 *saved_syn;
};

Expand Down Expand Up @@ -447,8 +447,8 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)

static inline bool tcp_passive_fastopen(const struct sock *sk)
{
return (sk->sk_state == TCP_SYN_RECV &&
tcp_sk(sk)->fastopen_rsk != NULL);
return sk->sk_state == TCP_SYN_RECV &&
rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL;
}

static inline void fastopen_queue_tune(struct sock *sk, int backlog)
Expand Down
29 changes: 19 additions & 10 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -878,12 +878,17 @@ static inline bool sk_acceptq_is_full(const struct sock *sk)
*/
static inline int sk_stream_min_wspace(const struct sock *sk)
{
return sk->sk_wmem_queued >> 1;
return READ_ONCE(sk->sk_wmem_queued) >> 1;
}

static inline int sk_stream_wspace(const struct sock *sk)
{
return sk->sk_sndbuf - sk->sk_wmem_queued;
return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
}

static inline void sk_wmem_queued_add(struct sock *sk, int val)
{
WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

void sk_stream_write_space(struct sock *sk);
Expand Down Expand Up @@ -1207,7 +1212,7 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)

static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (sk->sk_wmem_queued >= sk->sk_sndbuf)
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
return false;

return sk->sk_prot->stream_memory_free ?
Expand Down Expand Up @@ -1467,7 +1472,7 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
sk->sk_wmem_queued -= skb->truesize;
sk_wmem_queued_add(sk, -skb->truesize);
sk_mem_uncharge(sk, skb->truesize);
if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
!sk->sk_tx_skb_cache && !skb_cloned(skb)) {
Expand Down Expand Up @@ -2014,7 +2019,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk->sk_wmem_queued += copy;
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
return 0;
}
Expand Down Expand Up @@ -2220,10 +2225,14 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band)

static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) {
sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
sk->sk_sndbuf = max_t(u32, sk->sk_sndbuf, SOCK_MIN_SNDBUF);
}
u32 val;

if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
return;

val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);

WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}

struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
Expand Down Expand Up @@ -2251,7 +2260,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
*/
static inline bool sock_writeable(const struct sock *sk)
{
return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
}

static inline gfp_t gfp_any(void)
Expand Down
7 changes: 4 additions & 3 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1380,14 +1380,14 @@ static inline int tcp_win_from_space(const struct sock *sk, int space)
/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk)
{
return tcp_win_from_space(sk, sk->sk_rcvbuf -
return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
READ_ONCE(sk->sk_backlog.len) -
atomic_read(&sk->sk_rmem_alloc));
}

static inline int tcp_full_space(const struct sock *sk)
{
return tcp_win_from_space(sk, sk->sk_rcvbuf);
return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}

extern void tcp_openreq_init_rwin(struct request_sock *req,
Expand Down Expand Up @@ -1917,7 +1917,8 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
static inline bool tcp_stream_memory_free(const struct sock *sk, int wake)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 notsent_bytes = tp->write_seq - tp->snd_nxt;
u32 notsent_bytes = READ_ONCE(tp->write_seq) -
READ_ONCE(tp->snd_nxt);

return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
}
Expand Down
4 changes: 2 additions & 2 deletions include/trace/events/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ TRACE_EVENT(sock_rcvqueue_full,
TP_fast_assign(
__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
__entry->truesize = skb->truesize;
__entry->sk_rcvbuf = sk->sk_rcvbuf;
__entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf);
),

TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d",
Expand Down Expand Up @@ -115,7 +115,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
__entry->sysctl_wmem = sk_get_wmem0(sk, prot);
__entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
__entry->wmem_queued = sk->sk_wmem_queued;
__entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued);
__entry->kind = kind;
),

Expand Down
2 changes: 1 addition & 1 deletion net/core/datagram.c
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
skb->len += copied;
skb->truesize += truesize;
if (sk && sk->sk_type == SOCK_STREAM) {
sk->sk_wmem_queued += truesize;
sk_wmem_queued_add(sk, truesize);
sk_mem_charge(sk, truesize);
} else {
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
Expand Down
6 changes: 4 additions & 2 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -4252,12 +4252,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
case SO_RCVBUF:
val = min_t(u32, val, sysctl_rmem_max);
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
WRITE_ONCE(sk->sk_rcvbuf,
max_t(int, val * 2, SOCK_MIN_RCVBUF));
break;
case SO_SNDBUF:
val = min_t(u32, val, sysctl_wmem_max);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
WRITE_ONCE(sk->sk_sndbuf,
max_t(int, val * 2, SOCK_MIN_SNDBUF));
break;
case SO_MAX_PACING_RATE: /* 32bit version */
if (val != ~0U)
Expand Down
2 changes: 1 addition & 1 deletion net/core/request_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,

fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;

tcp_sk(sk)->fastopen_rsk = NULL;
RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
spin_lock_bh(&fastopenq->lock);
fastopenq->qlen--;
tcp_rsk(req)->tfo_listener = false;
Expand Down
2 changes: 1 addition & 1 deletion net/core/skbuff.c
Original file line number Diff line number Diff line change
Expand Up @@ -4415,7 +4415,7 @@ static void skb_set_err_queue(struct sk_buff *skb)
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned int)sk->sk_rcvbuf)
(unsigned int)READ_ONCE(sk->sk_rcvbuf))
return -ENOMEM;

skb_orphan(skb);
Expand Down
22 changes: 13 additions & 9 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,8 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
*/
val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
WRITE_ONCE(sk->sk_sndbuf,
max_t(int, val * 2, SOCK_MIN_SNDBUF));
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
Expand Down Expand Up @@ -831,7 +832,8 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
WRITE_ONCE(sk->sk_rcvbuf,
max_t(int, val * 2, SOCK_MIN_RCVBUF));
break;

case SO_RCVBUFFORCE:
Expand Down Expand Up @@ -2088,8 +2090,10 @@ EXPORT_SYMBOL(sock_i_ino);
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
if (force ||
refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
struct sk_buff *skb = alloc_skb(size, priority);

if (skb) {
skb_set_owner_w(skb, sk);
return skb;
Expand Down Expand Up @@ -2190,7 +2194,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
Expand Down Expand Up @@ -2225,7 +2229,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;

if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
break;

sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Expand Down Expand Up @@ -2806,7 +2810,7 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
Expand Down Expand Up @@ -3204,11 +3208,11 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);

mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/inet_connection_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
percpu_counter_inc(sk->sk_prot->orphan_count);

if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(tcp_sk(child)->fastopen_rsk != req);
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
BUG_ON(sk != req->rsk_listener);

/* Paranoid, to prevent race condition if
Expand All @@ -915,7 +915,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
* Also to satisfy an assertion in
* tcp_v4_destroy_sock().
*/
tcp_sk(child)->fastopen_rsk = NULL;
RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
}
inet_csk_destroy_sock(child);
}
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/inet_diag.c
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
.idiag_wmem = sk->sk_wmem_queued,
.idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
.idiag_fmem = sk->sk_forward_alloc,
.idiag_tmem = sk_wmem_alloc_get(sk),
};
Expand Down
Loading

0 comments on commit 3f23380

Please sign in to comment.