Skip to content

Commit

Permalink
Merge branch 'tcp-sender-chronographs'
Browse files Browse the repository at this point in the history
Yuchung Cheng says:

====================
tcp: sender chronographs instrumentation

This patch set provides instrumentation on TCP sender limitations.
While developing the BBR congestion control, we noticed that TCP
sending process is often limited by factors unrelated to congestion
control: insufficient sender buffer and/or insufficient receive
window/buffer to saturate the network bandwidth. Unfortunately these
limits are not visible to the users and often the poor performance
is attributed to the congestion control of choice.

Thie patch aims to help users get the high level understanding of
where sending process is limited by, similar to the TCP_INFO design.
It is not to replace detailed kernel tracing and instrumentation
facilities.

In addition this patch set provide a new option to the timestamping
work to instrument these limits on application data unit. For exampe,
one can use SO_TIMESTAMPING and this patch set to measure the how
long a particular HTTP response is limited by small receive window.

Patch set was initially written by Francis Yan then polished
by Yuchung Cheng, with lots of help from Eric Dumazet and Soheil
Hassas Yeganeh.
====================

Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Nov 30, 2016
2 parents a090994 + 1c88580 commit 6d5274e
Show file tree
Hide file tree
Showing 23 changed files with 217 additions and 13 deletions.
10 changes: 10 additions & 0 deletions Documentation/networking/timestamping.txt
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY:
the timestamp even if sysctl net.core.tstamp_allow_data is 0.
This option disables SOF_TIMESTAMPING_OPT_CMSG.

SOF_TIMESTAMPING_OPT_STATS:

Optional stats that are obtained along with the transmit timestamps.
It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the
transmit timestamp is available, the stats are available in a
separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a
list of TLVs (struct nlattr) of types. These stats allow the
application to associate various transport layer stats with
the transmit timestamps, such as how long a certain block of
data was limited by peer's receiver window.

New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
Expand Down
2 changes: 2 additions & 0 deletions arch/alpha/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _UAPI_ASM_SOCKET_H */
2 changes: 2 additions & 0 deletions arch/frv/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,7 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _ASM_SOCKET_H */

2 changes: 2 additions & 0 deletions arch/ia64/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _ASM_IA64_SOCKET_H */
2 changes: 2 additions & 0 deletions arch/m32r/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _ASM_M32R_SOCKET_H */
2 changes: 2 additions & 0 deletions arch/mips/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _UAPI_ASM_SOCKET_H */
2 changes: 2 additions & 0 deletions arch/mn10300/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _ASM_SOCKET_H */
2 changes: 2 additions & 0 deletions arch/parisc/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,6 @@

#define SO_CNX_ADVICE 0x402E

#define SCM_TIMESTAMPING_OPT_STATS 0x402F

#endif /* _UAPI_ASM_SOCKET_H */
2 changes: 2 additions & 0 deletions arch/powerpc/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _ASM_POWERPC_SOCKET_H */
2 changes: 2 additions & 0 deletions arch/s390/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _ASM_SOCKET_H */
2 changes: 2 additions & 0 deletions arch/sparc/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@

#define SO_CNX_ADVICE 0x0037

#define SCM_TIMESTAMPING_OPT_STATS 0x0038

/* Security levels - as per NRL IPv6 - don't actually do anything */
#define SO_SECURITY_AUTHENTICATION 0x5001
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002
Expand Down
2 changes: 2 additions & 0 deletions arch/xtensa/include/uapi/asm/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* _XTENSA_SOCKET_H */
9 changes: 7 additions & 2 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,11 @@ struct tcp_sock {
u8 reord; /* reordering detected */
} rack;
u16 advmss; /* Advertised MSS */
u8 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
unused:7;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */
rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
unused:5;
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */
Expand Down Expand Up @@ -425,4 +428,6 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
tp->saved_syn = NULL;
}

struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk);

#endif /* _LINUX_TCP_H */
20 changes: 19 additions & 1 deletion include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1516,11 +1516,26 @@ struct tcp_fastopen_context {
struct rcu_head rcu;
};

/* Latencies incurred by various limits for a sender. They are
* chronograph-like stats that are mutually exclusive.
*/
enum tcp_chrono {
TCP_CHRONO_UNSPEC,
TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
__TCP_CHRONO_MAX,
};

void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);

/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
struct sk_buff *skb;

tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
sk_wmem_free_skb(sk, skb);
sk_mem_reclaim(sk);
Expand Down Expand Up @@ -1579,8 +1594,10 @@ static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *

static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
{
if (sk->sk_send_head == skb_unlinked)
if (sk->sk_send_head == skb_unlinked) {
sk->sk_send_head = NULL;
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
}
if (tcp_sk(sk)->highest_sack == skb_unlinked)
tcp_sk(sk)->highest_sack = NULL;
}
Expand All @@ -1602,6 +1619,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
/* Queue it, remembering where we must start sending. */
if (sk->sk_send_head == NULL) {
sk->sk_send_head = skb;
tcp_chrono_start(sk, TCP_CHRONO_BUSY);

if (tcp_sk(sk)->highest_sack == NULL)
tcp_sk(sk)->highest_sack = skb;
Expand Down
2 changes: 2 additions & 0 deletions include/uapi/asm-generic/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,6 @@

#define SO_CNX_ADVICE 53

#define SCM_TIMESTAMPING_OPT_STATS 54

#endif /* __ASM_GENERIC_SOCKET_H */
3 changes: 2 additions & 1 deletion include/uapi/linux/net_tstamp.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ enum {
SOF_TIMESTAMPING_TX_ACK = (1<<9),
SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
SOF_TIMESTAMPING_OPT_STATS = (1<<12),

SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY,
SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS,
SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
SOF_TIMESTAMPING_LAST
};
Expand Down
12 changes: 12 additions & 0 deletions include/uapi/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,18 @@ struct tcp_info {
__u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */

__u64 tcpi_delivery_rate;

__u64 tcpi_busy_time; /* Time (usec) busy sending data */
__u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */
__u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
};

/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
enum {
TCP_NLA_PAD,
TCP_NLA_BUSY, /* Time (usec) busy sending data */
TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */
TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */
};

/* for TCP_MD5SIG socket option */
Expand Down
14 changes: 11 additions & 3 deletions net/core/skbuff.c
Original file line number Diff line number Diff line change
Expand Up @@ -3839,10 +3839,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!skb_may_tx_timestamp(sk, tsonly))
return;

if (tsonly)
skb = alloc_skb(0, GFP_ATOMIC);
else
if (tsonly) {
#ifdef CONFIG_INET
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM)
skb = tcp_get_timestamping_opt_stats(sk);
else
#endif
skb = alloc_skb(0, GFP_ATOMIC);
} else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
}
if (!skb)
return;

Expand Down
7 changes: 7 additions & 0 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_tskey = 0;
}
}

if (val & SOF_TIMESTAMPING_OPT_STATS &&
!(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
ret = -EINVAL;
break;
}

sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
Expand Down
50 changes: 48 additions & 2 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -996,8 +996,11 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
goto out;
out_err:
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
return sk_stream_error(sk, flags, err);
}

Expand Down Expand Up @@ -1331,8 +1334,11 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
out_err:
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
release_sock(sk);
return err;
}
Expand Down Expand Up @@ -2702,6 +2708,25 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif

static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
struct tcp_info *info)
{
u64 stats[__TCP_CHRONO_MAX], total = 0;
enum tcp_chrono i;

for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
stats[i] = tp->chrono_stat[i - 1];
if (i == tp->chrono_type)
stats[i] += tcp_time_stamp - tp->chrono_start;
stats[i] *= USEC_PER_SEC / HZ;
total += stats[i];
}

info->tcpi_busy_time = total;
info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
}

/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
Expand Down Expand Up @@ -2794,6 +2819,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_acked = tp->bytes_acked;
info->tcpi_bytes_received = tp->bytes_received;
info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
tcp_get_info_chrono_stats(tp, info);

unlock_sock_fast(sk, slow);

Expand All @@ -2815,6 +2841,26 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
}
EXPORT_SYMBOL_GPL(tcp_get_info);

struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;

stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
if (!stats)
return NULL;

tcp_get_info_chrono_stats(tp, &info);
nla_put_u64_64bit(stats, TCP_NLA_BUSY,
info.tcpi_busy_time, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
info.tcpi_rwnd_limited, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
info.tcpi_sndbuf_limited, TCP_NLA_PAD);
return stats;
}

static int do_tcp_getsockopt(struct sock *sk, int level,
int optname, char __user *optval, int __user *optlen)
{
Expand Down
8 changes: 7 additions & 1 deletion net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -3178,6 +3178,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->lost_skb_hint = NULL;
}

if (!skb)
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);

if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;

Expand Down Expand Up @@ -5056,8 +5059,11 @@ static void tcp_check_space(struct sock *sk)
/* pairs with tcp_poll() */
smp_mb__after_atomic();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
}

Expand Down
Loading

0 comments on commit 6d5274e

Please sign in to comment.