Skip to content

Commit

Permalink
Merge branch 'net-timestamp-next'
Browse files Browse the repository at this point in the history
Willem de Bruijn says:

====================
net-timestamp: new tx tstamps and tcp

Extend socket tx timestamping:
- allow multiple types of software timestamps aside from send (1)
- add software timestamp on enter packet scheduling (4)
- add software timestamp for TCP (5)
- add software timestamp for TCP on ACK (6)

The sk_flags option space is nearly exhausted. Also move the
many timestamp options to a new sk->sk_tstamps (2).

To disambiguate data when tstamps may arrive out of order,
optionally return a sequential ID assigned at send (3).

Extend Linux tx timestamping to monitoring of latency
incurred within the kernel stack and to protocols embedded in TCP.
Complex kernel setups may have multiple layers of queueing, including
multiple instances of packet scheduling, and many classes per layer.
Many applications embed discrete payloads into TCP bytestreams for
reliability, flow control, etcetera. Detecting application tail
latency in such scenarios relies on identifying the exact queue
responsible if on the host, or the network latency if otherwise.

Changelog:
v4->v5
  - define SCM_TSTAMP_SND == 0, for legacy behavior
  - add TCP tstamps without changing the generated byte stream
    - modify GSO and ACK to find offset: slightly more complex
      than previous invariant that it is the last byte
  - consistent naming of packet scheduling
    - rename SCM_TSTAMP_ENQ to SCM_TSTAMP_SCHED
  - add unique key in ee_data
  - add id field in ee_info to disambiguate tstamps
    - optional, only on new flag SOF_TIMESTAMPING_OPT_ID
    - for bytestream, in bytes

v3->v4
  - (v3 review comment) removed skb->mark packet identification (*A)
  - (v3 review comment) fixed indentation
  - tcp: fixed poll() to return POLLERR on non-zero queue
  - rebased to work without syststamp
  - comments: removed all traces of MSG_TSTAMP_.. (*B)

v2->v3
  - extend the SO_TIMESTAMPING API, instead of defining a new one.
  - add protocol independent support to correlate tstamps with data,
    based on returning skb->mark.
  - removed no-payload optimization and documentation (for now):

    I have a follow-on patch that reintroduces MSG_TSTAMP along with a
    new socket option SOF_TIMESTAMPING_OPT_ONFLAG. This is equivalent
    to sequence setsockopt(<enable>); send(..); setsockopt(<disable>),
    but avoids the need to define a MSG_TSTAMP_<TYPE> for each type.

    I will leave these three patches as follow-on, as this patchset is
    large enough as is.

v1->v2
  - expand timestamping (existing and new) to SOCK_RAW and ping sockets
  - rename sock_errqueue_timestamping to scm_timestamping
  - change timestamp data format: do not add fields to scm_timestamping.
      Doing so could break legacy applications. Instead, communicate
      through an existing, but unused, field in the error message.
  - rename SOF_.._OPT_TX_NO_PAYLOAD to shorter SOF_.._OPT_TSONLY
  - move msg_tstamp test app out of patchset and to github
      git://github.com/wdebruij/kerneltools.git
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Aug 5, 2014
2 parents a2b81b3 + e1c8a60 commit 618896e
Show file tree
Hide file tree
Showing 13 changed files with 170 additions and 64 deletions.
18 changes: 17 additions & 1 deletion include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ enum {
/* generate hardware time stamp */
SKBTX_HW_TSTAMP = 1 << 0,

/* generate software time stamp */
/* generate software time stamp when queueing packet to NIC */
SKBTX_SW_TSTAMP = 1 << 1,

/* device driver is going to provide hardware time stamp */
Expand All @@ -247,8 +247,19 @@ enum {
* all frags to avoid possible bad checksum
*/
SKBTX_SHARED_FRAG = 1 << 5,

/* generate software time stamp when entering packet scheduling */
SKBTX_SCHED_TSTAMP = 1 << 6,

/* generate software timestamp on peer data acknowledgment */
SKBTX_ACK_TSTAMP = 1 << 7,
};

#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
SKBTX_SCHED_TSTAMP | \
SKBTX_ACK_TSTAMP)
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)

/*
* The callback notifies userspace to release buffers when skb DMA is done in
* lower device, the skb last reference should be 0 when calling this.
Expand All @@ -275,6 +286,7 @@ struct skb_shared_info {
unsigned short gso_type;
struct sk_buff *frag_list;
struct skb_shared_hwtstamps hwtstamps;
u32 tskey;
__be32 ip6_frag_id;

/*
Expand Down Expand Up @@ -2691,6 +2703,10 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
void skb_complete_tx_timestamp(struct sk_buff *skb,
struct skb_shared_hwtstamps *hwtstamps);

void __skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype);

/**
* skb_tstamp_tx - queue clone of skb with send time stamps
* @orig_skb: the original outgoing packet
Expand Down
33 changes: 15 additions & 18 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
#include <linux/atomic.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <linux/net_tstamp.h>

struct cgroup;
struct cgroup_subsys;
Expand Down Expand Up @@ -278,6 +279,8 @@ struct cg_proto;
* @sk_protinfo: private area, net family specific, when not using slab
* @sk_timer: sock cleanup timer
* @sk_stamp: time stamp of last packet received
* @sk_tsflags: SO_TIMESTAMPING socket options
* @sk_tskey: counter to disambiguate concurrent tstamp requests
* @sk_socket: Identd and reporting IO signals
* @sk_user_data: RPC layer private data
* @sk_frag: cached page frag
Expand Down Expand Up @@ -411,6 +414,8 @@ struct sock {
void *sk_protinfo;
struct timer_list sk_timer;
ktime_t sk_stamp;
u16 sk_tsflags;
u32 sk_tskey;
struct socket *sk_socket;
void *sk_user_data;
struct page_frag sk_frag;
Expand Down Expand Up @@ -701,12 +706,7 @@ enum sock_flags {
SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
SOCK_MEMALLOC, /* VM depends on this socket for swapping */
SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */
SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */
SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */
SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */
SOCK_TIMESTAMPING_SOFTWARE, /* %SOF_TIMESTAMPING_SOFTWARE */
SOCK_TIMESTAMPING_RAW_HARDWARE, /* %SOF_TIMESTAMPING_RAW_HARDWARE */
SOCK_FASYNC, /* fasync() active */
SOCK_RXQ_OVFL,
SOCK_ZEROCOPY, /* buffers from userspace */
Expand Down Expand Up @@ -2160,18 +2160,17 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)

/*
* generate control messages if
* - receive time stamping in software requested (SOCK_RCVTSTAMP
* or SOCK_TIMESTAMPING_RX_SOFTWARE)
* - receive time stamping in software requested
* - software time stamp available and wanted
* (SOCK_TIMESTAMPING_SOFTWARE)
* - hardware time stamps available and wanted
* SOCK_TIMESTAMPING_RAW_HARDWARE
*/
if (sock_flag(sk, SOCK_RCVTSTAMP) ||
sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) ||
(kt.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) ||
(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
(kt.tv64 &&
(sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE ||
skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP)) ||
(hwtstamps->hwtstamp.tv64 &&
sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)))
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
__sock_recv_timestamp(msg, sk, skb);
else
sk->sk_stamp = kt;
Expand All @@ -2187,11 +2186,11 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb)
{
#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_RCVTSTAMP) | \
(1UL << SOCK_TIMESTAMPING_SOFTWARE) | \
(1UL << SOCK_TIMESTAMPING_RAW_HARDWARE))
(1UL << SOCK_RCVTSTAMP))
#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \
SOF_TIMESTAMPING_RAW_HARDWARE)

if (sk->sk_flags & FLAGS_TS_OR_DROPS)
if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
__sock_recv_ts_and_drops(msg, sk, skb);
else
sk->sk_stamp = skb->tstamp;
Expand All @@ -2201,8 +2200,6 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
* sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
* @sk: socket sending this packet
* @tx_flags: filled with instructions for time stamping
*
* Currently only depends on SOCK_TIMESTAMPING* flags.
*/
void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags);

Expand Down
20 changes: 20 additions & 0 deletions include/uapi/linux/errqueue.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,25 @@ struct sock_extended_err {

#define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1))

/**
* struct scm_timestamping - timestamps exposed through cmsg
*
* The timestamping interfaces SO_TIMESTAMPING, MSG_TSTAMP_*
* communicate network timestamps by passing this struct in a cmsg with
* recvmsg(). See Documentation/networking/timestamping.txt for details.
*/
struct scm_timestamping {
struct timespec ts[3];
};

/* The type of scm_timestamping, passed in sock_extended_err ee_info.
* This defines the type of ts[0]. For SCM_TSTAMP_SND only, if ts[0]
* is zero, then this is a hardware timestamp and recorded in ts[2].
*/
enum {
SCM_TSTAMP_SND, /* driver passed skb to NIC, or HW */
SCM_TSTAMP_SCHED, /* data entered the packet scheduler */
SCM_TSTAMP_ACK, /* data acknowledged by peer */
};

#endif /* _UAPI_LINUX_ERRQUEUE_H */
10 changes: 7 additions & 3 deletions include/uapi/linux/net_tstamp.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@ enum {
SOF_TIMESTAMPING_SOFTWARE = (1<<4),
SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5),
SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6),
SOF_TIMESTAMPING_MASK =
(SOF_TIMESTAMPING_RAW_HARDWARE - 1) |
SOF_TIMESTAMPING_RAW_HARDWARE
SOF_TIMESTAMPING_OPT_ID = (1<<7),
SOF_TIMESTAMPING_TX_SCHED = (1<<8),
SOF_TIMESTAMPING_TX_ACK = (1<<9),

SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_ACK,
SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
SOF_TIMESTAMPING_LAST
};

/**
Expand Down
4 changes: 4 additions & 0 deletions net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>

#include "net-sysfs.h"

Expand Down Expand Up @@ -2876,6 +2877,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)

skb_reset_mac_header(skb);

if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);

/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
Expand Down
20 changes: 17 additions & 3 deletions net/core/skbuff.c
Original file line number Diff line number Diff line change
Expand Up @@ -3490,10 +3490,10 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);

void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps)
void __skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
struct sock *sk = orig_skb->sk;
struct sock_exterr_skb *serr;
struct sk_buff *skb;
int err;
Expand Down Expand Up @@ -3521,12 +3521,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
serr->ee.ee_info = tstype;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
if (sk->sk_protocol == IPPROTO_TCP)
serr->ee.ee_data -= sk->sk_tskey;
}

err = sock_queue_err_skb(sk, skb);

if (err)
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);

void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps)
{
return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);

void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
Expand Down
37 changes: 14 additions & 23 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -848,22 +848,25 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
ret = -EINVAL;
break;
}
sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
val & SOF_TIMESTAMPING_TX_HARDWARE);
sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
val & SOF_TIMESTAMPING_TX_SOFTWARE);
sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
val & SOF_TIMESTAMPING_RX_HARDWARE);
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
if (sk->sk_protocol == IPPROTO_TCP) {
if (sk->sk_state != TCP_ESTABLISHED) {
ret = -EINVAL;
break;
}
sk->sk_tskey = tcp_sk(sk)->snd_una;
} else {
sk->sk_tskey = 0;
}
}
sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
val & SOF_TIMESTAMPING_SOFTWARE);
sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
val & SOF_TIMESTAMPING_RAW_HARDWARE);
break;

case SO_RCVLOWAT:
Expand Down Expand Up @@ -1089,19 +1092,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;

case SO_TIMESTAMPING:
v.val = 0;
if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
v.val |= SOF_TIMESTAMPING_SOFTWARE;
if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
v.val = sk->sk_tsflags;
break;

case SO_RCVTIMEO:
Expand Down
6 changes: 6 additions & 0 deletions net/ipv4/ip_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk,
unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
u32 tskey = 0;

skb = skb_peek_tail(queue);

exthdrlen = !skb ? rt->dst.header_len : 0;
mtu = cork->fragsize;
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
tskey = sk->sk_tskey++;

hh_len = LL_RESERVED_SPACE(rt->dst.dev);

Expand Down Expand Up @@ -976,6 +980,8 @@ static int __ip_append_data(struct sock *sk,
/* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;

/*
* Find where to start putting bytes.
Expand Down
22 changes: 19 additions & 3 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,15 @@ void tcp_init_sock(struct sock *sk)
}
EXPORT_SYMBOL(tcp_init_sock);

void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);

sock_tx_timestamp(sk, &shinfo->tx_flags);
if (shinfo->tx_flags & SKBTX_ANY_SW_TSTAMP)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}

/*
* Wait for a TCP event.
*
Expand Down Expand Up @@ -523,7 +532,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
if (sk->sk_err)
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
mask |= POLLERR;

return mask;
Expand Down Expand Up @@ -959,8 +968,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,

copied += copy;
offset += copy;
if (!(size -= copy))
if (!(size -= copy)) {
tcp_tx_timestamp(sk, skb);
goto out;
}

if (skb->len < size_goal || (flags & MSG_OOB))
continue;
Expand Down Expand Up @@ -1252,8 +1263,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

from += copy;
copied += copy;
if ((seglen -= copy) == 0 && iovlen == 0)
if ((seglen -= copy) == 0 && iovlen == 0) {
tcp_tx_timestamp(sk, skb);
goto out;
}

if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
Expand Down Expand Up @@ -1617,6 +1630,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct sk_buff *skb;
u32 urg_hole = 0;

if (unlikely(flags & MSG_ERRQUEUE))
return ip_recv_error(sk, msg, len, addr_len);

if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
(sk->sk_state == TCP_ESTABLISHED))
sk_busy_loop(sk, nonblock);
Expand Down
6 changes: 6 additions & 0 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <net/netdma.h>
#include <linux/errqueue.h>

int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
Expand Down Expand Up @@ -3106,6 +3107,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->retrans_stamp = 0;
}

if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) &&
between(skb_shinfo(skb)->tskey, prior_snd_una,
tp->snd_una + 1))
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);

if (!fully_acked)
break;

Expand Down
Loading

0 comments on commit 618896e

Please sign in to comment.