From 2e31396fa14be50a98c5d2b00416ebd74d381c1f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 23 Apr 2013 00:39:28 +0000 Subject: [PATCH 1/5] packet: tx timestamping on tpacket ring When transmit timestamping is enabled at the socket level, record a timestamp on packets written to a PACKET_TX_RING. Tx timestamps are always looped to the application over the socket error queue. Software timestamps are also written back into the packet frame header in the packet ring. Reported-by: Paul Chavent Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 12 ++++++------ net/packet/af_packet.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 898cf5c566f9..af9185d0be6a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3327,12 +3327,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) - return; - if (hwtstamps) { - *skb_hwtstamps(skb) = + *skb_hwtstamps(orig_skb) = *hwtstamps; } else { /* @@ -3340,9 +3336,13 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, * so keep the shared tx_flags and only * store software time stamp */ - skb->tstamp = ktime_get_real(); + orig_skb->tstamp = ktime_get_real(); } + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7e387ff64465..ec8ea27733ac 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -339,6 +339,37 @@ static int __packet_get_status(struct packet_sock *po, void *frame) } } +static void __packet_set_timestamp(struct packet_sock *po, void *frame, + ktime_t tstamp) +{ + union tpacket_uhdr h; + struct timespec ts; + + if (!ktime_to_timespec_cond(tstamp, &ts) || + !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE)) + return; + + h.raw = frame; + switch (po->tp_version) { + case TPACKET_V1: + h.h1->tp_sec = ts.tv_sec; + h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; + break; + case TPACKET_V2: + h.h2->tp_sec = ts.tv_sec; + h.h2->tp_nsec = ts.tv_nsec; + break; + case TPACKET_V3: + default: + WARN(1, "TPACKET version not supported.\n"); + BUG(); + } + + /* one flush is safe, as both fields always lie on the same cacheline */ + flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); + smp_wmb(); +} + static void *packet_lookup_frame(struct packet_sock *po, struct packet_ring_buffer *rb, unsigned int position, @@ -1877,6 +1908,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) ph = skb_shinfo(skb)->destructor_arg; BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); + __packet_set_timestamp(po, ph, skb->tstamp); __packet_set_status(po, ph, TP_STATUS_AVAILABLE); } @@ -1900,6 +1932,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; + sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; switch (po->tp_version) { From 7a51384cc9f4145de352d266821b63f1bb0d46ff Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Apr 2013 00:39:29 +0000 Subject: [PATCH 2/5] packet: enable hardware tx timestamping on tpacket ring Currently, we only have software timestamping for the TX ring buffer path, but this limitation stems rather from the implementation. By just reusing tpacket_get_timestamp(), we can also allow hardware timestamping just as in the RX path. Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ec8ea27733ac..9d46a07a15c0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -339,14 +339,33 @@ static int __packet_get_status(struct packet_sock *po, void *frame) } } +static bool tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, + unsigned int flags) +{ + struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + + if (shhwtstamps) { + if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && + ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) + return true; + if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && + ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) + return true; + } + + if (ktime_to_timespec_cond(skb->tstamp, ts)) + return true; + + return false; +} + static void __packet_set_timestamp(struct packet_sock *po, void *frame, - ktime_t tstamp) + struct sk_buff *skb) { union tpacket_uhdr h; struct timespec ts; - if (!ktime_to_timespec_cond(tstamp, &ts) || - !sock_flag(&po->sk, SOCK_TIMESTAMPING_SOFTWARE)) + if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp)) return; h.raw = frame; @@ -1688,26 +1707,6 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, return 0; } -static void tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, - unsigned int flags) -{ - struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); - - if (shhwtstamps) { - if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) - return; - if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) - return; - } - - if (ktime_to_timespec_cond(skb->tstamp, ts)) - return; - - getnstimeofday(ts); -} - static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { @@ -1804,7 +1803,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); - tpacket_get_timestamp(skb, &ts, po->tp_tstamp); + if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp)) + getnstimeofday(&ts); switch (po->tp_version) { case TPACKET_V1: @@ -1908,7 +1908,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) ph = skb_shinfo(skb)->destructor_arg; BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); - __packet_set_timestamp(po, ph, skb->tstamp); + __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE); } From 7276d5d743d775388bf382cd7bdea1a14e486d32 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Apr 2013 00:39:30 +0000 Subject: [PATCH 3/5] packet: minor: convert status bits into shifting format This makes it more readable and clearer what bits are still free to use. The compiler reduces this to a constant for us anyway. Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 8136658ea477..4dfc234d80e5 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -86,19 +86,19 @@ struct tpacket_auxdata { }; /* Rx ring - header status */ -#define TP_STATUS_KERNEL 0x0 -#define TP_STATUS_USER 0x1 -#define TP_STATUS_COPY 0x2 -#define TP_STATUS_LOSING 0x4 -#define TP_STATUS_CSUMNOTREADY 0x8 -#define TP_STATUS_VLAN_VALID 0x10 /* auxdata has valid tp_vlan_tci */ -#define TP_STATUS_BLK_TMO 0x20 +#define TP_STATUS_KERNEL 0 +#define TP_STATUS_USER (1 << 0) +#define TP_STATUS_COPY (1 << 1) +#define TP_STATUS_LOSING (1 << 2) +#define TP_STATUS_CSUMNOTREADY (1 << 3) +#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ +#define TP_STATUS_BLK_TMO (1 << 5) /* Tx ring - header status */ -#define TP_STATUS_AVAILABLE 0x0 -#define TP_STATUS_SEND_REQUEST 0x1 -#define TP_STATUS_SENDING 0x2 -#define TP_STATUS_WRONG_FORMAT 0x4 +#define TP_STATUS_AVAILABLE 0 +#define TP_STATUS_SEND_REQUEST (1 << 0) +#define TP_STATUS_SENDING (1 << 1) +#define TP_STATUS_WRONG_FORMAT (1 << 2) /* Rx ring - feature request bits */ #define TP_FT_REQ_FILL_RXHASH 0x1 From b9c32fb2717094231b31a7d7dcf5fd7f3638ac2f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Apr 2013 00:39:31 +0000 Subject: [PATCH 4/5] packet: if hw/sw ts enabled in rx/tx ring, report which ts we got Currently, there is no way to find out which timestamp is reported in tpacket{,2,3}_hdr's tp_sec, tp_{n,u}sec members. It can be one of SOF_TIMESTAMPING_SYS_HARDWARE, SOF_TIMESTAMPING_RAW_HARDWARE, SOF_TIMESTAMPING_SOFTWARE, or a fallback variant late call from the PF_PACKET code in software. Therefore, report in the tp_status member of the ring buffer which timestamp has been reported for RX and TX path. This should not break anything for the following reasons: i) in RX ring path, the user needs to test for tp_status & TP_STATUS_USER, and later for other flags as well such as TP_STATUS_VLAN_VALID et al, so adding other flags will do no harm; ii) in TX ring path, time stamps with PACKET_TIMESTAMP socketoption are not available resp. had no effect except that the application setting this is buggy. Next to TP_STATUS_AVAILABLE, the user also should check for other flags such as TP_STATUS_WRONG_FORMAT to reclaim frames to the application. Thus, in case TX ts are turned off (default case), nothing happens to the application logic, and in case we want to use this new feature, we now can also check which of the ts source is reported in the status field as provided in the docs. Reported-by: Richard Cochran Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 5 +++++ net/packet/af_packet.c | 36 ++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 4dfc234d80e5..b950c02030c0 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -100,6 +100,11 @@ struct tpacket_auxdata { #define TP_STATUS_SENDING (1 << 1) #define TP_STATUS_WRONG_FORMAT (1 << 2) +/* Rx and Tx ring - header status */ +#define TP_STATUS_TS_SOFTWARE (1 << 29) +#define TP_STATUS_TS_SYS_HARDWARE (1 << 30) +#define TP_STATUS_TS_RAW_HARDWARE (1 << 31) + /* Rx ring - feature request bits */ #define TP_FT_REQ_FILL_RXHASH 0x1 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9d46a07a15c0..ba8309a3e01b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -339,34 +339,35 @@ static int __packet_get_status(struct packet_sock *po, void *frame) } } -static bool tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, - unsigned int flags) +static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts, + unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps) { if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) && ktime_to_timespec_cond(shhwtstamps->syststamp, ts)) - return true; + return TP_STATUS_TS_SYS_HARDWARE; if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts)) - return true; + return TP_STATUS_TS_RAW_HARDWARE; } if (ktime_to_timespec_cond(skb->tstamp, ts)) - return true; + return TP_STATUS_TS_SOFTWARE; - return false; + return 0; } -static void __packet_set_timestamp(struct packet_sock *po, void *frame, - struct sk_buff *skb) +static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, + struct sk_buff *skb) { union tpacket_uhdr h; struct timespec ts; + __u32 ts_status; - if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp)) - return; + if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) + return 0; h.raw = frame; switch (po->tp_version) { @@ -387,6 +388,8 @@ static void __packet_set_timestamp(struct packet_sock *po, void *frame, /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); + + return ts_status; } static void *packet_lookup_frame(struct packet_sock *po, @@ -1721,6 +1724,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, unsigned short macoff, netoff, hdrlen; struct sk_buff *copy_skb = NULL; struct timespec ts; + __u32 ts_status; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -1803,9 +1807,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); - if (!tpacket_get_timestamp(skb, &ts, po->tp_tstamp)) + + if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) getnstimeofday(&ts); + status |= ts_status; + switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; @@ -1905,11 +1912,14 @@ static void tpacket_destruct_skb(struct sk_buff *skb) void *ph; if (likely(po->tx_ring.pg_vec)) { + __u32 ts; + ph = skb_shinfo(skb)->destructor_arg; BUG_ON(atomic_read(&po->tx_ring.pending) == 0); atomic_dec(&po->tx_ring.pending); - __packet_set_timestamp(po, ph, skb); - __packet_set_status(po, ph, TP_STATUS_AVAILABLE); + + ts = __packet_set_timestamp(po, ph, skb); + __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); } sock_wfree(skb); From 2940b26bec9fe5bf183c994678e62b55d35717e6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Apr 2013 00:39:32 +0000 Subject: [PATCH 5/5] packet: doc: update timestamping part Bring the timestamping section in sync with the implementation. Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/packet_mmap.txt | 41 ++++++++++++++++++++---- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 65efb85e49de..23dd80e82b8e 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -1016,10 +1016,11 @@ retry_block: ------------------------------------------------------------------------------- The PACKET_TIMESTAMP setting determines the source of the timestamp in -the packet meta information. If your NIC is capable of timestamping -packets in hardware, you can request those hardware timestamps to used. -Note: you may need to enable the generation of hardware timestamps with -SIOCSHWTSTAMP. +the packet meta information for mmap(2)ed RX_RING and TX_RINGs. If your +NIC is capable of timestamping packets in hardware, you can request those +hardware timestamps to be used. Note: you may need to enable the generation +of hardware timestamps with SIOCSHWTSTAMP (see related information from +Documentation/networking/timestamping.txt). PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE @@ -1031,8 +1032,36 @@ SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set. req |= SOF_TIMESTAMPING_SYS_HARDWARE; setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req)) -If PACKET_TIMESTAMP is not set, a software timestamp generated inside -the networking stack is used (the behavior before this setting was added). +For the mmap(2)ed ring buffers, such timestamps are stored in the +tpacket{,2,3}_hdr structure's tp_sec and tp_{n,u}sec members. To determine +what kind of timestamp has been reported, the tp_status field is binary |'ed +with the following possible bits ... + + TP_STATUS_TS_SYS_HARDWARE + TP_STATUS_TS_RAW_HARDWARE + TP_STATUS_TS_SOFTWARE + +... that are equivalent to its SOF_TIMESTAMPING_* counterparts. For the +RX_RING, if none of those 3 are set (i.e. PACKET_TIMESTAMP is not set), +then this means that a software fallback was invoked *within* PF_PACKET's +processing code (less precise). + +Getting timestamps for the TX_RING works as follows: i) fill the ring frames, +ii) call sendto() e.g. in blocking mode, iii) wait for status of relevant +frames to be updated resp. the frame handed over to the application, iv) walk +through the frames to pick up the individual hw/sw timestamps. + +Only (!) if transmit timestamping is enabled, then these bits are combined +with binary | with TP_STATUS_AVAILABLE, so you must check for that in your +application (e.g. !(tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)) +in a first step to see if the frame belongs to the application, and then +one can extract the type of timestamp in a second step from tp_status)! + +If you don't care about them, thus having it disabled, checking for +TP_STATUS_AVAILABLE resp. TP_STATUS_WRONG_FORMAT is sufficient. If in the +TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec +members do not contain a valid value. For TX_RINGs, by default no timestamp +is generated! See include/linux/net_tstamp.h and Documentation/networking/timestamping for more information on hardware timestamps.