From ad377cab4966e2f9162f05be1f7eeae466d411a8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:45 -0400 Subject: [PATCH 1/6] packet: rollover prepare: move code out of callsites packet_rcv_fanout calls fanout_demux_rollover twice. Move all rollover logic into the callee to simplify these callsites, especially with upcoming changes. The main differences between the two callsites is that the FLAG variant tests whether the socket previously selected by another mode (RR, RND, HASH, ..) has room before migrating flows, whereas the rollover mode has no original socket to test. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 12c5dde8e344f..5c88ecf22cc0b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1318,18 +1318,22 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f, static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, - unsigned int idx, unsigned int skip, + unsigned int idx, bool try_self, unsigned int num) { unsigned int i, j; + if (try_self && packet_rcv_has_room(pkt_sk(f->arr[idx]), skb)) + return idx; + i = j = min_t(int, f->next[idx], num - 1); do { - if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { + if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { if (i != j) f->next[idx] = i; return i; } + if (++i == num) i = 0; } while (i != j); @@ -1386,17 +1390,14 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: - idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); + idx = fanout_demux_rollover(f, skb, 0, false, num); break; } - po = pkt_sk(f->arr[idx]); - if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) && - unlikely(!packet_rcv_has_room(po, skb))) { - idx = fanout_demux_rollover(f, skb, idx, idx, num); - po = pkt_sk(f->arr[idx]); - } + if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) + idx = fanout_demux_rollover(f, skb, idx, true, num); + po = pkt_sk(f->arr[idx]); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } From 0648ab70afe6c3bf2369a6d779b44a85121c063d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:46 -0400 Subject: [PATCH 2/6] packet: rollover prepare: per-socket state Replace rollover state per fanout group with state per socket. Future patches will add fields to the new structure. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 21 ++++++++++++++++++--- net/packet/internal.h | 6 +++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5c88ecf22cc0b..ad3d9ff565413 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1321,16 +1321,18 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int idx, bool try_self, unsigned int num) { + struct packet_sock *po; unsigned int i, j; - if (try_self && packet_rcv_has_room(pkt_sk(f->arr[idx]), skb)) + po = pkt_sk(f->arr[idx]); + if (try_self && packet_rcv_has_room(po, skb)) return idx; - i = j = min_t(int, f->next[idx], num - 1); + i = j = min_t(int, po->rollover->sock, num - 1); do { if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { if (i != j) - f->next[idx] = i; + po->rollover->sock = i; return i; } @@ -1468,6 +1470,12 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) if (po->fanout) return -EALREADY; + if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) { + po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); + if (!po->rollover) + return -ENOMEM; + } + mutex_lock(&fanout_mutex); match = NULL; list_for_each_entry(f, &fanout_list, list) { @@ -1516,6 +1524,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) } out: mutex_unlock(&fanout_mutex); + if (err) { + kfree(po->rollover); + po->rollover = NULL; + } return err; } @@ -1537,6 +1549,8 @@ static void fanout_release(struct sock *sk) kfree(f); } mutex_unlock(&fanout_mutex); + + kfree(po->rollover); } static const struct proto_ops packet_ops; @@ -2866,6 +2880,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); + po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) diff --git a/net/packet/internal.h b/net/packet/internal.h index fe6e20caea1d9..a9d33a28a019b 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -82,12 +82,15 @@ struct packet_fanout { atomic_t rr_cur; struct list_head list; struct sock *arr[PACKET_FANOUT_MAX]; - int next[PACKET_FANOUT_MAX]; spinlock_t lock; atomic_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; }; +struct packet_rollover { + int sock; +} ____cacheline_aligned_in_smp; + struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; @@ -104,6 +107,7 @@ struct packet_sock { has_vnet_hdr:1; int ifindex; /* bound device */ __be16 num; + struct packet_rollover *rollover; struct packet_mclist *mclist; atomic_t mapped; enum tpacket_versions tp_version; From 9954729bc3896998d53040c46a28830a3a3d5063 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:47 -0400 Subject: [PATCH 3/6] packet: rollover only to socket with headroom Only migrate flows to sockets that have sufficient headroom, where sufficient is defined as having at least 25% empty space. The kernel has three different buffer types: a regular socket, a ring with frames (TPACKET_V[12]) or a ring with blocks (TPACKET_V3). The latter two do not expose a read pointer to the kernel, so headroom is not computed easily. All three needs a different implementation to estimate free space. Tested: Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input. bench_rollover has as many sockets as there are NIC receive queues in the system. Each socket is owned by a process that is pinned to one of the receive cpus. RFS is disabled. RPS is enabled with an identity mapping (cpu x -> cpu x), to count drops with softnettop. lpbb5:/export/hda3/willemb# ./bench_rollover -r -l 1000 -s Press [Enter] to exit cpu rx rx.k drop.k rollover r.huge r.failed 0 16 16 0 0 0 0 1 21 21 0 0 0 0 2 5227502 5227502 0 0 0 0 3 18 18 0 0 0 0 4 6083289 6083289 0 5227496 0 0 5 22 22 0 0 0 0 6 21 21 0 0 0 0 7 9 9 0 0 0 0 Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 76 ++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ad3d9ff565413..ffa67205f698a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1234,27 +1234,68 @@ static void packet_free_pending(struct packet_sock *po) free_percpu(po->tx_ring.pending_refcnt); } -static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +#define ROOM_POW_OFF 2 +#define ROOM_NONE 0x0 +#define ROOM_LOW 0x1 +#define ROOM_NORMAL 0x2 + +static bool __tpacket_has_room(struct packet_sock *po, int pow_off) { - struct sock *sk = &po->sk; - bool has_room; + int idx, len; + + len = po->rx_ring.frame_max + 1; + idx = po->rx_ring.head; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} - if (po->prot_hook.func != tpacket_rcv) - return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize) - <= sk->sk_rcvbuf; +static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) +{ + int idx, len; + + len = po->rx_ring.prb_bdqc.knum_blocks; + idx = po->rx_ring.prb_bdqc.kactive_blk_num; + if (pow_off) + idx += len >> pow_off; + if (idx >= len) + idx -= len; + return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); +} + +static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ + struct sock *sk = &po->sk; + int ret = ROOM_NONE; + + if (po->prot_hook.func != tpacket_rcv) { + int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) + - skb->truesize; + if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) + return ROOM_NORMAL; + else if (avail > 0) + return ROOM_LOW; + else + return ROOM_NONE; + } spin_lock(&sk->sk_receive_queue.lock); - if (po->tp_version == TPACKET_V3) - has_room = prb_lookup_block(po, &po->rx_ring, - po->rx_ring.prb_bdqc.kactive_blk_num, - TP_STATUS_KERNEL); - else - has_room = packet_lookup_frame(po, &po->rx_ring, - po->rx_ring.head, - TP_STATUS_KERNEL); + if (po->tp_version == TPACKET_V3) { + if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_v3_has_room(po, 0)) + ret = ROOM_LOW; + } else { + if (__tpacket_has_room(po, ROOM_POW_OFF)) + ret = ROOM_NORMAL; + else if (__tpacket_has_room(po, 0)) + ret = ROOM_LOW; + } spin_unlock(&sk->sk_receive_queue.lock); - return has_room; + return ret; } static void packet_sock_destruct(struct sock *sk) @@ -1325,12 +1366,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int i, j; po = pkt_sk(f->arr[idx]); - if (try_self && packet_rcv_has_room(po, skb)) + if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE) return idx; i = j = min_t(int, po->rollover->sock, num - 1); do { - if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) { + if (i != idx && + packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; return i; From 2ccdbaa6d55b0656244ba57c4b56765a0af76c0a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:48 -0400 Subject: [PATCH 4/6] packet: rollover lock contention avoidance Rollover has to call packet_rcv_has_room on sockets in the fanout group to find a socket to migrate to. This operation is expensive especially if the packet sockets use rings, when a lock has to be acquired. Avoid pounding on the lock by all sockets by temporarily marking a socket as "under memory pressure" when such pressure is detected. While set, only the socket owner may call packet_rcv_has_room on the socket. Once it detects normal conditions, it clears the flag. The socket is not used as a victim by any other socket in the meantime. Under reasonably balanced load, each socket writer frequently calls packet_rcv_has_room and clears its own pressure field. As a backup for when the socket is rarely written to, also clear the flag on reading (packet_recvmsg, packet_poll) if this can be done cheaply (i.e., without calling packet_rcv_has_room). This is only for edge cases. Tested: Ran bench_rollover: a process with 8 sockets in a single fanout group, each pinned to a single cpu that receives one nic recv interrupt. RPS and RFS are disabled. The benchmark uses packet rx_ring, which has to take a lock when determining whether a socket has room. Sent 3.5 Mpps of UDP traffic with sufficient entropy to spread uniformly across the packet sockets (and inserted an iptables rule to drop in PREROUTING to avoid protocol stack processing). Without this patch, all sockets try to migrate traffic to neighbors, causing lock contention when searching for a non- empty neighbor. The lock is the top 9 entries. perf record -a -g sleep 5 - 17.82% bench_rollover [kernel.kallsyms] [k] _raw_spin_lock - _raw_spin_lock - 99.00% spin_lock + 81.77% packet_rcv_has_room.isra.41 + 18.23% tpacket_rcv + 0.84% packet_rcv_has_room.isra.41 + 5.20% ksoftirqd/6 [kernel.kallsyms] [k] _raw_spin_lock + 5.15% ksoftirqd/1 [kernel.kallsyms] [k] _raw_spin_lock + 5.14% ksoftirqd/2 [kernel.kallsyms] [k] _raw_spin_lock + 5.12% ksoftirqd/7 [kernel.kallsyms] [k] _raw_spin_lock + 5.12% ksoftirqd/5 [kernel.kallsyms] [k] _raw_spin_lock + 5.10% ksoftirqd/4 [kernel.kallsyms] [k] _raw_spin_lock + 4.66% ksoftirqd/0 [kernel.kallsyms] [k] _raw_spin_lock + 4.45% ksoftirqd/3 [kernel.kallsyms] [k] _raw_spin_lock + 1.55% bench_rollover [kernel.kallsyms] [k] packet_rcv_has_room.isra.41 On net-next with this patch, this lock contention is no longer a top entry. Most time is spent in the actual read function. Next up are other locks: + 15.52% bench_rollover bench_rollover [.] reader + 4.68% swapper [kernel.kallsyms] [k] memcpy_erms + 2.77% swapper [kernel.kallsyms] [k] packet_lookup_frame.isra.51 + 2.56% ksoftirqd/1 [kernel.kallsyms] [k] memcpy_erms + 2.16% swapper [kernel.kallsyms] [k] tpacket_rcv + 1.93% swapper [kernel.kallsyms] [k] mlx4_en_process_rx_cq Looking closer at the remaining _raw_spin_lock, the cost of probing in rollover is now comparable to the cost of taking the lock later in tpacket_rcv. - 1.51% swapper [kernel.kallsyms] [k] _raw_spin_lock - _raw_spin_lock + 33.41% packet_rcv_has_room + 28.15% tpacket_rcv + 19.54% enqueue_to_backlog + 6.45% __free_pages_ok + 2.78% packet_rcv_fanout + 2.13% fanout_demux_rollover + 2.01% netif_receive_skb_internal Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 38 +++++++++++++++++++++++++++++++------- net/packet/internal.h | 1 + 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ffa67205f698a..3a383fd72f820 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1265,14 +1265,14 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off) return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } -static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc) - - skb->truesize; + - (skb ? skb->truesize : 0); if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) @@ -1281,7 +1281,6 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) return ROOM_NONE; } - spin_lock(&sk->sk_receive_queue.lock); if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; @@ -1293,7 +1292,26 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } - spin_unlock(&sk->sk_receive_queue.lock); + + return ret; +} + +static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) +{ + int ret; + bool has_room; + + if (po->prot_hook.func == tpacket_rcv) { + spin_lock(&po->sk.sk_receive_queue.lock); + ret = __packet_rcv_has_room(po, skb); + spin_unlock(&po->sk.sk_receive_queue.lock); + } else { + ret = __packet_rcv_has_room(po, skb); + } + + has_room = ret == ROOM_NORMAL; + if (po->pressure == has_room) + xchg(&po->pressure, !has_room); return ret; } @@ -1362,7 +1380,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int idx, bool try_self, unsigned int num) { - struct packet_sock *po; + struct packet_sock *po, *po_next; unsigned int i, j; po = pkt_sk(f->arr[idx]); @@ -1371,8 +1389,9 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { - if (i != idx && - packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) { + po_next = pkt_sk(f->arr[i]); + if (po_next != po && !po_next->pressure && + packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; return i; @@ -3000,6 +3019,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb == NULL) goto out; + if (pkt_sk(sk)->pressure) + packet_rcv_has_room(pkt_sk(sk), NULL); + if (pkt_sk(sk)->has_vnet_hdr) { struct virtio_net_hdr vnet_hdr = { 0 }; @@ -3755,6 +3777,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, TP_STATUS_KERNEL)) mask |= POLLIN | POLLRDNORM; } + if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) + xchg(&po->pressure, 0); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { diff --git a/net/packet/internal.h b/net/packet/internal.h index a9d33a28a019b..22d7d778c5b73 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -105,6 +105,7 @@ struct packet_sock { auxdata:1, origdev:1, has_vnet_hdr:1; + int pressure; int ifindex; /* bound device */ __be16 num; struct packet_rollover *rollover; From 3b3a5b0aab5b9ad345d4beb9a364a7dd02c23d40 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:49 -0400 Subject: [PATCH 5/6] packet: rollover huge flows before small flows Migrate flows from a socket to another socket in the fanout group not only when the socket is full. Start migrating huge flows early, to divert possible 4-tuple attacks without affecting normal traffic. Introduce fanout_flow_is_huge(). This detects huge flows, which are defined as taking up more than half the load. It does so cheaply, by storing the rxhashes of the N most recent packets. If over half of these are the same rxhash as the current packet, then drop it. This only protects against 4-tuple attacks. N is chosen to fit all data in a single cache line. Tested: Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input. lpbb5:/export/hda3/willemb# ./bench_rollover -l 1000 -r -s cpu rx rx.k drop.k rollover r.huge r.failed 0 14 14 0 0 0 0 1 20 20 0 0 0 0 2 16 16 0 0 0 0 3 6168824 6168824 0 4867721 4867721 0 4 4867741 4867741 0 0 0 0 5 12 12 0 0 0 0 6 15 15 0 0 0 0 7 17 17 0 0 0 0 Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 25 ++++++++++++++++++++++--- net/packet/internal.h | 2 ++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3a383fd72f820..8f0156b10f8d4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1341,6 +1341,20 @@ static int fanout_rr_next(struct packet_fanout *f, unsigned int num) return x; } +static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) +{ + u32 rxhash; + int i, count = 0; + + rxhash = skb_get_hash(skb); + for (i = 0; i < ROLLOVER_HLEN; i++) + if (po->rollover->history[i] == rxhash) + count++; + + po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash; + return count > (ROLLOVER_HLEN >> 1); +} + static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) @@ -1381,11 +1395,16 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int num) { struct packet_sock *po, *po_next; - unsigned int i, j; + unsigned int i, j, room; po = pkt_sk(f->arr[idx]); - if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE) - return idx; + + if (try_self) { + room = packet_rcv_has_room(po, skb); + if (room == ROOM_NORMAL || + (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) + return idx; + } i = j = min_t(int, po->rollover->sock, num - 1); do { diff --git a/net/packet/internal.h b/net/packet/internal.h index 22d7d778c5b73..a9d30a17c7147 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -89,6 +89,8 @@ struct packet_fanout { struct packet_rollover { int sock; +#define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32)) + u32 history[ROLLOVER_HLEN] ____cacheline_aligned; } ____cacheline_aligned_in_smp; struct packet_sock { From a9b6391814d5d6b8668fca2dace86949b7244e2e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:50 -0400 Subject: [PATCH 6/6] packet: rollover statistics Rollover indicates exceptional conditions. Export a counter to inform socket owners of this state. If no socket with sufficient room is found, rollover fails. Also count these events. Finally, also count when flows are rolled over early thanks to huge flow detection, to validate its correctness. Tested: Read counters in bench_rollover on all other tests in the patchset Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 7 +++++++ net/packet/af_packet.c | 19 ++++++++++++++++++- net/packet/internal.h | 3 +++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 053bd102fbe00..d3d715f8c88f6 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -54,6 +54,7 @@ struct sockaddr_ll { #define PACKET_FANOUT 18 #define PACKET_TX_HAS_OFF 19 #define PACKET_QDISC_BYPASS 20 +#define PACKET_ROLLOVER_STATS 21 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 @@ -75,6 +76,12 @@ struct tpacket_stats_v3 { unsigned int tp_freeze_q_cnt; }; +struct tpacket_rollover_stats { + __aligned_u64 tp_all; + __aligned_u64 tp_huge; + __aligned_u64 tp_failed; +}; + union tpacket_stats_u { struct tpacket_stats stats1; struct tpacket_stats_v3 stats3; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8f0156b10f8d4..31d58565726c4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1395,7 +1395,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, unsigned int num) { struct packet_sock *po, *po_next; - unsigned int i, j, room; + unsigned int i, j, room = ROOM_NONE; po = pkt_sk(f->arr[idx]); @@ -1413,6 +1413,9 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; + atomic_long_inc(&po->rollover->num); + if (room == ROOM_LOW) + atomic_long_inc(&po->rollover->num_huge); return i; } @@ -1420,6 +1423,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = 0; } while (i != j); + atomic_long_inc(&po->rollover->num_failed); return idx; } @@ -1554,6 +1558,9 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL); if (!po->rollover) return -ENOMEM; + atomic_long_set(&po->rollover->num, 0); + atomic_long_set(&po->rollover->num_huge, 0); + atomic_long_set(&po->rollover->num_failed, 0); } mutex_lock(&fanout_mutex); @@ -3584,6 +3591,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; + struct tpacket_rollover_stats rstats; if (level != SOL_PACKET) return -ENOPROTOOPT; @@ -3659,6 +3667,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ((u32)po->fanout->flags << 24)) : 0); break; + case PACKET_ROLLOVER_STATS: + if (!po->rollover) + return -EINVAL; + rstats.tp_all = atomic_long_read(&po->rollover->num); + rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); + rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); + data = &rstats; + lv = sizeof(rstats); + break; case PACKET_TX_HAS_OFF: val = po->tp_tx_has_off; break; diff --git a/net/packet/internal.h b/net/packet/internal.h index a9d30a17c7147..c035d263c1e8d 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -89,6 +89,9 @@ struct packet_fanout { struct packet_rollover { int sock; + atomic_long_t num; + atomic_long_t num_huge; + atomic_long_t num_failed; #define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32)) u32 history[ROLLOVER_HLEN] ____cacheline_aligned; } ____cacheline_aligned_in_smp;