Skip to content

Commit

Permalink
packet: rollover only to socket with headroom
Browse files Browse the repository at this point in the history
Only migrate flows to sockets that have sufficient headroom, where
sufficient is defined as having at least 25% empty space.

The kernel has three different buffer types: a regular socket, a ring
with frames (TPACKET_V[12]) or a ring with blocks (TPACKET_V3). The
latter two do not expose a read pointer to the kernel, so headroom is
not computed easily. All three needs a different implementation to
estimate free space.

Tested:
  Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input.

  bench_rollover has as many sockets as there are NIC receive queues
  in the system. Each socket is owned by a process that is pinned to
  one of the receive cpus. RFS is disabled. RPS is enabled with an
  identity mapping (cpu x -> cpu x), to count drops with softnettop.

    lpbb5:/export/hda3/willemb# ./bench_rollover -r -l 1000 -s
    Press [Enter] to exit

    cpu         rx       rx.k     drop.k   rollover     r.huge   r.failed
      0         16         16          0          0          0          0
      1         21         21          0          0          0          0
      2    5227502    5227502          0          0          0          0
      3         18         18          0          0          0          0
      4    6083289    6083289          0    5227496          0          0
      5         22         22          0          0          0          0
      6         21         21          0          0          0          0
      7          9          9          0          0          0          0

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Willem de Bruijn authored and David S. Miller committed May 13, 2015
1 parent 0648ab7 commit 9954729
Showing 1 changed file with 59 additions and 17 deletions.
76 changes: 59 additions & 17 deletions net/packet/af_packet.c
Original file line number Diff line number Diff line change
Expand Up @@ -1234,27 +1234,68 @@ static void packet_free_pending(struct packet_sock *po)
free_percpu(po->tx_ring.pending_refcnt);
}

static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
#define ROOM_POW_OFF 2
#define ROOM_NONE 0x0
#define ROOM_LOW 0x1
#define ROOM_NORMAL 0x2

static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
{
struct sock *sk = &po->sk;
bool has_room;
int idx, len;

len = po->rx_ring.frame_max + 1;
idx = po->rx_ring.head;
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
idx -= len;
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}

if (po->prot_hook.func != tpacket_rcv)
return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
<= sk->sk_rcvbuf;
static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
{
int idx, len;

len = po->rx_ring.prb_bdqc.knum_blocks;
idx = po->rx_ring.prb_bdqc.kactive_blk_num;
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
idx -= len;
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}

static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
struct sock *sk = &po->sk;
int ret = ROOM_NONE;

if (po->prot_hook.func != tpacket_rcv) {
int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
- skb->truesize;
if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
return ROOM_NORMAL;
else if (avail > 0)
return ROOM_LOW;
else
return ROOM_NONE;
}

spin_lock(&sk->sk_receive_queue.lock);
if (po->tp_version == TPACKET_V3)
has_room = prb_lookup_block(po, &po->rx_ring,
po->rx_ring.prb_bdqc.kactive_blk_num,
TP_STATUS_KERNEL);
else
has_room = packet_lookup_frame(po, &po->rx_ring,
po->rx_ring.head,
TP_STATUS_KERNEL);
if (po->tp_version == TPACKET_V3) {
if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
ret = ROOM_NORMAL;
else if (__tpacket_v3_has_room(po, 0))
ret = ROOM_LOW;
} else {
if (__tpacket_has_room(po, ROOM_POW_OFF))
ret = ROOM_NORMAL;
else if (__tpacket_has_room(po, 0))
ret = ROOM_LOW;
}
spin_unlock(&sk->sk_receive_queue.lock);

return has_room;
return ret;
}

static void packet_sock_destruct(struct sock *sk)
Expand Down Expand Up @@ -1325,12 +1366,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
unsigned int i, j;

po = pkt_sk(f->arr[idx]);
if (try_self && packet_rcv_has_room(po, skb))
if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE)
return idx;

i = j = min_t(int, po->rollover->sock, num - 1);
do {
if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
if (i != idx &&
packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) {
if (i != j)
po->rollover->sock = i;
return i;
Expand Down

0 comments on commit 9954729

Please sign in to comment.