Skip to content

Commit

Permalink
Merge branch 'tcp-remove-prequeue-and-header-prediction'
Browse files Browse the repository at this point in the history
Florian Westphal says:

====================
tcp: remove prequeue and header prediction

During a hallway discussion with Eric Dumazet at Netdev 1.2 in
Tokyo some maybe-not-so-useful-anymore TCP stack features came up,
among these header prediction and prequeueing.

In brief, TCP prequeue assumes a single-process-blocking-read design,
which is not that common anymore. The most frequently used high-performance
networking program that is an excellent fit for these features is netperf.

The idea behind prequeueing is to move part of tcp processing, including
retransmit queue cleaning, to process context.

With (e)poll designs, prequeue is always skipped, so for such programs
this is dead-code removal.

Header prediction is also less useful nowadays.
For packet trains, GRO will do packet aggregation so we do not get the
per-packet benefit that this had before GRO anymore.

Because of SACK, header prediction also will be ineffective once
a connection suffers even light packet losses.

code removal aside, after this change processing always occurs in BH
context, this allows to experiment e.g. with doing bulk freeing of
skb heads when incoming ACKs clean packets from the retransmit queue.

There are no changes since the RFC, except in last patch (i missed
another no-longer-used mib counter). I also edited a few commit messages.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Jul 31, 2017
2 parents 764646b + 3282e65 commit 834e0ec
Show file tree
Hide file tree
Showing 14 changed files with 43 additions and 566 deletions.
7 changes: 1 addition & 6 deletions Documentation/networking/ip-sysctl.txt
Original file line number Diff line number Diff line change
Expand Up @@ -353,12 +353,7 @@ tcp_l3mdev_accept - BOOLEAN
compiled with CONFIG_NET_L3_MASTER_DEV.

tcp_low_latency - BOOLEAN
If set, the TCP stack makes decisions that prefer lower
latency as opposed to higher throughput. By default, this
option is not set meaning that higher throughput is preferred.
An example of an application where this default should be
changed would be a Beowulf compute cluster.
Default: 0
This is a legacy option, it has no effect anymore.

tcp_max_orphans - INTEGER
Maximal number of TCP sockets not attached to any user file handle,
Expand Down
15 changes: 0 additions & 15 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,6 @@ struct tcp_sock {
u16 tcp_header_len; /* Bytes of tcp header to send */
u16 gso_segs; /* Max number of segs per GSO packet */

/*
* Header prediction flags
* 0x5?10 << 16 + snd_wnd in net byte order
*/
__be32 pred_flags;

/*
* RFC793 variables by their proper names. This means you can
* read the code and the spec side by side (and laugh ...)
Expand Down Expand Up @@ -192,15 +186,6 @@ struct tcp_sock {

struct list_head tsq_node; /* anchor in tsq_tasklet.head list */

/* Data for direct copy to user */
struct {
struct sk_buff_head prequeue;
struct task_struct *task;
struct msghdr *msg;
int memory;
int len;
} ucopy;

u32 snd_wl1; /* Sequence for window update */
u32 snd_wnd; /* The window we expect to receive */
u32 max_window; /* Maximal window ever seen from peer */
Expand Down
40 changes: 2 additions & 38 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,6 @@ extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_app_win;
extern int sysctl_tcp_adv_win_scale;
extern int sysctl_tcp_frto;
extern int sysctl_tcp_low_latency;
extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
Expand Down Expand Up @@ -632,29 +631,6 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}

static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}

static inline void tcp_fast_path_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);

if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
tp->rcv_wnd &&
atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
!tp->urg_data)
tcp_fast_path_on(tp);
}

/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(struct sock *sk)
{
Expand Down Expand Up @@ -904,9 +880,8 @@ enum tcp_ca_event {

/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
enum tcp_ca_ack_event_flags {
CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
CA_ACK_WIN_UPDATE = (1 << 0), /* ACK updated window */
CA_ACK_ECE = (1 << 1), /* ECE bit is set on ack */
};

/*
Expand Down Expand Up @@ -1244,17 +1219,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
__tcp_checksum_complete(skb);
}

/* Prequeue for VJ style copy to user, combined with checksumming. */

static inline void tcp_prequeue_init(struct tcp_sock *tp)
{
tp->ucopy.task = NULL;
tp->ucopy.len = 0;
tp->ucopy.memory = 0;
skb_queue_head_init(&tp->ucopy.prequeue);
}

bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
int tcp_filter(struct sock *sk, struct sk_buff *skb);

Expand Down
9 changes: 0 additions & 9 deletions include/uapi/linux/snmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,7 @@ enum
LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */
LINUX_MIB_LISTENOVERFLOWS, /* ListenOverflows */
LINUX_MIB_LISTENDROPS, /* ListenDrops */
LINUX_MIB_TCPPREQUEUED, /* TCPPrequeued */
LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, /* TCPDirectCopyFromBacklog */
LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, /* TCPDirectCopyFromPrequeue */
LINUX_MIB_TCPPREQUEUEDROPPED, /* TCPPrequeueDropped */
LINUX_MIB_TCPHPHITS, /* TCPHPHits */
LINUX_MIB_TCPHPHITSTOUSER, /* TCPHPHitsToUser */
LINUX_MIB_TCPPUREACKS, /* TCPPureAcks */
LINUX_MIB_TCPHPACKS, /* TCPHPAcks */
LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */
LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */
LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */
Expand All @@ -208,14 +201,12 @@ enum
LINUX_MIB_TCPSACKFAILURES, /* TCPSackFailures */
LINUX_MIB_TCPLOSSFAILURES, /* TCPLossFailures */
LINUX_MIB_TCPFASTRETRANS, /* TCPFastRetrans */
LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */
LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */
LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */
LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */
LINUX_MIB_TCPLOSSPROBERECOVERY, /* TCPLossProbeRecovery */
LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */
LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */
LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */
LINUX_MIB_TCPRCVCOLLAPSED, /* TCPRcvCollapsed */
LINUX_MIB_TCPDSACKOLDSENT, /* TCPDSACKOldSent */
LINUX_MIB_TCPDSACKOFOSENT, /* TCPDSACKOfoSent */
Expand Down
9 changes: 0 additions & 9 deletions net/ipv4/proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
Expand All @@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
Expand Down
3 changes: 3 additions & 0 deletions net/ipv4/sysctl_net_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };

/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;

/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
Expand Down
109 changes: 1 addition & 108 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,6 @@ void tcp_init_sock(struct sock *sk)

tp->out_of_order_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
INIT_LIST_HEAD(&tp->tsq_node);

icsk->icsk_rto = TCP_TIMEOUT_INIT;
Expand Down Expand Up @@ -1525,20 +1524,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}

static void tcp_prequeue_process(struct sock *sk)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);

NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);

while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);

/* Clear memory counter. */
tp->ucopy.memory = 0;
}

static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
Expand Down Expand Up @@ -1671,7 +1656,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int err;
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
struct sk_buff *skb, *last;
u32 urg_hole = 0;

Expand Down Expand Up @@ -1806,51 +1790,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,

tcp_cleanup_rbuf(sk, copied);

if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
/* Install new reader */
if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
user_recv = current;
tp->ucopy.task = user_recv;
tp->ucopy.msg = msg;
}

tp->ucopy.len = len;

WARN_ON(tp->copied_seq != tp->rcv_nxt &&
!(flags & (MSG_PEEK | MSG_TRUNC)));

/* Ugly... If prequeue is not empty, we have to
* process it before releasing socket, otherwise
* order will be broken at second iteration.
* More elegant solution is required!!!
*
* Look: we have the following (pseudo)queues:
*
* 1. packets in flight
* 2. backlog
* 3. prequeue
* 4. receive_queue
*
* Each queue can be processed only if the next ones
* are empty. At this point we have empty receive_queue.
* But prequeue _can_ be not empty after 2nd iteration,
* when we jumped to start of loop because backlog
* processing added something to receive_queue.
* We cannot release_sock(), because backlog contains
* packets arrived _after_ prequeued ones.
*
* Shortly, algorithm is clear --- to process all
* the queues in order. We could make it more directly,
* requeueing packets from backlog to prequeue, if
* is not empty. It is more elegant, but eats cycles,
* unfortunately.
*/
if (!skb_queue_empty(&tp->ucopy.prequeue))
goto do_prequeue;

/* __ Set realtime policy in scheduler __ */
}

if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
Expand All @@ -1859,31 +1798,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
sk_wait_data(sk, &timeo, last);
}

if (user_recv) {
int chunk;

/* __ Restore normal policy in scheduler __ */

chunk = len - tp->ucopy.len;
if (chunk != 0) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
}

if (tp->rcv_nxt == tp->copied_seq &&
!skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue:
tcp_prequeue_process(sk);

chunk = len - tp->ucopy.len;
if (chunk != 0) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}
}
if ((flags & MSG_PEEK) &&
(peek_seq - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
Expand Down Expand Up @@ -1934,10 +1848,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
tcp_rcv_space_adjust(sk);

skip_copy:
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
tp->urg_data = 0;
tcp_fast_path_check(sk);
}
if (used + offset < skb->len)
continue;

Expand All @@ -1955,25 +1867,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
break;
} while (len > 0);

if (user_recv) {
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
int chunk;

tp->ucopy.len = copied > 0 ? len : 0;

tcp_prequeue_process(sk);

if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
}

tp->ucopy.task = NULL;
tp->ucopy.len = 0;
}

/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
Expand Down
Loading

0 comments on commit 834e0ec

Please sign in to comment.