Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6

mariux64 · Jun 24, 2005 · a39451c · a39451c
2 parents adb7ee3 + 0e57976
commit a39451c
Show file tree

Hide file tree

Showing 24 changed files with 2,304 additions and 1,060 deletions.
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
@@ -304,57 +304,6 @@ tcp_low_latency - BOOLEAN
 	changed would be a Beowulf compute cluster.
 	Default: 0
 
-tcp_westwood - BOOLEAN
-        Enable TCP Westwood+ congestion control algorithm.
-	TCP Westwood+ is a sender-side only modification of the TCP Reno 
-	protocol stack that optimizes the performance of TCP congestion 
-	control. It is based on end-to-end bandwidth estimation to set 
-	congestion window and slow start threshold after a congestion 
-	episode. Using this estimation, TCP Westwood+ adaptively sets a 
-	slow start threshold and a congestion window which takes into 
-	account the bandwidth used  at the time congestion is experienced. 
-	TCP Westwood+ significantly increases fairness wrt TCP Reno in 
-	wired networks and throughput over wireless links.   
-        Default: 0
-
-tcp_vegas_cong_avoid - BOOLEAN
-	Enable TCP Vegas congestion avoidance algorithm.
-	TCP Vegas is a sender-side only change to TCP that anticipates
-	the onset of congestion by estimating the bandwidth. TCP Vegas
-	adjusts the sending rate by modifying the congestion
-	window. TCP Vegas should provide less packet loss, but it is
-	not as aggressive as TCP Reno.
-	Default:0
-
-tcp_bic - BOOLEAN
-	Enable BIC TCP congestion control algorithm.
-	BIC-TCP is a sender-side only change that ensures a linear RTT
-	fairness under large windows while offering both scalability and
-	bounded TCP-friendliness. The protocol combines two schemes
-	called additive increase and binary search increase. When the
-	congestion window is large, additive increase with a large
-	increment ensures linear RTT fairness as well as good
-	scalability. Under small congestion windows, binary search
-	increase provides TCP friendliness.
-	Default: 0
-
-tcp_bic_low_window - INTEGER
-	Sets the threshold window (in packets) where BIC TCP starts to
-	adjust the congestion window. Below this threshold BIC TCP behaves
-	the same as the default TCP Reno. 
-	Default: 14
-
-tcp_bic_fast_convergence - BOOLEAN
-	Forces BIC TCP to more quickly respond to changes in congestion
-	window. Allows two flows sharing the same connection to converge
-	more rapidly.
-	Default: 1
-
-tcp_default_win_scale - INTEGER
-	Sets the minimum window scale TCP will negotiate for on all
-	conections.
-	Default: 7
-
 tcp_tso_win_divisor - INTEGER
        This allows control over what percentage of the congestion window
        can be consumed by a single TSO frame.
@@ -368,6 +317,11 @@ tcp_frto - BOOLEAN
 	where packet loss is typically due to random radio interference
 	rather than intermediate router congestion.
 
+tcp_congestion_control - STRING
+	Set the congestion control algorithm to be used for new
+	connections. The algorithm "reno" is always available, but
+	additional choices may be available based on kernel configuration.
+
 somaxconn - INTEGER
 	Limit of socket listen() backlog, known in userspace as SOMAXCONN.
 	Defaults to 128.  See also tcp_max_syn_backlog for additional tuning

diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt
@@ -1,5 +1,72 @@
-How the new TCP output machine [nyi] works.
+TCP protocol
+============
+
+Last updated: 21 June 2005
+
+Contents
+========
+
+- Congestion control
+- How the new TCP output machine [nyi] works
+
+Congestion control
+==================
+
+The following variables are used in the tcp_sock for congestion control:
+snd_cwnd		The size of the congestion window
+snd_ssthresh		Slow start threshold. We are in slow start if
+			snd_cwnd is less than this.
+snd_cwnd_cnt		A counter used to slow down the rate of increase
+			once we exceed slow start threshold.
+snd_cwnd_clamp		This is the maximum size that snd_cwnd can grow to.
+snd_cwnd_stamp		Timestamp for when congestion window last validated.
+snd_cwnd_used		Used as a highwater mark for how much of the
+			congestion window is in use. It is used to adjust
+			snd_cwnd down when the link is limited by the
+			application rather than the network.
+
+As of 2.6.13, Linux supports pluggable congestion control algorithms.
+A congestion control mechanism can be registered through functions in
+tcp_cong.c. The functions used by the congestion control mechanism are
+registered via passing a tcp_congestion_ops struct to
+tcp_register_congestion_control. As a minimum name, ssthresh,
+cong_avoid, min_cwnd must be valid.
 
+Private data for a congestion control mechanism is stored in tp->ca_priv.
+tcp_ca(tp) returns a pointer to this space.  This is preallocated space - it
+is important to check the size of your private data will fit this space, or
+alternatively space could be allocated elsewhere and a pointer to it could
+be stored here.
+
+There are three kinds of congestion control algorithms currently: The
+simplest ones are derived from TCP reno (highspeed, scalable) and just
+provide an alternative the congestion window calculation. More complex
+ones like BIC try to look at other events to provide better
+heuristics.  There are also round trip time based algorithms like
+Vegas and Westwood+.
+
+Good TCP congestion control is a complex problem because the algorithm
+needs to maintain fairness and performance. Please review current
+research and RFC's before developing new modules.
+
+The method that is used to determine which congestion control mechanism is
+determined by the setting of the sysctl net.ipv4.tcp_congestion_control.
+The default congestion control will be the last one registered (LIFO);
+so if you built everything as modules. the default will be reno. If you
+build with the default's from Kconfig, then BIC will be builtin (not a module)
+and it will end up the default.
+
+If you really want a particular default value then you will need
+to set it with the sysctl.  If you use a sysctl, the module will be autoloaded
+if needed and you will get the expected protocol. If you ask for an
+unknown congestion method, then the sysctl attempt will fail.
+
+If you remove a tcp congestion control module, then you will get the next
+available one. Since reno can not be built as a module, and can not be
+deleted, it will always be available.
+
+How the new TCP output machine [nyi] works.
+===========================================
 
 Data is kept on a single queue. The skb->users flag tells us if the frame is
 one that has been queued already. To add a frame we throw it on the end. Ack

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
@@ -333,21 +333,14 @@ enum
 	NET_TCP_FRTO=92,
 	NET_TCP_LOW_LATENCY=93,
 	NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
-	NET_TCP_WESTWOOD=95,
 	NET_IPV4_IGMP_MAX_MSF=96,
 	NET_TCP_NO_METRICS_SAVE=97,
-	NET_TCP_VEGAS=98,
-	NET_TCP_VEGAS_ALPHA=99,
-	NET_TCP_VEGAS_BETA=100,
-	NET_TCP_VEGAS_GAMMA=101,
- 	NET_TCP_BIC=102,
- 	NET_TCP_BIC_FAST_CONVERGENCE=103,
-	NET_TCP_BIC_LOW_WINDOW=104,
 	NET_TCP_DEFAULT_WIN_SCALE=105,
 	NET_TCP_MODERATE_RCVBUF=106,
 	NET_TCP_TSO_WIN_DIVISOR=107,
 	NET_TCP_BIC_BETA=108,
 	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
+	NET_TCP_CONG_CONTROL=110,
 };
 
 enum {

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
@@ -203,13 +203,6 @@ struct tcp_sack_block {
 	__u32	end_seq;
 };
 
-enum tcp_congestion_algo {
-	TCP_RENO=0,
-	TCP_VEGAS,
-	TCP_WESTWOOD,
-	TCP_BIC,
-};
-
 struct tcp_options_received {
 /*	PAWS/RTTM data	*/
 	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
@@ -305,7 +298,7 @@ struct tcp_sock {
 	__u8	reordering;	/* Packet reordering metric.		*/
 	__u8	frto_counter;	/* Number of new acks after RTO */
 
-	__u8	adv_cong;	/* Using Vegas, Westwood, or BIC */
+	__u8	unused;
 	__u8	defer_accept;	/* User waits for some data after accept() */
 
 /* RTT measurement */
@@ -401,44 +394,22 @@ struct tcp_sock {
 		__u32	time;
 	} rcvq_space;
 
-/* TCP Westwood structure */
-        struct {
-                __u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
-                __u32    bw_est;           /* bandwidth estimate */
-                __u32    rtt_win_sx;       /* here starts a new evaluation... */
-                __u32    bk;
-                __u32    snd_una;          /* used for evaluating the number of acked bytes */
-                __u32    cumul_ack;
-                __u32    accounted;
-                __u32    rtt;
-                __u32    rtt_min;          /* minimum observed RTT */
-        } westwood;
-
-/* Vegas variables */
-	struct {
-		__u32	beg_snd_nxt;	/* right edge during last RTT */
-		__u32	beg_snd_una;	/* left edge  during last RTT */
-		__u32	beg_snd_cwnd;	/* saves the size of the cwnd */
-		__u8	doing_vegas_now;/* if true, do vegas for this RTT */
-		__u16	cntRTT;		/* # of RTTs measured within last RTT */
-		__u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
-		__u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
-	} vegas;
-
-	/* BI TCP Parameters */
-	struct {
-		__u32	cnt;		/* increase cwnd by 1 after this number of ACKs */
-		__u32 	last_max_cwnd;	/* last maximium snd_cwnd */
-		__u32	last_cwnd;	/* the last snd_cwnd */
-		__u32   last_stamp;     /* time when updated last_cwnd */
-	} bictcp;
+	/* Pluggable TCP congestion control hook */
+	struct tcp_congestion_ops *ca_ops;
+	u32	ca_priv[16];
+#define TCP_CA_PRIV_SIZE	(16*sizeof(u32))
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 {
 	return (struct tcp_sock *)sk;
 }
 
+static inline void *tcp_ca(const struct tcp_sock *tp)
+{
+	return (void *) tp->ca_priv;
+}
+
 #endif
 
 #endif	/* _LINUX_TCP_H */
diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h
@@ -99,9 +99,10 @@ enum
 	TCPDIAG_MEMINFO,
 	TCPDIAG_INFO,
 	TCPDIAG_VEGASINFO,
+	TCPDIAG_CONG,
 };
 
-#define TCPDIAG_MAX TCPDIAG_VEGASINFO
+#define TCPDIAG_MAX TCPDIAG_CONG
 
 
 /* TCPDIAG_MEM */
@@ -123,5 +124,4 @@ struct tcpvegas_info {
 	__u32	tcpv_minrtt;
 };
 
-
 #endif /* _TCP_DIAG_H_ */