From 1621b8025f323210743a1aaff584dc0f1b26ff23 Mon Sep 17 00:00:00 2001 From: David Mosberger-Tang Date: Wed, 22 Jun 2005 22:24:00 -0700 Subject: [PATCH] --- yaml --- r: 3028 b: refs/heads/master c: e608a8072b10258aa18c2e33324def225199ba1d h: refs/heads/master v: v3 --- [refs] | 2 +- trunk/Documentation/networking/ip-sysctl.txt | 56 +- trunk/Documentation/networking/tcp.txt | 69 +- trunk/arch/arm/configs/s3c2410_defconfig | 27 +- trunk/arch/arm/mach-s3c2410/mach-bast.c | 49 +- trunk/arch/arm/mach-s3c2410/mach-vr1000.c | 77 +- trunk/arch/arm/nwfpe/softfloat-macros | 22 +- trunk/arch/arm/nwfpe/softfloat.c | 12 +- trunk/drivers/serial/8250.c | 54 +- trunk/drivers/serial/8250.h | 3 - trunk/drivers/serial/s3c2410.c | 4 +- trunk/drivers/video/pxafb.c | 3 +- trunk/include/asm-ia64/mmzone.h | 4 +- trunk/include/linux/sysctl.h | 9 +- trunk/include/linux/tcp.h | 49 +- trunk/include/linux/tcp_diag.h | 4 +- trunk/include/net/tcp.h | 237 ++++-- trunk/net/ipv4/Kconfig | 90 --- trunk/net/ipv4/Makefile | 10 +- trunk/net/ipv4/sysctl_net_ipv4.c | 114 +-- trunk/net/ipv4/tcp.c | 2 - trunk/net/ipv4/tcp_bic.c | 331 --------- trunk/net/ipv4/tcp_cong.c | 195 ----- trunk/net/ipv4/tcp_diag.c | 34 +- trunk/net/ipv4/tcp_highspeed.c | 181 ----- trunk/net/ipv4/tcp_htcp.c | 289 -------- trunk/net/ipv4/tcp_hybla.c | 187 ----- trunk/net/ipv4/tcp_input.c | 737 +++++++++++++++++-- trunk/net/ipv4/tcp_ipv4.c | 3 - trunk/net/ipv4/tcp_minisocks.c | 4 +- trunk/net/ipv4/tcp_output.c | 23 +- trunk/net/ipv4/tcp_scalable.c | 68 -- trunk/net/ipv4/tcp_vegas.c | 411 ----------- trunk/net/ipv4/tcp_westwood.c | 259 ------- trunk/net/ipv6/tcp_ipv6.c | 2 +- 35 files changed, 1100 insertions(+), 2521 deletions(-) delete mode 100644 trunk/net/ipv4/tcp_bic.c delete mode 100644 trunk/net/ipv4/tcp_cong.c delete mode 100644 trunk/net/ipv4/tcp_highspeed.c delete mode 100644 trunk/net/ipv4/tcp_htcp.c delete mode 100644 trunk/net/ipv4/tcp_hybla.c delete mode 100644 trunk/net/ipv4/tcp_scalable.c delete mode 100644 trunk/net/ipv4/tcp_vegas.c delete mode 100644 trunk/net/ipv4/tcp_westwood.c diff --git a/[refs] b/[refs] index d1d5272176f1..8bc6bc569b5c 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: a39451c17f53bbae053555670c7b678d46bcebba +refs/heads/master: e608a8072b10258aa18c2e33324def225199ba1d diff --git a/trunk/Documentation/networking/ip-sysctl.txt b/trunk/Documentation/networking/ip-sysctl.txt index ab65714d95fc..a2c893a7475d 100644 --- a/trunk/Documentation/networking/ip-sysctl.txt +++ b/trunk/Documentation/networking/ip-sysctl.txt @@ -304,6 +304,57 @@ tcp_low_latency - BOOLEAN changed would be a Beowulf compute cluster. Default: 0 +tcp_westwood - BOOLEAN + Enable TCP Westwood+ congestion control algorithm. + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. + Default: 0 + +tcp_vegas_cong_avoid - BOOLEAN + Enable TCP Vegas congestion avoidance algorithm. + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. + Default:0 + +tcp_bic - BOOLEAN + Enable BIC TCP congestion control algorithm. + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + Default: 0 + +tcp_bic_low_window - INTEGER + Sets the threshold window (in packets) where BIC TCP starts to + adjust the congestion window. Below this threshold BIC TCP behaves + the same as the default TCP Reno. + Default: 14 + +tcp_bic_fast_convergence - BOOLEAN + Forces BIC TCP to more quickly respond to changes in congestion + window. Allows two flows sharing the same connection to converge + more rapidly. + Default: 1 + +tcp_default_win_scale - INTEGER + Sets the minimum window scale TCP will negotiate for on all + conections. + Default: 7 + tcp_tso_win_divisor - INTEGER This allows control over what percentage of the congestion window can be consumed by a single TSO frame. @@ -317,11 +368,6 @@ tcp_frto - BOOLEAN where packet loss is typically due to random radio interference rather than intermediate router congestion. -tcp_congestion_control - STRING - Set the congestion control algorithm to be used for new - connections. The algorithm "reno" is always available, but - additional choices may be available based on kernel configuration. - somaxconn - INTEGER Limit of socket listen() backlog, known in userspace as SOMAXCONN. Defaults to 128. See also tcp_max_syn_backlog for additional tuning diff --git a/trunk/Documentation/networking/tcp.txt b/trunk/Documentation/networking/tcp.txt index 0fa300425575..71749007091e 100644 --- a/trunk/Documentation/networking/tcp.txt +++ b/trunk/Documentation/networking/tcp.txt @@ -1,72 +1,5 @@ -TCP protocol -============ - -Last updated: 21 June 2005 - -Contents -======== - -- Congestion control -- How the new TCP output machine [nyi] works - -Congestion control -================== - -The following variables are used in the tcp_sock for congestion control: -snd_cwnd The size of the congestion window -snd_ssthresh Slow start threshold. We are in slow start if - snd_cwnd is less than this. -snd_cwnd_cnt A counter used to slow down the rate of increase - once we exceed slow start threshold. -snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to. -snd_cwnd_stamp Timestamp for when congestion window last validated. -snd_cwnd_used Used as a highwater mark for how much of the - congestion window is in use. It is used to adjust - snd_cwnd down when the link is limited by the - application rather than the network. - -As of 2.6.13, Linux supports pluggable congestion control algorithms. -A congestion control mechanism can be registered through functions in -tcp_cong.c. The functions used by the congestion control mechanism are -registered via passing a tcp_congestion_ops struct to -tcp_register_congestion_control. As a minimum name, ssthresh, -cong_avoid, min_cwnd must be valid. - -Private data for a congestion control mechanism is stored in tp->ca_priv. -tcp_ca(tp) returns a pointer to this space. This is preallocated space - it -is important to check the size of your private data will fit this space, or -alternatively space could be allocated elsewhere and a pointer to it could -be stored here. - -There are three kinds of congestion control algorithms currently: The -simplest ones are derived from TCP reno (highspeed, scalable) and just -provide an alternative the congestion window calculation. More complex -ones like BIC try to look at other events to provide better -heuristics. There are also round trip time based algorithms like -Vegas and Westwood+. - -Good TCP congestion control is a complex problem because the algorithm -needs to maintain fairness and performance. Please review current -research and RFC's before developing new modules. - -The method that is used to determine which congestion control mechanism is -determined by the setting of the sysctl net.ipv4.tcp_congestion_control. -The default congestion control will be the last one registered (LIFO); -so if you built everything as modules. the default will be reno. If you -build with the default's from Kconfig, then BIC will be builtin (not a module) -and it will end up the default. - -If you really want a particular default value then you will need -to set it with the sysctl. If you use a sysctl, the module will be autoloaded -if needed and you will get the expected protocol. If you ask for an -unknown congestion method, then the sysctl attempt will fail. - -If you remove a tcp congestion control module, then you will get the next -available one. Since reno can not be built as a module, and can not be -deleted, it will always be available. - How the new TCP output machine [nyi] works. -=========================================== + Data is kept on a single queue. The skb->users flag tells us if the frame is one that has been queued already. To add a frame we throw it on the end. Ack diff --git a/trunk/arch/arm/configs/s3c2410_defconfig b/trunk/arch/arm/configs/s3c2410_defconfig index 98b72ff38832..2a63fb277196 100644 --- a/trunk/arch/arm/configs/s3c2410_defconfig +++ b/trunk/arch/arm/configs/s3c2410_defconfig @@ -1,13 +1,14 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.12-git4 -# Wed Jun 22 15:56:42 2005 +# Linux kernel version: 2.6.12-rc1-bk2 +# Sun Mar 27 17:47:45 2005 # CONFIG_ARM=y CONFIG_MMU=y CONFIG_UID16=y CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_GENERIC_IOMAP=y # # Code maturity level options @@ -16,7 +17,6 @@ CONFIG_EXPERIMENTAL=y # CONFIG_CLEAN_COMPILE is not set CONFIG_BROKEN=y CONFIG_BROKEN_ON_SMP=y -CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -35,8 +35,6 @@ CONFIG_KOBJECT_UEVENT=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set -CONFIG_PRINTK=y -CONFIG_BUG=y CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y @@ -83,7 +81,6 @@ CONFIG_ARCH_S3C2410=y # CONFIG_ARCH_VERSATILE is not set # CONFIG_ARCH_IMX is not set # CONFIG_ARCH_H720X is not set -# CONFIG_ARCH_AAEC2000 is not set # # S3C24XX Implementations @@ -137,7 +134,6 @@ CONFIG_CPU_TLB_V4WBI=y # # Bus support # -CONFIG_ISA_DMA_API=y # # PCCARD (PCMCIA/CardBus) support @@ -147,9 +143,7 @@ CONFIG_ISA_DMA_API=y # # Kernel Features # -# CONFIG_SMP is not set # CONFIG_PREEMPT is not set -# CONFIG_DISCONTIGMEM is not set CONFIG_ALIGNMENT_TRAP=y # @@ -303,6 +297,7 @@ CONFIG_PARPORT_1284=y # # Block devices # +# CONFIG_BLK_DEV_FD is not set # CONFIG_PARIDE is not set # CONFIG_BLK_DEV_COW_COMMON is not set CONFIG_BLK_DEV_LOOP=y @@ -364,7 +359,6 @@ CONFIG_BLK_DEV_IDE_BAST=y # # Fusion MPT device support # -# CONFIG_FUSION is not set # # IEEE 1394 (FireWire) support @@ -384,11 +378,10 @@ CONFIG_NET=y # Networking options # # CONFIG_PACKET is not set +# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y -CONFIG_IP_FIB_HASH=y -# CONFIG_IP_FIB_TRIE is not set # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set CONFIG_IP_PNP=y @@ -450,9 +443,8 @@ CONFIG_NETDEVICES=y # Ethernet (10 or 100Mbit) # CONFIG_NET_ETHERNET=y -CONFIG_MII=m +# CONFIG_MII is not set # CONFIG_SMC91X is not set -CONFIG_DM9000=m # # Ethernet (1000 Mbit) @@ -529,6 +521,7 @@ CONFIG_SERIO_SERPORT=y CONFIG_SERIO_LIBPS2=y # CONFIG_SERIO_RAW is not set # CONFIG_GAMEPORT is not set +CONFIG_SOUND_GAMEPORT=y # # Character devices @@ -612,6 +605,7 @@ CONFIG_S3C2410_RTC=y # # TPM devices # +# CONFIG_TCG_TPM is not set # # I2C support @@ -660,7 +654,6 @@ CONFIG_SENSORS_LM78=m CONFIG_SENSORS_LM85=m # CONFIG_SENSORS_LM87 is not set # CONFIG_SENSORS_LM90 is not set -# CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_SMSC47B397 is not set @@ -672,7 +665,6 @@ CONFIG_SENSORS_LM85=m # # Other I2C Chip support # -# CONFIG_SENSORS_DS1337 is not set CONFIG_SENSORS_EEPROM=m # CONFIG_SENSORS_PCF8574 is not set # CONFIG_SENSORS_PCF8591 is not set @@ -704,10 +696,8 @@ CONFIG_FB=y # CONFIG_FB_CFB_COPYAREA is not set # CONFIG_FB_CFB_IMAGEBLIT is not set # CONFIG_FB_SOFT_CURSOR is not set -# CONFIG_FB_MACMODES is not set CONFIG_FB_MODE_HELPERS=y # CONFIG_FB_TILEBLITTING is not set -# CONFIG_FB_S1D13XXX is not set # CONFIG_FB_VIRTUAL is not set # @@ -792,6 +782,7 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" # CONFIG_PROC_FS=y CONFIG_SYSFS=y +# CONFIG_DEVFS_FS is not set # CONFIG_DEVPTS_FS_XATTR is not set # CONFIG_TMPFS is not set # CONFIG_HUGETLBFS is not set diff --git a/trunk/arch/arm/mach-s3c2410/mach-bast.c b/trunk/arch/arm/mach-s3c2410/mach-bast.c index f3e970039b65..3bb97eb6e693 100644 --- a/trunk/arch/arm/mach-s3c2410/mach-bast.c +++ b/trunk/arch/arm/mach-s3c2410/mach-bast.c @@ -26,7 +26,6 @@ * 03-Mar-2005 BJD Ensured that bast-cpld.h is included * 10-Mar-2005 LCVR Changed S3C2410_VA to S3C24XX_VA * 14-Mar-2006 BJD Updated for __iomem changes - * 22-Jun-2006 BJD Added DM9000 platform information */ #include @@ -36,7 +35,6 @@ #include #include #include -#include #include #include @@ -55,7 +53,6 @@ #include #include #include -#include #include #include @@ -115,6 +112,7 @@ static struct map_desc bast_iodesc[] __initdata = { { VA_C2(BAST_VA_ISAMEM), PA_CS2(BAST_PA_ISAMEM), SZ_16M, MT_DEVICE }, { VA_C2(BAST_VA_ASIXNET), PA_CS3(BAST_PA_ASIXNET), SZ_1M, MT_DEVICE }, { VA_C2(BAST_VA_SUPERIO), PA_CS2(BAST_PA_SUPERIO), SZ_1M, MT_DEVICE }, + { VA_C2(BAST_VA_DM9000), PA_CS2(BAST_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C2(BAST_VA_IDEPRI), PA_CS3(BAST_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C2(BAST_VA_IDESEC), PA_CS3(BAST_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C2(BAST_VA_IDEPRIAUX), PA_CS3(BAST_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -125,6 +123,7 @@ static struct map_desc bast_iodesc[] __initdata = { { VA_C3(BAST_VA_ISAMEM), PA_CS3(BAST_PA_ISAMEM), SZ_16M, MT_DEVICE }, { VA_C3(BAST_VA_ASIXNET), PA_CS3(BAST_PA_ASIXNET), SZ_1M, MT_DEVICE }, { VA_C3(BAST_VA_SUPERIO), PA_CS3(BAST_PA_SUPERIO), SZ_1M, MT_DEVICE }, + { VA_C3(BAST_VA_DM9000), PA_CS3(BAST_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C3(BAST_VA_IDEPRI), PA_CS3(BAST_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C3(BAST_VA_IDESEC), PA_CS3(BAST_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C3(BAST_VA_IDEPRIAUX), PA_CS3(BAST_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -135,6 +134,7 @@ static struct map_desc bast_iodesc[] __initdata = { { VA_C4(BAST_VA_ISAMEM), PA_CS4(BAST_PA_ISAMEM), SZ_16M, MT_DEVICE }, { VA_C4(BAST_VA_ASIXNET), PA_CS5(BAST_PA_ASIXNET), SZ_1M, MT_DEVICE }, { VA_C4(BAST_VA_SUPERIO), PA_CS4(BAST_PA_SUPERIO), SZ_1M, MT_DEVICE }, + { VA_C4(BAST_VA_DM9000), PA_CS4(BAST_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C4(BAST_VA_IDEPRI), PA_CS5(BAST_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C4(BAST_VA_IDESEC), PA_CS5(BAST_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C4(BAST_VA_IDEPRIAUX), PA_CS5(BAST_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -145,6 +145,7 @@ static struct map_desc bast_iodesc[] __initdata = { { VA_C5(BAST_VA_ISAMEM), PA_CS5(BAST_PA_ISAMEM), SZ_16M, MT_DEVICE }, { VA_C5(BAST_VA_ASIXNET), PA_CS5(BAST_PA_ASIXNET), SZ_1M, MT_DEVICE }, { VA_C5(BAST_VA_SUPERIO), PA_CS5(BAST_PA_SUPERIO), SZ_1M, MT_DEVICE }, + { VA_C5(BAST_VA_DM9000), PA_CS5(BAST_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C5(BAST_VA_IDEPRI), PA_CS5(BAST_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C5(BAST_VA_IDESEC), PA_CS5(BAST_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C5(BAST_VA_IDEPRIAUX), PA_CS5(BAST_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -312,45 +313,6 @@ static struct s3c2410_platform_nand bast_nand_info = { .select_chip = bast_nand_select, }; -/* DM9000 */ - -static struct resource bast_dm9k_resource[] = { - [0] = { - .start = S3C2410_CS5 + BAST_PA_DM9000, - .end = S3C2410_CS5 + BAST_PA_DM9000 + 3, - .flags = IORESOURCE_MEM - }, - [1] = { - .start = S3C2410_CS5 + BAST_PA_DM9000 + 0x40, - .end = S3C2410_CS5 + BAST_PA_DM9000 + 0x40 + 0x3f, - .flags = IORESOURCE_MEM - }, - [2] = { - .start = IRQ_DM9000, - .end = IRQ_DM9000, - .flags = IORESOURCE_IRQ - } - -}; - -/* for the moment we limit ourselves to 16bit IO until some - * better IO routines can be written and tested -*/ - -struct dm9000_plat_data bast_dm9k_platdata = { - .flags = DM9000_PLATF_16BITONLY -}; - -static struct platform_device bast_device_dm9k = { - .name = "dm9000", - .id = 0, - .num_resources = ARRAY_SIZE(bast_dm9k_resource), - .resource = bast_dm9k_resource, - .dev = { - .platform_data = &bast_dm9k_platdata, - } -}; - /* Standard BAST devices */ @@ -362,8 +324,7 @@ static struct platform_device *bast_devices[] __initdata = { &s3c_device_iis, &s3c_device_rtc, &s3c_device_nand, - &bast_device_nor, - &bast_device_dm9k, + &bast_device_nor }; static struct clk *bast_clocks[] = { diff --git a/trunk/arch/arm/mach-s3c2410/mach-vr1000.c b/trunk/arch/arm/mach-s3c2410/mach-vr1000.c index 76be074944a0..5512146b1ce4 100644 --- a/trunk/arch/arm/mach-s3c2410/mach-vr1000.c +++ b/trunk/arch/arm/mach-s3c2410/mach-vr1000.c @@ -27,7 +27,6 @@ * 10-Feb-2005 BJD Added power-off capability * 10-Mar-2005 LCVR Changed S3C2410_VA to S3C24XX_VA * 14-Mar-2006 BJD void __iomem fixes - * 22-Jun-2006 BJD Added DM9000 platform information */ #include @@ -36,7 +35,6 @@ #include #include #include -#include #include #include @@ -100,24 +98,28 @@ static struct map_desc vr1000_iodesc[] __initdata = { * are only 8bit */ /* slow, byte */ + { VA_C2(VR1000_VA_DM9000), PA_CS2(VR1000_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C2(VR1000_VA_IDEPRI), PA_CS3(VR1000_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C2(VR1000_VA_IDESEC), PA_CS3(VR1000_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C2(VR1000_VA_IDEPRIAUX), PA_CS3(VR1000_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, { VA_C2(VR1000_VA_IDESECAUX), PA_CS3(VR1000_PA_IDESECAUX), SZ_1M, MT_DEVICE }, /* slow, word */ + { VA_C3(VR1000_VA_DM9000), PA_CS3(VR1000_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C3(VR1000_VA_IDEPRI), PA_CS3(VR1000_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C3(VR1000_VA_IDESEC), PA_CS3(VR1000_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C3(VR1000_VA_IDEPRIAUX), PA_CS3(VR1000_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, { VA_C3(VR1000_VA_IDESECAUX), PA_CS3(VR1000_PA_IDESECAUX), SZ_1M, MT_DEVICE }, /* fast, byte */ + { VA_C4(VR1000_VA_DM9000), PA_CS4(VR1000_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C4(VR1000_VA_IDEPRI), PA_CS5(VR1000_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C4(VR1000_VA_IDESEC), PA_CS5(VR1000_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C4(VR1000_VA_IDEPRIAUX), PA_CS5(VR1000_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, { VA_C4(VR1000_VA_IDESECAUX), PA_CS5(VR1000_PA_IDESECAUX), SZ_1M, MT_DEVICE }, /* fast, word */ + { VA_C5(VR1000_VA_DM9000), PA_CS5(VR1000_PA_DM9000), SZ_1M, MT_DEVICE }, { VA_C5(VR1000_VA_IDEPRI), PA_CS5(VR1000_PA_IDEPRI), SZ_1M, MT_DEVICE }, { VA_C5(VR1000_VA_IDESEC), PA_CS5(VR1000_PA_IDESEC), SZ_1M, MT_DEVICE }, { VA_C5(VR1000_VA_IDEPRIAUX), PA_CS5(VR1000_PA_IDEPRIAUX), SZ_1M, MT_DEVICE }, @@ -244,74 +246,6 @@ static struct platform_device vr1000_nor = { .resource = vr1000_nor_resource, }; -/* DM9000 ethernet devices */ - -static struct resource vr1000_dm9k0_resource[] = { - [0] = { - .start = S3C2410_CS5 + VR1000_PA_DM9000, - .end = S3C2410_CS5 + VR1000_PA_DM9000 + 3, - .flags = IORESOURCE_MEM - }, - [1] = { - .start = S3C2410_CS5 + VR1000_PA_DM9000 + 0x40, - .end = S3C2410_CS5 + VR1000_PA_DM9000 + 0x7f, - .flags = IORESOURCE_MEM - }, - [2] = { - .start = IRQ_VR1000_DM9000A, - .end = IRQ_VR1000_DM9000A, - .flags = IORESOURCE_IRQ - } - -}; - -static struct resource vr1000_dm9k1_resource[] = { - [0] = { - .start = S3C2410_CS5 + VR1000_PA_DM9000 + 0x80, - .end = S3C2410_CS5 + VR1000_PA_DM9000 + 0x83, - .flags = IORESOURCE_MEM - }, - [1] = { - .start = S3C2410_CS5 + VR1000_PA_DM9000 + 0xC0, - .end = S3C2410_CS5 + VR1000_PA_DM9000 + 0xFF, - .flags = IORESOURCE_MEM - }, - [2] = { - .start = IRQ_VR1000_DM9000N, - .end = IRQ_VR1000_DM9000N, - .flags = IORESOURCE_IRQ - } -}; - -/* for the moment we limit ourselves to 16bit IO until some - * better IO routines can be written and tested -*/ - -struct dm9000_plat_data vr1000_dm9k_platdata = { - .flags = DM9000_PLATF_16BITONLY, -}; - -static struct platform_device vr1000_dm9k0 = { - .name = "dm9000", - .id = 0, - .num_resources = ARRAY_SIZE(vr1000_dm9k0_resource), - .resource = vr1000_dm9k0_resource, - .dev = { - .platform_data = &vr1000_dm9k_platdata, - } -}; - -static struct platform_device vr1000_dm9k1 = { - .name = "dm9000", - .id = 1, - .num_resources = ARRAY_SIZE(vr1000_dm9k1_resource), - .resource = vr1000_dm9k1_resource, - .dev = { - .platform_data = &vr1000_dm9k_platdata, - } -}; - -/* devices for this board */ static struct platform_device *vr1000_devices[] __initdata = { &s3c_device_usb, @@ -319,11 +253,8 @@ static struct platform_device *vr1000_devices[] __initdata = { &s3c_device_wdt, &s3c_device_i2c, &s3c_device_iis, - &s3c_device_adc, &serial_device, &vr1000_nor, - &vr1000_dm9k0, - &vr1000_dm9k1 }; static struct clk *vr1000_clocks[] = { diff --git a/trunk/arch/arm/nwfpe/softfloat-macros b/trunk/arch/arm/nwfpe/softfloat-macros index 5a060f95a58f..5469989f2c5e 100644 --- a/trunk/arch/arm/nwfpe/softfloat-macros +++ b/trunk/arch/arm/nwfpe/softfloat-macros @@ -563,14 +563,8 @@ static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b ) bits64 rem0, rem1, term0, term1; bits64 z; if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF ); - b0 = b>>32; /* hence b0 is 32 bits wide now */ - if ( b0<<32 <= a0 ) { - z = LIT64( 0xFFFFFFFF00000000 ); - } else { - z = a0; - do_div( z, b0 ); - z <<= 32; - } + b0 = b>>32; + z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32; mul64To128( b, z, &term0, &term1 ); sub128( a0, a1, term0, term1, &rem0, &rem1 ); while ( ( (sbits64) rem0 ) < 0 ) { @@ -579,12 +573,7 @@ static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b ) add128( rem0, rem1, b0, b1, &rem0, &rem1 ); } rem0 = ( rem0<<32 ) | ( rem1>>32 ); - if ( b0<<32 <= rem0 ) { - z |= 0xFFFFFFFF; - } else { - do_div( rem0, b0 ); - z |= rem0; - } + z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0; return z; } @@ -612,7 +601,6 @@ static bits32 estimateSqrt32( int16 aExp, bits32 a ) }; int8 index; bits32 z; - bits64 A; index = ( a>>27 ) & 15; if ( aExp & 1 ) { @@ -626,9 +614,7 @@ static bits32 estimateSqrt32( int16 aExp, bits32 a ) z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 ); } - A = ( (bits64) a )<<31; - do_div( A, z ); - return ( (bits32) A ) + ( z>>1 ); + return ( (bits32) ( ( ( (bits64) a )<<31 ) / z ) ) + ( z>>1 ); } diff --git a/trunk/arch/arm/nwfpe/softfloat.c b/trunk/arch/arm/nwfpe/softfloat.c index e038dd3be9b3..9d743ae29062 100644 --- a/trunk/arch/arm/nwfpe/softfloat.c +++ b/trunk/arch/arm/nwfpe/softfloat.c @@ -28,8 +28,6 @@ this code that are retained. =============================================================================== */ -#include - #include "fpa11.h" //#include "milieu.h" //#include "softfloat.h" @@ -1333,11 +1331,7 @@ float32 float32_div( float32 a, float32 b ) aSig >>= 1; ++zExp; } - { - bits64 tmp = ( (bits64) aSig )<<32; - do_div( tmp, bSig ); - zSig = tmp; - } + zSig = ( ( (bits64) aSig )<<32 ) / bSig; if ( ( zSig & 0x3F ) == 0 ) { zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 ); } @@ -1403,9 +1397,7 @@ float32 float32_rem( float32 a, float32 b ) q = ( bSig <= aSig ); if ( q ) aSig -= bSig; if ( 0 < expDiff ) { - bits64 tmp = ( (bits64) aSig )<<32; - do_div( tmp, bSig ); - q = tmp; + q = ( ( (bits64) aSig )<<32 ) / bSig; q >>= 32 - expDiff; bSig >>= 2; aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; diff --git a/trunk/drivers/serial/8250.c b/trunk/drivers/serial/8250.c index 79f67fd863ec..30e8beb71430 100644 --- a/trunk/drivers/serial/8250.c +++ b/trunk/drivers/serial/8250.c @@ -132,9 +132,9 @@ struct uart_8250_port { struct uart_port port; struct timer_list timer; /* "no irq" timer */ struct list_head list; /* ports on this IRQ */ - unsigned short capabilities; /* port capabilities */ - unsigned short bugs; /* port bugs */ + unsigned int capabilities; /* port capabilities */ unsigned int tx_loadsz; /* transmit fifo load size */ + unsigned short rev; unsigned char acr; unsigned char ier; unsigned char lcr; @@ -560,14 +560,7 @@ static void autoconfig_has_efr(struct uart_8250_port *up) if (id1 == 0x16 && id2 == 0xC9 && (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) { up->port.type = PORT_16C950; - - /* - * Enable work around for the Oxford Semiconductor 952 rev B - * chip which causes it to seriously miscalculate baud rates - * when DLL is 0. - */ - if (id3 == 0x52 && rev == 0x01) - up->bugs |= UART_BUG_QUOT; + up->rev = rev | (id3 << 8); return; } @@ -584,6 +577,8 @@ static void autoconfig_has_efr(struct uart_8250_port *up) id2 = id1 >> 8; if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) { + if (id2 == 0x10) + up->rev = id1 & 255; up->port.type = PORT_16850; return; } @@ -814,7 +809,6 @@ static void autoconfig(struct uart_8250_port *up, unsigned int probeflags) // save_flags(flags); cli(); up->capabilities = 0; - up->bugs = 0; if (!(up->port.flags & UPF_BUGGY_UART)) { /* @@ -1027,8 +1021,6 @@ static void serial8250_stop_tx(struct uart_port *port, unsigned int tty_stop) } } -static void transmit_chars(struct uart_8250_port *up); - static void serial8250_start_tx(struct uart_port *port, unsigned int tty_start) { struct uart_8250_port *up = (struct uart_8250_port *)port; @@ -1036,14 +1028,6 @@ static void serial8250_start_tx(struct uart_port *port, unsigned int tty_start) if (!(up->ier & UART_IER_THRI)) { up->ier |= UART_IER_THRI; serial_out(up, UART_IER, up->ier); - - if (up->bugs & UART_BUG_TXEN) { - unsigned char lsr, iir; - lsr = serial_in(up, UART_LSR); - iir = serial_in(up, UART_IIR); - if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) - transmit_chars(up); - } } /* * We only do this from uart_start @@ -1449,7 +1433,6 @@ static int serial8250_startup(struct uart_port *port) { struct uart_8250_port *up = (struct uart_8250_port *)port; unsigned long flags; - unsigned char lsr, iir; int retval; up->capabilities = uart_config[up->port.type].flags; @@ -1553,26 +1536,6 @@ static int serial8250_startup(struct uart_port *port) up->port.mctrl |= TIOCM_OUT2; serial8250_set_mctrl(&up->port, up->port.mctrl); - - /* - * Do a quick test to see if we receive an - * interrupt when we enable the TX irq. - */ - serial_outp(up, UART_IER, UART_IER_THRI); - lsr = serial_in(up, UART_LSR); - iir = serial_in(up, UART_IIR); - serial_outp(up, UART_IER, 0); - - if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) { - if (!(up->bugs & UART_BUG_TXEN)) { - up->bugs |= UART_BUG_TXEN; - pr_debug("ttyS%d - enabling bad tx status workarounds\n", - port->line); - } - } else { - up->bugs &= ~UART_BUG_TXEN; - } - spin_unlock_irqrestore(&up->port.lock, flags); /* @@ -1714,9 +1677,12 @@ serial8250_set_termios(struct uart_port *port, struct termios *termios, quot = serial8250_get_divisor(port, baud); /* - * Oxford Semi 952 rev B workaround + * Work around a bug in the Oxford Semiconductor 952 rev B + * chip which causes it to seriously miscalculate baud rates + * when DLL is 0. */ - if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0) + if ((quot & 0xff) == 0 && up->port.type == PORT_16C950 && + up->rev == 0x5201) quot ++; if (up->capabilities & UART_CAP_FIFO && up->port.fifosize > 1) { diff --git a/trunk/drivers/serial/8250.h b/trunk/drivers/serial/8250.h index 9225c82faeb8..4f3d62f222f4 100644 --- a/trunk/drivers/serial/8250.h +++ b/trunk/drivers/serial/8250.h @@ -51,9 +51,6 @@ struct serial8250_config { #define UART_CAP_AFE (1 << 11) /* MCR-based hw flow control */ #define UART_CAP_UUE (1 << 12) /* UART needs IER bit 6 set (Xscale) */ -#define UART_BUG_QUOT (1 << 0) /* UART has buggy quot LSB */ -#define UART_BUG_TXEN (1 << 1) /* UART has buggy TX IIR status */ - #if defined(__i386__) && (defined(CONFIG_M386) || defined(CONFIG_M486)) #define _INLINE_ inline #else diff --git a/trunk/drivers/serial/s3c2410.c b/trunk/drivers/serial/s3c2410.c index 5c4678478b1d..2a9f7ade2c9d 100644 --- a/trunk/drivers/serial/s3c2410.c +++ b/trunk/drivers/serial/s3c2410.c @@ -198,7 +198,7 @@ static inline struct s3c24xx_uart_port *to_ourport(struct uart_port *port) /* translate a port to the device name */ -static inline const char *s3c24xx_serial_portname(struct uart_port *port) +static inline char *s3c24xx_serial_portname(struct uart_port *port) { return to_platform_device(port->dev)->name; } @@ -903,7 +903,7 @@ static void s3c24xx_serial_release_port(struct uart_port *port) static int s3c24xx_serial_request_port(struct uart_port *port) { - const char *name = s3c24xx_serial_portname(port); + char *name = s3c24xx_serial_portname(port); return request_mem_region(port->mapbase, MAP_SIZE, name) ? 0 : -EBUSY; } diff --git a/trunk/drivers/video/pxafb.c b/trunk/drivers/video/pxafb.c index 16e37a535d85..815fbc8317fc 100644 --- a/trunk/drivers/video/pxafb.c +++ b/trunk/drivers/video/pxafb.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -461,7 +460,7 @@ static inline unsigned int get_pcd(unsigned int pixclock) * speeds */ pcd = (unsigned long long)get_lcdclk_frequency_10khz() * pixclock; - do_div(pcd, 100000000 * 2); + pcd /= 100000000 * 2; /* no need for this, since we should subtract 1 anyway. they cancel */ /* pcd += 1; */ /* make up for integer math truncations */ return (unsigned int)pcd; diff --git a/trunk/include/asm-ia64/mmzone.h b/trunk/include/asm-ia64/mmzone.h index 83ca4043fc11..d32f51e3d6c2 100644 --- a/trunk/include/asm-ia64/mmzone.h +++ b/trunk/include/asm-ia64/mmzone.h @@ -15,6 +15,8 @@ #include #include +#ifdef CONFIG_DISCONTIGMEM + static inline int pfn_to_nid(unsigned long pfn) { #ifdef CONFIG_NUMA @@ -29,8 +31,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -#ifdef CONFIG_DISCONTIGMEM - #ifdef CONFIG_IA64_DIG /* DIG systems are small */ # define MAX_PHYSNODE_ID 8 # define NR_NODE_MEMBLKS (MAX_NUMNODES * 8) diff --git a/trunk/include/linux/sysctl.h b/trunk/include/linux/sysctl.h index 72965bfe6cfb..614e939c78a4 100644 --- a/trunk/include/linux/sysctl.h +++ b/trunk/include/linux/sysctl.h @@ -333,14 +333,21 @@ enum NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, + NET_TCP_WESTWOOD=95, NET_IPV4_IGMP_MAX_MSF=96, NET_TCP_NO_METRICS_SAVE=97, + NET_TCP_VEGAS=98, + NET_TCP_VEGAS_ALPHA=99, + NET_TCP_VEGAS_BETA=100, + NET_TCP_VEGAS_GAMMA=101, + NET_TCP_BIC=102, + NET_TCP_BIC_FAST_CONVERGENCE=103, + NET_TCP_BIC_LOW_WINDOW=104, NET_TCP_DEFAULT_WIN_SCALE=105, NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, - NET_TCP_CONG_CONTROL=110, }; enum { diff --git a/trunk/include/linux/tcp.h b/trunk/include/linux/tcp.h index 3ea75dd6640a..97a7c9e03df5 100644 --- a/trunk/include/linux/tcp.h +++ b/trunk/include/linux/tcp.h @@ -203,6 +203,13 @@ struct tcp_sack_block { __u32 end_seq; }; +enum tcp_congestion_algo { + TCP_RENO=0, + TCP_VEGAS, + TCP_WESTWOOD, + TCP_BIC, +}; + struct tcp_options_received { /* PAWS/RTTM data */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -298,7 +305,7 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 unused; + __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ __u8 defer_accept; /* User waits for some data after accept() */ /* RTT measurement */ @@ -394,10 +401,37 @@ struct tcp_sock { __u32 time; } rcvq_space; - /* Pluggable TCP congestion control hook */ - struct tcp_congestion_ops *ca_ops; - u32 ca_priv[16]; -#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) +/* TCP Westwood structure */ + struct { + __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ + __u32 bw_est; /* bandwidth estimate */ + __u32 rtt_win_sx; /* here starts a new evaluation... */ + __u32 bk; + __u32 snd_una; /* used for evaluating the number of acked bytes */ + __u32 cumul_ack; + __u32 accounted; + __u32 rtt; + __u32 rtt_min; /* minimum observed RTT */ + } westwood; + +/* Vegas variables */ + struct { + __u32 beg_snd_nxt; /* right edge during last RTT */ + __u32 beg_snd_una; /* left edge during last RTT */ + __u32 beg_snd_cwnd; /* saves the size of the cwnd */ + __u8 doing_vegas_now;/* if true, do vegas for this RTT */ + __u16 cntRTT; /* # of RTTs measured within last RTT */ + __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + } vegas; + + /* BI TCP Parameters */ + struct { + __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ + __u32 last_max_cwnd; /* last maximium snd_cwnd */ + __u32 last_cwnd; /* the last snd_cwnd */ + __u32 last_stamp; /* time when updated last_cwnd */ + } bictcp; }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -405,11 +439,6 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } -static inline void *tcp_ca(const struct tcp_sock *tp) -{ - return (void *) tp->ca_priv; -} - #endif #endif /* _LINUX_TCP_H */ diff --git a/trunk/include/linux/tcp_diag.h b/trunk/include/linux/tcp_diag.h index 7a5996743946..ceee962e1d15 100644 --- a/trunk/include/linux/tcp_diag.h +++ b/trunk/include/linux/tcp_diag.h @@ -99,10 +99,9 @@ enum TCPDIAG_MEMINFO, TCPDIAG_INFO, TCPDIAG_VEGASINFO, - TCPDIAG_CONG, }; -#define TCPDIAG_MAX TCPDIAG_CONG +#define TCPDIAG_MAX TCPDIAG_VEGASINFO /* TCPDIAG_MEM */ @@ -124,4 +123,5 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; + #endif /* _TCP_DIAG_H_ */ diff --git a/trunk/include/net/tcp.h b/trunk/include/net/tcp.h index e427cf35915c..f730935b824a 100644 --- a/trunk/include/net/tcp.h +++ b/trunk/include/net/tcp.h @@ -505,6 +505,25 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #else # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) #endif + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_MAX_INCREMENT 32 /* + * Limit on the amount of + * increment allowed during + * binary search. + */ +#define BICTCP_FUNC_OF_MIN_INCR 11 /* + * log(B/Smin)/log(B/(B-1))+1, + * Smin:min increment + * B:log factor + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ + /* * TCP option */ @@ -577,7 +596,16 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_westwood; +extern int sysctl_tcp_vegas_cong_avoid; +extern int sysctl_tcp_vegas_alpha; +extern int sysctl_tcp_vegas_beta; +extern int sysctl_tcp_vegas_gamma; extern int sysctl_tcp_nometrics_save; +extern int sysctl_tcp_bic; +extern int sysctl_tcp_bic_fast_convergence; +extern int sysctl_tcp_bic_low_window; +extern int sysctl_tcp_bic_beta; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -1108,80 +1136,6 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp, tp->packets_out -= tcp_skb_pcount(skb); } -/* Events passed to congestion control interface */ -enum tcp_ca_event { - CA_EVENT_TX_START, /* first transmit when no packets in flight */ - CA_EVENT_CWND_RESTART, /* congestion window restart */ - CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ - CA_EVENT_FRTO, /* fast recovery timeout */ - CA_EVENT_LOSS, /* loss timeout */ - CA_EVENT_FAST_ACK, /* in sequence ack */ - CA_EVENT_SLOW_ACK, /* other ack */ -}; - -/* - * Interface for adding new TCP congestion control handlers - */ -#define TCP_CA_NAME_MAX 16 -struct tcp_congestion_ops { - struct list_head list; - - /* initialize private data (optional) */ - void (*init)(struct tcp_sock *tp); - /* cleanup private data (optional) */ - void (*release)(struct tcp_sock *tp); - - /* return slow start threshold (required) */ - u32 (*ssthresh)(struct tcp_sock *tp); - /* lower bound for congestion window (optional) */ - u32 (*min_cwnd)(struct tcp_sock *tp); - /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct tcp_sock *tp, u32 ack, - u32 rtt, u32 in_flight, int good_ack); - /* round trip time sample per acked packet (optional) */ - void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); - /* call before changing ca_state (optional) */ - void (*set_state)(struct tcp_sock *tp, u8 new_state); - /* call when cwnd event occurs (optional) */ - void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); - /* new value of cwnd after loss (optional) */ - u32 (*undo_cwnd)(struct tcp_sock *tp); - /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); - /* get info for tcp_diag (optional) */ - void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); - - char name[TCP_CA_NAME_MAX]; - struct module *owner; -}; - -extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); -extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); - -extern void tcp_init_congestion_control(struct tcp_sock *tp); -extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); -extern int tcp_set_default_congestion_control(const char *name); -extern void tcp_get_default_congestion_control(char *name); - -extern struct tcp_congestion_ops tcp_reno; -extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); -extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, - u32 rtt, u32 in_flight, int flag); -extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); - -static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) -{ - if (tp->ca_ops->set_state) - tp->ca_ops->set_state(tp, ca_state); - tp->ca_state = ca_state; -} - -static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) -{ - if (tp->ca_ops->cwnd_event) - tp->ca_ops->cwnd_event(tp, event); -} - /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK @@ -1201,6 +1155,91 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) return (tp->packets_out - tp->left_out + tp->retrans_out); } +/* + * Which congestion algorithim is in use on the connection. + */ +#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS) +#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD) +#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC) + +/* Recalculate snd_ssthresh, we want to set it to: + * + * Reno: + * one half the current congestion window, but no + * less than two segments + * + * BIC: + * behave like Reno until low_window is reached, + * then increase congestion window slowly + */ +static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) +{ + if (tcp_is_bic(tp)) { + if (sysctl_tcp_bic_fast_convergence && + tp->snd_cwnd < tp->bictcp.last_max_cwnd) + tp->bictcp.last_max_cwnd = (tp->snd_cwnd * + (BICTCP_BETA_SCALE + + sysctl_tcp_bic_beta)) + / (2 * BICTCP_BETA_SCALE); + else + tp->bictcp.last_max_cwnd = tp->snd_cwnd; + + if (tp->snd_cwnd > sysctl_tcp_bic_low_window) + return max((tp->snd_cwnd * sysctl_tcp_bic_beta) + / BICTCP_BETA_SCALE, 2U); + } + + return max(tp->snd_cwnd >> 1U, 2U); +} + +/* Stop taking Vegas samples for now. */ +#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0) + +static inline void tcp_vegas_enable(struct tcp_sock *tp) +{ + /* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ + + /* Begin taking Vegas samples next time we send something. */ + tp->vegas.doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + tp->vegas.beg_snd_nxt = tp->snd_nxt; + + tp->vegas.cntRTT = 0; + tp->vegas.minRTT = 0x7fffffff; +} + +/* Should we be taking Vegas samples right now? */ +#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now) + +extern void tcp_ca_init(struct tcp_sock *tp); + +static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) +{ + if (tcp_is_vegas(tp)) { + if (ca_state == TCP_CA_Open) + tcp_vegas_enable(tp); + else + tcp_vegas_disable(tp); + } + tp->ca_state = ca_state; +} + /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. @@ -1249,7 +1288,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) static inline void __tcp_enter_cwr(struct tcp_sock *tp) { tp->undo_marker = 0; - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -1837,4 +1876,52 @@ struct tcp_iter_state { extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); +/* TCP Westwood functions and constants */ + +#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ +#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ + +static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq) +{ + if (tcp_is_westwood(tp)) + tp->westwood.rtt = rtt_seq; +} + +static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) +{ + return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / + (__u32) (tp->mss_cache_std), + 2U); +} + +static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) +{ + return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; +} + +static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) +{ + __u32 ssthresh = 0; + + if (tcp_is_westwood(tp)) { + ssthresh = __tcp_westwood_bw_rttmin(tp); + if (ssthresh) + tp->snd_ssthresh = ssthresh; + } + + return (ssthresh != 0); +} + +static inline int tcp_westwood_cwnd(struct tcp_sock *tp) +{ + __u32 cwnd = 0; + + if (tcp_is_westwood(tp)) { + cwnd = __tcp_westwood_bw_rttmin(tp); + if (cwnd) + tp->snd_cwnd = cwnd; + } + + return (cwnd != 0); +} #endif /* _TCP_H */ diff --git a/trunk/net/ipv4/Kconfig b/trunk/net/ipv4/Kconfig index 690e88ba2484..567b03b1c349 100644 --- a/trunk/net/ipv4/Kconfig +++ b/trunk/net/ipv4/Kconfig @@ -433,95 +433,5 @@ config IP_TCPDIAG config IP_TCPDIAG_IPV6 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) -# TCP Reno is builtin (required as fallback) -menu "TCP congestion control" - depends on INET - -config TCP_CONG_BIC - tristate "Binary Increase Congestion (BIC) control" - depends on INET - default y - ---help--- - BIC-TCP is a sender-side only change that ensures a linear RTT - fairness under large windows while offering both scalability and - bounded TCP-friendliness. The protocol combines two schemes - called additive increase and binary search increase. When the - congestion window is large, additive increase with a large - increment ensures linear RTT fairness as well as good - scalability. Under small congestion windows, binary search - increase provides TCP friendliness. - See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ - -config TCP_CONG_WESTWOOD - tristate "TCP Westwood+" - depends on INET - default m - ---help--- - TCP Westwood+ is a sender-side only modification of the TCP Reno - protocol stack that optimizes the performance of TCP congestion - control. It is based on end-to-end bandwidth estimation to set - congestion window and slow start threshold after a congestion - episode. Using this estimation, TCP Westwood+ adaptively sets a - slow start threshold and a congestion window which takes into - account the bandwidth used at the time congestion is experienced. - TCP Westwood+ significantly increases fairness wrt TCP Reno in - wired networks and throughput over wireless links. - -config TCP_CONG_HTCP - tristate "H-TCP" - depends on INET - default m - ---help--- - H-TCP is a send-side only modifications of the TCP Reno - protocol stack that optimizes the performance of TCP - congestion control for high speed network links. It uses a - modeswitch to change the alpha and beta parameters of TCP Reno - based on network conditions and in a way so as to be fair with - other Reno and H-TCP flows. - -config TCP_CONG_HSTCP - tristate "High Speed TCP" - depends on INET && EXPERIMENTAL - default n - ---help--- - Sally Floyd's High Speed TCP (RFC 3649) congestion control. - A modification to TCP's congestion control mechanism for use - with large congestion windows. A table indicates how much to - increase the congestion window by when an ACK is received. - For more detail see http://www.icir.org/floyd/hstcp.html - -config TCP_CONG_HYBLA - tristate "TCP-Hybla congestion control algorithm" - depends on INET && EXPERIMENTAL - default n - ---help--- - TCP-Hybla is a sender-side only change that eliminates penalization of - long-RTT, large-bandwidth connections, like when satellite legs are - involved, expecially when sharing a common bottleneck with normal - terrestrial connections. - -config TCP_CONG_VEGAS - tristate "TCP Vegas" - depends on INET && EXPERIMENTAL - default n - ---help--- - TCP Vegas is a sender-side only change to TCP that anticipates - the onset of congestion by estimating the bandwidth. TCP Vegas - adjusts the sending rate by modifying the congestion - window. TCP Vegas should provide less packet loss, but it is - not as aggressive as TCP Reno. - -config TCP_CONG_SCALABLE - tristate "Scalable TCP" - depends on INET && EXPERIMENTAL - default n - ---help--- - Scalable TCP is a sender-side only change to TCP which uses a - MIMD congestion control algorithm which has some nice scaling - properties, though is known to have fairness issues. - See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ - -endmenu - source "net/ipv4/ipvs/Kconfig" diff --git a/trunk/net/ipv4/Makefile b/trunk/net/ipv4/Makefile index 5718cdb3a61e..65d57d8e1add 100644 --- a/trunk/net/ipv4/Makefile +++ b/trunk/net/ipv4/Makefile @@ -5,8 +5,7 @@ obj-y := utils.o route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ - tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ - tcp_minisocks.o tcp_cong.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o @@ -31,13 +30,6 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o -obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o -obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o -obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o -obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o -obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o -obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o -obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/trunk/net/ipv4/sysctl_net_ipv4.c b/trunk/net/ipv4/sysctl_net_ipv4.c index e32894532416..23068bddbf0b 100644 --- a/trunk/net/ipv4/sysctl_net_ipv4.c +++ b/trunk/net/ipv4/sysctl_net_ipv4.c @@ -118,45 +118,6 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, return 1; } -static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - char val[TCP_CA_NAME_MAX]; - ctl_table tbl = { - .data = val, - .maxlen = TCP_CA_NAME_MAX, - }; - int ret; - - tcp_get_default_congestion_control(val); - - ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); - if (write && ret == 0) - ret = tcp_set_default_congestion_control(val); - return ret; -} - -int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - void **context) -{ - char val[TCP_CA_NAME_MAX]; - ctl_table tbl = { - .data = val, - .maxlen = TCP_CA_NAME_MAX, - }; - int ret; - - tcp_get_default_congestion_control(val); - ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, - context); - if (ret == 0 && newval && newlen) - ret = tcp_set_default_congestion_control(val); - return ret; -} - - ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, @@ -650,6 +611,70 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_TCP_WESTWOOD, + .procname = "tcp_westwood", + .data = &sysctl_tcp_westwood, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS, + .procname = "tcp_vegas_cong_avoid", + .data = &sysctl_tcp_vegas_cong_avoid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_ALPHA, + .procname = "tcp_vegas_alpha", + .data = &sysctl_tcp_vegas_alpha, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_BETA, + .procname = "tcp_vegas_beta", + .data = &sysctl_tcp_vegas_beta, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_GAMMA, + .procname = "tcp_vegas_gamma", + .data = &sysctl_tcp_vegas_gamma, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC, + .procname = "tcp_bic", + .data = &sysctl_tcp_bic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, + .procname = "tcp_bic_fast_convergence", + .data = &sysctl_tcp_bic_fast_convergence, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC_LOW_WINDOW, + .procname = "tcp_bic_low_window", + .data = &sysctl_tcp_bic_low_window, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", @@ -667,14 +692,13 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_CONG_CONTROL, - .procname = "tcp_congestion_control", + .ctl_name = NET_TCP_BIC_BETA, + .procname = "tcp_bic_beta", + .data = &sysctl_tcp_bic_beta, + .maxlen = sizeof(int), .mode = 0644, - .maxlen = TCP_CA_NAME_MAX, - .proc_handler = &proc_tcp_congestion_control, - .strategy = &sysctl_tcp_congestion_control, + .proc_handler = &proc_dointvec, }, - { .ctl_name = 0 } }; diff --git a/trunk/net/ipv4/tcp.c b/trunk/net/ipv4/tcp.c index f3dbc8dc1263..674bbd8cfd36 100644 --- a/trunk/net/ipv4/tcp.c +++ b/trunk/net/ipv4/tcp.c @@ -2333,8 +2333,6 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", tcp_ehash_size << 1, tcp_bhash_size); - - tcp_register_congestion_control(&tcp_reno); } EXPORT_SYMBOL(tcp_accept); diff --git a/trunk/net/ipv4/tcp_bic.c b/trunk/net/ipv4/tcp_bic.c deleted file mode 100644 index ec38d45d6649..000000000000 --- a/trunk/net/ipv4/tcp_bic.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Binary Increase Congestion control for TCP - * - * This is from the implementation of BICTCP in - * Lison-Xu, Kahaled Harfoush, and Injong Rhee. - * "Binary Increase Congestion Control for Fast, Long Distance - * Networks" in InfoComm 2004 - * Available from: - * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf - * - * Unless BIC is enabled and congestion window is large - * this behaves the same as the original Reno. - */ - -#include -#include -#include -#include - - -#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation - * max_cwnd = snd_cwnd * beta - */ -#define BICTCP_B 4 /* - * In binary search, - * go to point (max+min)/N - */ - -static int fast_convergence = 1; -static int max_increment = 32; -static int low_window = 14; -static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ -static int low_utilization_threshold = 153; -static int low_utilization_period = 2; -static int initial_ssthresh = 100; -static int smooth_part = 20; - -module_param(fast_convergence, int, 0644); -MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); -module_param(max_increment, int, 0644); -MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); -module_param(low_window, int, 0644); -MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); -module_param(beta, int, 0644); -MODULE_PARM_DESC(beta, "beta for multiplicative increase"); -module_param(low_utilization_threshold, int, 0644); -MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); -module_param(low_utilization_period, int, 0644); -MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); -module_param(initial_ssthresh, int, 0644); -MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); -module_param(smooth_part, int, 0644); -MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); - - -/* BIC TCP Parameters */ -struct bictcp { - u32 cnt; /* increase cwnd by 1 after ACKs */ - u32 last_max_cwnd; /* last maximum snd_cwnd */ - u32 loss_cwnd; /* congestion window at last loss */ - u32 last_cwnd; /* the last snd_cwnd */ - u32 last_time; /* time when updated last_cwnd */ - u32 delay_min; /* min delay */ - u32 delay_max; /* max delay */ - u32 last_delay; - u8 low_utilization;/* 0: high; 1: low */ - u32 low_utilization_start; /* starting time of low utilization detection*/ - u32 epoch_start; /* beginning of an epoch */ -#define ACK_RATIO_SHIFT 4 - u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ -}; - -static inline void bictcp_reset(struct bictcp *ca) -{ - ca->cnt = 0; - ca->last_max_cwnd = 0; - ca->loss_cwnd = 0; - ca->last_cwnd = 0; - ca->last_time = 0; - ca->delay_min = 0; - ca->delay_max = 0; - ca->last_delay = 0; - ca->low_utilization = 0; - ca->low_utilization_start = 0; - ca->epoch_start = 0; - ca->delayed_ack = 2 << ACK_RATIO_SHIFT; -} - -static void bictcp_init(struct tcp_sock *tp) -{ - bictcp_reset(tcp_ca(tp)); - if (initial_ssthresh) - tp->snd_ssthresh = initial_ssthresh; -} - -/* - * Compute congestion window to use. - */ -static inline void bictcp_update(struct bictcp *ca, u32 cwnd) -{ - if (ca->last_cwnd == cwnd && - (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) - return; - - ca->last_cwnd = cwnd; - ca->last_time = tcp_time_stamp; - - if (ca->epoch_start == 0) /* record the beginning of an epoch */ - ca->epoch_start = tcp_time_stamp; - - /* start off normal */ - if (cwnd <= low_window) { - ca->cnt = cwnd; - return; - } - - /* binary increase */ - if (cwnd < ca->last_max_cwnd) { - __u32 dist = (ca->last_max_cwnd - cwnd) - / BICTCP_B; - - if (dist > max_increment) - /* linear increase */ - ca->cnt = cwnd / max_increment; - else if (dist <= 1U) - /* binary search increase */ - ca->cnt = (cwnd * smooth_part) / BICTCP_B; - else - /* binary search increase */ - ca->cnt = cwnd / dist; - } else { - /* slow start AMD linear increase */ - if (cwnd < ca->last_max_cwnd + BICTCP_B) - /* slow start */ - ca->cnt = (cwnd * smooth_part) / BICTCP_B; - else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) - /* slow start */ - ca->cnt = (cwnd * (BICTCP_B-1)) - / cwnd-ca->last_max_cwnd; - else - /* linear increase */ - ca->cnt = cwnd / max_increment; - } - - /* if in slow start or link utilization is very low */ - if ( ca->loss_cwnd == 0 || - (cwnd > ca->loss_cwnd && ca->low_utilization)) { - if (ca->cnt > 20) /* increase cwnd 5% per RTT */ - ca->cnt = 20; - } - - ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; - if (ca->cnt == 0) /* cannot be zero */ - ca->cnt = 1; -} - - -/* Detect low utilization in congestion avoidance */ -static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) -{ - struct bictcp *ca = tcp_ca(tp); - u32 dist, delay; - - /* No time stamp */ - if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || - /* Discard delay samples right after fast recovery */ - tcp_time_stamp < ca->epoch_start + HZ || - /* this delay samples may not be accurate */ - flag == 0) { - ca->last_delay = 0; - goto notlow; - } - - delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ - ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - if (delay == 0) /* no previous delay sample */ - goto notlow; - - /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) { - ca->delay_min = ca->delay_max = delay; - goto notlow; - } - - if (ca->delay_max < delay) - ca->delay_max = delay; - - /* utilization is low, if avg delay < dist*threshold - for checking_period time */ - dist = ca->delay_max - ca->delay_min; - if (dist <= ca->delay_min>>6 || - tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) - goto notlow; - - if (ca->low_utilization_start == 0) { - ca->low_utilization = 0; - ca->low_utilization_start = tcp_time_stamp; - } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) - > low_utilization_period*HZ) { - ca->low_utilization = 1; - } - - return; - - notlow: - ca->low_utilization = 0; - ca->low_utilization_start = 0; - -} - -static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, - u32 seq_rtt, u32 in_flight, int data_acked) -{ - struct bictcp *ca = tcp_ca(tp); - - bictcp_low_utilization(tp, data_acked); - - if (in_flight < tp->snd_cwnd) - return; - - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - bictcp_update(ca, tp->snd_cwnd); - - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= ca->cnt) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } - -} - -/* - * behave like Reno until low_window is reached, - * then increase congestion window slowly - */ -static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) -{ - struct bictcp *ca = tcp_ca(tp); - - ca->epoch_start = 0; /* end of epoch */ - - /* in case of wrong delay_max*/ - if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) - ca->delay_max = ca->delay_min - + ((ca->delay_max - ca->delay_min)* 90) / 100; - - /* Wmax and fast convergence */ - if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) - ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) - / (2 * BICTCP_BETA_SCALE); - else - ca->last_max_cwnd = tp->snd_cwnd; - - ca->loss_cwnd = tp->snd_cwnd; - - - if (tp->snd_cwnd <= low_window) - return max(tp->snd_cwnd >> 1U, 2U); - else - return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); -} - -static u32 bictcp_undo_cwnd(struct tcp_sock *tp) -{ - struct bictcp *ca = tcp_ca(tp); - - return max(tp->snd_cwnd, ca->last_max_cwnd); -} - -static u32 bictcp_min_cwnd(struct tcp_sock *tp) -{ - return tp->snd_ssthresh; -} - -static void bictcp_state(struct tcp_sock *tp, u8 new_state) -{ - if (new_state == TCP_CA_Loss) - bictcp_reset(tcp_ca(tp)); -} - -/* Track delayed acknowledgement ratio using sliding window - * ratio = (15*ratio + sample) / 16 - */ -static void bictcp_acked(struct tcp_sock *tp, u32 cnt) -{ - if (cnt > 0 && tp->ca_state == TCP_CA_Open) { - struct bictcp *ca = tcp_ca(tp); - cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ca->delayed_ack += cnt; - } -} - - -static struct tcp_congestion_ops bictcp = { - .init = bictcp_init, - .ssthresh = bictcp_recalc_ssthresh, - .cong_avoid = bictcp_cong_avoid, - .set_state = bictcp_state, - .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, - .pkts_acked = bictcp_acked, - .owner = THIS_MODULE, - .name = "bic", -}; - -static int __init bictcp_register(void) -{ - BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); - return tcp_register_congestion_control(&bictcp); -} - -static void __exit bictcp_unregister(void) -{ - tcp_unregister_congestion_control(&bictcp); -} - -module_init(bictcp_register); -module_exit(bictcp_unregister); - -MODULE_AUTHOR("Stephen Hemminger"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("BIC TCP"); diff --git a/trunk/net/ipv4/tcp_cong.c b/trunk/net/ipv4/tcp_cong.c deleted file mode 100644 index 665394a63ae4..000000000000 --- a/trunk/net/ipv4/tcp_cong.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Plugable TCP congestion control support and newReno - * congestion control. - * Based on ideas from I/O scheduler suport and Web100. - * - * Copyright (C) 2005 Stephen Hemminger - */ - -#include -#include -#include -#include -#include -#include - -static DEFINE_SPINLOCK(tcp_cong_list_lock); -static LIST_HEAD(tcp_cong_list); - -/* Simple linear search, don't expect many entries! */ -static struct tcp_congestion_ops *tcp_ca_find(const char *name) -{ - struct tcp_congestion_ops *e; - - list_for_each_entry(e, &tcp_cong_list, list) { - if (strcmp(e->name, name) == 0) - return e; - } - - return NULL; -} - -/* - * Attach new congestion control algorthim to the list - * of available options. - */ -int tcp_register_congestion_control(struct tcp_congestion_ops *ca) -{ - int ret = 0; - - /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { - printk(KERN_ERR "TCP %s does not implement required ops\n", - ca->name); - return -EINVAL; - } - - spin_lock(&tcp_cong_list_lock); - if (tcp_ca_find(ca->name)) { - printk(KERN_NOTICE "TCP %s already registered\n", ca->name); - ret = -EEXIST; - } else { - list_add_rcu(&ca->list, &tcp_cong_list); - printk(KERN_INFO "TCP %s registered\n", ca->name); - } - spin_unlock(&tcp_cong_list_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(tcp_register_congestion_control); - -/* - * Remove congestion control algorithm, called from - * the module's remove function. Module ref counts are used - * to ensure that this can't be done till all sockets using - * that method are closed. - */ -void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) -{ - spin_lock(&tcp_cong_list_lock); - list_del_rcu(&ca->list); - spin_unlock(&tcp_cong_list_lock); -} -EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); - -/* Assign choice of congestion control. */ -void tcp_init_congestion_control(struct tcp_sock *tp) -{ - struct tcp_congestion_ops *ca; - - rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (try_module_get(ca->owner)) { - tp->ca_ops = ca; - break; - } - - } - rcu_read_unlock(); - - if (tp->ca_ops->init) - tp->ca_ops->init(tp); -} - -/* Manage refcounts on socket close. */ -void tcp_cleanup_congestion_control(struct tcp_sock *tp) -{ - if (tp->ca_ops->release) - tp->ca_ops->release(tp); - module_put(tp->ca_ops->owner); -} - -/* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) -{ - struct tcp_congestion_ops *ca; - int ret = -ENOENT; - - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_KMOD - if (!ca) { - spin_unlock(&tcp_cong_list_lock); - - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif - - if (ca) { - list_move(&ca->list, &tcp_cong_list); - ret = 0; - } - spin_unlock(&tcp_cong_list_lock); - - return ret; -} - -/* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) -{ - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); - - rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); - strncpy(name, ca->name, TCP_CA_NAME_MAX); - rcu_read_unlock(); -} - -/* - * TCP Reno congestion control - * This is special case used for fallback as well. - */ -/* This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. - */ -void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, - int flag) -{ - if (in_flight < tp->snd_cwnd) - return; - - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } -} -EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); - -/* Slow start threshold is half the congestion window (min 2) */ -u32 tcp_reno_ssthresh(struct tcp_sock *tp) -{ - return max(tp->snd_cwnd >> 1U, 2U); -} -EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); - -/* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct tcp_sock *tp) -{ - return tp->snd_ssthresh/2; -} -EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); - -struct tcp_congestion_ops tcp_reno = { - .name = "reno", - .owner = THIS_MODULE, - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, -}; - -EXPORT_SYMBOL_GPL(tcp_reno); diff --git a/trunk/net/ipv4/tcp_diag.c b/trunk/net/ipv4/tcp_diag.c index f66945cb158f..634befc07921 100644 --- a/trunk/net/ipv4/tcp_diag.c +++ b/trunk/net/ipv4/tcp_diag.c @@ -42,8 +42,15 @@ struct tcpdiag_entry static struct sock *tcpnl; + #define TCPDIAG_PUT(skb, attrtype, attrlen) \ - RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) +({ int rtalen = RTA_LENGTH(attrlen); \ + struct rtattr *rta; \ + if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ + rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ + rta->rta_type = attrtype; \ + rta->rta_len = rtalen; \ + RTA_DATA(rta); }) static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags) @@ -54,6 +61,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, struct nlmsghdr *nlh; struct tcp_info *info = NULL; struct tcpdiag_meminfo *minfo = NULL; + struct tcpvegas_info *vinfo = NULL; unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); @@ -65,11 +73,9 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - if (ext & (1<<(TCPDIAG_CONG-1))) { - size_t len = strlen(tp->ca_ops->name); - strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), - tp->ca_ops->name); - } + if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) + && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) + vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; @@ -160,13 +166,23 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (info) tcp_get_info(sk, info); - if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) - tp->ca_ops->get_info(tp, ext, skb); + if (vinfo) { + if (tcp_is_vegas(tp)) { + vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; + vinfo->tcpv_rttcnt = tp->vegas.cntRTT; + vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); + vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); + } else { + vinfo->tcpv_enabled = 0; + vinfo->tcpv_rttcnt = 0; + vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); + vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); + } + } nlh->nlmsg_len = skb->tail - b; return skb->len; -rtattr_failure: nlmsg_failure: skb_trim(skb, b - skb->data); return -1; diff --git a/trunk/net/ipv4/tcp_highspeed.c b/trunk/net/ipv4/tcp_highspeed.c deleted file mode 100644 index 36c51f8136bf..000000000000 --- a/trunk/net/ipv4/tcp_highspeed.c +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Sally Floyd's High Speed TCP (RFC 3649) congestion control - * - * See http://www.icir.org/floyd/hstcp.html - * - * John Heffner - */ - -#include -#include -#include - - -/* From AIMD tables from RFC 3649 appendix B, - * with fixed-point MD scaled <<8. - */ -static const struct hstcp_aimd_val { - unsigned int cwnd; - unsigned int md; -} hstcp_aimd_vals[] = { - { 38, 128, /* 0.50 */ }, - { 118, 112, /* 0.44 */ }, - { 221, 104, /* 0.41 */ }, - { 347, 98, /* 0.38 */ }, - { 495, 93, /* 0.37 */ }, - { 663, 89, /* 0.35 */ }, - { 851, 86, /* 0.34 */ }, - { 1058, 83, /* 0.33 */ }, - { 1284, 81, /* 0.32 */ }, - { 1529, 78, /* 0.31 */ }, - { 1793, 76, /* 0.30 */ }, - { 2076, 74, /* 0.29 */ }, - { 2378, 72, /* 0.28 */ }, - { 2699, 71, /* 0.28 */ }, - { 3039, 69, /* 0.27 */ }, - { 3399, 68, /* 0.27 */ }, - { 3778, 66, /* 0.26 */ }, - { 4177, 65, /* 0.26 */ }, - { 4596, 64, /* 0.25 */ }, - { 5036, 62, /* 0.25 */ }, - { 5497, 61, /* 0.24 */ }, - { 5979, 60, /* 0.24 */ }, - { 6483, 59, /* 0.23 */ }, - { 7009, 58, /* 0.23 */ }, - { 7558, 57, /* 0.22 */ }, - { 8130, 56, /* 0.22 */ }, - { 8726, 55, /* 0.22 */ }, - { 9346, 54, /* 0.21 */ }, - { 9991, 53, /* 0.21 */ }, - { 10661, 52, /* 0.21 */ }, - { 11358, 52, /* 0.20 */ }, - { 12082, 51, /* 0.20 */ }, - { 12834, 50, /* 0.20 */ }, - { 13614, 49, /* 0.19 */ }, - { 14424, 48, /* 0.19 */ }, - { 15265, 48, /* 0.19 */ }, - { 16137, 47, /* 0.19 */ }, - { 17042, 46, /* 0.18 */ }, - { 17981, 45, /* 0.18 */ }, - { 18955, 45, /* 0.18 */ }, - { 19965, 44, /* 0.17 */ }, - { 21013, 43, /* 0.17 */ }, - { 22101, 43, /* 0.17 */ }, - { 23230, 42, /* 0.17 */ }, - { 24402, 41, /* 0.16 */ }, - { 25618, 41, /* 0.16 */ }, - { 26881, 40, /* 0.16 */ }, - { 28193, 39, /* 0.16 */ }, - { 29557, 39, /* 0.15 */ }, - { 30975, 38, /* 0.15 */ }, - { 32450, 38, /* 0.15 */ }, - { 33986, 37, /* 0.15 */ }, - { 35586, 36, /* 0.14 */ }, - { 37253, 36, /* 0.14 */ }, - { 38992, 35, /* 0.14 */ }, - { 40808, 35, /* 0.14 */ }, - { 42707, 34, /* 0.13 */ }, - { 44694, 33, /* 0.13 */ }, - { 46776, 33, /* 0.13 */ }, - { 48961, 32, /* 0.13 */ }, - { 51258, 32, /* 0.13 */ }, - { 53677, 31, /* 0.12 */ }, - { 56230, 30, /* 0.12 */ }, - { 58932, 30, /* 0.12 */ }, - { 61799, 29, /* 0.12 */ }, - { 64851, 28, /* 0.11 */ }, - { 68113, 28, /* 0.11 */ }, - { 71617, 27, /* 0.11 */ }, - { 75401, 26, /* 0.10 */ }, - { 79517, 26, /* 0.10 */ }, - { 84035, 25, /* 0.10 */ }, - { 89053, 24, /* 0.10 */ }, -}; - -#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) - -struct hstcp { - u32 ai; -}; - -static void hstcp_init(struct tcp_sock *tp) -{ - struct hstcp *ca = tcp_ca(tp); - - ca->ai = 0; - - /* Ensure the MD arithmetic works. This is somewhat pedantic, - * since I don't think we will see a cwnd this large. :) */ - tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); -} - -static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, - u32 in_flight, int good) -{ - struct hstcp *ca = tcp_ca(tp); - - if (in_flight < tp->snd_cwnd) - return; - - if (tp->snd_cwnd <= tp->snd_ssthresh) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* Update AIMD parameters */ - if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { - while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && - ca->ai < HSTCP_AIMD_MAX) - ca->ai++; - } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) { - while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && - ca->ai > 0) - ca->ai--; - } - - /* Do additive increase */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) { - tp->snd_cwnd_cnt += ca->ai; - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; - tp->snd_cwnd_cnt -= tp->snd_cwnd; - } - } - } -} - -static u32 hstcp_ssthresh(struct tcp_sock *tp) -{ - struct hstcp *ca = tcp_ca(tp); - - /* Do multiplicative decrease */ - return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); -} - - -static struct tcp_congestion_ops tcp_highspeed = { - .init = hstcp_init, - .ssthresh = hstcp_ssthresh, - .cong_avoid = hstcp_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, - - .owner = THIS_MODULE, - .name = "highspeed" -}; - -static int __init hstcp_register(void) -{ - BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); - return tcp_register_congestion_control(&tcp_highspeed); -} - -static void __exit hstcp_unregister(void) -{ - tcp_unregister_congestion_control(&tcp_highspeed); -} - -module_init(hstcp_register); -module_exit(hstcp_unregister); - -MODULE_AUTHOR("John Heffner"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("High Speed TCP"); diff --git a/trunk/net/ipv4/tcp_htcp.c b/trunk/net/ipv4/tcp_htcp.c deleted file mode 100644 index 40168275acf9..000000000000 --- a/trunk/net/ipv4/tcp_htcp.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * H-TCP congestion control. The algorithm is detailed in: - * R.N.Shorten, D.J.Leith: - * "H-TCP: TCP for high-speed and long-distance networks" - * Proc. PFLDnet, Argonne, 2004. - * http://www.hamilton.ie/net/htcp3.pdf - */ - -#include -#include -#include -#include - -#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ -#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ -#define BETA_MAX 102 /* 0.8 with shift << 7 */ - -static int use_rtt_scaling = 1; -module_param(use_rtt_scaling, int, 0644); -MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); - -static int use_bandwidth_switch = 1; -module_param(use_bandwidth_switch, int, 0644); -MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); - -struct htcp { - u16 alpha; /* Fixed point arith, << 7 */ - u8 beta; /* Fixed point arith, << 7 */ - u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ - u8 ccount; /* Number of RTTs since last congestion event */ - u8 undo_ccount; - u16 packetcount; - u32 minRTT; - u32 maxRTT; - u32 snd_cwnd_cnt2; - - u32 undo_maxRTT; - u32 undo_old_maxB; - - /* Bandwidth estimation */ - u32 minB; - u32 maxB; - u32 old_maxB; - u32 Bi; - u32 lasttime; -}; - -static inline void htcp_reset(struct htcp *ca) -{ - ca->undo_ccount = ca->ccount; - ca->undo_maxRTT = ca->maxRTT; - ca->undo_old_maxB = ca->old_maxB; - - ca->ccount = 0; - ca->snd_cwnd_cnt2 = 0; -} - -static u32 htcp_cwnd_undo(struct tcp_sock *tp) -{ - struct htcp *ca = tcp_ca(tp); - ca->ccount = ca->undo_ccount; - ca->maxRTT = ca->undo_maxRTT; - ca->old_maxB = ca->undo_old_maxB; - return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); -} - -static inline void measure_rtt(struct tcp_sock *tp) -{ - struct htcp *ca = tcp_ca(tp); - u32 srtt = tp->srtt>>3; - - /* keep track of minimum RTT seen so far, minRTT is zero at first */ - if (ca->minRTT > srtt || !ca->minRTT) - ca->minRTT = srtt; - - /* max RTT */ - if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { - if (ca->maxRTT < ca->minRTT) - ca->maxRTT = ca->minRTT; - if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) - ca->maxRTT = srtt; - } -} - -static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) -{ - struct htcp *ca = tcp_ca(tp); - u32 now = tcp_time_stamp; - - /* achieved throughput calculations */ - if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { - ca->packetcount = 0; - ca->lasttime = now; - return; - } - - ca->packetcount += pkts_acked; - - if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1) - && now - ca->lasttime >= ca->minRTT - && ca->minRTT > 0) { - __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime); - if (ca->ccount <= 3) { - /* just after backoff */ - ca->minB = ca->maxB = ca->Bi = cur_Bi; - } else { - ca->Bi = (3*ca->Bi + cur_Bi)/4; - if (ca->Bi > ca->maxB) - ca->maxB = ca->Bi; - if (ca->minB > ca->maxB) - ca->minB = ca->maxB; - } - ca->packetcount = 0; - ca->lasttime = now; - } -} - -static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) -{ - if (use_bandwidth_switch) { - u32 maxB = ca->maxB; - u32 old_maxB = ca->old_maxB; - ca->old_maxB = ca->maxB; - - if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) { - ca->beta = BETA_MIN; - ca->modeswitch = 0; - return; - } - } - - if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) { - ca->beta = (minRTT<<7)/maxRTT; - if (ca->beta < BETA_MIN) - ca->beta = BETA_MIN; - else if (ca->beta > BETA_MAX) - ca->beta = BETA_MAX; - } else { - ca->beta = BETA_MIN; - ca->modeswitch = 1; - } -} - -static inline void htcp_alpha_update(struct htcp *ca) -{ - u32 minRTT = ca->minRTT; - u32 factor = 1; - u32 diff = ca->ccount * minRTT; /* time since last backoff */ - - if (diff > HZ) { - diff -= HZ; - factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ; - } - - if (use_rtt_scaling && minRTT) { - u32 scale = (HZ<<3)/(10*minRTT); - scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */ - factor = (factor<<3)/scale; - if (!factor) - factor = 1; - } - - ca->alpha = 2*factor*((1<<7)-ca->beta); - if (!ca->alpha) - ca->alpha = ALPHA_BASE; -} - -/* After we have the rtt data to calculate beta, we'd still prefer to wait one - * rtt before we adjust our beta to ensure we are working from a consistent - * data. - * - * This function should be called when we hit a congestion event since only at - * that point do we really have a real sense of maxRTT (the queues en route - * were getting just too full now). - */ -static void htcp_param_update(struct tcp_sock *tp) -{ - struct htcp *ca = tcp_ca(tp); - u32 minRTT = ca->minRTT; - u32 maxRTT = ca->maxRTT; - - htcp_beta_update(ca, minRTT, maxRTT); - htcp_alpha_update(ca); - - /* add slowly fading memory for maxRTT to accommodate routing changes etc */ - if (minRTT > 0 && maxRTT > minRTT) - ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; -} - -static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) -{ - struct htcp *ca = tcp_ca(tp); - htcp_param_update(tp); - return max((tp->snd_cwnd * ca->beta) >> 7, 2U); -} - -static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, - u32 in_flight, int data_acked) -{ - struct htcp *ca = tcp_ca(tp); - - if (in_flight < tp->snd_cwnd) - return; - - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - measure_rtt(tp); - - /* keep track of number of round-trip times since last backoff event */ - if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { - ca->ccount++; - ca->snd_cwnd_cnt2 = 0; - htcp_alpha_update(ca); - } - - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd - */ - if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - ca->ccount++; - } - } -} - -/* Lower bound on congestion window. */ -static u32 htcp_min_cwnd(struct tcp_sock *tp) -{ - return tp->snd_ssthresh; -} - - -static void htcp_init(struct tcp_sock *tp) -{ - struct htcp *ca = tcp_ca(tp); - - memset(ca, 0, sizeof(struct htcp)); - ca->alpha = ALPHA_BASE; - ca->beta = BETA_MIN; -} - -static void htcp_state(struct tcp_sock *tp, u8 new_state) -{ - switch (new_state) { - case TCP_CA_CWR: - case TCP_CA_Recovery: - case TCP_CA_Loss: - htcp_reset(tcp_ca(tp)); - break; - } -} - -static struct tcp_congestion_ops htcp = { - .init = htcp_init, - .ssthresh = htcp_recalc_ssthresh, - .min_cwnd = htcp_min_cwnd, - .cong_avoid = htcp_cong_avoid, - .set_state = htcp_state, - .undo_cwnd = htcp_cwnd_undo, - .pkts_acked = measure_achieved_throughput, - .owner = THIS_MODULE, - .name = "htcp", -}; - -static int __init htcp_register(void) -{ - BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); - BUILD_BUG_ON(BETA_MIN >= BETA_MAX); - if (!use_bandwidth_switch) - htcp.pkts_acked = NULL; - return tcp_register_congestion_control(&htcp); -} - -static void __exit htcp_unregister(void) -{ - tcp_unregister_congestion_control(&htcp); -} - -module_init(htcp_register); -module_exit(htcp_unregister); - -MODULE_AUTHOR("Baruch Even"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("H-TCP"); diff --git a/trunk/net/ipv4/tcp_hybla.c b/trunk/net/ipv4/tcp_hybla.c deleted file mode 100644 index 13a66342c304..000000000000 --- a/trunk/net/ipv4/tcp_hybla.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * TCP HYBLA - * - * TCP-HYBLA Congestion control algorithm, based on: - * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement - * for Heterogeneous Networks", - * International Journal on satellite Communications, - * September 2004 - * Daniele Lacamera - * root at danielinux.net - */ - -#include -#include -#include - -/* Tcp Hybla structure. */ -struct hybla { - u8 hybla_en; - u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ - u32 rho; /* Rho parameter, integer part */ - u32 rho2; /* Rho * Rho, integer part */ - u32 rho_3ls; /* Rho parameter, <<3 */ - u32 rho2_7ls; /* Rho^2, <<7 */ - u32 minrtt; /* Minimum smoothed round trip time value seen */ -}; - -/* Hybla reference round trip time (default= 1/40 sec = 25 ms), - expressed in jiffies */ -static int rtt0 = 25; -module_param(rtt0, int, 0644); -MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); - - -/* This is called to refresh values for hybla parameters */ -static inline void hybla_recalc_param (struct tcp_sock *tp) -{ - struct hybla *ca = tcp_ca(tp); - - ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); - ca->rho = ca->rho_3ls >> 3; - ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; - ca->rho2 = ca->rho2_7ls >>7; -} - -static void hybla_init(struct tcp_sock *tp) -{ - struct hybla *ca = tcp_ca(tp); - - ca->rho = 0; - ca->rho2 = 0; - ca->rho_3ls = 0; - ca->rho2_7ls = 0; - ca->snd_cwnd_cents = 0; - ca->hybla_en = 1; - tp->snd_cwnd = 2; - tp->snd_cwnd_clamp = 65535; - - /* 1st Rho measurement based on initial srtt */ - hybla_recalc_param(tp); - - /* set minimum rtt as this is the 1st ever seen */ - ca->minrtt = tp->srtt; - tp->snd_cwnd = ca->rho; -} - -static void hybla_state(struct tcp_sock *tp, u8 ca_state) -{ - struct hybla *ca = tcp_ca(tp); - - ca->hybla_en = (ca_state == TCP_CA_Open); -} - -static inline u32 hybla_fraction(u32 odds) -{ - static const u32 fractions[] = { - 128, 139, 152, 165, 181, 197, 215, 234, - }; - - return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; -} - -/* TCP Hybla main routine. - * This is the algorithm behavior: - * o Recalc Hybla parameters if min_rtt has changed - * o Give cwnd a new value based on the model proposed - * o remember increments <1 - */ -static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, - u32 in_flight, int flag) -{ - struct hybla *ca = tcp_ca(tp); - u32 increment, odd, rho_fractions; - int is_slowstart = 0; - - /* Recalculate rho only if this srtt is the lowest */ - if (tp->srtt < ca->minrtt){ - hybla_recalc_param(tp); - ca->minrtt = tp->srtt; - } - - if (!ca->hybla_en) - return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); - - if (in_flight < tp->snd_cwnd) - return; - - if (ca->rho == 0) - hybla_recalc_param(tp); - - rho_fractions = ca->rho_3ls - (ca->rho << 3); - - if (tp->snd_cwnd < tp->snd_ssthresh) { - /* - * slow start - * INC = 2^RHO - 1 - * This is done by splitting the rho parameter - * into 2 parts: an integer part and a fraction part. - * Inrement<<7 is estimated by doing: - * [2^(int+fract)]<<7 - * that is equal to: - * (2^int) * [(2^fract) <<7] - * 2^int is straightly computed as 1<rho) * hybla_fraction(rho_fractions)) - - 128; - } else { - /* - * congestion avoidance - * INC = RHO^2 / W - * as long as increment is estimated as (rho<<7)/window - * it already is <<7 and we can easily count its fractions. - */ - increment = ca->rho2_7ls / tp->snd_cwnd; - if (increment < 128) - tp->snd_cwnd_cnt++; - } - - odd = increment % 128; - tp->snd_cwnd += increment >> 7; - ca->snd_cwnd_cents += odd; - - /* check when fractions goes >=128 and increase cwnd by 1. */ - while(ca->snd_cwnd_cents >= 128) { - tp->snd_cwnd++; - ca->snd_cwnd_cents -= 128; - tp->snd_cwnd_cnt = 0; - } - - /* clamp down slowstart cwnd to ssthresh value. */ - if (is_slowstart) - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); -} - -static struct tcp_congestion_ops tcp_hybla = { - .init = hybla_init, - .ssthresh = tcp_reno_ssthresh, - .min_cwnd = tcp_reno_min_cwnd, - .cong_avoid = hybla_cong_avoid, - .set_state = hybla_state, - - .owner = THIS_MODULE, - .name = "hybla" -}; - -static int __init hybla_register(void) -{ - BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); - return tcp_register_congestion_control(&tcp_hybla); -} - -static void __exit hybla_unregister(void) -{ - tcp_unregister_congestion_control(&tcp_hybla); -} - -module_init(hybla_register); -module_exit(hybla_unregister); - -MODULE_AUTHOR("Daniele Lacamera"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("TCP Hybla"); diff --git a/trunk/net/ipv4/tcp_input.c b/trunk/net/ipv4/tcp_input.c index 7bbbbc33eb4b..5bad504630a3 100644 --- a/trunk/net/ipv4/tcp_input.c +++ b/trunk/net/ipv4/tcp_input.c @@ -61,6 +61,7 @@ * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs + * Angelo Dell'Aera: TCP Westwood+ support */ #include @@ -87,9 +88,23 @@ int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; +int sysctl_tcp_westwood; +int sysctl_tcp_vegas_cong_avoid; int sysctl_tcp_moderate_rcvbuf = 1; +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +int sysctl_tcp_vegas_alpha = 1<snd_cwnd_stamp = tcp_time_stamp; } +static void init_bictcp(struct tcp_sock *tp) +{ + tp->bictcp.cnt = 0; + + tp->bictcp.last_max_cwnd = 0; + tp->bictcp.last_cwnd = 0; + tp->bictcp.last_stamp = 0; +} + /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { @@ -534,6 +558,45 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ tcp_grow_window(sk, tp, skb); } +/* When starting a new connection, pin down the current choice of + * congestion algorithm. + */ +void tcp_ca_init(struct tcp_sock *tp) +{ + if (sysctl_tcp_westwood) + tp->adv_cong = TCP_WESTWOOD; + else if (sysctl_tcp_bic) + tp->adv_cong = TCP_BIC; + else if (sysctl_tcp_vegas_cong_avoid) { + tp->adv_cong = TCP_VEGAS; + tp->vegas.baseRTT = 0x7fffffff; + tcp_vegas_enable(tp); + } +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) +{ + __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < tp->vegas.baseRTT) + tp->vegas.baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); + tp->vegas.cntRTT++; +} + /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -543,10 +606,13 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) { long m = mrtt; /* RTT */ + if (tcp_vegas_enabled(tp)) + vegas_rtt_calc(tp, mrtt); + /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -604,8 +670,7 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) tp->rtt_seq = tp->snd_nxt; } - if (tp->ca_ops->rtt_sample) - tp->ca_ops->rtt_sample(tp, *usrtt); + tcp_westwood_update_rtt(tp, tp->srtt >> 3); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1120,8 +1185,8 @@ void tcp_enter_frto(struct sock *sk) tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); - tcp_ca_event(tp, CA_EVENT_FRTO); + if (!tcp_westwood_ssthresh(tp)) + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1187,6 +1252,8 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + + init_bictcp(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1216,8 +1283,7 @@ void tcp_enter_loss(struct sock *sk, int how) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); - tcp_ca_event(tp, CA_EVENT_LOSS); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1530,14 +1596,28 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ + static void tcp_cwnd_down(struct tcp_sock *tp) { int decr = tp->snd_cwnd_cnt + 1; + __u32 limit; + + /* + * TCP Westwood + * Here limit is evaluated as BWestimation*RTTmin (for obtaining it + * in packets we use mss_cache). If sysctl_tcp_westwood is off + * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is + * still used as usual. It prevents other strange cases in which + * BWE*RTTmin could assume value 0. It should not happen but... + */ + + if (!(limit = tcp_westwood_bw_rttmin(tp))) + limit = tp->snd_ssthresh/2; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) + if (decr && tp->snd_cwnd > limit) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1574,8 +1654,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) static void tcp_undo_cwr(struct tcp_sock *tp, int undo) { if (tp->prior_ssthresh) { - if (tp->ca_ops->undo_cwnd) - tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); + if (tcp_is_bic(tp)) + tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); @@ -1687,9 +1767,11 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) static inline void tcp_complete_cwr(struct tcp_sock *tp) { - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + if (tcp_westwood_cwnd(tp)) + tp->snd_ssthresh = tp->snd_cwnd; + else + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); } static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) @@ -1864,7 +1946,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (tp->ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); TCP_ECN_queue_cwr(tp); } @@ -1881,7 +1963,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) { __u32 seq_rtt; @@ -1901,13 +1983,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) * in window is lost... Voila. --ANK (010210) */ seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt, usrtt); + tcp_rtt_estimator(tp, seq_rtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1921,29 +2003,338 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt, usrtt); + tcp_rtt_estimator(tp, seq_rtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt, u32 *usrtt) + int flag, s32 seq_rtt) { /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, usrtt, flag); + tcp_ack_saw_tstamp(tp, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(tp, seq_rtt, flag); } -static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, - u32 in_flight, int good) +/* + * Compute congestion window to use. + * + * This is from the implementation of BICTCP in + * Lison-Xu, Kahaled Harfoush, and Injog Rhee. + * "Binary Increase Congestion Control for Fast, Long Distance + * Networks" in InfoComm 2004 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf + * + * Unless BIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ +static inline __u32 bictcp_cwnd(struct tcp_sock *tp) +{ + /* orignal Reno behaviour */ + if (!tcp_is_bic(tp)) + return tp->snd_cwnd; + + if (tp->bictcp.last_cwnd == tp->snd_cwnd && + (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) + return tp->bictcp.cnt; + + tp->bictcp.last_cwnd = tp->snd_cwnd; + tp->bictcp.last_stamp = tcp_time_stamp; + + /* start off normal */ + if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) + tp->bictcp.cnt = tp->snd_cwnd; + + /* binary increase */ + else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { + __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) + / BICTCP_B; + + if (dist > BICTCP_MAX_INCREMENT) + /* linear increase */ + tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; + else if (dist <= 1U) + /* binary search increase */ + tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR + / BICTCP_B; + else + /* binary search increase */ + tp->bictcp.cnt = tp->snd_cwnd / dist; + } else { + /* slow start amd linear increase */ + if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) + /* slow start */ + tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR + / BICTCP_B; + else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) + /* slow start */ + tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) + / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); + else + /* linear increase */ + tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; + } + return tp->bictcp.cnt; +} + +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +static inline void reno_cong_avoid(struct tcp_sock *tp) { - tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt=0; + } else + tp->snd_cwnd_cnt++; + } tp->snd_cwnd_stamp = tcp_time_stamp; } +/* This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ +static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) +{ + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, tp->vegas.beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / + tp->mss_cache_std; + old_snd_cwnd = tp->vegas.beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; + tp->vegas.beg_snd_nxt = tp->snd_nxt; + tp->vegas.beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + vegas_rtt_calc(tp, seq_rtt); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (tp->vegas.cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = tp->vegas.minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * tp->vegas.baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > sysctl_tcp_vegas_gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > sysctl_tcp_vegas_beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < sysctl_tcp_vegas_alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + tp->vegas.cntRTT = 0; + tp->vegas.minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); + + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) +{ + if (tcp_vegas_enabled(tp)) + vegas_cong_avoid(tp, ack, seq_rtt); + else + reno_cong_avoid(tp); +} + /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ @@ -2024,18 +2415,13 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; - struct timeval usnow; - u32 pkts_acked = 0; - - if (seq_usrtt) - do_gettimeofday(&usnow); while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2062,7 +2448,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt */ if (!(scb->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; - ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; @@ -2076,10 +2461,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; - if (seq_usrtt) - *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 - + (usnow.tv_usec - skb->stamp.tv_usec); - if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2098,11 +2479,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(tp, acked, seq_rtt); tcp_ack_packets_out(sk, tp); - - if (tp->ca_ops->pkts_acked) - tp->ca_ops->pkts_acked(tp, pkts_acked); } #if FASTRETRANS_DEBUG > 0 @@ -2246,6 +2624,257 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) tp->frto_counter = (tp->frto_counter + 1) % 3; } +/* + * TCP Westwood+ + */ + +/* + * @init_westwood + * This function initializes fields used in TCP Westwood+. We can't + * get no information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ + +static void init_westwood(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.bw_ns_est = 0; + tp->westwood.bw_est = 0; + tp->westwood.accounted = 0; + tp->westwood.cumul_ack = 0; + tp->westwood.rtt_win_sx = tcp_time_stamp; + tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; + tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; + tp->westwood.snd_una = tp->snd_una; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coeffients. + */ + +static inline __u32 westwood_do_filter(__u32 a, __u32 b) +{ + return (((7 * a) + b) >> 3); +} + +static void westwood_filter(struct sock *sk, __u32 delta) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.bw_ns_est = + westwood_do_filter(tp->westwood.bw_ns_est, + tp->westwood.bk / delta); + tp->westwood.bw_est = + westwood_do_filter(tp->westwood.bw_est, + tp->westwood.bw_ns_est); +} + +/* + * @westwood_update_rttmin + * It is used to update RTTmin. In this case we MUST NOT use + * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! + */ + +static inline __u32 westwood_update_rttmin(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u32 rttmin = tp->westwood.rtt_min; + + if (tp->westwood.rtt != 0 && + (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) + rttmin = tp->westwood.rtt; + + return rttmin; +} + +/* + * @westwood_acked + * Evaluate increases for dk. + */ + +static inline __u32 westwood_acked(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return tp->snd_una - tp->westwood.snd_una; +} + +/* + * @westwood_new_window + * It evaluates if we are receiving data inside the same RTT window as + * when we started. + * Return value: + * It returns 0 if we are still evaluating samples in the same RTT + * window, 1 if the sample has to be considered in the next window. + */ + +static int westwood_new_window(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u32 left_bound; + __u32 rtt; + int ret = 0; + + left_bound = tp->westwood.rtt_win_sx; + rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); + + /* + * A RTT-window has passed. Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was choosen since an estimation on small + * time intervals is better to avoid... + * Obvioulsy on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + + if ((left_bound + rtt) < tcp_time_stamp) + ret = 1; + + return ret; +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ + +static void __westwood_update_window(struct sock *sk, __u32 now) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 delta = now - tp->westwood.rtt_win_sx; + + if (delta) { + if (tp->westwood.rtt) + westwood_filter(sk, delta); + + tp->westwood.bk = 0; + tp->westwood.rtt_win_sx = tcp_time_stamp; + } +} + + +static void westwood_update_window(struct sock *sk, __u32 now) +{ + if (westwood_new_window(sk)) + __westwood_update_window(sk, now); +} + +/* + * @__tcp_westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successfull. In such case infact update is + * straight forward and doesn't need any particular care. + */ + +static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + westwood_update_window(sk, tcp_time_stamp); + + tp->westwood.bk += westwood_acked(sk); + tp->westwood.snd_una = tp->snd_una; + tp->westwood.rtt_min = westwood_update_rttmin(sk); +} + +static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_is_westwood(tcp_sk(sk))) + __tcp_westwood_fast_bw(sk, skb); +} + + +/* + * @westwood_dupack_update + * It updates accounted and cumul_ack when receiving a dupack. + */ + +static void westwood_dupack_update(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.accounted += tp->mss_cache_std; + tp->westwood.cumul_ack = tp->mss_cache_std; +} + +static inline int westwood_may_change_cumul(struct tcp_sock *tp) +{ + return (tp->westwood.cumul_ack > tp->mss_cache_std); +} + +static inline void westwood_partial_update(struct tcp_sock *tp) +{ + tp->westwood.accounted -= tp->westwood.cumul_ack; + tp->westwood.cumul_ack = tp->mss_cache_std; +} + +static inline void westwood_complete_update(struct tcp_sock *tp) +{ + tp->westwood.cumul_ack -= tp->westwood.accounted; + tp->westwood.accounted = 0; +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating dk in case of + * delayed or partial acks. + */ + +static inline __u32 westwood_acked_count(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.cumul_ack = westwood_acked(sk); + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!(tp->westwood.cumul_ack)) + westwood_dupack_update(sk); + + if (westwood_may_change_cumul(tp)) { + /* Partial or delayed ack */ + if (tp->westwood.accounted >= tp->westwood.cumul_ack) + westwood_partial_update(tp); + else + westwood_complete_update(tp); + } + + tp->westwood.snd_una = tp->snd_una; + + return tp->westwood.cumul_ack; +} + + +/* + * @__tcp_westwood_slow_bw + * It is called when something is going wrong..even if there could + * be no problems! Infact a simple delayed packet may trigger a + * dupack. But we need to be careful in such case. + */ + +static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + westwood_update_window(sk, tcp_time_stamp); + + tp->westwood.bk += westwood_acked_count(sk); + tp->westwood.rtt_min = westwood_update_rttmin(sk); +} + +static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_is_westwood(tcp_sk(sk))) + __tcp_westwood_slow_bw(sk, skb); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { @@ -2255,7 +2884,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; - s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2274,10 +2902,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) */ tcp_update_wl(tp, ack, ack_seq); tp->snd_una = ack; + tcp_westwood_fast_bw(sk, skb); flag |= FLAG_WIN_UPDATE; - tcp_ca_event(tp, CA_EVENT_FAST_ACK); - NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) @@ -2293,7 +2920,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; - tcp_ca_event(tp, CA_EVENT_SLOW_ACK); + tcp_westwood_slow_bw(sk,skb); } /* We passed data and got it acked, remove any soft error @@ -2308,20 +2935,22 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); + if ((flag & FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && + tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp, ack, seq_rtt); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED)) - tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); + if ((flag & FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) + tcp_cong_avoid(tp, ack, seq_rtt); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -3923,8 +4552,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); - tcp_init_congestion_control(tp); - /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ @@ -4081,6 +4708,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if(tp->af_specific->conn_request(sk, skb) < 0) return 1; + init_westwood(sk); + init_bictcp(tp); + /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the @@ -4102,6 +4732,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: + init_westwood(sk); + init_bictcp(tp); + queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued >= 0) return queued; @@ -4183,7 +4816,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0, 0); + tcp_ack_saw_tstamp(tp, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4195,8 +4828,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); - tcp_init_congestion_control(tp); - /* Prevent spurious tcp_cwnd_restart() on * first data packet. */ diff --git a/trunk/net/ipv4/tcp_ipv4.c b/trunk/net/ipv4/tcp_ipv4.c index 9122814c13ad..2d41d5d6ad19 100644 --- a/trunk/net/ipv4/tcp_ipv4.c +++ b/trunk/net/ipv4/tcp_ipv4.c @@ -2048,7 +2048,6 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache_std = tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; - tp->ca_ops = &tcp_reno; sk->sk_state = TCP_CLOSE; @@ -2071,8 +2070,6 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); - tcp_cleanup_congestion_control(tp); - /* Cleanup up the write buffer. */ sk_stream_writequeue_purge(sk); diff --git a/trunk/net/ipv4/tcp_minisocks.c b/trunk/net/ipv4/tcp_minisocks.c index f42a284164b7..b3943e7562f3 100644 --- a/trunk/net/ipv4/tcp_minisocks.c +++ b/trunk/net/ipv4/tcp_minisocks.c @@ -774,8 +774,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; - newtp->ca_ops = &tcp_reno; - tcp_set_ca_state(newtp, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); @@ -844,6 +842,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (newtp->ecn_flags&TCP_ECN_OK) sock_set_flag(newsk, SOCK_NO_LARGESEND); + tcp_ca_init(newtp); + TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); } return newsk; diff --git a/trunk/net/ipv4/tcp_output.c b/trunk/net/ipv4/tcp_output.c index 0e17c244875c..f17c6577e337 100644 --- a/trunk/net/ipv4/tcp_output.c +++ b/trunk/net/ipv4/tcp_output.c @@ -111,7 +111,8 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; - tcp_ca_event(tp, CA_EVENT_CWND_RESTART); + if (tcp_is_vegas(tp)) + tcp_vegas_enable(tp); tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); @@ -279,10 +280,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 - /* If congestion control is doing timestamping */ - if (tp->ca_ops->rtt_sample) - do_gettimeofday(&skb->stamp); - sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; @@ -307,8 +304,17 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(tp, CA_EVENT_TX_START); + /* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ + if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) + tcp_vegas_enable(tp); th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -515,7 +521,6 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; - buff->stamp = skb->stamp; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tp->lost_out -= tcp_skb_pcount(skb); @@ -1444,6 +1449,7 @@ static inline void tcp_connect_init(struct sock *sk) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); + tcp_ca_init(tp); tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), @@ -1497,6 +1503,7 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + tcp_ca_init(tp); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; diff --git a/trunk/net/ipv4/tcp_scalable.c b/trunk/net/ipv4/tcp_scalable.c deleted file mode 100644 index 70e108e15c71..000000000000 --- a/trunk/net/ipv4/tcp_scalable.c +++ /dev/null @@ -1,68 +0,0 @@ -/* Tom Kelly's Scalable TCP - * - * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ - * - * John Heffner - */ - -#include -#include -#include - -/* These factors derived from the recommended values in the aer: - * .01 and and 7/8. We use 50 instead of 100 to account for - * delayed ack. - */ -#define TCP_SCALABLE_AI_CNT 50U -#define TCP_SCALABLE_MD_SCALE 3 - -static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, - u32 in_flight, int flag) -{ - if (in_flight < tp->snd_cwnd) - return; - - if (tp->snd_cwnd <= tp->snd_ssthresh) { - tp->snd_cwnd++; - } else { - tp->snd_cwnd_cnt++; - if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } - } - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; -} - -static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) -{ - return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); -} - - -static struct tcp_congestion_ops tcp_scalable = { - .ssthresh = tcp_scalable_ssthresh, - .cong_avoid = tcp_scalable_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, - - .owner = THIS_MODULE, - .name = "scalable", -}; - -static int __init tcp_scalable_register(void) -{ - return tcp_register_congestion_control(&tcp_scalable); -} - -static void __exit tcp_scalable_unregister(void) -{ - tcp_unregister_congestion_control(&tcp_scalable); -} - -module_init(tcp_scalable_register); -module_exit(tcp_scalable_unregister); - -MODULE_AUTHOR("John Heffner"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Scalable TCP"); diff --git a/trunk/net/ipv4/tcp_vegas.c b/trunk/net/ipv4/tcp_vegas.c deleted file mode 100644 index 9bd443db5193..000000000000 --- a/trunk/net/ipv4/tcp_vegas.c +++ /dev/null @@ -1,411 +0,0 @@ -/* - * TCP Vegas congestion control - * - * This is based on the congestion detection/avoidance scheme described in - * Lawrence S. Brakmo and Larry L. Peterson. - * "TCP Vegas: End to end congestion avoidance on a global internet." - * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, - * October 1995. Available from: - * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps - * - * See http://www.cs.arizona.edu/xkernel/ for their implementation. - * The main aspects that distinguish this implementation from the - * Arizona Vegas implementation are: - * o We do not change the loss detection or recovery mechanisms of - * Linux in any way. Linux already recovers from losses quite well, - * using fine-grained timers, NewReno, and FACK. - * o To avoid the performance penalty imposed by increasing cwnd - * only every-other RTT during slow start, we increase during - * every RTT during slow start, just like Reno. - * o Largely to allow continuous cwnd growth during slow start, - * we use the rate at which ACKs come back as the "actual" - * rate, rather than the rate at which data is sent. - * o To speed convergence to the right rate, we set the cwnd - * to achieve the right ("actual") rate when we exit slow start. - * o To filter out the noise caused by delayed ACKs, we use the - * minimum RTT sample observed during the last RTT to calculate - * the actual rate. - * o When the sender re-starts from idle, it waits until it has - * received ACKs for an entire flight of new data before making - * a cwnd adjustment decision. The original Vegas implementation - * assumed senders never went idle. - */ - -#include -#include -#include -#include -#include - -#include - -/* Default values of the Vegas variables, in fixed-point representation - * with V_PARAM_SHIFT bits to the right of the binary point. - */ -#define V_PARAM_SHIFT 1 -static int alpha = 1<doing_vegas_now = 1; - - /* Set the beginning of the next send window. */ - vegas->beg_snd_nxt = tp->snd_nxt; - - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; -} - -/* Stop taking Vegas samples for now. */ -static inline void vegas_disable(struct tcp_sock *tp) -{ - struct vegas *vegas = tcp_ca(tp); - - vegas->doing_vegas_now = 0; -} - -static void tcp_vegas_init(struct tcp_sock *tp) -{ - struct vegas *vegas = tcp_ca(tp); - - vegas->baseRTT = 0x7fffffff; - vegas_enable(tp); -} - -/* Do RTT sampling needed for Vegas. - * Basically we: - * o min-filter RTT samples from within an RTT to get the current - * propagation delay + queuing delay (we are min-filtering to try to - * avoid the effects of delayed ACKs) - * o min-filter RTT samples from a much longer window (forever for now) - * to find the propagation delay (baseRTT) - */ -static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) -{ - struct vegas *vegas = tcp_ca(tp); - u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ - - /* Filter to find propagation delay: */ - if (vrtt < vegas->baseRTT) - vegas->baseRTT = vrtt; - - /* Find the min RTT during the last RTT to find - * the current prop. delay + queuing delay: - */ - vegas->minRTT = min(vegas->minRTT, vrtt); - vegas->cntRTT++; -} - -static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) -{ - - if (ca_state == TCP_CA_Open) - vegas_enable(tp); - else - vegas_disable(tp); -} - -/* - * If the connection is idle and we are restarting, - * then we don't want to do any Vegas calculations - * until we get fresh RTT samples. So when we - * restart, we reset our Vegas state to a clean - * slate. After we get acks for this flight of - * packets, _then_ we can make Vegas calculations - * again. - */ -static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) -{ - if (event == CA_EVENT_CWND_RESTART || - event == CA_EVENT_TX_START) - tcp_vegas_init(tp); -} - -static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, - u32 seq_rtt, u32 in_flight, int flag) -{ - struct vegas *vegas = tcp_ca(tp); - - if (!vegas->doing_vegas_now) - return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); - - /* The key players are v_beg_snd_una and v_beg_snd_nxt. - * - * These are so named because they represent the approximate values - * of snd_una and snd_nxt at the beginning of the current RTT. More - * precisely, they represent the amount of data sent during the RTT. - * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, - * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding - * bytes of data have been ACKed during the course of the RTT, giving - * an "actual" rate of: - * - * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) - * - * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, - * because delayed ACKs can cover more than one segment, so they - * don't line up nicely with the boundaries of RTTs. - * - * Another unfortunate fact of life is that delayed ACKs delay the - * advance of the left edge of our send window, so that the number - * of bytes we send in an RTT is often less than our cwnd will allow. - * So we keep track of our cwnd separately, in v_beg_snd_cwnd. - */ - - if (after(ack, vegas->beg_snd_nxt)) { - /* Do the Vegas once-per-RTT cwnd adjustment. */ - u32 old_wnd, old_snd_cwnd; - - - /* Here old_wnd is essentially the window of data that was - * sent during the previous RTT, and has all - * been acknowledged in the course of the RTT that ended - * with the ACK we just received. Likewise, old_snd_cwnd - * is the cwnd during the previous RTT. - */ - old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / - tp->mss_cache; - old_snd_cwnd = vegas->beg_snd_cwnd; - - /* Save the extent of the current window so we can use this - * at the end of the next RTT. - */ - vegas->beg_snd_una = vegas->beg_snd_nxt; - vegas->beg_snd_nxt = tp->snd_nxt; - vegas->beg_snd_cwnd = tp->snd_cwnd; - - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - tcp_vegas_rtt_calc(tp, seq_rtt*1000); - - /* We do the Vegas calculations only if we got enough RTT - * samples that we can be reasonably sure that we got - * at least one RTT sample that wasn't from a delayed ACK. - * If we only had 2 samples total, - * then that means we're getting only 1 ACK per RTT, which - * means they're almost certainly delayed ACKs. - * If we have 3 samples, we should be OK. - */ - - if (vegas->cntRTT <= 2) { - /* We don't have enough RTT samples to do the Vegas - * calculation, so we'll behave like Reno. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; - } else { - u32 rtt, target_cwnd, diff; - - /* We have enough RTT samples, so, using the Vegas - * algorithm, we determine if we should increase or - * decrease cwnd, and by how much. - */ - - /* Pluck out the RTT we are using for the Vegas - * calculations. This is the min RTT seen during the - * last RTT. Taking the min filters out the effects - * of delayed ACKs, at the cost of noticing congestion - * a bit later. - */ - rtt = vegas->minRTT; - - /* Calculate the cwnd we should have, if we weren't - * going too fast. - * - * This is: - * (actual rate in segments) * baseRTT - * We keep it as a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary point. - */ - target_cwnd = ((old_wnd * vegas->baseRTT) - << V_PARAM_SHIFT) / rtt; - - /* Calculate the difference between the window we had, - * and the window we would like to have. This quantity - * is the "Diff" from the Arizona Vegas papers. - * - * Again, this is a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary - * point. - */ - diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - - if (tp->snd_cwnd < tp->snd_ssthresh) { - /* Slow start. */ - if (diff > gamma) { - /* Going too fast. Time to slow down - * and switch to congestion avoidance. - */ - tp->snd_ssthresh = 2; - - /* Set cwnd to match the actual rate - * exactly: - * cwnd = (actual rate) * baseRTT - * Then we add 1 because the integer - * truncation robs us of full link - * utilization. - */ - tp->snd_cwnd = min(tp->snd_cwnd, - (target_cwnd >> - V_PARAM_SHIFT)+1); - - } - } else { - /* Congestion avoidance. */ - u32 next_snd_cwnd; - - /* Figure out where we would like cwnd - * to be. - */ - if (diff > beta) { - /* The old window was too fast, so - * we slow down. - */ - next_snd_cwnd = old_snd_cwnd - 1; - } else if (diff < alpha) { - /* We don't have enough extra packets - * in the network, so speed up. - */ - next_snd_cwnd = old_snd_cwnd + 1; - } else { - /* Sending just as fast as we - * should be. - */ - next_snd_cwnd = old_snd_cwnd; - } - - /* Adjust cwnd upward or downward, toward the - * desired value. - */ - if (next_snd_cwnd > tp->snd_cwnd) - tp->snd_cwnd++; - else if (next_snd_cwnd < tp->snd_cwnd) - tp->snd_cwnd--; - } - } - - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; - } - - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); -} - -/* Extract info for Tcp socket info provided via netlink. */ -static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, - struct sk_buff *skb) -{ - const struct vegas *ca = tcp_ca(tp); - if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { - struct tcpvegas_info *info; - - info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, - sizeof(*info))); - - info->tcpv_enabled = ca->doing_vegas_now; - info->tcpv_rttcnt = ca->cntRTT; - info->tcpv_rtt = ca->baseRTT; - info->tcpv_minrtt = ca->minRTT; - rtattr_failure: ; - } -} - -static struct tcp_congestion_ops tcp_vegas = { - .init = tcp_vegas_init, - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_vegas_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, - .rtt_sample = tcp_vegas_rtt_calc, - .set_state = tcp_vegas_state, - .cwnd_event = tcp_vegas_cwnd_event, - .get_info = tcp_vegas_get_info, - - .owner = THIS_MODULE, - .name = "vegas", -}; - -static int __init tcp_vegas_register(void) -{ - BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); - tcp_register_congestion_control(&tcp_vegas); - return 0; -} - -static void __exit tcp_vegas_unregister(void) -{ - tcp_unregister_congestion_control(&tcp_vegas); -} - -module_init(tcp_vegas_register); -module_exit(tcp_vegas_unregister); - -MODULE_AUTHOR("Stephen Hemminger"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("TCP Vegas"); diff --git a/trunk/net/ipv4/tcp_westwood.c b/trunk/net/ipv4/tcp_westwood.c deleted file mode 100644 index ef827242c940..000000000000 --- a/trunk/net/ipv4/tcp_westwood.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * TCP Westwood+ - * - * Angelo Dell'Aera: TCP Westwood+ support - */ - -#include -#include -#include -#include -#include -#include - -/* TCP Westwood structure */ -struct westwood { - u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ - u32 bw_est; /* bandwidth estimate */ - u32 rtt_win_sx; /* here starts a new evaluation... */ - u32 bk; - u32 snd_una; /* used for evaluating the number of acked bytes */ - u32 cumul_ack; - u32 accounted; - u32 rtt; - u32 rtt_min; /* minimum observed RTT */ -}; - - -/* TCP Westwood functions and constants */ -#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ -#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ - -/* - * @tcp_westwood_create - * This function initializes fields used in TCP Westwood+, - * it is called after the initial SYN, so the sequence numbers - * are correct but new passive connections we have no - * information about RTTmin at this time so we simply set it to - * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative - * since in this way we're sure it will be updated in a consistent - * way as soon as possible. It will reasonably happen within the first - * RTT period of the connection lifetime. - */ -static void tcp_westwood_init(struct tcp_sock *tp) -{ - struct westwood *w = tcp_ca(tp); - - w->bk = 0; - w->bw_ns_est = 0; - w->bw_est = 0; - w->accounted = 0; - w->cumul_ack = 0; - w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; - w->rtt_win_sx = tcp_time_stamp; - w->snd_una = tp->snd_una; -} - -/* - * @westwood_do_filter - * Low-pass filter. Implemented using constant coefficients. - */ -static inline u32 westwood_do_filter(u32 a, u32 b) -{ - return (((7 * a) + b) >> 3); -} - -static inline void westwood_filter(struct westwood *w, u32 delta) -{ - w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); - w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); -} - -/* - * @westwood_pkts_acked - * Called after processing group of packets. - * but all westwood needs is the last sample of srtt. - */ -static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) -{ - struct westwood *w = tcp_ca(tp); - if (cnt > 0) - w->rtt = tp->srtt >> 3; -} - -/* - * @westwood_update_window - * It updates RTT evaluation window if it is the right moment to do - * it. If so it calls filter for evaluating bandwidth. - */ -static void westwood_update_window(struct tcp_sock *tp) -{ - struct westwood *w = tcp_ca(tp); - s32 delta = tcp_time_stamp - w->rtt_win_sx; - - /* - * See if a RTT-window has passed. - * Be careful since if RTT is less than - * 50ms we don't filter but we continue 'building the sample'. - * This minimum limit was chosen since an estimation on small - * time intervals is better to avoid... - * Obviously on a LAN we reasonably will always have - * right_bound = left_bound + WESTWOOD_RTT_MIN - */ - if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { - westwood_filter(w, delta); - - w->bk = 0; - w->rtt_win_sx = tcp_time_stamp; - } -} - -/* - * @westwood_fast_bw - * It is called when we are in fast path. In particular it is called when - * header prediction is successful. In such case in fact update is - * straight forward and doesn't need any particular care. - */ -static inline void westwood_fast_bw(struct tcp_sock *tp) -{ - struct westwood *w = tcp_ca(tp); - - westwood_update_window(tp); - - w->bk += tp->snd_una - w->snd_una; - w->snd_una = tp->snd_una; - w->rtt_min = min(w->rtt, w->rtt_min); -} - -/* - * @westwood_acked_count - * This function evaluates cumul_ack for evaluating bk in case of - * delayed or partial acks. - */ -static inline u32 westwood_acked_count(struct tcp_sock *tp) -{ - struct westwood *w = tcp_ca(tp); - - w->cumul_ack = tp->snd_una - w->snd_una; - - /* If cumul_ack is 0 this is a dupack since it's not moving - * tp->snd_una. - */ - if (!w->cumul_ack) { - w->accounted += tp->mss_cache; - w->cumul_ack = tp->mss_cache; - } - - if (w->cumul_ack > tp->mss_cache) { - /* Partial or delayed ack */ - if (w->accounted >= w->cumul_ack) { - w->accounted -= w->cumul_ack; - w->cumul_ack = tp->mss_cache; - } else { - w->cumul_ack -= w->accounted; - w->accounted = 0; - } - } - - w->snd_una = tp->snd_una; - - return w->cumul_ack; -} - -static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) -{ - struct westwood *w = tcp_ca(tp); - return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); -} - -/* - * TCP Westwood - * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it - * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 - * so avoids ever returning 0. - */ -static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) -{ - return westwood_bw_rttmin(tp); -} - -static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) -{ - struct westwood *w = tcp_ca(tp); - - switch(event) { - case CA_EVENT_FAST_ACK: - westwood_fast_bw(tp); - break; - - case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); - break; - - case CA_EVENT_FRTO: - tp->snd_ssthresh = westwood_bw_rttmin(tp); - break; - - case CA_EVENT_SLOW_ACK: - westwood_update_window(tp); - w->bk += westwood_acked_count(tp); - w->rtt_min = min(w->rtt, w->rtt_min); - break; - - default: - /* don't care */ - break; - } -} - - -/* Extract info for Tcp socket info provided via netlink. */ -static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, - struct sk_buff *skb) -{ - const struct westwood *ca = tcp_ca(tp); - if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { - struct rtattr *rta; - struct tcpvegas_info *info; - - rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); - info = RTA_DATA(rta); - info->tcpv_enabled = 1; - info->tcpv_rttcnt = 0; - info->tcpv_rtt = jiffies_to_usecs(ca->rtt); - info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); - rtattr_failure: ; - } -} - - -static struct tcp_congestion_ops tcp_westwood = { - .init = tcp_westwood_init, - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_cwnd_min, - .cwnd_event = tcp_westwood_event, - .get_info = tcp_westwood_info, - .pkts_acked = tcp_westwood_pkts_acked, - - .owner = THIS_MODULE, - .name = "westwood" -}; - -static int __init tcp_westwood_register(void) -{ - BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); - return tcp_register_congestion_control(&tcp_westwood); -} - -static void __exit tcp_westwood_unregister(void) -{ - tcp_unregister_congestion_control(&tcp_westwood); -} - -module_init(tcp_westwood_register); -module_exit(tcp_westwood_unregister); - -MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("TCP Westwood+"); diff --git a/trunk/net/ipv6/tcp_ipv6.c b/trunk/net/ipv6/tcp_ipv6.c index fce56039b0e9..2414937f2a83 100644 --- a/trunk/net/ipv6/tcp_ipv6.c +++ b/trunk/net/ipv6/tcp_ipv6.c @@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; tp->af_specific = &ipv6_specific; - tp->ca_ops = &tcp_reno; + sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);