Skip to content

Commit

Permalink
[INET]: Generalise tcp_tw_bucket, aka TIME_WAIT sockets
Browse files Browse the repository at this point in the history
This paves the way to generalise the rest of the sock ID lookup
routines and saves some bytes in TCPv4 TIME_WAIT sockets on distro
kernels (where IPv6 is always built as a module):

[root@qemu ~]# grep tw_sock /proc/slabinfo
tw_sock_TCPv6  0  0  128  31  1
tw_sock_TCP    0  0   96  41  1
[root@qemu ~]#

Now if a protocol wants to use the TIME_WAIT generic infrastructure it
only has to set the sk_prot->twsk_obj_size field with the size of its
inet_timewait_sock derived sock and proto_register will create
sk_prot->twsk_slab, for now its only for INET sockets, but we can
introduce timewait_sock later if some non INET transport protocolo
wants to use this stuff.

Next changesets will take advantage of this new infrastructure to
generalise even more TCP code.

[acme@toy net-2.6.14]$ grep built-in /tmp/before.size /tmp/after.size
/tmp/before.size: 188646   11764    5068  205478   322a6 net/ipv4/built-in.o
/tmp/after.size:  188144   11764    5068  204976   320b0 net/ipv4/built-in.o
[acme@toy net-2.6.14]$

Tested with both IPv4 & IPv6 (::1 (localhost) & ::ffff:172.20.0.1
(qemu host)).

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Arnaldo Carvalho de Melo authored and David S. Miller committed Aug 29, 2005
1 parent 33b6223 commit 8feaf0c
Show file tree
Hide file tree
Showing 13 changed files with 484 additions and 391 deletions.
52 changes: 49 additions & 3 deletions include/linux/ipv6.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,41 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,

#define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only)
#define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))

#include <linux/tcp.h>

struct tcp6_timewait_sock {
struct tcp_timewait_sock tw_v6_sk;
struct in6_addr tw_v6_daddr;
struct in6_addr tw_v6_rcv_saddr;
};

static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk)
{
return (struct tcp6_timewait_sock *)sk;
}

static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
{
return likely(sk->sk_state != TCP_TIME_WAIT) ?
&inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr;
}

static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
{
return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
}

static inline int tcp_twsk_ipv6only(const struct sock *sk)
{
return inet_twsk(sk)->tw_ipv6only;
}

static inline int tcp_v6_ipv6only(const struct sock *sk)
{
return likely(sk->sk_state != TCP_TIME_WAIT) ?
ipv6_only_sock(sk) : tcp_twsk_ipv6only(sk);
}
#else
#define __ipv6_only_sock(sk) 0
#define ipv6_only_sock(sk) 0
Expand All @@ -322,8 +357,19 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
return NULL;
}

#endif
#define __tcp_v6_rcv_saddr(__sk) NULL
#define tcp_v6_rcv_saddr(__sk) NULL
#define tcp_twsk_ipv6only(__sk) 0
#define tcp_v6_ipv6only(__sk) 0
#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */

#endif
#define INET6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \
(((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
((__sk)->sk_family == AF_INET6) && \
ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \
ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))

#endif
#endif /* __KERNEL__ */

#endif /* _IPV6_H */
15 changes: 15 additions & 0 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ struct tcp_info
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <net/sock.h>
#include <net/inet_timewait_sock.h>

/* This defines a selective acknowledgement block. */
struct tcp_sack_block {
Expand Down Expand Up @@ -387,6 +388,20 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
return (struct tcp_sock *)sk;
}

struct tcp_timewait_sock {
struct inet_timewait_sock tw_sk;
__u32 tw_rcv_nxt;
__u32 tw_snd_nxt;
__u32 tw_rcv_wnd;
__u32 tw_ts_recent;
long tw_ts_recent_stamp;
};

static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
{
return (struct tcp_timewait_sock *)sk;
}

static inline void *tcp_ca(const struct tcp_sock *tp)
{
return (void *) tp->ca_priv;
Expand Down
41 changes: 41 additions & 0 deletions include/net/inet_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#ifndef _INET_HASHTABLES_H
#define _INET_HASHTABLES_H

#include <linux/config.h>

#include <linux/interrupt.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
Expand Down Expand Up @@ -310,4 +312,43 @@ static inline struct sock *inet_lookup_listener(struct inet_hashinfo *hashinfo,
read_unlock(&hashinfo->lhash_lock);
return sk;
}

/* Socket demux engine toys. */
#ifdef __BIG_ENDIAN
#define INET_COMBINED_PORTS(__sport, __dport) \
(((__u32)(__sport) << 16) | (__u32)(__dport))
#else /* __LITTLE_ENDIAN */
#define INET_COMBINED_PORTS(__sport, __dport) \
(((__u32)(__dport) << 16) | (__u32)(__sport))
#endif

#if (BITS_PER_LONG == 64)
#ifdef __BIG_ENDIAN
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
const __u64 __name = (((__u64)(__saddr)) << 32) | ((__u64)(__daddr));
#else /* __LITTLE_ENDIAN */
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr));
#endif /* __BIG_ENDIAN */
#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
(((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
(((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \
((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
#else /* 32-bit arch */
#define INET_ADDR_COOKIE(__name, __saddr, __daddr)
#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif) \
((inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif) \
((inet_twsk(__sk)->tw_daddr == (__saddr)) && \
(inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \
((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
#endif /* 64-bit arch */
#endif /* _INET_HASHTABLES_H */
142 changes: 142 additions & 0 deletions include/net/inet_timewait_sock.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for a generic INET TIMEWAIT sock
*
* From code originally in net/tcp.h
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _INET_TIMEWAIT_SOCK_
#define _INET_TIMEWAIT_SOCK_

#include <linux/config.h>

#include <linux/list.h>
#include <linux/types.h>

#include <net/sock.h>
#include <net/tcp_states.h>

#include <asm/atomic.h>

#if (BITS_PER_LONG == 64)
#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8
#else
#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 4
#endif

struct inet_bind_bucket;

/*
* This is a TIME_WAIT sock. It works around the memory consumption
* problems of sockets in such a state on heavily loaded servers, but
* without violating the protocol specification.
*/
struct inet_timewait_sock {
/*
* Now struct sock also uses sock_common, so please just
* don't add nothing before this first member (__tw_common) --acme
*/
struct sock_common __tw_common;
#define tw_family __tw_common.skc_family
#define tw_state __tw_common.skc_state
#define tw_reuse __tw_common.skc_reuse
#define tw_bound_dev_if __tw_common.skc_bound_dev_if
#define tw_node __tw_common.skc_node
#define tw_bind_node __tw_common.skc_bind_node
#define tw_refcnt __tw_common.skc_refcnt
#define tw_prot __tw_common.skc_prot
volatile unsigned char tw_substate;
/* 3 bits hole, try to pack */
unsigned char tw_rcv_wscale;
/* Socket demultiplex comparisons on incoming packets. */
/* these five are in inet_sock */
__u16 tw_sport;
__u32 tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES)));
__u32 tw_rcv_saddr;
__u16 tw_dport;
__u16 tw_num;
/* And these are ours. */
__u8 tw_ipv6only:1;
/* 31 bits hole, try to pack */
int tw_hashent;
int tw_timeout;
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node;
};

static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
struct hlist_head *list)
{
hlist_add_head(&tw->tw_node, list);
}

static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
struct hlist_head *list)
{
hlist_add_head(&tw->tw_bind_node, list);
}

static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
{
return tw->tw_death_node.pprev != NULL;
}

static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
{
tw->tw_death_node.pprev = NULL;
}

static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
{
__hlist_del(&tw->tw_death_node);
inet_twsk_dead_node_init(tw);
}

static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
{
if (inet_twsk_dead_hashed(tw)) {
__inet_twsk_del_dead_node(tw);
return 1;
}
return 0;
}

#define inet_twsk_for_each(tw, node, head) \
hlist_for_each_entry(tw, node, head, tw_node)

#define inet_twsk_for_each_inmate(tw, node, jail) \
hlist_for_each_entry(tw, node, jail, tw_death_node)

#define inet_twsk_for_each_inmate_safe(tw, node, safe, jail) \
hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node)

static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
{
return (struct inet_timewait_sock *)sk;
}

static inline u32 inet_rcv_saddr(const struct sock *sk)
{
return likely(sk->sk_state != TCP_TIME_WAIT) ?
inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr;
}

static inline void inet_twsk_put(struct inet_timewait_sock *tw)
{
if (atomic_dec_and_test(&tw->tw_refcnt)) {
#ifdef SOCK_REFCNT_DEBUG
printk(KERN_DEBUG "%s timewait_sock %p released\n",
tw->tw_prot->name, tw);
#endif
kmem_cache_free(tw->tw_prot->twsk_slab, tw);
}
}
#endif /* _INET_TIMEWAIT_SOCK_ */
17 changes: 11 additions & 6 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ do { spin_lock_init(&((__sk)->sk_lock.slock)); \
} while(0)

struct sock;
struct proto;

/**
* struct sock_common - minimal network layer representation of sockets
Expand All @@ -98,10 +99,11 @@ struct sock;
* @skc_node: main hash linkage for various protocol lookup tables
* @skc_bind_node: bind hash linkage for various protocol lookup tables
* @skc_refcnt: reference count
* @skc_prot: protocol handlers inside a network family
*
* This is the minimal network layer representation of sockets, the header
* for struct sock and struct tcp_tw_bucket.
*/
* for struct sock and struct inet_timewait_sock.
*/
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
Expand All @@ -110,11 +112,12 @@ struct sock_common {
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
struct proto *skc_prot;
};

/**
* struct sock - network layer representation of sockets
* @__sk_common: shared layout with tcp_tw_bucket
* @__sk_common: shared layout with inet_timewait_sock
* @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
* @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
* @sk_lock: synchronizer
Expand All @@ -140,7 +143,6 @@ struct sock_common {
* @sk_backlog: always used with the per-socket spinlock held
* @sk_callback_lock: used with the callbacks in the end of this struct
* @sk_error_queue: rarely used
* @sk_prot: protocol handlers inside a network family
* @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance)
* @sk_err: last error
* @sk_err_soft: errors that don't cause failure but are the cause of a persistent failure not just 'timed out'
Expand Down Expand Up @@ -173,7 +175,7 @@ struct sock_common {
*/
struct sock {
/*
* Now struct tcp_tw_bucket also uses sock_common, so please just
* Now struct inet_timewait_sock also uses sock_common, so please just
* don't add nothing before this first member (__sk_common) --acme
*/
struct sock_common __sk_common;
Expand All @@ -184,6 +186,7 @@ struct sock {
#define sk_node __sk_common.skc_node
#define sk_bind_node __sk_common.skc_bind_node
#define sk_refcnt __sk_common.skc_refcnt
#define sk_prot __sk_common.skc_prot
unsigned char sk_shutdown : 2,
sk_no_check : 2,
sk_userlocks : 4;
Expand Down Expand Up @@ -218,7 +221,6 @@ struct sock {
struct sk_buff *tail;
} sk_backlog;
struct sk_buff_head sk_error_queue;
struct proto *sk_prot;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err,
Expand Down Expand Up @@ -557,6 +559,9 @@ struct proto {
kmem_cache_t *slab;
unsigned int obj_size;

kmem_cache_t *twsk_slab;
unsigned int twsk_obj_size;

struct request_sock_ops *rsk_prot;

struct module *owner;
Expand Down
Loading

0 comments on commit 8feaf0c

Please sign in to comment.