From b43309578477ff3271945d4efb920f558a309b4a Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Tue, 3 May 2005 14:23:13 -0700 Subject: [PATCH 01/28] [NETFILTER]: Missing owner-field initialization in iptable_raw Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/netfilter/iptable_raw.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 01b4a3c814d39..47449ba83eb94 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -103,13 +103,15 @@ static struct nf_hook_ops ipt_ops[] = { .hook = ipt_hook, .pf = PF_INET, .hooknum = NF_IP_PRE_ROUTING, - .priority = NF_IP_PRI_RAW + .priority = NF_IP_PRI_RAW, + .owner = THIS_MODULE, }, { .hook = ipt_hook, .pf = PF_INET, .hooknum = NF_IP_LOCAL_OUT, - .priority = NF_IP_PRI_RAW + .priority = NF_IP_PRI_RAW, + .owner = THIS_MODULE, }, }; From 31da185d8162ae0f30a13ed945f1f4d28d158133 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Tue, 3 May 2005 14:23:50 -0700 Subject: [PATCH 02/28] [NETFILTER]: Don't checksum CHECKSUM_UNNECESSARY skbs in TCP connection tracking Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 2b87c1974be60..721ddbf522b42 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -819,6 +819,7 @@ static int tcp_error(struct sk_buff *skb, */ /* FIXME: Source route IP option packets --RR */ if (hooknum == NF_IP_PRE_ROUTING + && skb->ip_summed != CHECKSUM_UNNECESSARY && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { From 679a87382433cf12a28f07a7d5c240f30f0daa08 Mon Sep 17 00:00:00 2001 From: Herbert Xu <herbert@gondor.apana.org.au> Date: Tue, 3 May 2005 14:24:36 -0700 Subject: [PATCH 03/28] [IPV6]: Fix raw socket checksums with IPsec I made a mistake in my last patch to the raw socket checksum code. I used the value of inet->cork.length as the length of the payload. While this works with normal packets, it breaks down when IPsec is present since the cork length includes the extension header length. So here is a patch to fix the length calculations. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/raw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 1352c1d9bf4d3..617645bc5ed6a 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -455,11 +455,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct raw6_sock *rp) { - struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; int err = 0; int offset; int len; + int total_len; u32 tmp_csum; u16 csum; @@ -470,7 +470,8 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, goto out; offset = rp->offset; - if (offset >= inet->cork.length - 1) { + total_len = inet_sk(sk)->cork.length - (skb->nh.raw - skb->data); + if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); goto out; @@ -514,7 +515,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, tmp_csum = csum_ipv6_magic(&fl->fl6_src, &fl->fl6_dst, - inet->cork.length, fl->proto, tmp_csum); + total_len, fl->proto, tmp_csum); if (tmp_csum == 0) tmp_csum = -1; From e4553eddae592b948c9695c9a0002169b0cab6fc Mon Sep 17 00:00:00 2001 From: Herbert Xu <herbert@gondor.apana.org.au> Date: Tue, 3 May 2005 14:25:13 -0700 Subject: [PATCH 04/28] [IPV6]: Include ipv6.h for ipv6_addr_set This patch includes net/ipv6.h from addrconf.h since it needs ipv6_addr_set. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/addrconf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 7af9a13cb9beb..f1e5af4be98e4 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -46,6 +46,7 @@ struct prefix_info { #include <linux/in6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> +#include <net/ipv6.h> #define IN6_ADDR_HSIZE 16 From 526bdb80a23b2e10ed4ccc3fcf309c9118d892d6 Mon Sep 17 00:00:00 2001 From: Thomas Graf <tgraf@suug.ch> Date: Tue, 3 May 2005 14:26:01 -0700 Subject: [PATCH 05/28] [XFRM]: Prevent off-by-one access to xfrm_dispatch Makes the type > XFRM_MSG_MAX check behave correctly to protect access to xfrm_dispatch. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/xfrm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index f0df02ae68a41..4d19b9e65317c 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -140,8 +140,9 @@ enum { XFRM_MSG_FLUSHPOLICY, #define XFRM_MSG_FLUSHPOLICY XFRM_MSG_FLUSHPOLICY - XFRM_MSG_MAX + __XFRM_MSG_MAX }; +#define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1) struct xfrm_user_tmpl { struct xfrm_id id; From 492b558b3191319cbc859a9e025bc354d336c261 Mon Sep 17 00:00:00 2001 From: Thomas Graf <tgraf@suug.ch> Date: Tue, 3 May 2005 14:26:40 -0700 Subject: [PATCH 06/28] [XFRM]: Cleanup xfrm_msg_min and xfrm_dispatch Converts xfrm_msg_min and xfrm_dispatch to use c99 designated initializers to make greping a little bit easier. Also replaces two hardcoded message type with meaningful names. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/xfrm.h | 2 ++ net/xfrm/xfrm_user.c | 73 ++++++++++++++++++++++---------------------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 4d19b9e65317c..fd2ef742a9fd1 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -144,6 +144,8 @@ enum { }; #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1) +#define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE) + struct xfrm_user_tmpl { struct xfrm_id id; __u16 family; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 63661b0fd736a..52b5843937c58 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -855,47 +855,44 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **x return 0; } -static const int xfrm_msg_min[(XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)] = { - NLMSG_LENGTH(sizeof(struct xfrm_usersa_info)), /* NEW SA */ - NLMSG_LENGTH(sizeof(struct xfrm_usersa_id)), /* DEL SA */ - NLMSG_LENGTH(sizeof(struct xfrm_usersa_id)), /* GET SA */ - NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info)),/* NEW POLICY */ - NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id)), /* DEL POLICY */ - NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id)), /* GET POLICY */ - NLMSG_LENGTH(sizeof(struct xfrm_userspi_info)), /* ALLOC SPI */ - NLMSG_LENGTH(sizeof(struct xfrm_user_acquire)), /* ACQUIRE */ - NLMSG_LENGTH(sizeof(struct xfrm_user_expire)), /* EXPIRE */ - NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info)),/* UPD POLICY */ - NLMSG_LENGTH(sizeof(struct xfrm_usersa_info)), /* UPD SA */ - NLMSG_LENGTH(sizeof(struct xfrm_user_polexpire)), /* POLEXPIRE */ - NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush)), /* FLUSH SA */ - NLMSG_LENGTH(0), /* FLUSH POLICY */ +#define XMSGSIZE(type) NLMSG_LENGTH(sizeof(struct type)) + +static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { + [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info), + [XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), + [XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id), + [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info), + [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), + [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userspi_info), + [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_acquire), + [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_expire), + [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info), + [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info), + [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_polexpire), + [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush), + [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = NLMSG_LENGTH(0), }; +#undef XMSGSIZE + static struct xfrm_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, void **); int (*dump)(struct sk_buff *, struct netlink_callback *); -} xfrm_dispatch[] = { - { .doit = xfrm_add_sa, }, - { .doit = xfrm_del_sa, }, - { - .doit = xfrm_get_sa, - .dump = xfrm_dump_sa, - }, - { .doit = xfrm_add_policy }, - { .doit = xfrm_get_policy }, - { - .doit = xfrm_get_policy, - .dump = xfrm_dump_policy, - }, - { .doit = xfrm_alloc_userspi }, - {}, - {}, - { .doit = xfrm_add_policy }, - { .doit = xfrm_add_sa, }, - {}, - { .doit = xfrm_flush_sa }, - { .doit = xfrm_flush_policy }, +} xfrm_dispatch[XFRM_NR_MSGTYPES] = { + [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, + [XFRM_MSG_DELSA - XFRM_MSG_BASE] = { .doit = xfrm_del_sa }, + [XFRM_MSG_GETSA - XFRM_MSG_BASE] = { .doit = xfrm_get_sa, + .dump = xfrm_dump_sa }, + [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy }, + [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy }, + [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy, + .dump = xfrm_dump_policy }, + [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi }, + [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy }, + [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, + [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = { .doit = xfrm_flush_sa }, + [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, }; static int xfrm_done(struct netlink_callback *cb) @@ -931,7 +928,9 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err return -1; } - if ((type == 2 || type == 5) && (nlh->nlmsg_flags & NLM_F_DUMP)) { + if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || + type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && + (nlh->nlmsg_flags & NLM_F_DUMP)) { u32 rlen; if (link->dump == NULL) From d775fc09f16f4b88cd0373006b112c4772589778 Mon Sep 17 00:00:00 2001 From: Thomas Graf <tgraf@suug.ch> Date: Tue, 3 May 2005 14:27:35 -0700 Subject: [PATCH 07/28] [RTNETLINK] Fix RTM_MAX to represent the maximum valid message type RTM_MAX is currently set to the maximum reserverd message type plus one thus being the cause of two bugs for new types being assigned a) given the new family registers only the NEW command in its reserved block the array size for per family entries is calculated one entry short and b) given the new family registers all commands RTM_MAX would point to the first entry of the block following this one and the rtnetlink receive path would accept a message type for a nonexisting family. This patch changes RTM_MAX to point to the maximum valid message type by aligning it to the start of the next block and subtracting one. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/rtnetlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 32e52769a00b6..d607219af6ac9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -89,8 +89,8 @@ enum { RTM_GETANYCAST = 62, #define RTM_GETANYCAST RTM_GETANYCAST - RTM_MAX, -#define RTM_MAX RTM_MAX + __RTM_MAX, +#define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; /* From f90a0a74b864fdc46737614f03b8868f4f31e3bf Mon Sep 17 00:00:00 2001 From: Thomas Graf <tgraf@suug.ch> Date: Tue, 3 May 2005 14:29:00 -0700 Subject: [PATCH 08/28] [RTNETLINK] Fix & cleanup rtm_min/rtm_max Converts rtm_min and rtm_max arrays to use c99 designated initializers for easier insertion of new message families. RTM_GETMULTICAST and RTM_GETANYCAST did not have the minimal message size specified which means that the netlink message was parsed for routing attributes starting from the header. Adds the proper minimal message sizes for these messages (netlink header + common rtnetlink header) to fix this issue. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/rtnetlink.h | 2 ++ net/core/rtnetlink.c | 39 +++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index d607219af6ac9..1ecaea74d55a6 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -93,6 +93,8 @@ enum { #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; +#define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2) + /* Generic structure for encapsulation of optional route information. It is reminiscent of sockaddr, but with sa_family replaced diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d8c198e42f905..58a981806268a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -88,28 +88,31 @@ struct rtnetlink_link * rtnetlink_links[NPROTO]; static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = { - NLMSG_LENGTH(sizeof(struct ifinfomsg)), - NLMSG_LENGTH(sizeof(struct ifaddrmsg)), - NLMSG_LENGTH(sizeof(struct rtmsg)), - NLMSG_LENGTH(sizeof(struct ndmsg)), - NLMSG_LENGTH(sizeof(struct rtmsg)), - NLMSG_LENGTH(sizeof(struct tcmsg)), - NLMSG_LENGTH(sizeof(struct tcmsg)), - NLMSG_LENGTH(sizeof(struct tcmsg)), - NLMSG_LENGTH(sizeof(struct tcamsg)) + [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)), + [RTM_FAM(RTM_NEWNEIGH)] = NLMSG_LENGTH(sizeof(struct ndmsg)), + [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct rtmsg)), + [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)), + [RTM_FAM(RTM_NEWPREFIX)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), + [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), + [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), }; static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = { - IFLA_MAX, - IFA_MAX, - RTA_MAX, - NDA_MAX, - RTA_MAX, - TCA_MAX, - TCA_MAX, - TCA_MAX, - TCAA_MAX + [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX, + [RTM_FAM(RTM_NEWADDR)] = IFA_MAX, + [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX, + [RTM_FAM(RTM_NEWNEIGH)] = NDA_MAX, + [RTM_FAM(RTM_NEWRULE)] = RTA_MAX, + [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX, + [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, + [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, + [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, }; void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) From db46edc6d3b66bf708a8f23a9aa89f63a49ebe33 Mon Sep 17 00:00:00 2001 From: Thomas Graf <tgraf@suug.ch> Date: Tue, 3 May 2005 14:29:39 -0700 Subject: [PATCH 09/28] [RTNETLINK] Cleanup rtnetlink_link tables Converts remaining rtnetlink_link tables to use c99 designated initializers to make greping a little bit easier. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/rtnetlink.h | 2 ++ net/core/rtnetlink.c | 6 +++--- net/decnet/dn_dev.c | 25 +++++++++++++------------ net/ipv4/devinet.c | 21 +++++++++++---------- net/ipv6/addrconf.c | 2 +- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 1ecaea74d55a6..91ac97c207771 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -93,6 +93,8 @@ enum { #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; +#define RTM_NR_MSGTYPES (RTM_MAX + 1 - RTM_BASE) +#define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2) #define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2) /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 58a981806268a..5fb70cfa10850 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -86,7 +86,7 @@ struct sock *rtnl; struct rtnetlink_link * rtnetlink_links[NPROTO]; -static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = +static const int rtm_min[RTM_NR_FAMILIES] = { [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)), [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), @@ -102,7 +102,7 @@ static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), }; -static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = +static const int rta_max[RTM_NR_FAMILIES] = { [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX, [RTM_FAM(RTM_NEWADDR)] = IFA_MAX, @@ -641,7 +641,7 @@ static void rtnetlink_rcv(struct sock *sk, int len) } while (rtnl && rtnl->sk_receive_queue.qlen); } -static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] = { [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo }, [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink }, diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index c2a0346f423b4..e6e23eb144280 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1411,21 +1411,22 @@ static struct file_operations dn_dev_seq_fops = { #endif /* CONFIG_PROC_FS */ -static struct rtnetlink_link dnet_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +static struct rtnetlink_link dnet_rtnetlink_table[RTM_NR_MSGTYPES] = { - [4] = { .doit = dn_dev_rtm_newaddr, }, - [5] = { .doit = dn_dev_rtm_deladdr, }, - [6] = { .dumpit = dn_dev_dump_ifaddr, }, - + [RTM_NEWADDR - RTM_BASE] = { .doit = dn_dev_rtm_newaddr, }, + [RTM_DELADDR - RTM_BASE] = { .doit = dn_dev_rtm_deladdr, }, + [RTM_GETADDR - RTM_BASE] = { .dumpit = dn_dev_dump_ifaddr, }, #ifdef CONFIG_DECNET_ROUTER - [8] = { .doit = dn_fib_rtm_newroute, }, - [9] = { .doit = dn_fib_rtm_delroute, }, - [10] = { .doit = dn_cache_getroute, .dumpit = dn_fib_dump, }, - [16] = { .doit = dn_fib_rtm_newrule, }, - [17] = { .doit = dn_fib_rtm_delrule, }, - [18] = { .dumpit = dn_fib_dump_rules, }, + [RTM_NEWROUTE - RTM_BASE] = { .doit = dn_fib_rtm_newroute, }, + [RTM_DELROUTE - RTM_BASE] = { .doit = dn_fib_rtm_delroute, }, + [RTM_GETROUTE - RTM_BASE] = { .doit = dn_cache_getroute, + .dumpit = dn_fib_dump, }, + [RTM_NEWRULE - RTM_BASE] = { .doit = dn_fib_rtm_newrule, }, + [RTM_DELRULE - RTM_BASE] = { .doit = dn_fib_rtm_delrule, }, + [RTM_GETRULE - RTM_BASE] = { .dumpit = dn_fib_dump_rules, }, #else - [10] = { .doit = dn_cache_getroute, .dumpit = dn_cache_dump, }, + [RTM_GETROUTE - RTM_BASE] = { .doit = dn_cache_getroute, + .dumpit = dn_cache_dump, #endif }; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index eea7ef0107767..abbc6d5c183e3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1107,17 +1107,18 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa) } } -static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = { - [4] = { .doit = inet_rtm_newaddr, }, - [5] = { .doit = inet_rtm_deladdr, }, - [6] = { .dumpit = inet_dump_ifaddr, }, - [8] = { .doit = inet_rtm_newroute, }, - [9] = { .doit = inet_rtm_delroute, }, - [10] = { .doit = inet_rtm_getroute, .dumpit = inet_dump_fib, }, +static struct rtnetlink_link inet_rtnetlink_table[RTM_NR_MSGTYPES] = { + [RTM_NEWADDR - RTM_BASE] = { .doit = inet_rtm_newaddr, }, + [RTM_DELADDR - RTM_BASE] = { .doit = inet_rtm_deladdr, }, + [RTM_GETADDR - RTM_BASE] = { .dumpit = inet_dump_ifaddr, }, + [RTM_NEWROUTE - RTM_BASE] = { .doit = inet_rtm_newroute, }, + [RTM_DELROUTE - RTM_BASE] = { .doit = inet_rtm_delroute, }, + [RTM_GETROUTE - RTM_BASE] = { .doit = inet_rtm_getroute, + .dumpit = inet_dump_fib, }, #ifdef CONFIG_IP_MULTIPLE_TABLES - [16] = { .doit = inet_rtm_newrule, }, - [17] = { .doit = inet_rtm_delrule, }, - [18] = { .dumpit = inet_dump_rules, }, + [RTM_NEWRULE - RTM_BASE] = { .doit = inet_rtm_newrule, }, + [RTM_DELRULE - RTM_BASE] = { .doit = inet_rtm_delrule, }, + [RTM_GETRULE - RTM_BASE] = { .dumpit = inet_dump_rules, }, #endif }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7196ac2f2d168..7744a2592693f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3076,7 +3076,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC); } -static struct rtnetlink_link inet6_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = { +static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = { [RTM_GETLINK - RTM_BASE] = { .dumpit = inet6_dump_ifinfo, }, [RTM_NEWADDR - RTM_BASE] = { .doit = inet6_rtm_newaddr, }, [RTM_DELADDR - RTM_BASE] = { .doit = inet6_rtm_deladdr, }, From 6a5d362120a61a719095443194cc2d5e9a7027dd Mon Sep 17 00:00:00 2001 From: Jesper Juhl <juhl-lkml@dif.dk> Date: Tue, 3 May 2005 14:33:27 -0700 Subject: [PATCH 10/28] [WAN]: kfree of NULL pointer is valid kfree(0) is perfectly valid, checking pointers for NULL before calling kfree() on them is redundant. The patch below cleans away a few such redundant checks (and while I was around some of those bits I couldn't stop myself from making a few tiny whitespace changes as well). Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk> Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/wan/cycx_x25.c | 8 ++------ drivers/net/wan/pc300_tty.c | 27 +++++++++++---------------- drivers/net/wan/sdla_chdlc.c | 13 ++++--------- drivers/net/wan/x25_asy.c | 20 ++++++-------------- 4 files changed, 23 insertions(+), 45 deletions(-) diff --git a/drivers/net/wan/cycx_x25.c b/drivers/net/wan/cycx_x25.c index 5b48cd8568f51..02d57c0b4243a 100644 --- a/drivers/net/wan/cycx_x25.c +++ b/drivers/net/wan/cycx_x25.c @@ -436,9 +436,7 @@ static int cycx_wan_new_if(struct wan_device *wandev, struct net_device *dev, } if (err) { - if (chan->local_addr) - kfree(chan->local_addr); - + kfree(chan->local_addr); kfree(chan); return err; } @@ -458,9 +456,7 @@ static int cycx_wan_del_if(struct wan_device *wandev, struct net_device *dev) struct cycx_x25_channel *chan = dev->priv; if (chan->svc) { - if (chan->local_addr) - kfree(chan->local_addr); - + kfree(chan->local_addr); if (chan->state == WAN_CONNECTED) del_timer(&chan->timer); } diff --git a/drivers/net/wan/pc300_tty.c b/drivers/net/wan/pc300_tty.c index 29f84ad087303..8454bf6caaa70 100644 --- a/drivers/net/wan/pc300_tty.c +++ b/drivers/net/wan/pc300_tty.c @@ -400,10 +400,8 @@ static void cpc_tty_close(struct tty_struct *tty, struct file *flip) cpc_tty->buf_rx.last = NULL; } - if (cpc_tty->buf_tx) { - kfree(cpc_tty->buf_tx); - cpc_tty->buf_tx = NULL; - } + kfree(cpc_tty->buf_tx); + cpc_tty->buf_tx = NULL; CPC_TTY_DBG("%s: TTY closed\n",cpc_tty->name); @@ -666,7 +664,7 @@ static void cpc_tty_rx_work(void * data) unsigned long port; int i, j; st_cpc_tty_area *cpc_tty; - volatile st_cpc_rx_buf * buf; + volatile st_cpc_rx_buf *buf; char flags=0,flg_rx=1; struct tty_ldisc *ld; @@ -680,9 +678,9 @@ static void cpc_tty_rx_work(void * data) cpc_tty = &cpc_tty_area[port]; if ((buf=cpc_tty->buf_rx.first) != 0) { - if(cpc_tty->tty) { + if (cpc_tty->tty) { ld = tty_ldisc_ref(cpc_tty->tty); - if(ld) { + if (ld) { if (ld->receive_buf) { CPC_TTY_DBG("%s: call line disc. receive_buf\n",cpc_tty->name); ld->receive_buf(cpc_tty->tty, (char *)(buf->data), &flags, buf->size); @@ -691,7 +689,7 @@ static void cpc_tty_rx_work(void * data) } } cpc_tty->buf_rx.first = cpc_tty->buf_rx.first->next; - kfree((unsigned char *)buf); + kfree(buf); buf = cpc_tty->buf_rx.first; flg_rx = 1; } @@ -733,7 +731,7 @@ static void cpc_tty_rx_disc_frame(pc300ch_t *pc300chan) void cpc_tty_receive(pc300dev_t *pc300dev) { - st_cpc_tty_area *cpc_tty; + st_cpc_tty_area *cpc_tty; pc300ch_t *pc300chan = (pc300ch_t *)pc300dev->chan; pc300_t *card = (pc300_t *)pc300chan->card; int ch = pc300chan->channel; @@ -742,7 +740,7 @@ void cpc_tty_receive(pc300dev_t *pc300dev) int rx_len, rx_aux; volatile unsigned char status; unsigned short first_bd = pc300chan->rx_first_bd; - st_cpc_rx_buf *new=NULL; + st_cpc_rx_buf *new = NULL; unsigned char dsr_rx; if (pc300dev->cpc_tty == NULL) { @@ -762,7 +760,7 @@ void cpc_tty_receive(pc300dev_t *pc300dev) if (status & DST_EOM) { break; } - ptdescr=(pcsca_bd_t __iomem *)(card->hw.rambase+cpc_readl(&ptdescr->next)); + ptdescr = (pcsca_bd_t __iomem *)(card->hw.rambase+cpc_readl(&ptdescr->next)); } if (!rx_len) { @@ -771,10 +769,7 @@ void cpc_tty_receive(pc300dev_t *pc300dev) cpc_writel(card->hw.scabase + DRX_REG(EDAL, ch), RX_BD_ADDR(ch, pc300chan->rx_last_bd)); } - if (new) { - kfree(new); - new = NULL; - } + kfree(new); return; } @@ -787,7 +782,7 @@ void cpc_tty_receive(pc300dev_t *pc300dev) continue; } - new = (st_cpc_rx_buf *) kmalloc(rx_len + sizeof(st_cpc_rx_buf), GFP_ATOMIC); + new = (st_cpc_rx_buf *)kmalloc(rx_len + sizeof(st_cpc_rx_buf), GFP_ATOMIC); if (new == 0) { cpc_tty_rx_disc_frame(pc300chan); continue; diff --git a/drivers/net/wan/sdla_chdlc.c b/drivers/net/wan/sdla_chdlc.c index afbe0024e3e15..496d29237e927 100644 --- a/drivers/net/wan/sdla_chdlc.c +++ b/drivers/net/wan/sdla_chdlc.c @@ -3664,15 +3664,10 @@ static void wanpipe_tty_close(struct tty_struct *tty, struct file * filp) chdlc_disable_comm_shutdown(card); unlock_adapter_irq(&card->wandev.lock,&smp_flags); - if (card->tty_buf){ - kfree(card->tty_buf); - card->tty_buf=NULL; - } - - if (card->tty_rx){ - kfree(card->tty_rx); - card->tty_rx=NULL; - } + kfree(card->tty_buf); + card->tty_buf = NULL; + kfree(card->tty_rx); + card->tty_rx = NULL; } return; } diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c index 8c5cfcb55826b..1c540d825551f 100644 --- a/drivers/net/wan/x25_asy.c +++ b/drivers/net/wan/x25_asy.c @@ -107,13 +107,9 @@ static struct x25_asy *x25_asy_alloc(void) static void x25_asy_free(struct x25_asy *sl) { /* Free all X.25 frame buffers. */ - if (sl->rbuff) { - kfree(sl->rbuff); - } + kfree(sl->rbuff); sl->rbuff = NULL; - if (sl->xbuff) { - kfree(sl->xbuff); - } + kfree(sl->xbuff); sl->xbuff = NULL; if (!test_and_clear_bit(SLF_INUSE, &sl->flags)) { @@ -134,10 +130,8 @@ static int x25_asy_change_mtu(struct net_device *dev, int newmtu) { printk("%s: unable to grow X.25 buffers, MTU change cancelled.\n", dev->name); - if (xbuff != NULL) - kfree(xbuff); - if (rbuff != NULL) - kfree(rbuff); + kfree(xbuff); + kfree(rbuff); return -ENOMEM; } @@ -169,10 +163,8 @@ static int x25_asy_change_mtu(struct net_device *dev, int newmtu) spin_unlock_bh(&sl->lock); - if (xbuff != NULL) - kfree(xbuff); - if (rbuff != NULL) - kfree(rbuff); + kfree(xbuff); + kfree(rbuff); return 0; } From 20cc6befa23bb993cf4a4c58becb1dd99e7fc927 Mon Sep 17 00:00:00 2001 From: Lucas Correia Villa Real <lucasvr@gobolinux.org> Date: Tue, 3 May 2005 14:34:20 -0700 Subject: [PATCH 11/28] [PKT_SCHED]: fix typo on Kconfig This is a trivial fix for a typo on Kconfig, where the Generic Random Early Detection algorithm is abbreviated as RED instead of GRED. Signed-off-by: Lucas Correia Villa Real <lucasvr@gobolinux.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 9c118baed9dc5..b0941186f8677 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -185,7 +185,7 @@ config NET_SCH_GRED depends on NET_SCHED help Say Y here if you want to use the Generic Random Early Detection - (RED) packet scheduling algorithm for some of your network devices + (GRED) packet scheduling algorithm for some of your network devices (see the top of <file:net/sched/sch_red.c> for details and references about the algorithm). From 0b2531bdc54e19717de5cb161d57e5ee0a7725ff Mon Sep 17 00:00:00 2001 From: Folkert van Heusden <folkert@vanheusden.com> Date: Tue, 3 May 2005 14:36:08 -0700 Subject: [PATCH 12/28] [TCP]: Optimize check in port-allocation code. Signed-off-by: Folkert van Heusden <folkert@vanheusden.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp_ipv4.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3ac6659869c41..dad98e4a50431 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -222,10 +222,13 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) int rover; spin_lock(&tcp_portalloc_lock); - rover = tcp_port_rover; + if (tcp_port_rover < low) + rover = low; + else + rover = tcp_port_rover; do { rover++; - if (rover < low || rover > high) + if (rover > high) rover = low; head = &tcp_bhash[tcp_bhashfn(rover)]; spin_lock(&head->lock); From c3924c70dd3bddc28b99ccd1688bd281bad1a9be Mon Sep 17 00:00:00 2001 From: Folkert van Heusden <folkert@vanheusden.com> Date: Tue, 3 May 2005 14:36:45 -0700 Subject: [PATCH 13/28] [TCP]: Optimize check in port-allocation code, v6 version. Signed-off-by: Folkert van Heusden <folkert@vanheusden.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv6/tcp_ipv6.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4760c85e19db8..0f69e800a0ad6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -139,9 +139,12 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) int rover; spin_lock(&tcp_portalloc_lock); - rover = tcp_port_rover; + if (tcp_port_rover < low) + rover = low; + else + rover = tcp_port_rover; do { rover++; - if ((rover < low) || (rover > high)) + if (rover > high) rover = low; head = &tcp_bhash[tcp_bhashfn(rover)]; spin_lock(&head->lock); From 96edf83c4e284c08584f97623f7c7f029759459e Mon Sep 17 00:00:00 2001 From: Jesper Juhl <juhl-lkml@dif.dk> Date: Tue, 3 May 2005 14:38:09 -0700 Subject: [PATCH 14/28] [PPP]: remove redundant NULL pointer checks before kfree & vfree kfree() and vfree() can both deal with NULL pointers. This patch removes redundant NULL pointer checks from the ppp code in drivers/net/ Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ppp_deflate.c | 6 ++---- drivers/net/ppp_generic.c | 12 ++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/net/ppp_deflate.c b/drivers/net/ppp_deflate.c index 507d6328d4eb0..3872088fdd10c 100644 --- a/drivers/net/ppp_deflate.c +++ b/drivers/net/ppp_deflate.c @@ -87,8 +87,7 @@ static void z_comp_free(void *arg) if (state) { zlib_deflateEnd(&state->strm); - if (state->strm.workspace) - vfree(state->strm.workspace); + vfree(state->strm.workspace); kfree(state); } } @@ -308,8 +307,7 @@ static void z_decomp_free(void *arg) if (state) { zlib_inflateEnd(&state->strm); - if (state->strm.workspace) - kfree(state->strm.workspace); + kfree(state->strm.workspace); kfree(state); } } diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index c456dc81b8730..3b377f6cd4a0b 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -2467,14 +2467,10 @@ static void ppp_destroy_interface(struct ppp *ppp) skb_queue_purge(&ppp->mrq); #endif /* CONFIG_PPP_MULTILINK */ #ifdef CONFIG_PPP_FILTER - if (ppp->pass_filter) { - kfree(ppp->pass_filter); - ppp->pass_filter = NULL; - } - if (ppp->active_filter) { - kfree(ppp->active_filter); - ppp->active_filter = NULL; - } + kfree(ppp->pass_filter); + ppp->pass_filter = NULL; + kfree(ppp->active_filter); + ppp->active_filter = NULL; #endif /* CONFIG_PPP_FILTER */ kfree(ppp); From 033d899904792d3501b7dd469495ca9138424ec3 Mon Sep 17 00:00:00 2001 From: Asim Shankar <asimshankar@gmail.com> Date: Tue, 3 May 2005 14:39:33 -0700 Subject: [PATCH 15/28] [PKT_SCHED]: HTB: Drop packet when direct queue is full htb_enqueue(): Free skb and return NET_XMIT_DROP if a packet is destined for the direct_queue but the direct_queue is full. (Before this: erroneously returned NET_XMIT_SUCCESS even though the packet was not enqueued) Signed-off-by: Asim Shankar <asimshankar@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/sch_htb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index a85935e7d53d2..558cc087e6023 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -717,6 +717,10 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->direct_queue.qlen < q->direct_qlen) { __skb_queue_tail(&q->direct_queue, skb); q->direct_pkts++; + } else { + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; } #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { From 9dfa277f88388a94993b121db46b80df66f48d9e Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Tue, 3 May 2005 14:41:18 -0700 Subject: [PATCH 16/28] [PKT_SCHED]: Fix range in PSCHED_TDIFF_SAFE to 0..bound Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/pkt_sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 7352e455053cc..fcb05a387dbee 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -157,7 +157,8 @@ psched_tod_diff(int delta_sec, int bound) case 1: \ __delta += 1000000; \ case 0: \ - __delta = abs(__delta); \ + if (__delta > bound || __delta < 0) \ + __delta = bound; \ } \ __delta; \ }) From 96c36023434b7b6824b1da72a6b7b1ca61d7310c Mon Sep 17 00:00:00 2001 From: Herbert Xu <herbert@gondor.apana.org.au> Date: Tue, 3 May 2005 14:43:27 -0700 Subject: [PATCH 17/28] [NETLINK]: cb_lock does not needs ref count on sk Here is a little optimisation for the cb_lock used by netlink_dump. While fixing that race earlier, I noticed that the reference count held by cb_lock is completely useless. The reason is that in order to obtain the protection of the reference count, you have to take the cb_lock. But the only way to take the cb_lock is through dereferencing the socket. That is, you must already possess a reference count on the socket before you can take advantage of the reference count held by cb_lock. As a corollary, we can remve the reference count held by the cb_lock. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/netlink/af_netlink.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 29a5fd231eac9..4ee392066148e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -373,7 +373,6 @@ static int netlink_release(struct socket *sock) nlk->cb->done(nlk->cb); netlink_destroy_callback(nlk->cb); nlk->cb = NULL; - __sock_put(sk); } spin_unlock(&nlk->cb_lock); @@ -1099,7 +1098,6 @@ static int netlink_dump(struct sock *sk) spin_unlock(&nlk->cb_lock); netlink_destroy_callback(cb); - __sock_put(sk); return 0; } @@ -1138,7 +1136,6 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, return -EBUSY; } nlk->cb = cb; - sock_hold(sk); spin_unlock(&nlk->cb_lock); netlink_dump(sk); From 2a0a6ebee1d68552152ae8d4aeda91d806995dec Mon Sep 17 00:00:00 2001 From: Herbert Xu <herbert@gondor.apana.org.au> Date: Tue, 3 May 2005 14:55:09 -0700 Subject: [PATCH 18/28] [NETLINK]: Synchronous message processing. Let's recap the problem. The current asynchronous netlink kernel message processing is vulnerable to these attacks: 1) Hit and run: Attacker sends one or more messages and then exits before they're processed. This may confuse/disable the next netlink user that gets the netlink address of the attacker since it may receive the responses to the attacker's messages. Proposed solutions: a) Synchronous processing. b) Stream mode socket. c) Restrict/prohibit binding. 2) Starvation: Because various netlink rcv functions were written to not return until all messages have been processed on a socket, it is possible for these functions to execute for an arbitrarily long period of time. If this is successfully exploited it could also be used to hold rtnl forever. Proposed solutions: a) Synchronous processing. b) Stream mode socket. Firstly let's cross off solution c). It only solves the first problem and it has user-visible impacts. In particular, it'll break user space applications that expect to bind or communicate with specific netlink addresses (pid's). So we're left with a choice of synchronous processing versus SOCK_STREAM for netlink. For the moment I'm sticking with the synchronous approach as suggested by Alexey since it's simpler and I'd rather spend my time working on other things. However, it does have a number of deficiencies compared to the stream mode solution: 1) User-space to user-space netlink communication is still vulnerable. 2) Inefficient use of resources. This is especially true for rtnetlink since the lock is shared with other users such as networking drivers. The latter could hold the rtnl while communicating with hardware which causes the rtnetlink user to wait when it could be doing other things. 3) It is still possible to DoS all netlink users by flooding the kernel netlink receive queue. The attacker simply fills the receive socket with a single netlink message that fills up the entire queue. The attacker then continues to call sendmsg with the same message in a loop. Point 3) can be countered by retransmissions in user-space code, however it is pretty messy. In light of these problems (in particular, point 3), we should implement stream mode netlink at some point. In the mean time, here is a patch that implements synchronous processing. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- kernel/audit.c | 19 ++++++++----------- net/core/rtnetlink.c | 23 ++++++++++++++--------- net/decnet/netfilter/dn_rtmsg.c | 3 ++- net/ipv4/netfilter/ip_queue.c | 20 +++++++++----------- net/ipv4/tcp_diag.c | 3 ++- net/ipv6/netfilter/ip6_queue.c | 20 +++++++++----------- net/xfrm/xfrm_user.c | 15 +++++++++++---- 7 files changed, 55 insertions(+), 48 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index 0f84dd7af2c8d..ac26d4d960d33 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -427,7 +427,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* Get message from skb (based on rtnetlink_rcv_skb). Each message is * processed by audit_receive_msg. Malformed skbs with wrong length are * discarded silently. */ -static int audit_receive_skb(struct sk_buff *skb) +static void audit_receive_skb(struct sk_buff *skb) { int err; struct nlmsghdr *nlh; @@ -436,7 +436,7 @@ static int audit_receive_skb(struct sk_buff *skb) while (skb->len >= NLMSG_SPACE(0)) { nlh = (struct nlmsghdr *)skb->data; if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return 0; + return; rlen = NLMSG_ALIGN(nlh->nlmsg_len); if (rlen > skb->len) rlen = skb->len; @@ -446,23 +446,20 @@ static int audit_receive_skb(struct sk_buff *skb) netlink_ack(skb, nlh, 0); skb_pull(skb, rlen); } - return 0; } /* Receive messages from netlink socket. */ static void audit_receive(struct sock *sk, int length) { struct sk_buff *skb; + unsigned int qlen; - if (down_trylock(&audit_netlink_sem)) - return; + down(&audit_netlink_sem); - /* FIXME: this must not cause starvation */ - while ((skb = skb_dequeue(&sk->sk_receive_queue))) { - if (audit_receive_skb(skb) && skb->len) - skb_queue_head(&sk->sk_receive_queue, skb); - else - kfree_skb(skb); + for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { + skb = skb_dequeue(&sk->sk_receive_queue); + audit_receive_skb(skb); + kfree_skb(skb); } up(&audit_netlink_sem); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5fb70cfa10850..6e1ab1e34b2ec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -609,26 +609,31 @@ static inline int rtnetlink_rcv_skb(struct sk_buff *skb) /* * rtnetlink input queue processing routine: - * - try to acquire shared lock. If it is failed, defer processing. + * - process as much as there was in the queue upon entry. * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, - * that will occur, when a dump started and/or acquisition of - * exclusive lock failed. + * that will occur, when a dump started. */ static void rtnetlink_rcv(struct sock *sk, int len) { + unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + do { struct sk_buff *skb; - if (rtnl_shlock_nowait()) - return; + rtnl_lock(); + + if (qlen > skb_queue_len(&sk->sk_receive_queue)) + qlen = skb_queue_len(&sk->sk_receive_queue); - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + while (qlen--) { + skb = skb_dequeue(&sk->sk_receive_queue); if (rtnetlink_rcv_skb(skb)) { - if (skb->len) + if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); - else + qlen++; + } else kfree_skb(skb); break; } @@ -638,7 +643,7 @@ static void rtnetlink_rcv(struct sock *sk, int len) up(&rtnl_sem); netdev_run_todo(); - } while (rtnl && rtnl->sk_receive_queue.qlen); + } while (qlen); } static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] = diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index f86a6259fd121..101ddef9ba9aa 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -119,8 +119,9 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb) static void dnrmg_receive_user_sk(struct sock *sk, int len) { struct sk_buff *skb; + unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); - while((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { dnrmg_receive_user_skb(skb); kfree_skb(skb); } diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 9e40dffc204f3..e5746b6744134 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -546,20 +546,18 @@ ipq_rcv_skb(struct sk_buff *skb) static void ipq_rcv_sk(struct sock *sk, int len) { - do { - struct sk_buff *skb; + struct sk_buff *skb; + unsigned int qlen; - if (down_trylock(&ipqnl_sem)) - return; + down(&ipqnl_sem); - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { - ipq_rcv_skb(skb); - kfree_skb(skb); - } + for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { + skb = skb_dequeue(&sk->sk_receive_queue); + ipq_rcv_skb(skb); + kfree_skb(skb); + } - up(&ipqnl_sem); - - } while (ipqnl && ipqnl->sk_receive_queue.qlen); + up(&ipqnl_sem); } static int diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 313c1408da33a..8faa8948f75c2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -777,8 +777,9 @@ static inline void tcpdiag_rcv_skb(struct sk_buff *skb) static void tcpdiag_rcv(struct sock *sk, int len) { struct sk_buff *skb; + unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { tcpdiag_rcv_skb(skb); kfree_skb(skb); } diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index c54830b895939..750943e2d34ee 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -549,20 +549,18 @@ ipq_rcv_skb(struct sk_buff *skb) static void ipq_rcv_sk(struct sock *sk, int len) { - do { - struct sk_buff *skb; + struct sk_buff *skb; + unsigned int qlen; - if (down_trylock(&ipqnl_sem)) - return; + down(&ipqnl_sem); - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { - ipq_rcv_skb(skb); - kfree_skb(skb); - } + for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { + skb = skb_dequeue(&sk->sk_receive_queue); + ipq_rcv_skb(skb); + kfree_skb(skb); + } - up(&ipqnl_sem); - - } while (ipqnl && ipqnl->sk_receive_queue.qlen); + up(&ipqnl_sem); } static int diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 52b5843937c58..dab112f1dd8a8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1008,17 +1008,24 @@ static int xfrm_user_rcv_skb(struct sk_buff *skb) static void xfrm_netlink_rcv(struct sock *sk, int len) { + unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + do { struct sk_buff *skb; down(&xfrm_cfg_sem); - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (qlen > skb_queue_len(&sk->sk_receive_queue)) + qlen = skb_queue_len(&sk->sk_receive_queue); + + while (qlen--) { + skb = skb_dequeue(&sk->sk_receive_queue); if (xfrm_user_rcv_skb(skb)) { - if (skb->len) + if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); - else + qlen++; + } else kfree_skb(skb); break; } @@ -1027,7 +1034,7 @@ static void xfrm_netlink_rcv(struct sock *sk, int len) up(&xfrm_cfg_sem); - } while (xfrm_nl && xfrm_nl->sk_receive_queue.qlen); + } while (qlen); } static int build_expire(struct sk_buff *skb, struct xfrm_state *x, int hard) From 09e14305982efc2f3b509d3c50ef5dcbff64a998 Mon Sep 17 00:00:00 2001 From: "David S. Miller" <davem@sunset.davemloft.net> Date: Tue, 3 May 2005 15:30:05 -0700 Subject: [PATCH 19/28] [NETLINK]: Fix infinite loops in synchronous netlink changes. The qlen should continue to decrement, even if we pop partially processed SKBs back onto the receive queue. Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/rtnetlink.c | 7 +++---- net/decnet/netfilter/dn_rtmsg.c | 2 +- net/xfrm/xfrm_user.c | 7 +++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6e1ab1e34b2ec..75b6d33b52924 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -626,14 +626,13 @@ static void rtnetlink_rcv(struct sock *sk, int len) if (qlen > skb_queue_len(&sk->sk_receive_queue)) qlen = skb_queue_len(&sk->sk_receive_queue); - while (qlen--) { + for (; qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); if (rtnetlink_rcv_skb(skb)) { - if (skb->len) { + if (skb->len) skb_queue_head(&sk->sk_receive_queue, skb); - qlen++; - } else + else kfree_skb(skb); break; } diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 101ddef9ba9aa..284a9998e53d7 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -121,7 +121,7 @@ static void dnrmg_receive_user_sk(struct sock *sk, int len) struct sk_buff *skb; unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); - while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { + for (; qlen && (skb = skb_dequeue(&sk->sk_receive_queue)); qlen--) { dnrmg_receive_user_skb(skb); kfree_skb(skb); } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index dab112f1dd8a8..e8740a4a1d784 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1018,14 +1018,13 @@ static void xfrm_netlink_rcv(struct sock *sk, int len) if (qlen > skb_queue_len(&sk->sk_receive_queue)) qlen = skb_queue_len(&sk->sk_receive_queue); - while (qlen--) { + for (; qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); if (xfrm_user_rcv_skb(skb)) { - if (skb->len) { + if (skb->len) skb_queue_head(&sk->sk_receive_queue, skb); - qlen++; - } else + else kfree_skb(skb); break; } From 0f4821e7b93fe72e89b8ff393bd8e705bd178aa5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" <davem@sunset.davemloft.net> Date: Tue, 3 May 2005 16:15:59 -0700 Subject: [PATCH 20/28] [XFRM/RTNETLINK]: Decrement qlen properly in {xfrm_,rt}netlink_rcv(). If we free up a partially processed packet because it's skb->len dropped to zero, we need to decrement qlen because we are dropping out of the top-level loop so it will do the decrement for us. Spotted by Herbert Xu. Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/rtnetlink.c | 4 +++- net/xfrm/xfrm_user.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 75b6d33b52924..00caf4b318b20 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -632,8 +632,10 @@ static void rtnetlink_rcv(struct sock *sk, int len) if (skb->len) skb_queue_head(&sk->sk_receive_queue, skb); - else + else { kfree_skb(skb); + qlen--; + } break; } kfree_skb(skb); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e8740a4a1d784..5ddda2c98af9c 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1024,8 +1024,10 @@ static void xfrm_netlink_rcv(struct sock *sk, int len) if (skb->len) skb_queue_head(&sk->sk_receive_queue, skb); - else + else { kfree_skb(skb); + qlen--; + } break; } kfree_skb(skb); From cacaddf57ed4d5ca994e9a7e2bd5558061f5d89d Mon Sep 17 00:00:00 2001 From: "Tommy S. Christensen" <tommy.christensen@tpack.net> Date: Tue, 3 May 2005 16:18:52 -0700 Subject: [PATCH 21/28] [NET]: Disable queueing when carrier is lost. Some network drivers call netif_stop_queue() when detecting loss of carrier. This leads to packets being queued up at the qdisc level for an unbound period of time. In order to prevent this effect, the core networking stack will now cease to queue packets for any device, that is operationally down (i.e. the queue is flushed and disabled). Signed-off-by: Tommy S. Christensen <tommy.christensen@tpack.net> Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/link_watch.c | 7 +++++++ net/sched/sch_generic.c | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 4859b7446c6fc..d43d1201275c1 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -16,6 +16,7 @@ #include <linux/netdevice.h> #include <linux/if.h> #include <net/sock.h> +#include <net/pkt_sched.h> #include <linux/rtnetlink.h> #include <linux/jiffies.h> #include <linux/spinlock.h> @@ -74,6 +75,12 @@ void linkwatch_run_queue(void) clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); if (dev->flags & IFF_UP) { + if (netif_carrier_ok(dev)) { + WARN_ON(dev->qdisc_sleeping == &noop_qdisc); + dev_activate(dev); + } else + dev_deactivate(dev); + netdev_state_change(dev); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 8c01e023f02e7..9a2f8e41a26ee 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -539,6 +539,10 @@ void dev_activate(struct net_device *dev) write_unlock_bh(&qdisc_tree_lock); } + if (!netif_carrier_ok(dev)) + /* Delay activation until next carrier-on event */ + return; + spin_lock_bh(&dev->queue_lock); rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping); if (dev->qdisc != &noqueue_qdisc) { From e4f8ab00cf3599ecb8110c0a838cd15d013b79e5 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Tue, 3 May 2005 16:20:39 -0700 Subject: [PATCH 22/28] [NETFILTER]: Fix nf_debug_ip_local_deliver() Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/netfilter.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/net/core/netfilter.c b/net/core/netfilter.c index e51cfa46950cf..92c51824797dd 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -217,21 +217,10 @@ void nf_debug_ip_local_deliver(struct sk_buff *skb) * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */ if (!skb->dev) { printk("ip_local_deliver: skb->dev is NULL.\n"); - } - else if (strcmp(skb->dev->name, "lo") == 0) { - if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) - | (1 << NF_IP_POST_ROUTING) - | (1 << NF_IP_PRE_ROUTING) - | (1 << NF_IP_LOCAL_IN))) { - printk("ip_local_deliver: bad loopback skb: "); - debug_print_hooks_ip(skb->nf_debug); - nf_dump_skb(PF_INET, skb); - } - } - else { + } else { if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_LOCAL_IN))) { - printk("ip_local_deliver: bad non-lo skb: "); + printk("ip_local_deliver: bad skb: "); debug_print_hooks_ip(skb->nf_debug); nf_dump_skb(PF_INET, skb); } From bd96535b81ad09d7593cc75093534acb984d3dc9 Mon Sep 17 00:00:00 2001 From: Patrick McHardy <kaber@trash.net> Date: Tue, 3 May 2005 16:21:37 -0700 Subject: [PATCH 23/28] [NETFILTER]: Drop conntrack reference in ip_dev_loopback_xmit() Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/netfilter.c | 2 -- net/ipv4/ip_output.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 92c51824797dd..22a8f127c4aad 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -236,8 +236,6 @@ void nf_debug_ip_loopback_xmit(struct sk_buff *newskb) debug_print_hooks_ip(newskb->nf_debug); nf_dump_skb(PF_INET, newskb); } - /* Clear to avoid confusing input check */ - newskb->nf_debug = 0; } void nf_debug_ip_finish_output2(struct sk_buff *skb) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 38f69532a029e..24fe3e00b42b0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -111,6 +111,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) #ifdef CONFIG_NETFILTER_DEBUG nf_debug_ip_loopback_xmit(newskb); #endif + nf_reset(newskb); netif_rx(newskb); return 0; } From 8cbe1d46d69f9e2c49f284fe0e9aee3387bd2c71 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger <shemminger@osdl.org> Date: Tue, 3 May 2005 16:24:03 -0700 Subject: [PATCH 24/28] [PKT_SCHED]: netetm: trap infinite loop hange on qlen underflow Due to bugs in netem (fixed by later patches), it is possible to get qdisc qlen to go negative. If this happens the CPU ends up spinning forever in qdisc_run(). So add a BUG_ON() to trap it. Signed-off-by: Stephen Hemminger <shemminger@osdl.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/sch_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 9a2f8e41a26ee..87e48a4e10513 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -179,6 +179,7 @@ int qdisc_restart(struct net_device *dev) netif_schedule(dev); return 1; } + BUG_ON((int) q->q.qlen < 0); return q->q.qlen; } From 771018e76aaa6474be20a53c20458bcae8b00485 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger <shemminger@osdl.org> Date: Tue, 3 May 2005 16:24:32 -0700 Subject: [PATCH 25/28] [PKT_SCHED]: netetm: make qdisc friendly to outer disciplines Netem currently dumps packets into the queue when timer expires. This patch makes work by self-clocking (more like TBF). It fixes a bug when 0 delay is requested (only doing loss or duplication). Signed-off-by: Stephen Hemminger <shemminger@osdl.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/sch_netem.c | 113 +++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 46 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 31c29deb139d3..864b8d353ffae 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -138,38 +138,78 @@ static long tabledist(unsigned long mu, long sigma, } /* Put skb in the private delayed queue. */ -static int delay_skb(struct Qdisc *sch, struct sk_buff *skb) +static int netem_delay(struct Qdisc *sch, struct sk_buff *skb) { struct netem_sched_data *q = qdisc_priv(sch); - struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; psched_tdiff_t td; psched_time_t now; PSCHED_GET_TIME(now); td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); - PSCHED_TADD2(now, td, cb->time_to_send); /* Always queue at tail to keep packets in order */ if (likely(q->delayed.qlen < q->limit)) { + struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; + + PSCHED_TADD2(now, td, cb->time_to_send); + + pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb, + now, cb->time_to_send); + __skb_queue_tail(&q->delayed, skb); - if (!timer_pending(&q->timer)) { - q->timer.expires = jiffies + PSCHED_US2JIFFIE(td); - add_timer(&q->timer); - } return NET_XMIT_SUCCESS; } + pr_debug("netem_delay: queue over limit %d\n", q->limit); + sch->qstats.overlimits++; kfree_skb(skb); return NET_XMIT_DROP; } +/* + * Move a packet that is ready to send from the delay holding + * list to the underlying qdisc. + */ +static int netem_run(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + psched_time_t now; + + PSCHED_GET_TIME(now); + + skb = skb_peek(&q->delayed); + if (skb) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + long delay + = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); + pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay); + + /* if more time remaining? */ + if (delay > 0) { + mod_timer(&q->timer, jiffies + delay); + return 1; + } + + __skb_unlink(skb, &q->delayed); + + if (q->qdisc->enqueue(skb, q->qdisc)) { + sch->q.qlen--; + sch->qstats.drops++; + } + } + + return 0; +} + static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb2; int ret; - pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); + pr_debug("netem_enqueue skb=%p\n", skb); /* Random packet drop 0 => none, ~0 => all */ if (q->loss && q->loss >= get_crandom(&q->loss_cor)) { @@ -184,7 +224,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { pr_debug("netem_enqueue: dup %p\n", skb2); - if (delay_skb(sch, skb2)) { + if (netem_delay(sch, skb2)) { sch->q.qlen++; sch->bstats.bytes += skb2->len; sch->bstats.packets++; @@ -202,7 +242,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = q->qdisc->enqueue(skb, q->qdisc); } else { q->counter = 0; - ret = delay_skb(sch, skb); + ret = netem_delay(sch, skb); + netem_run(sch); } if (likely(ret == NET_XMIT_SUCCESS)) { @@ -241,56 +282,35 @@ static unsigned int netem_drop(struct Qdisc* sch) return len; } -/* Dequeue packet. - * Move all packets that are ready to send from the delay holding - * list to the underlying qdisc, then just call dequeue - */ static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; + int pending; + + pending = netem_run(sch); skb = q->qdisc->dequeue(q->qdisc); - if (skb) + if (skb) { + pr_debug("netem_dequeue: return skb=%p\n", skb); sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + } + else if (pending) { + pr_debug("netem_dequeue: throttling\n"); + sch->flags |= TCQ_F_THROTTLED; + } + return skb; } static void netem_watchdog(unsigned long arg) { struct Qdisc *sch = (struct Qdisc *)arg; - struct netem_sched_data *q = qdisc_priv(sch); - struct net_device *dev = sch->dev; - struct sk_buff *skb; - psched_time_t now; - pr_debug("netem_watchdog: fired @%lu\n", jiffies); - - spin_lock_bh(&dev->queue_lock); - PSCHED_GET_TIME(now); - - while ((skb = skb_peek(&q->delayed)) != NULL) { - const struct netem_skb_cb *cb - = (const struct netem_skb_cb *)skb->cb; - long delay - = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); - pr_debug("netem_watchdog: skb %p@%lu %ld\n", - skb, jiffies, delay); - - /* if more time remaining? */ - if (delay > 0) { - mod_timer(&q->timer, jiffies + delay); - break; - } - __skb_unlink(skb, &q->delayed); - - if (q->qdisc->enqueue(skb, q->qdisc)) { - sch->q.qlen--; - sch->qstats.drops++; - } - } - qdisc_run(dev); - spin_unlock_bh(&dev->queue_lock); + pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen); + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); } static void netem_reset(struct Qdisc *sch) @@ -301,6 +321,7 @@ static void netem_reset(struct Qdisc *sch) skb_queue_purge(&q->delayed); sch->q.qlen = 0; + sch->flags &= ~TCQ_F_THROTTLED; del_timer_sync(&q->timer); } From d5d75cd6b10ddad2f375b61092754474ad78aec7 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger <shemminger@osdl.org> Date: Tue, 3 May 2005 16:24:57 -0700 Subject: [PATCH 26/28] [PKT_SCHED]: netetm: adjust parent qlen when duplicating Fix qlen underrun when doing duplication with netem. If netem is used as leaf discipline, then the parent needs to be tweaked when packets are duplicated. Signed-off-by: Stephen Hemminger <shemminger@osdl.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/sch_api.c | 1 + net/sched/sch_netem.c | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 4323a74eea30b..07977f8f2679b 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1289,6 +1289,7 @@ static int __init pktsched_init(void) subsys_initcall(pktsched_init); +EXPORT_SYMBOL(qdisc_lookup); EXPORT_SYMBOL(qdisc_get_rtab); EXPORT_SYMBOL(qdisc_put_rtab); EXPORT_SYMBOL(register_qdisc); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 864b8d353ffae..e0c9fbe73b15c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -206,7 +206,6 @@ static int netem_run(struct Qdisc *sch) static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - struct sk_buff *skb2; int ret; pr_debug("netem_enqueue skb=%p\n", skb); @@ -220,11 +219,21 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor) - && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - pr_debug("netem_enqueue: dup %p\n", skb2); + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) { + struct sk_buff *skb2; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 && netem_delay(sch, skb2) == NET_XMIT_SUCCESS) { + struct Qdisc *qp; + + /* Since one packet can generate two packets in the + * queue, the parent's qlen accounting gets confused, + * so fix it. + */ + qp = qdisc_lookup(sch->dev, TC_H_MAJ(sch->parent)); + if (qp) + qp->q.qlen++; - if (netem_delay(sch, skb2)) { sch->q.qlen++; sch->bstats.bytes += skb2->len; sch->bstats.packets++; @@ -253,6 +262,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } else sch->qstats.drops++; + pr_debug("netem: enqueue ret %d\n", ret); return ret; } From aabc9761b69f1bfa30a78f7005be95cc9cc06175 Mon Sep 17 00:00:00 2001 From: Herbert Xu <herbert@gondor.apana.org.au> Date: Tue, 3 May 2005 16:27:10 -0700 Subject: [PATCH 27/28] [IPSEC]: Store idev entries I found a bug that stopped IPsec/IPv6 from working. About a month ago IPv6 started using rt6i_idev->dev on the cached socket dst entries. If the cached socket dst entry is IPsec, then rt6i_idev will be NULL. Since we want to look at the rt6i_idev of the original route in this case, the easiest fix is to store rt6i_idev in the IPsec dst entry just as we do for a number of other IPv6 route attributes. Unfortunately this means that we need some new code to handle the references to rt6i_idev. That's why this patch is bigger than it would otherwise be. I've also done the same thing for IPv4 since it is conceivable that once these idev attributes start getting used for accounting, we probably need to dereference them for IPv4 IPsec entries too. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/xfrm.h | 10 ++++++++++ net/ipv4/xfrm4_policy.c | 42 ++++++++++++++++++++++++++++++++++++++++ net/ipv6/xfrm6_policy.c | 43 +++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_policy.c | 25 ++---------------------- 4 files changed, 97 insertions(+), 23 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 73e9a8ca3d3b7..e142a256d5dc3 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1,6 +1,7 @@ #ifndef _NET_XFRM_H #define _NET_XFRM_H +#include <linux/compiler.h> #include <linux/xfrm.h> #include <linux/spinlock.h> #include <linux/list.h> @@ -516,6 +517,15 @@ struct xfrm_dst u32 child_mtu_cached; }; +static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) +{ + dst_release(xdst->route); + if (likely(xdst->u.dst.xfrm)) + xfrm_state_put(xdst->u.dst.xfrm); +} + +extern void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); + /* Decapsulation state, used by the input to store data during * decapsulation procedure, to be used later (during the policy * check diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7fe2afd2e6695..b2b60f3e9cdd6 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -8,7 +8,10 @@ * */ +#include <asm/bug.h> +#include <linux/compiler.h> #include <linux/config.h> +#include <linux/inetdevice.h> #include <net/xfrm.h> #include <net/ip.h> @@ -152,6 +155,8 @@ __xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int x->u.rt.rt_dst = rt0->rt_dst; x->u.rt.rt_gateway = rt->rt_gateway; x->u.rt.rt_spec_dst = rt0->rt_spec_dst; + x->u.rt.idev = rt0->idev; + in_dev_hold(rt0->idev); header_len -= x->u.dst.xfrm->props.header_len; trailer_len -= x->u.dst.xfrm->props.trailer_len; } @@ -243,11 +248,48 @@ static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) path->ops->update_pmtu(path, mtu); } +static void xfrm4_dst_destroy(struct dst_entry *dst) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + if (likely(xdst->u.rt.idev)) + in_dev_put(xdst->u.rt.idev); + xfrm_dst_destroy(xdst); +} + +static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + struct xfrm_dst *xdst; + + if (!unregister) + return; + + xdst = (struct xfrm_dst *)dst; + if (xdst->u.rt.idev->dev == dev) { + struct in_device *loopback_idev = in_dev_get(&loopback_dev); + BUG_ON(!loopback_idev); + + do { + in_dev_put(xdst->u.rt.idev); + xdst->u.rt.idev = loopback_idev; + in_dev_hold(loopback_idev); + xdst = (struct xfrm_dst *)xdst->u.dst.child; + } while (xdst->u.dst.xfrm); + + __in_dev_put(loopback_idev); + } + + xfrm_dst_ifdown(dst, dev); +} + static struct dst_ops xfrm4_dst_ops = { .family = AF_INET, .protocol = __constant_htons(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .destroy = xfrm4_dst_destroy, + .ifdown = xfrm4_dst_ifdown, .gc_thresh = 1024, .entry_size = sizeof(struct xfrm_dst), }; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 8a4f37de4d2da..4429b1a1fe5fe 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -11,7 +11,11 @@ * */ +#include <asm/bug.h> +#include <linux/compiler.h> #include <linux/config.h> +#include <linux/netdevice.h> +#include <net/addrconf.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/ipv6.h> @@ -166,6 +170,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int memcpy(&x->u.rt6.rt6i_gateway, &rt0->rt6i_gateway, sizeof(x->u.rt6.rt6i_gateway)); x->u.rt6.rt6i_dst = rt0->rt6i_dst; x->u.rt6.rt6i_src = rt0->rt6i_src; + x->u.rt6.rt6i_idev = rt0->rt6i_idev; + in6_dev_hold(rt0->rt6i_idev); header_len -= x->u.dst.xfrm->props.header_len; trailer_len -= x->u.dst.xfrm->props.trailer_len; } @@ -251,11 +257,48 @@ static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) path->ops->update_pmtu(path, mtu); } +static void xfrm6_dst_destroy(struct dst_entry *dst) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + if (likely(xdst->u.rt6.rt6i_idev)) + in6_dev_put(xdst->u.rt6.rt6i_idev); + xfrm_dst_destroy(xdst); +} + +static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + struct xfrm_dst *xdst; + + if (!unregister) + return; + + xdst = (struct xfrm_dst *)dst; + if (xdst->u.rt6.rt6i_idev->dev == dev) { + struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev); + BUG_ON(!loopback_idev); + + do { + in6_dev_put(xdst->u.rt6.rt6i_idev); + xdst->u.rt6.rt6i_idev = loopback_idev; + in6_dev_hold(loopback_idev); + xdst = (struct xfrm_dst *)xdst->u.dst.child; + } while (xdst->u.dst.xfrm); + + __in6_dev_put(loopback_idev); + } + + xfrm_dst_ifdown(dst, dev); +} + static struct dst_ops xfrm6_dst_ops = { .family = AF_INET6, .protocol = __constant_htons(ETH_P_IPV6), .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, + .destroy = xfrm6_dst_destroy, + .ifdown = xfrm6_dst_ifdown, .gc_thresh = 1024, .entry_size = sizeof(struct xfrm_dst), }; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 80828078733dd..55ed979db1445 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1028,30 +1028,15 @@ static int stale_bundle(struct dst_entry *dst) return !xfrm_bundle_ok((struct xfrm_dst *)dst, NULL, AF_UNSPEC); } -static void xfrm_dst_destroy(struct dst_entry *dst) +void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - - dst_release(xdst->route); - - if (!dst->xfrm) - return; - xfrm_state_put(dst->xfrm); - dst->xfrm = NULL; -} - -static void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) -{ - if (!unregister) - return; - while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { dst->dev = &loopback_dev; dev_hold(&loopback_dev); dev_put(dev); } } +EXPORT_SYMBOL(xfrm_dst_ifdown); static void xfrm_link_failure(struct sk_buff *skb) { @@ -1262,10 +1247,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) dst_ops->kmem_cachep = xfrm_dst_cache; if (likely(dst_ops->check == NULL)) dst_ops->check = xfrm_dst_check; - if (likely(dst_ops->destroy == NULL)) - dst_ops->destroy = xfrm_dst_destroy; - if (likely(dst_ops->ifdown == NULL)) - dst_ops->ifdown = xfrm_dst_ifdown; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) @@ -1297,8 +1278,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) xfrm_policy_afinfo[afinfo->family] = NULL; dst_ops->kmem_cachep = NULL; dst_ops->check = NULL; - dst_ops->destroy = NULL; - dst_ops->ifdown = NULL; dst_ops->negative_advice = NULL; dst_ops->link_failure = NULL; dst_ops->get_mss = NULL; From 14d50e78f947d340066ee0465dd892ad1d9162c0 Mon Sep 17 00:00:00 2001 From: J Hadi Salim <hadi@cyberus.ca> Date: Tue, 3 May 2005 16:29:13 -0700 Subject: [PATCH 28/28] [PKT_SCHED]: Action repeat Long standing bug. Policy to repeat an action never worked. Signed-off-by: J Hadi Salim <hadi@cyberus.ca> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/act_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5e6cc371b39ea..cafcb084098d1 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -171,10 +171,10 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); } - if (ret != TC_ACT_PIPE) - goto exec_done; if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ + if (ret != TC_ACT_PIPE) + goto exec_done; } act = a->next; }