Skip to content

Commit

Permalink
Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
Browse files Browse the repository at this point in the history
Pablo Neira Ayuso says:

====================
Netfilter updates for net-next

A small batch with accumulated updates in nf-next, mostly IPVS updates,
they are:

1) Add 64-bits stats counters to IPVS, from Julian Anastasov.

2) Move NETFILTER_XT_MATCH_ADDRTYPE out of NETFILTER_ADVANCED as docker
seem to require this, from Anton Blanchard.

3) Use boolean instead of numeric value in set_match_v*(), from
coccinelle via Fengguang Wu.

4) Allows rescheduling of new connections in IPVS when port reuse is
detected, from Marcelo Ricardo Leitner.

5) Add missing bits to support arptables extensions from nft_compat,
from Arturo Borrero.

Patrick is preparing a large batch to enhance the set infrastructure,
named expressions among other things, that should follow up soon after
this batch.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Mar 2, 2015
2 parents 49b31e5 + 5f15893 commit 77f0379
Show file tree
Hide file tree
Showing 10 changed files with 326 additions and 152 deletions.
21 changes: 21 additions & 0 deletions Documentation/networking/ipvs-sysctl.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,27 @@ backup_only - BOOLEAN
If set, disable the director function while the server is
in backup mode to avoid packet loops for DR/TUN methods.

conn_reuse_mode - INTEGER
1 - default

Controls how ipvs will deal with connections that are detected
port reuse. It is a bitmap, with the values being:

0: disable any special handling on port reuse. The new
connection will be delivered to the same real server that was
servicing the previous connection. This will effectively
disable expire_nodest_conn.

bit 1: enable rescheduling of new connections when it is safe.
That is, whenever expire_nodest_conn and for TCP sockets, when
the connection is in TIME_WAIT state (which is only possible if
you use NAT mode).

bit 2: it is bit 1 plus, for TCP connections, when connections
are in FIN_WAIT state, as this is the last state seen by load
balancer in Direct Routing mode. This bit helps on adding new
real servers to a very busy cluster.

conntrack - BOOLEAN
0 - disabled (default)
not 0 - enabled
Expand Down
61 changes: 44 additions & 17 deletions include/net/ip_vs.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,15 +365,15 @@ struct ip_vs_seq {

/* counters per cpu */
struct ip_vs_counters {
__u32 conns; /* connections scheduled */
__u32 inpkts; /* incoming packets */
__u32 outpkts; /* outgoing packets */
__u64 conns; /* connections scheduled */
__u64 inpkts; /* incoming packets */
__u64 outpkts; /* outgoing packets */
__u64 inbytes; /* incoming bytes */
__u64 outbytes; /* outgoing bytes */
};
/* Stats per cpu */
struct ip_vs_cpu_stats {
struct ip_vs_counters ustats;
struct ip_vs_counters cnt;
struct u64_stats_sync syncp;
};

Expand All @@ -383,23 +383,40 @@ struct ip_vs_estimator {

u64 last_inbytes;
u64 last_outbytes;
u32 last_conns;
u32 last_inpkts;
u32 last_outpkts;

u32 cps;
u32 inpps;
u32 outpps;
u32 inbps;
u32 outbps;
u64 last_conns;
u64 last_inpkts;
u64 last_outpkts;

u64 cps;
u64 inpps;
u64 outpps;
u64 inbps;
u64 outbps;
};

/*
* IPVS statistics object, 64-bit kernel version of struct ip_vs_stats_user
*/
struct ip_vs_kstats {
u64 conns; /* connections scheduled */
u64 inpkts; /* incoming packets */
u64 outpkts; /* outgoing packets */
u64 inbytes; /* incoming bytes */
u64 outbytes; /* outgoing bytes */

u64 cps; /* current connection rate */
u64 inpps; /* current in packet rate */
u64 outpps; /* current out packet rate */
u64 inbps; /* current in byte rate */
u64 outbps; /* current out byte rate */
};

struct ip_vs_stats {
struct ip_vs_stats_user ustats; /* statistics */
struct ip_vs_kstats kstats; /* kernel statistics */
struct ip_vs_estimator est; /* estimator */
struct ip_vs_cpu_stats __percpu *cpustats; /* per cpu counters */
spinlock_t lock; /* spin lock */
struct ip_vs_stats_user ustats0; /* reset values */
struct ip_vs_kstats kstats0; /* reset values */
};

struct dst_entry;
Expand Down Expand Up @@ -924,6 +941,7 @@ struct netns_ipvs {
int sysctl_nat_icmp_send;
int sysctl_pmtu_disc;
int sysctl_backup_only;
int sysctl_conn_reuse_mode;

/* ip_vs_lblc */
int sysctl_lblc_expiration;
Expand Down Expand Up @@ -1042,6 +1060,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
ipvs->sysctl_backup_only;
}

static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_conn_reuse_mode;
}

#else

static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
Expand Down Expand Up @@ -1109,6 +1132,11 @@ static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
return 0;
}

static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
{
return 1;
}

#endif

/* IPVS core functions
Expand Down Expand Up @@ -1388,8 +1416,7 @@ void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts);
void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats);
void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats);
void ip_vs_zero_estimator(struct ip_vs_stats *stats);
void ip_vs_read_estimator(struct ip_vs_stats_user *dst,
struct ip_vs_stats *stats);
void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats);

/* Various IPVS packet transmitters (from ip_vs_xmit.c) */
int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
Expand Down
7 changes: 6 additions & 1 deletion include/uapi/linux/ip_vs.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ enum {

IPVS_SVC_ATTR_PE_NAME, /* name of ct retriever */

IPVS_SVC_ATTR_STATS64, /* nested attribute for service stats */

__IPVS_SVC_ATTR_MAX,
};

Expand Down Expand Up @@ -387,6 +389,8 @@ enum {

IPVS_DEST_ATTR_ADDR_FAMILY, /* Address family of address */

IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */

__IPVS_DEST_ATTR_MAX,
};

Expand All @@ -410,7 +414,8 @@ enum {
/*
* Attributes used to describe service or destination entry statistics
*
* Used inside nested attributes IPVS_SVC_ATTR_STATS and IPVS_DEST_ATTR_STATS
* Used inside nested attributes IPVS_SVC_ATTR_STATS, IPVS_DEST_ATTR_STATS,
* IPVS_SVC_ATTR_STATS64 and IPVS_DEST_ATTR_STATS64.
*/
enum {
IPVS_STATS_ATTR_UNSPEC = 0,
Expand Down
2 changes: 1 addition & 1 deletion net/netfilter/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ comment "Xtables matches"

config NETFILTER_XT_MATCH_ADDRTYPE
tristate '"addrtype" address type match support'
depends on NETFILTER_ADVANCED
default m if NETFILTER_ADVANCED=n
---help---
This option allows you to match what routing thinks of an address,
eg. UNICAST, LOCAL, BROADCAST, ...
Expand Down
69 changes: 50 additions & 19 deletions net/netfilter/ipvs/ip_vs_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,24 +119,24 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
struct ip_vs_service *svc;

s = this_cpu_ptr(dest->stats.cpustats);
s->ustats.inpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.inbytes += skb->len;
s->cnt.inpkts++;
s->cnt.inbytes += skb->len;
u64_stats_update_end(&s->syncp);

rcu_read_lock();
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
s->ustats.inpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.inbytes += skb->len;
s->cnt.inpkts++;
s->cnt.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
rcu_read_unlock();

s = this_cpu_ptr(ipvs->tot_stats.cpustats);
s->ustats.inpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.inbytes += skb->len;
s->cnt.inpkts++;
s->cnt.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
}
}
Expand All @@ -153,24 +153,24 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
struct ip_vs_service *svc;

s = this_cpu_ptr(dest->stats.cpustats);
s->ustats.outpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.outbytes += skb->len;
s->cnt.outpkts++;
s->cnt.outbytes += skb->len;
u64_stats_update_end(&s->syncp);

rcu_read_lock();
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
s->ustats.outpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.outbytes += skb->len;
s->cnt.outpkts++;
s->cnt.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
rcu_read_unlock();

s = this_cpu_ptr(ipvs->tot_stats.cpustats);
s->ustats.outpkts++;
u64_stats_update_begin(&s->syncp);
s->ustats.outbytes += skb->len;
s->cnt.outpkts++;
s->cnt.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
}
}
Expand All @@ -183,13 +183,19 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
struct ip_vs_cpu_stats *s;

s = this_cpu_ptr(cp->dest->stats.cpustats);
s->ustats.conns++;
u64_stats_update_begin(&s->syncp);
s->cnt.conns++;
u64_stats_update_end(&s->syncp);

s = this_cpu_ptr(svc->stats.cpustats);
s->ustats.conns++;
u64_stats_update_begin(&s->syncp);
s->cnt.conns++;
u64_stats_update_end(&s->syncp);

s = this_cpu_ptr(ipvs->tot_stats.cpustats);
s->ustats.conns++;
u64_stats_update_begin(&s->syncp);
s->cnt.conns++;
u64_stats_update_end(&s->syncp);
}


Expand Down Expand Up @@ -1046,6 +1052,26 @@ static inline bool is_new_conn(const struct sk_buff *skb,
}
}

static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
int conn_reuse_mode)
{
/* Controlled (FTP DATA or persistence)? */
if (cp->control)
return false;

switch (cp->protocol) {
case IPPROTO_TCP:
return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
((conn_reuse_mode & 2) &&
(cp->state == IP_VS_TCP_S_FIN_WAIT) &&
(cp->flags & IP_VS_CONN_F_NOOUTPUT));
case IPPROTO_SCTP:
return cp->state == IP_VS_SCTP_S_CLOSED;
default:
return false;
}
}

/* Handle response packets: rewrite addresses and send away...
*/
static unsigned int
Expand Down Expand Up @@ -1585,6 +1611,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
struct ip_vs_conn *cp;
int ret, pkts;
struct netns_ipvs *ipvs;
int conn_reuse_mode;

/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
Expand Down Expand Up @@ -1653,10 +1680,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
*/
cp = pp->conn_in_get(af, skb, &iph, 0);

if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
is_new_conn(skb, &iph)) {
ip_vs_conn_expire_now(cp);
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
if (conn_reuse_mode && !iph.fragoffs &&
is_new_conn(skb, &iph) && cp &&
((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight))) ||
unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
if (!atomic_read(&cp->n_control))
ip_vs_conn_expire_now(cp);
__ip_vs_conn_put(cp);
cp = NULL;
}
Expand Down
Loading

0 comments on commit 77f0379

Please sign in to comment.