Skip to content

Commit

Permalink
Merge branch 'rt_cong_ctrl'
Browse files Browse the repository at this point in the history
Daniel Borkmann says:

====================
net: allow setting congctl via routing table

This is the second part of our work and allows for setting the congestion
control algorithm via routing table. For details, please see individual
patches.

Since patch 1 is a bug fix, we suggest applying patch 1 to net, and then
merging net into net-next, for example, and following up with the remaining
feature patches wrt dependencies.

Joint work with Florian Westphal, suggested by Hannes Frederic Sowa.

Patch for iproute2 is available under [1], but will be reposted with along
with the man-page update when this set hits net-next.

  [1] http://patchwork.ozlabs.org/patch/418149/

Thanks!

v2 -> v3:
 - Added module auto-loading as suggested by David Miller, thanks!
  - Added patch 2 for handling possible sleeps in fib6
  - While working on this, we discovered a bug, hence fix in patch 1
  - Added auto-loading to patch 4
 - Rebased, retested, rest the same.
v1 -> v2:
 - Very sorry, I noticed I had decnet disabled during testing.
   Added missing header include in decnet, rest as is.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Jan 6, 2015
2 parents 6cb6974 + 8116441 commit a918eb9
Show file tree
Hide file tree
Showing 15 changed files with 304 additions and 85 deletions.
3 changes: 2 additions & 1 deletion include/net/inet_connection_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ struct inet_connection_sock {
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state;
__u8 icsk_ca_state:7,
icsk_ca_dst_locked:1;
__u8 icsk_retransmits;
__u8 icsk_pending;
__u8 icsk_backoff;
Expand Down
10 changes: 7 additions & 3 deletions include/net/ip6_fib.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ struct fib6_node {
#define FIB6_SUBTREE(fn) ((fn)->subtree)
#endif

struct mx6_config {
const u32 *mx;
DECLARE_BITMAP(mx_valid, RTAX_MAX);
};

/*
* routing information
*
Expand Down Expand Up @@ -291,9 +296,8 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
void *arg);

int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
struct nlattr *mx, int mx_len);

int fib6_add(struct fib6_node *root, struct rt6_info *rt,
struct nl_info *info, struct mx6_config *mxc);
int fib6_del(struct rt6_info *rt, struct nl_info *info);

void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info);
Expand Down
22 changes: 21 additions & 1 deletion include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(struct sock *sk,
struct request_sock *req,
struct sk_buff *skb);
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst);
Expand Down Expand Up @@ -636,6 +637,11 @@ static inline u32 tcp_rto_min_us(struct sock *sk)
return jiffies_to_usecs(tcp_rto_min(sk));
}

static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
{
return dst_metric_locked(dst, RTAX_CC_ALGO);
}

/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
Expand Down Expand Up @@ -787,14 +793,17 @@ enum tcp_ca_ack_event_flags {
#define TCP_CA_MAX 128
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)

#define TCP_CA_UNSPEC 0

/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN 0x2

struct tcp_congestion_ops {
struct list_head list;
unsigned long flags;
u32 key;
u32 flags;

/* initialize private data (optional) */
void (*init)(struct sock *sk);
Expand Down Expand Up @@ -841,6 +850,17 @@ u32 tcp_reno_ssthresh(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;

struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(const char *name);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
return NULL;
}
#endif

static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
Expand Down
2 changes: 2 additions & 0 deletions include/uapi/linux/rtnetlink.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,8 @@ enum {
#define RTAX_INITRWND RTAX_INITRWND
RTAX_QUICKACK,
#define RTAX_QUICKACK RTAX_QUICKACK
RTAX_CC_ALGO,
#define RTAX_CC_ALGO RTAX_CC_ALGO
__RTAX_MAX
};

Expand Down
15 changes: 13 additions & 2 deletions net/core/rtnetlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
#include <net/arp.h>
#include <net/route.h>
#include <net/udp.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/fib_rules.h>
Expand Down Expand Up @@ -669,9 +670,19 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)

for (i = 0; i < RTAX_MAX; i++) {
if (metrics[i]) {
if (i == RTAX_CC_ALGO - 1) {
char tmp[TCP_CA_NAME_MAX], *name;

name = tcp_ca_get_name_by_key(metrics[i], tmp);
if (!name)
continue;
if (nla_put_string(skb, i + 1, name))
goto nla_put_failure;
} else {
if (nla_put_u32(skb, i + 1, metrics[i]))
goto nla_put_failure;
}
valid++;
if (nla_put_u32(skb, i+1, metrics[i]))
goto nla_put_failure;
}
}

Expand Down
3 changes: 2 additions & 1 deletion net/decnet/dn_fib.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,8 @@ struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *att
int type = nla_type(attr);

if (type) {
if (type > RTAX_MAX || nla_len(attr) < 4)
if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
nla_len(attr) < 4)
goto err_inval;

fi->fib_metrics[type-1] = nla_get_u32(attr);
Expand Down
4 changes: 3 additions & 1 deletion net/decnet/dn_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <linux/route.h> /* RTF_xxx */
#include <net/neighbour.h>
#include <net/netlink.h>
#include <net/tcp.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/fib_rules.h>
Expand Down Expand Up @@ -273,7 +274,8 @@ static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(2) /* RTA_DST */
+ nla_total_size(4); /* RTA_PRIORITY */
+ nla_total_size(4) /* RTA_PRIORITY */
+ nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */

/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
Expand Down
14 changes: 12 additions & 2 deletions net/ipv4/fib_semantics.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
+ nla_total_size(4) /* RTA_TABLE */
+ nla_total_size(4) /* RTA_DST */
+ nla_total_size(4) /* RTA_PRIORITY */
+ nla_total_size(4); /* RTA_PREFSRC */
+ nla_total_size(4) /* RTA_PREFSRC */
+ nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */

/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
Expand Down Expand Up @@ -859,7 +860,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)

if (type > RTAX_MAX)
goto err_inval;
val = nla_get_u32(nla);
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];

nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp);
if (val == TCP_CA_UNSPEC)
goto err_inval;
} else {
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40)
val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15)
Expand Down
121 changes: 95 additions & 26 deletions net/ipv4/tcp_cong.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <net/tcp.h>

static DEFINE_SPINLOCK(tcp_cong_list_lock);
Expand All @@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
return NULL;
}

/* Must be called with rcu lock held */
static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
{
const struct tcp_congestion_ops *ca = tcp_ca_find(name);
#ifdef CONFIG_MODULES
if (!ca && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
request_module("tcp_%s", name);
rcu_read_lock();
ca = tcp_ca_find(name);
}
#endif
return ca;
}

/* Simple linear search, not much in here. */
struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
{
struct tcp_congestion_ops *e;

list_for_each_entry_rcu(e, &tcp_cong_list, list) {
if (e->key == key)
return e;
}

return NULL;
}

/*
* Attach new congestion control algorithm to the list
* of available options.
Expand All @@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL;
}

ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));

spin_lock(&tcp_cong_list_lock);
if (tcp_ca_find(ca->name)) {
pr_notice("%s already registered\n", ca->name);
if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
pr_notice("%s already registered or non-unique key\n",
ca->name);
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
Expand All @@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
spin_lock(&tcp_cong_list_lock);
list_del_rcu(&ca->list);
spin_unlock(&tcp_cong_list_lock);

/* Wait for outstanding readers to complete before the
* module gets removed entirely.
*
* A try_module_get() should fail by now as our module is
* in "going" state since no refs are held anymore and
* module_exit() handler being called.
*/
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);

u32 tcp_ca_get_key_by_name(const char *name)
{
const struct tcp_congestion_ops *ca;
u32 key;

might_sleep();

rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
key = ca ? ca->key : TCP_CA_UNSPEC;
rcu_read_unlock();

return key;
}
EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);

char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
const struct tcp_congestion_ops *ca;
char *ret = NULL;

rcu_read_lock();
ca = tcp_ca_find_key(key);
if (ca)
ret = strncpy(buffer, ca->name,
TCP_CA_NAME_MAX);
rcu_read_unlock();

return ret;
}
EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);

/* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk)
{
Expand Down Expand Up @@ -107,6 +180,18 @@ void tcp_init_congestion_control(struct sock *sk)
icsk->icsk_ca_ops->init(sk);
}

static void tcp_reinit_congestion_control(struct sock *sk,
const struct tcp_congestion_ops *ca)
{
struct inet_connection_sock *icsk = inet_csk(sk);

tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;

if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
}

/* Manage refcounts on socket close. */
void tcp_cleanup_congestion_control(struct sock *sk)
{
Expand Down Expand Up @@ -241,42 +326,26 @@ int tcp_set_allowed_congestion_control(char *val)
int tcp_set_congestion_control(struct sock *sk, const char *name)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca;
const struct tcp_congestion_ops *ca;
int err = 0;

rcu_read_lock();
ca = tcp_ca_find(name);
if (icsk->icsk_ca_dst_locked)
return -EPERM;

/* no change asking for existing value */
rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
/* No change asking for existing value */
if (ca == icsk->icsk_ca_ops)
goto out;

#ifdef CONFIG_MODULES
/* not found attempt to autoload module */
if (!ca && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
request_module("tcp_%s", name);
rcu_read_lock();
ca = tcp_ca_find(name);
}
#endif
if (!ca)
err = -ENOENT;

else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
err = -EPERM;

else if (!try_module_get(ca->owner))
err = -EBUSY;

else {
tcp_cleanup_congestion_control(sk);
icsk->icsk_ca_ops = ca;

if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
}
else
tcp_reinit_congestion_control(sk, ca);
out:
rcu_read_unlock();
return err;
Expand Down
2 changes: 2 additions & 0 deletions net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
}
sk_setup_caps(newsk, dst);

tcp_ca_openreq_child(newsk, dst);

tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric_advmss(dst);
if (tcp_sk(sk)->rx_opt.user_mss &&
Expand Down
Loading

0 comments on commit a918eb9

Please sign in to comment.