Skip to content

Commit

Permalink
Merge branch 'tcp-add-tos-reflection-feature'
Browse files Browse the repository at this point in the history
Wei Wang says:

====================
tcp: add tos reflection feature

This patch series adds a new tcp feature to reflect TOS value received in
SYN, and send it out in SYN-ACK, and eventually set the TOS value of the
established socket with this reflected TOS value. This provides a way to
set the traffic class/QoS level for all traffic in the same connection
to be the same as the incoming SYN. It could be useful for datacenters
to provide equivalent QoS according to the incoming request.
This feature is guarded by /proc/sys/net/ipv4/tcp_reflect_tos, and is by
default turned off.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Sep 10, 2020
2 parents 3a8c4ad + ac8f171 commit d095c46
Show file tree
Hide file tree
Showing 10 changed files with 42 additions and 10 deletions.
1 change: 1 addition & 0 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ struct tcp_request_sock {
* FastOpen it's the seq#
* after data-in-SYN.
*/
u8 syn_tos;
};

static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
Expand Down
2 changes: 1 addition & 1 deletion include/net/ip.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ int igmp_mc_init(void);

int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
__be32 saddr, __be32 daddr,
struct ip_options_rcu *opt);
struct ip_options_rcu *opt, u8 tos);
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev);
void ip_list_rcv(struct list_head *head, struct packet_type *pt,
Expand Down
1 change: 1 addition & 0 deletions include/net/netns/ipv4.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ struct netns_ipv4 {
unsigned int sysctl_tcp_fastopen_blackhole_timeout;
atomic_t tfo_active_disable_times;
unsigned long tfo_active_disable_stamp;
int sysctl_tcp_reflect_tos;

int sysctl_udp_wmem_min;
int sysctl_udp_rmem_min;
Expand Down
6 changes: 4 additions & 2 deletions net/dccp/ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,8 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
rcu_dereference(ireq->ireq_opt));
rcu_dereference(ireq->ireq_opt),
inet_sk(sk)->tos);
rcu_read_unlock();
err = net_xmit_eval(err);
}
Expand Down Expand Up @@ -537,7 +538,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
local_bh_disable();
bh_lock_sock(ctl_sk);
err = ip_build_and_send_pkt(skb, ctl_sk,
rxiph->daddr, rxiph->saddr, NULL);
rxiph->daddr, rxiph->saddr, NULL,
inet_sk(ctl_sk)->tos);
bh_unlock_sock(ctl_sk);

if (net_xmit_eval(err) == 0) {
Expand Down
5 changes: 3 additions & 2 deletions net/ipv4/ip_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
*
*/
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
Expand All @@ -155,7 +156,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
iph->tos = inet->tos;
iph->tos = tos;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
Expand Down
6 changes: 3 additions & 3 deletions net/ipv4/syncookies.c
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
struct sock *sk,
struct sk_buff *skb)
{
struct tcp_request_sock *treq;
struct request_sock *req;

#ifdef CONFIG_MPTCP
struct tcp_request_sock *treq;

if (sk_is_mptcp(sk))
ops = &mptcp_subflow_request_sock_ops;
#endif
Expand All @@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
if (!req)
return NULL;

#if IS_ENABLED(CONFIG_MPTCP)
treq = tcp_rsk(req);
treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
#if IS_ENABLED(CONFIG_MPTCP)
treq->is_mptcp = sk_is_mptcp(sk);
if (treq->is_mptcp) {
int err = mptcp_subflow_init_cookie_req(req, sk, skb);
Expand Down
9 changes: 9 additions & 0 deletions net/ipv4/sysctl_net_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -1329,6 +1329,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &comp_sack_nr_max,
},
{
.procname = "tcp_reflect_tos",
.data = &init_net.ipv4.sysctl_tcp_reflect_tos,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
Expand Down
1 change: 1 addition & 0 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -6834,6 +6834,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,

tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
Expand Down
11 changes: 10 additions & 1 deletion net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -972,20 +972,25 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
u8 tos;

/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;

skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);

tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;

if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);

rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
rcu_dereference(ireq->ireq_opt));
rcu_dereference(ireq->ireq_opt),
tos & ~INET_ECN_MASK);
rcu_read_unlock();
err = net_xmit_eval(err);
}
Expand Down Expand Up @@ -1530,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = prandom_u32();

/* Set ToS of the new socket based upon the value of incoming SYN. */
if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;

if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst)
Expand Down
10 changes: 9 additions & 1 deletion net/ipv6/tcp_ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
u8 tclass;

/* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
Expand All @@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,

rcu_read_lock();
opt = ireq->ipv6_opt;
tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
tcp_rsk(req)->syn_tos : np->tclass;
if (!opt)
opt = rcu_dereference(np->opt);
err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
tclass & ~INET_ECN_MASK,
sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
Expand Down Expand Up @@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (np->repflow)
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));

/* Set ToS of the new socket based upon the value of incoming SYN. */
if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;

/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever,
Expand Down

0 comments on commit d095c46

Please sign in to comment.