Skip to content

Commit

Permalink
bpf: Add bpf_dctcp example
Browse files Browse the repository at this point in the history
This patch adds a bpf_dctcp example.  It currently does not do
no-ECN fallback but the same could be done through the cgrp2-bpf.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200109003517.3856825-1-kafai@fb.com
  • Loading branch information
Martin KaFai Lau authored and Alexei Starovoitov committed Jan 9, 2020
1 parent 590a008 commit 0990386
Show file tree
Hide file tree
Showing 3 changed files with 625 additions and 0 deletions.
228 changes: 228 additions & 0 deletions tools/testing/selftests/bpf/bpf_tcp_helpers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BPF_TCP_HELPERS_H
#define __BPF_TCP_HELPERS_H

#include <stdbool.h>
#include <linux/types.h>
#include <bpf_helpers.h>
#include <bpf_core_read.h>
#include "bpf_trace_helpers.h"

/* "struct_ops/" is only a convention. not a requirement. */
#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)

struct sock_common {
unsigned char skc_state;
} __attribute__((preserve_access_index));

struct sock {
struct sock_common __sk_common;
} __attribute__((preserve_access_index));

struct inet_sock {
struct sock sk;
} __attribute__((preserve_access_index));

struct inet_connection_sock {
struct inet_sock icsk_inet;
__u8 icsk_ca_state:6,
icsk_ca_setsockopt:1,
icsk_ca_dst_locked:1;
struct {
__u8 pending;
} icsk_ack;
__u64 icsk_ca_priv[104 / sizeof(__u64)];
} __attribute__((preserve_access_index));

struct tcp_sock {
struct inet_connection_sock inet_conn;

__u32 rcv_nxt;
__u32 snd_nxt;
__u32 snd_una;
__u8 ecn_flags;
__u32 delivered;
__u32 delivered_ce;
__u32 snd_cwnd;
__u32 snd_cwnd_cnt;
__u32 snd_cwnd_clamp;
__u32 snd_ssthresh;
__u8 syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
save_syn:1, /* Save headers of SYN packet */
is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
syn_smc:1; /* SYN includes SMC */
__u32 max_packets_out;
__u32 lsndtime;
__u32 prior_cwnd;
} __attribute__((preserve_access_index));

static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
return (struct inet_connection_sock *)sk;
}

static __always_inline void *inet_csk_ca(const struct sock *sk)
{
return (void *)inet_csk(sk)->icsk_ca_priv;
}

static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
return (struct tcp_sock *)sk;
}

static __always_inline bool before(__u32 seq1, __u32 seq2)
{
return (__s32)(seq1-seq2) < 0;
}
#define after(seq2, seq1) before(seq1, seq2)

#define TCP_ECN_OK 1
#define TCP_ECN_QUEUE_CWR 2
#define TCP_ECN_DEMAND_CWR 4
#define TCP_ECN_SEEN 8

enum inet_csk_ack_state_t {
ICSK_ACK_SCHED = 1,
ICSK_ACK_TIMER = 2,
ICSK_ACK_PUSHED = 4,
ICSK_ACK_PUSHED2 = 8,
ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */
};

enum tcp_ca_event {
CA_EVENT_TX_START = 0,
CA_EVENT_CWND_RESTART = 1,
CA_EVENT_COMPLETE_CWR = 2,
CA_EVENT_LOSS = 3,
CA_EVENT_ECN_NO_CE = 4,
CA_EVENT_ECN_IS_CE = 5,
};

enum tcp_ca_state {
TCP_CA_Open = 0,
TCP_CA_Disorder = 1,
TCP_CA_CWR = 2,
TCP_CA_Recovery = 3,
TCP_CA_Loss = 4
};

struct ack_sample {
__u32 pkts_acked;
__s32 rtt_us;
__u32 in_flight;
} __attribute__((preserve_access_index));

struct rate_sample {
__u64 prior_mstamp; /* starting timestamp for interval */
__u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
__s32 delivered; /* number of packets delivered over interval */
long interval_us; /* time for tp->delivered to incr "delivered" */
__u32 snd_interval_us; /* snd interval for delivered packets */
__u32 rcv_interval_us; /* rcv interval for delivered packets */
long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
int losses; /* number of packets marked lost upon ACK */
__u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
__u32 prior_in_flight; /* in flight before this ACK */
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
bool is_ack_delayed; /* is this (likely) a delayed ACK? */
} __attribute__((preserve_access_index));

#define TCP_CA_NAME_MAX 16
#define TCP_CONG_NEEDS_ECN 0x2

struct tcp_congestion_ops {
char name[TCP_CA_NAME_MAX];
__u32 flags;

/* initialize private data (optional) */
void (*init)(struct sock *sk);
/* cleanup private data (optional) */
void (*release)(struct sock *sk);

/* return slow start threshold (required) */
__u32 (*ssthresh)(struct sock *sk);
/* do new cwnd calculation (required) */
void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
/* call before changing ca_state (optional) */
void (*set_state)(struct sock *sk, __u8 new_state);
/* call when cwnd event occurs (optional) */
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
/* call when ack arrives (optional) */
void (*in_ack_event)(struct sock *sk, __u32 flags);
/* new value of cwnd after loss (required) */
__u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
/* override sysctl_tcp_min_tso_segs */
__u32 (*min_tso_segs)(struct sock *sk);
/* returns the multiplier used in tcp_sndbuf_expand (optional) */
__u32 (*sndbuf_expand)(struct sock *sk);
/* call when packets are delivered to update cwnd and pacing rate,
* after all the ca_state processing. (optional)
*/
void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
};

#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
#define min_not_zero(x, y) ({ \
typeof(x) __x = (x); \
typeof(y) __y = (y); \
__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })

static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked)
{
__u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);

acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);

return acked;
}

static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
{
return tp->snd_cwnd < tp->snd_ssthresh;
}

static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);

/* If in slow start, ensure cwnd grows to twice what was ACKed. */
if (tcp_in_slow_start(tp))
return tp->snd_cwnd < 2 * tp->max_packets_out;

return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
}

static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked)
{
/* If credits accumulated at a higher w, apply them gently now. */
if (tp->snd_cwnd_cnt >= w) {
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd++;
}

tp->snd_cwnd_cnt += acked;
if (tp->snd_cwnd_cnt >= w) {
__u32 delta = tp->snd_cwnd_cnt / w;

tp->snd_cwnd_cnt -= delta * w;
tp->snd_cwnd += delta;
}
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
}

#endif
Loading

0 comments on commit 0990386

Please sign in to comment.