Skip to content

Commit

Permalink
Merge branch 'bpf-more-sock_ops-callbacks'
Browse files Browse the repository at this point in the history
Lawrence Brakmo says:

====================
This patchset adds support for:

- direct R or R/W access to many tcp_sock fields
- passing up to 4 arguments to sock_ops BPF functions
- tcp_sock field bpf_sock_ops_cb_flags for controlling callbacks
- optionally calling sock_ops BPF program when RTO fires
- optionally calling sock_ops BPF program when packet is retransmitted
- optionally calling sock_ops BPF program when TCP state changes
- access to tclass and sk_txhash
- new selftest

v2: Fixed commit message 0/11. The commit is to "bpf-next" but the patch
    below used "bpf" and Patchwork didn't work correctly.
v3: Cleaned RTO callback as per  Yuchung's comment
    Added BPF enum for TCP states as per  Alexei's comment
v4: Fixed compile warnings related to detecting changes between TCP
    internal states and the BPF defined states.
v5: Fixed comment issues in some selftest files
    Fixed accesss issue with u64 fields in bpf_sock_ops struct
v6: Made fixes based on comments form Eric Dumazet:
    The field bpf_sock_ops_cb_flags was addded in a hole on 64bit kernels
    Field bpf_sock_ops_cb_flags is now set through a helper function
    which returns an error when a BPF program tries to set bits for
    callbacks that are not supported in the current kernel.
    Added a comment indicating that when adding fields to bpf_sock_ops_kern
    they should be added before the field named "temp" if they need to be
    cleared before calling the BPF function.
v7: Enfornced fields "op" and "replylong[1] .. replylong[3]" not be writable
    based on comments form Eric Dumazet and Alexei Starovoitov.
    Filled 32 bit hole in bpf_sock_ops struct with sk_txhash based on
    comments from Daniel Borkmann.
    Removed unused functions (tcp_call_bpf_1arg, tcp_call_bpf_4arg) based
    on comments from Daniel Borkmann.
v8: Add commit message 00/12
    Add Acked-by as appropriate
v9: Moved the bug fix to the front of the patchset
    Changed RETRANS_CB so it is always called (before it was only called if
    the retransmit succeeded). It is now called with an extra argument, the
    return value of tcp_transmit_skb (0 => success). Based on comments
    from Yuchung Cheng.
    Added support for reading 2 new fields, sacked_out and lost_out, based on
    comments from Yuchung Cheng.
v10: Moved the callback flags from include/uapi/linux/tcp.h to
     include/uapi/linux/bpf.h
     Cleaned up the test in selftest. Added a timeout so it always completes,
     even if the client is not communicating with the server. Made it faster
     by removing the sleeps. Made sure it works even when called back-to-back
     20 times.

Consists of the following patches:
[PATCH bpf-next v10 01/12] bpf: Only reply field should be writeable
[PATCH bpf-next v10 02/12] bpf: Make SOCK_OPS_GET_TCP size
[PATCH bpf-next v10 03/12] bpf: Make SOCK_OPS_GET_TCP struct
[PATCH bpf-next v10 04/12] bpf: Add write access to tcp_sock and sock
[PATCH bpf-next v10 05/12] bpf: Support passing args to sock_ops bpf
[PATCH bpf-next v10 06/12] bpf: Adds field bpf_sock_ops_cb_flags to
[PATCH bpf-next v10 07/12] bpf: Add sock_ops RTO callback
[PATCH bpf-next v10 08/12] bpf: Add support for reading sk_state and
[PATCH bpf-next v10 09/12] bpf: Add sock_ops R/W access to tclass
[PATCH bpf-next v10 10/12] bpf: Add BPF_SOCK_OPS_RETRANS_CB
[PATCH bpf-next v10 11/12] bpf: Add BPF_SOCK_OPS_STATE_CB
[PATCH bpf-next v10 12/12] bpf: add selftest for tcpbpf
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
  • Loading branch information
Alexei Starovoitov committed Jan 26, 2018
2 parents e9dcd80 + d6d4f60 commit 82f1e0f
Show file tree
Hide file tree
Showing 17 changed files with 925 additions and 39 deletions.
10 changes: 10 additions & 0 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1003,10 +1003,20 @@ struct bpf_sock_ops_kern {
struct sock *sk;
u32 op;
union {
u32 args[4];
u32 reply;
u32 replylong[4];
};
u32 is_fullsock;
u64 temp; /* temp and everything after is not
* initialized to 0 before calling
* the BPF program. New fields that
* should be initialized to 0 should
* be inserted before temp.
* temp is scratch storage used by
* sock_ops_convert_ctx_access
* as temporary storage of a register.
*/
};

#endif /* __LINUX_FILTER_H__ */
11 changes: 11 additions & 0 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,17 @@ struct tcp_sock {

int linger2;


/* Sock_ops bpf program related variables */
#ifdef CONFIG_BPF
u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
* values defined in uapi/linux/tcp.h
*/
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
#endif

/* Receiver side RTT estimation */
struct {
u32 rtt_us;
Expand Down
42 changes: 36 additions & 6 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -2006,19 +2006,21 @@ void tcp_cleanup_ulp(struct sock *sk);
* program loaded).
*/
#ifdef CONFIG_BPF
static inline int tcp_call_bpf(struct sock *sk, int op)
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
struct bpf_sock_ops_kern sock_ops;
int ret;

memset(&sock_ops, 0, sizeof(sock_ops));
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
if (sk_fullsock(sk)) {
sock_ops.is_fullsock = 1;
sock_owned_by_me(sk);
}

sock_ops.sk = sk;
sock_ops.op = op;
if (nargs > 0)
memcpy(sock_ops.args, args, nargs * sizeof(*args));

ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
if (ret == 0)
Expand All @@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
ret = -1;
return ret;
}

static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
{
u32 args[2] = {arg1, arg2};

return tcp_call_bpf(sk, op, 2, args);
}

static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
u32 arg3)
{
u32 args[3] = {arg1, arg2, arg3};

return tcp_call_bpf(sk, op, 3, args);
}

#else
static inline int tcp_call_bpf(struct sock *sk, int op)
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
return -EPERM;
}

static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
{
return -EPERM;
}

static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
u32 arg3)
{
return -EPERM;
}

#endif

static inline u32 tcp_timeout_init(struct sock *sk)
{
int timeout;

timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);

if (timeout <= 0)
timeout = TCP_TIMEOUT_INIT;
Expand All @@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
{
int rwnd;

rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);

if (rwnd < 0)
rwnd = 0;
Expand All @@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)

static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}

#if IS_ENABLED(CONFIG_SMC)
Expand Down
84 changes: 81 additions & 3 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,14 @@ union bpf_attr {
* @optlen: length of optval in bytes
* Return: 0 or negative error
*
* int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
* Set callback flags for sock_ops
* @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
* @flags: flags value
* Return: 0 for no error
* -EINVAL if there is no full tcp socket
* bits in flags that are not supported by current kernel
*
* int bpf_skb_adjust_room(skb, len_diff, mode, flags)
* Grow or shrink room in sk_buff.
* @skb: pointer to skb
Expand Down Expand Up @@ -748,7 +756,8 @@ union bpf_attr {
FN(perf_event_read_value), \
FN(perf_prog_read_value), \
FN(getsockopt), \
FN(override_return),
FN(override_return), \
FN(sock_ops_cb_flags_set),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
Expand Down Expand Up @@ -952,8 +961,9 @@ struct bpf_map_info {
struct bpf_sock_ops {
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
__u32 args[4]; /* Optionally passed to bpf program */
__u32 reply; /* Returned by bpf program */
__u32 replylong[4]; /* Optionally returned by bpf prog */
};
__u32 family;
__u32 remote_ip4; /* Stored in network byte order */
Expand All @@ -968,8 +978,39 @@ struct bpf_sock_ops {
*/
__u32 snd_cwnd;
__u32 srtt_us; /* Averaged RTT << 3 in usecs */
__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
__u32 state;
__u32 rtt_min;
__u32 snd_ssthresh;
__u32 rcv_nxt;
__u32 snd_nxt;
__u32 snd_una;
__u32 mss_cache;
__u32 ecn_flags;
__u32 rate_delivered;
__u32 rate_interval_us;
__u32 packets_out;
__u32 retrans_out;
__u32 total_retrans;
__u32 segs_in;
__u32 data_segs_in;
__u32 segs_out;
__u32 data_segs_out;
__u32 lost_out;
__u32 sacked_out;
__u32 sk_txhash;
__u64 bytes_received;
__u64 bytes_acked;
};

/* Definitions for bpf_sock_ops_cb_flags */
#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently
* supported cb flags
*/

/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
Expand Down Expand Up @@ -1003,6 +1044,43 @@ enum {
* a congestion threshold. RTTs above
* this indicate congestion
*/
BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered.
* Arg1: value of icsk_retransmits
* Arg2: value of icsk_rto
* Arg3: whether RTO has expired
*/
BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted.
* Arg1: sequence number of 1st byte
* Arg2: # segments
* Arg3: return value of
* tcp_transmit_skb (0 => success)
*/
BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state.
* Arg1: old_state
* Arg2: new_state
*/
};

/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
* changes between the TCP and BPF versions. Ideally this should never happen.
* If it does, we need to add code to convert them before calling
* the BPF sock_ops function.
*/
enum {
BPF_TCP_ESTABLISHED = 1,
BPF_TCP_SYN_SENT,
BPF_TCP_SYN_RECV,
BPF_TCP_FIN_WAIT1,
BPF_TCP_FIN_WAIT2,
BPF_TCP_TIME_WAIT,
BPF_TCP_CLOSE,
BPF_TCP_CLOSE_WAIT,
BPF_TCP_LAST_ACK,
BPF_TCP_LISTEN,
BPF_TCP_CLOSING, /* Now a valid state */
BPF_TCP_NEW_SYN_RECV,

BPF_TCP_MAX_STATES /* Leave at the end! */
};

#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
Expand Down
Loading

0 comments on commit 82f1e0f

Please sign in to comment.