Skip to content

Commit

Permalink
Merge branch 'net-timestamp-bpf-extension-to-equip-applications-trans…
Browse files Browse the repository at this point in the history
…parently'

Jason Xing says:

====================
net-timestamp: bpf extension to equip applications transparently

"Timestamping is key to debugging network stack latency. With
SO_TIMESTAMPING, bugs that are otherwise incorrectly assumed to be
network issues can be attributed to the kernel." This is extracted
from the talk "SO_TIMESTAMPING: Powering Fleetwide RPC Monitoring"
addressed by Willem de Bruijn at netdevconf 0x17).

There are a few areas that need optimization with the consideration of
easier use and less performance impact, which I highlighted and mainly
discussed at netconf 2024 with Willem de Bruijn and John Fastabend:
uAPI compatibility, extra system call overhead, and the need for
application modification. I initially managed to solve these issues
by writing a kernel module that hooks various key functions. However,
this approach is not suitable for the next kernel release. Therefore,
a BPF extension was proposed. During recent period, Martin KaFai Lau
provides invaluable suggestions about BPF along the way. Many thanks
here!

This series adds the BPF networking timestamping infrastructure through
reusing most of the tx timestamping callback that is currently enabled
by the SO_TIMESTAMPING.. This series also adds TX timestamping support
for TCP. The RX timestamping and UDP support will be added in the future.
====================

Link: https://patch.msgid.link/20250220072940.99994-1-kerneljasonxing@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
  • Loading branch information
Martin KaFai Lau committed Feb 20, 2025
2 parents 09bc97b + f4924ae commit 68b92ac
Show file tree
Hide file tree
Showing 18 changed files with 727 additions and 14 deletions.
1 change: 1 addition & 0 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,7 @@ struct bpf_sock_ops_kern {
void *skb_data_end;
u8 op;
u8 is_fullsock;
u8 is_locked_tcp_sock;
u8 remaining_opt_len;
u64 temp; /* temp and everything after is not
* initialized to 0 before calling
Expand Down
12 changes: 9 additions & 3 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ struct skb_shared_hwtstamps {
/* Definitions for tx_flags in struct skb_shared_info */
enum {
/* generate hardware time stamp */
SKBTX_HW_TSTAMP = 1 << 0,
SKBTX_HW_TSTAMP_NOBPF = 1 << 0,

/* generate software time stamp when queueing packet to NIC */
SKBTX_SW_TSTAMP = 1 << 1,
Expand All @@ -489,10 +489,16 @@ enum {

/* generate software time stamp when entering packet scheduling */
SKBTX_SCHED_TSTAMP = 1 << 6,

/* used for bpf extension when a bpf program is loaded */
SKBTX_BPF = 1 << 7,
};

#define SKBTX_HW_TSTAMP (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF)

#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
SKBTX_SCHED_TSTAMP)
SKBTX_SCHED_TSTAMP | \
SKBTX_BPF)
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \
SKBTX_HW_TSTAMP_USE_CYCLES | \
SKBTX_ANY_SW_TSTAMP)
Expand Down Expand Up @@ -4564,7 +4570,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
skb_clone_tx_timestamp(skb);
if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF))
skb_tstamp_tx(skb, NULL);
}

Expand Down
10 changes: 10 additions & 0 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
* @sk_bpf_cb_flags: used in bpf_setsockopt()
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
Expand Down Expand Up @@ -525,6 +526,8 @@ struct sock {
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
u8 sk_bpf_cb_flags;

void *sk_user_data;
#ifdef CONFIG_SECURITY
Expand Down Expand Up @@ -2921,6 +2924,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);

void sock_enable_timestamps(struct sock *sk);
#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
#else
static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
{
}
#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
Expand Down
7 changes: 5 additions & 2 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -964,10 +964,12 @@ struct tcp_skb_cb {

__u8 sacked; /* State flags for SACK. */
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
__u8 txstamp_ack:1, /* Record TX timestamp for ack? */
#define TSTAMP_ACK_SK 0x1
#define TSTAMP_ACK_BPF 0x2
__u8 txstamp_ack:2, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
has_rxtstamp:1, /* SKB has a RX timestamp */
unused:5;
unused:4;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
Expand Down Expand Up @@ -2657,6 +2659,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
if (sk_fullsock(sk)) {
sock_ops.is_fullsock = 1;
sock_ops.is_locked_tcp_sock = 1;
sock_owned_by_me(sk);
}

Expand Down
30 changes: 30 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -6913,6 +6913,12 @@ enum {
BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
};

enum {
SK_BPF_CB_TX_TIMESTAMPING = 1<<0,
SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
SK_BPF_CB_TX_TIMESTAMPING
};

/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
Expand Down Expand Up @@ -7025,6 +7031,29 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
BPF_SOCK_OPS_TSTAMP_SCHED_CB, /* Called when skb is passing
* through dev layer when
* SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TSTAMP_SND_SW_CB, /* Called when skb is about to send
* to the nic when SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TSTAMP_SND_HW_CB, /* Called in hardware phase when
* SK_BPF_CB_TX_TIMESTAMPING feature
* is on.
*/
BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the
* same sendmsg call are acked
* when SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TSTAMP_SENDMSG_CB, /* Called when every sendmsg syscall
* is triggered. It's used to correlate
* sendmsg timestamp with corresponding
* tskey.
*/
};

/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
Expand Down Expand Up @@ -7091,6 +7120,7 @@ enum {
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */
};

enum {
Expand Down
1 change: 1 addition & 0 deletions kernel/bpf/btf.c
Original file line number Diff line number Diff line change
Expand Up @@ -8524,6 +8524,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
return BTF_KFUNC_HOOK_CGROUP;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
Expand Down
3 changes: 2 additions & 1 deletion net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -4501,7 +4501,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
skb_assert_len(skb);

if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
if (unlikely(skb_shinfo(skb)->tx_flags &
(SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);

/* Disable soft irqs for various locks below. Also
Expand Down
79 changes: 74 additions & 5 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -5222,6 +5222,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};

static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
{
u32 sk_bpf_cb_flags;

if (getopt) {
*(u32 *)optval = sk->sk_bpf_cb_flags;
return 0;
}

sk_bpf_cb_flags = *(u32 *)optval;

if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
return -EINVAL;

sk->sk_bpf_cb_flags = sk_bpf_cb_flags;

return 0;
}

static int sol_socket_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
Expand All @@ -5238,6 +5257,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
case SO_MAX_PACING_RATE:
case SO_BINDTOIFINDEX:
case SO_TXREHASH:
case SK_BPF_CB_FLAGS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
Expand All @@ -5247,6 +5267,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
return -EINVAL;
}

if (optname == SK_BPF_CB_FLAGS)
return sk_bpf_set_get_cb_flags(sk, optval, getopt);

if (getopt) {
if (optname == SO_BINDTODEVICE)
return -EINVAL;
Expand Down Expand Up @@ -5501,6 +5524,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
}

static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
{
return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
}

static int _bpf_setsockopt(struct sock *sk, int level, int optname,
char *optval, int optlen)
{
Expand Down Expand Up @@ -5651,6 +5679,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;

return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

Expand Down Expand Up @@ -5736,6 +5767,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;

if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
int ret, copy_len = 0;
Expand Down Expand Up @@ -5778,6 +5812,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
struct sock *sk = bpf_sock->sk;
int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;

if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;

if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
return -EINVAL;

Expand Down Expand Up @@ -7587,6 +7624,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
u8 search_kind, search_len, copy_len, magic_len;
int ret;

if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;

/* 2 byte is the minimal option len except TCPOPT_NOP and
* TCPOPT_EOL which are useless for the bpf prog to learn
* and this helper disallow loading them also.
Expand Down Expand Up @@ -10359,10 +10399,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
} \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
is_locked_tcp_sock), \
fullsock_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
if (si->dst_reg == si->src_reg) \
*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
Expand Down Expand Up @@ -10447,10 +10487,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
temp)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
is_locked_tcp_sock), \
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
Expand Down Expand Up @@ -12063,6 +12103,25 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
#endif
}

__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
u64 flags)
{
struct sk_buff *skb;

if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
return -EOPNOTSUPP;

if (flags)
return -EINVAL;

skb = skops->skb;
skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;

return 0;
}

__bpf_kfunc_end_defs();

int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
Expand Down Expand Up @@ -12096,6 +12155,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)

static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_skb,
Expand All @@ -12116,6 +12179,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
.set = &bpf_kfunc_check_set_tcp_reqsk,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_sock_ops,
};

static int __init bpf_kfunc_init(void)
{
int ret;
Expand All @@ -12134,7 +12202,8 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
&bpf_kfunc_set_sock_addr);
return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
}
late_initcall(bpf_kfunc_init);

Expand Down
Loading

0 comments on commit 68b92ac

Please sign in to comment.