diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index bcd31f458d1ac..5895905b6aa14 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -579,6 +579,7 @@ config NETDEVSIM depends on DEBUG_FS depends on INET depends on IPV6 || IPV6=n + depends on PSAMPLE || PSAMPLE=n select NET_DEVLINK help This driver is a developer testing tool and software model that can diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 8af7d9d03475e..80712dc803d0f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -58,6 +58,25 @@ struct mlxsw_tx_info { bool is_emad; }; +struct mlxsw_rx_md_info { + u32 cookie_index; + u32 latency; + u32 tx_congestion; + union { + /* Valid when 'tx_port_valid' is set. */ + u16 tx_sys_port; + u16 tx_lag_id; + }; + u8 tx_lag_port_index; /* Valid when 'tx_port_is_lag' is set. */ + u8 tx_tc; + u8 latency_valid:1, + tx_congestion_valid:1, + tx_tc_valid:1, + tx_port_valid:1, + tx_port_is_lag:1, + unused:3; +}; + bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core, const struct mlxsw_tx_info *tx_info); int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb, @@ -515,7 +534,7 @@ enum mlxsw_devlink_param_id { struct mlxsw_skb_cb { union { struct mlxsw_tx_info tx_info; - u32 cookie_index; /* Only used during receive */ + struct mlxsw_rx_md_info rx_md_info; }; }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index d0052537e627e..8e84568113843 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -540,6 +540,55 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci, spin_unlock(&q->lock); } +static void mlxsw_pci_cqe_rdq_md_tx_port_init(struct sk_buff *skb, + const char *cqe) +{ + struct mlxsw_skb_cb *cb = mlxsw_skb_cb(skb); + + if (mlxsw_pci_cqe2_tx_lag_get(cqe)) { + cb->rx_md_info.tx_port_is_lag = true; + cb->rx_md_info.tx_lag_id = mlxsw_pci_cqe2_tx_lag_id_get(cqe); + cb->rx_md_info.tx_lag_port_index = + mlxsw_pci_cqe2_tx_lag_subport_get(cqe); + } else { + cb->rx_md_info.tx_port_is_lag = false; + cb->rx_md_info.tx_sys_port = + mlxsw_pci_cqe2_tx_system_port_get(cqe); + } + + if (cb->rx_md_info.tx_sys_port != MLXSW_PCI_CQE2_TX_PORT_MULTI_PORT && + cb->rx_md_info.tx_sys_port != MLXSW_PCI_CQE2_TX_PORT_INVALID) + cb->rx_md_info.tx_port_valid = 1; + else + cb->rx_md_info.tx_port_valid = 0; +} + +static void mlxsw_pci_cqe_rdq_md_init(struct sk_buff *skb, const char *cqe) +{ + struct mlxsw_skb_cb *cb = mlxsw_skb_cb(skb); + + cb->rx_md_info.tx_congestion = mlxsw_pci_cqe2_mirror_cong_get(cqe); + if (cb->rx_md_info.tx_congestion != MLXSW_PCI_CQE2_MIRROR_CONG_INVALID) + cb->rx_md_info.tx_congestion_valid = 1; + else + cb->rx_md_info.tx_congestion_valid = 0; + cb->rx_md_info.tx_congestion <<= MLXSW_PCI_CQE2_MIRROR_CONG_SHIFT; + + cb->rx_md_info.latency = mlxsw_pci_cqe2_mirror_latency_get(cqe); + if (cb->rx_md_info.latency != MLXSW_PCI_CQE2_MIRROR_LATENCY_INVALID) + cb->rx_md_info.latency_valid = 1; + else + cb->rx_md_info.latency_valid = 0; + + cb->rx_md_info.tx_tc = mlxsw_pci_cqe2_mirror_tclass_get(cqe); + if (cb->rx_md_info.tx_tc != MLXSW_PCI_CQE2_MIRROR_TCLASS_INVALID) + cb->rx_md_info.tx_tc_valid = 1; + else + cb->rx_md_info.tx_tc_valid = 0; + + mlxsw_pci_cqe_rdq_md_tx_port_init(skb, cqe); +} + static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci, struct mlxsw_pci_queue *q, u16 consumer_counter_limit, @@ -581,11 +630,15 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci, if (mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2) cookie_index = mlxsw_pci_cqe2_user_def_val_orig_pkt_len_get(cqe); - mlxsw_skb_cb(skb)->cookie_index = cookie_index; + mlxsw_skb_cb(skb)->rx_md_info.cookie_index = cookie_index; } else if (rx_info.trap_id >= MLXSW_TRAP_ID_MIRROR_SESSION0 && rx_info.trap_id <= MLXSW_TRAP_ID_MIRROR_SESSION7 && mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2) { rx_info.mirror_reason = mlxsw_pci_cqe2_mirror_reason_get(cqe); + mlxsw_pci_cqe_rdq_md_init(skb, cqe); + } else if (rx_info.trap_id == MLXSW_TRAP_ID_PKT_SAMPLE && + mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2) { + mlxsw_pci_cqe_rdq_md_tx_port_init(skb, cqe); } byte_count = mlxsw_pci_cqe_byte_count_get(cqe); diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h index a2c1fbd3e0d13..7b531228d6c0f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h +++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h @@ -173,6 +173,15 @@ MLXSW_ITEM32(pci, cqe, wqe_counter, 0x04, 16, 16); */ MLXSW_ITEM32(pci, cqe, byte_count, 0x04, 0, 14); +#define MLXSW_PCI_CQE2_MIRROR_CONG_INVALID 0xFFFF + +/* pci_cqe_mirror_cong_high + * Congestion level in units of 8KB of the egress traffic class of the original + * packet that does mirroring to the CPU. Value of 0xFFFF means that the + * congestion level is invalid. + */ +MLXSW_ITEM32(pci, cqe2, mirror_cong_high, 0x08, 16, 4); + /* pci_cqe_trap_id * Trap ID that captured the packet. */ @@ -208,6 +217,59 @@ MLXSW_ITEM32(pci, cqe0, dqn, 0x0C, 1, 5); MLXSW_ITEM32(pci, cqe12, dqn, 0x0C, 1, 6); mlxsw_pci_cqe_item_helpers(dqn, 0, 12, 12); +#define MLXSW_PCI_CQE2_MIRROR_TCLASS_INVALID 0x1F + +/* pci_cqe_mirror_tclass + * The egress traffic class of the original packet that does mirroring to the + * CPU. Value of 0x1F means that the traffic class is invalid. + */ +MLXSW_ITEM32(pci, cqe2, mirror_tclass, 0x10, 27, 5); + +/* pci_cqe_tx_lag + * The Tx port of a packet that is mirrored / sampled to the CPU is a LAG. + */ +MLXSW_ITEM32(pci, cqe2, tx_lag, 0x10, 24, 1); + +/* pci_cqe_tx_lag_subport + * The port index within the LAG of a packet that is mirrored / sampled to the + * CPU. Reserved when tx_lag is 0. + */ +MLXSW_ITEM32(pci, cqe2, tx_lag_subport, 0x10, 16, 8); + +#define MLXSW_PCI_CQE2_TX_PORT_MULTI_PORT 0xFFFE +#define MLXSW_PCI_CQE2_TX_PORT_INVALID 0xFFFF + +/* pci_cqe_tx_lag_id + * The Tx LAG ID of the original packet that is mirrored / sampled to the CPU. + * Value of 0xFFFE means multi-port. Value fo 0xFFFF means that the Tx LAG ID + * is invalid. Reserved when tx_lag is 0. + */ +MLXSW_ITEM32(pci, cqe2, tx_lag_id, 0x10, 0, 16); + +/* pci_cqe_tx_system_port + * The Tx port of the original packet that is mirrored / sampled to the CPU. + * Value of 0xFFFE means multi-port. Value fo 0xFFFF means that the Tx port is + * invalid. Reserved when tx_lag is 1. + */ +MLXSW_ITEM32(pci, cqe2, tx_system_port, 0x10, 0, 16); + +/* pci_cqe_mirror_cong_low + * Congestion level in units of 8KB of the egress traffic class of the original + * packet that does mirroring to the CPU. Value of 0xFFFF means that the + * congestion level is invalid. + */ +MLXSW_ITEM32(pci, cqe2, mirror_cong_low, 0x14, 20, 12); + +#define MLXSW_PCI_CQE2_MIRROR_CONG_SHIFT 13 /* Units of 8KB. */ + +static inline u16 mlxsw_pci_cqe2_mirror_cong_get(const char *cqe) +{ + u16 cong_high = mlxsw_pci_cqe2_mirror_cong_high_get(cqe); + u16 cong_low = mlxsw_pci_cqe2_mirror_cong_low_get(cqe); + + return cong_high << 12 | cong_low; +} + /* pci_cqe_user_def_val_orig_pkt_len * When trap_id is an ACL: User defined value from policy engine action. */ @@ -218,6 +280,15 @@ MLXSW_ITEM32(pci, cqe2, user_def_val_orig_pkt_len, 0x14, 0, 20); */ MLXSW_ITEM32(pci, cqe2, mirror_reason, 0x18, 24, 8); +#define MLXSW_PCI_CQE2_MIRROR_LATENCY_INVALID 0xFFFFFF + +/* pci_cqe_mirror_latency + * End-to-end latency of the original packet that does mirroring to the CPU. + * Value of 0xFFFFFF means that the latency is invalid. Units are according to + * MOGCR.mirror_latency_units. + */ +MLXSW_ITEM32(pci, cqe2, mirror_latency, 0x1C, 8, 24); + /* pci_cqe_owner * Ownership bit. */ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 93b15b8c007e6..6054147fd51ca 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2212,32 +2212,6 @@ void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, mlxsw_sp->ptp_ops->receive(mlxsw_sp, skb, local_port); } -void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, - u8 local_port) -{ - struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port]; - struct mlxsw_sp_port_sample *sample; - u32 size; - - if (unlikely(!mlxsw_sp_port)) { - dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received for non-existent port\n", - local_port); - goto out; - } - - rcu_read_lock(); - sample = rcu_dereference(mlxsw_sp_port->sample); - if (!sample) - goto out_unlock; - size = sample->truncate ? sample->trunc_size : skb->len; - psample_sample_packet(sample->psample_group, skb, size, - mlxsw_sp_port->dev->ifindex, 0, sample->rate); -out_unlock: - rcu_read_unlock(); -out: - consume_skb(skb); -} - #define MLXSW_SP_RXL_NO_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ MLXSW_RXL(mlxsw_sp_rx_listener_no_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index bc7006de7873b..0082f70daff3a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -570,8 +570,6 @@ void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb, u8 local_port, void *priv); void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, u8 local_port); -void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, - u8 local_port); int mlxsw_sp_port_speed_get(struct mlxsw_sp_port *mlxsw_sp_port, u32 *speed); int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port, enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index e0e6ee58d31a7..056201029ce57 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -108,7 +108,7 @@ static void mlxsw_sp_rx_drop_listener(struct sk_buff *skb, u8 local_port, static void mlxsw_sp_rx_acl_drop_listener(struct sk_buff *skb, u8 local_port, void *trap_ctx) { - u32 cookie_index = mlxsw_skb_cb(skb)->cookie_index; + u32 cookie_index = mlxsw_skb_cb(skb)->rx_md_info.cookie_index; const struct flow_action_cookie *fa_cookie; struct devlink_port *in_devlink_port; struct mlxsw_sp_port *mlxsw_sp_port; @@ -204,21 +204,86 @@ static void mlxsw_sp_rx_ptp_listener(struct sk_buff *skb, u8 local_port, mlxsw_sp_ptp_receive(mlxsw_sp, skb, local_port); } +static struct mlxsw_sp_port * +mlxsw_sp_sample_tx_port_get(struct mlxsw_sp *mlxsw_sp, + const struct mlxsw_rx_md_info *rx_md_info) +{ + u8 local_port; + + if (!rx_md_info->tx_port_valid) + return NULL; + + if (rx_md_info->tx_port_is_lag) + local_port = mlxsw_core_lag_mapping_get(mlxsw_sp->core, + rx_md_info->tx_lag_id, + rx_md_info->tx_lag_port_index); + else + local_port = rx_md_info->tx_sys_port; + + if (local_port >= mlxsw_core_max_ports(mlxsw_sp->core)) + return NULL; + + return mlxsw_sp->ports[local_port]; +} + +/* The latency units are determined according to MOGCR.mirror_latency_units. It + * defaults to 64 nanoseconds. + */ +#define MLXSW_SP_MIRROR_LATENCY_SHIFT 6 + +static void mlxsw_sp_psample_md_init(struct mlxsw_sp *mlxsw_sp, + struct psample_metadata *md, + struct sk_buff *skb, int in_ifindex, + bool truncate, u32 trunc_size) +{ + struct mlxsw_rx_md_info *rx_md_info = &mlxsw_skb_cb(skb)->rx_md_info; + struct mlxsw_sp_port *mlxsw_sp_port; + + md->trunc_size = truncate ? trunc_size : skb->len; + md->in_ifindex = in_ifindex; + mlxsw_sp_port = mlxsw_sp_sample_tx_port_get(mlxsw_sp, rx_md_info); + md->out_ifindex = mlxsw_sp_port && mlxsw_sp_port->dev ? + mlxsw_sp_port->dev->ifindex : 0; + md->out_tc_valid = rx_md_info->tx_tc_valid; + md->out_tc = rx_md_info->tx_tc; + md->out_tc_occ_valid = rx_md_info->tx_congestion_valid; + md->out_tc_occ = rx_md_info->tx_congestion; + md->latency_valid = rx_md_info->latency_valid; + md->latency = rx_md_info->latency; + md->latency <<= MLXSW_SP_MIRROR_LATENCY_SHIFT; +} + static void mlxsw_sp_rx_sample_listener(struct sk_buff *skb, u8 local_port, void *trap_ctx) { struct mlxsw_sp *mlxsw_sp = devlink_trap_ctx_priv(trap_ctx); + struct mlxsw_sp_port *mlxsw_sp_port; + struct mlxsw_sp_port_sample *sample; + struct psample_metadata md = {}; int err; err = __mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx); if (err) return; - /* The sample handler expects skb->data to point to the start of the + mlxsw_sp_port = mlxsw_sp->ports[local_port]; + if (!mlxsw_sp_port) + goto out; + + sample = rcu_dereference(mlxsw_sp_port->sample); + if (!sample) + goto out; + + /* The psample module expects skb->data to point to the start of the * Ethernet header. */ skb_push(skb, ETH_HLEN); - mlxsw_sp_sample_receive(mlxsw_sp, skb, local_port); + mlxsw_sp_psample_md_init(mlxsw_sp, &md, skb, + mlxsw_sp_port->dev->ifindex, sample->truncate, + sample->trunc_size); + psample_sample_packet(sample->psample_group, skb, sample->rate, &md); +out: + consume_skb(skb); } #define MLXSW_SP_TRAP_DROP(_id, _group_id) \ diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile index ade086eed9551..a1cbfa44a1e13 100644 --- a/drivers/net/netdevsim/Makefile +++ b/drivers/net/netdevsim/Makefile @@ -13,3 +13,7 @@ endif ifneq ($(CONFIG_XFRM_OFFLOAD),) netdevsim-objs += ipsec.o endif + +ifneq ($(CONFIG_PSAMPLE),) +netdevsim-objs += psample.o +endif diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index dbeb29fa16e81..6189a4c0d39e9 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -1032,10 +1032,14 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, if (err) goto err_fib_destroy; - err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count); + err = nsim_dev_psample_init(nsim_dev); if (err) goto err_health_exit; + err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count); + if (err) + goto err_psample_exit; + nsim_dev->take_snapshot = debugfs_create_file("take_snapshot", 0200, nsim_dev->ddir, @@ -1043,6 +1047,8 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, &nsim_dev_take_snapshot_fops); return 0; +err_psample_exit: + nsim_dev_psample_exit(nsim_dev); err_health_exit: nsim_dev_health_exit(nsim_dev); err_fib_destroy: @@ -1118,14 +1124,20 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev) if (err) goto err_health_exit; - err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count); + err = nsim_dev_psample_init(nsim_dev); if (err) goto err_bpf_dev_exit; + err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count); + if (err) + goto err_psample_exit; + devlink_params_publish(devlink); devlink_reload_enable(devlink); return 0; +err_psample_exit: + nsim_dev_psample_exit(nsim_dev); err_bpf_dev_exit: nsim_bpf_dev_exit(nsim_dev); err_health_exit: @@ -1158,6 +1170,7 @@ static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev) return; debugfs_remove(nsim_dev->take_snapshot); nsim_dev_port_del_all(nsim_dev); + nsim_dev_psample_exit(nsim_dev); nsim_dev_health_exit(nsim_dev); nsim_fib_destroy(devlink, nsim_dev->fib_data); nsim_dev_traps_exit(devlink); diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 48163c5f2ec91..d735c21def4b1 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -180,6 +180,20 @@ struct nsim_dev_health { int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink); void nsim_dev_health_exit(struct nsim_dev *nsim_dev); +#if IS_ENABLED(CONFIG_PSAMPLE) +int nsim_dev_psample_init(struct nsim_dev *nsim_dev); +void nsim_dev_psample_exit(struct nsim_dev *nsim_dev); +#else +static inline int nsim_dev_psample_init(struct nsim_dev *nsim_dev) +{ + return 0; +} + +static inline void nsim_dev_psample_exit(struct nsim_dev *nsim_dev) +{ +} +#endif + struct nsim_dev_port { struct list_head list; struct devlink_port devlink_port; @@ -229,6 +243,7 @@ struct nsim_dev { bool static_iana_vxlan; u32 sleep; } udp_ports; + struct nsim_dev_psample *psample; }; static inline struct net *nsim_dev_net(struct nsim_dev *nsim_dev) diff --git a/drivers/net/netdevsim/psample.c b/drivers/net/netdevsim/psample.c new file mode 100644 index 0000000000000..5ec3bd7f891b7 --- /dev/null +++ b/drivers/net/netdevsim/psample.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Mellanox Technologies. All rights reserved */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "netdevsim.h" + +#define NSIM_PSAMPLE_REPORT_INTERVAL_MS 100 +#define NSIM_PSAMPLE_INVALID_TC 0xFFFF +#define NSIM_PSAMPLE_L4_DATA_LEN 100 + +struct nsim_dev_psample { + struct delayed_work psample_dw; + struct dentry *ddir; + struct psample_group *group; + u32 rate; + u32 group_num; + u32 trunc_size; + int in_ifindex; + int out_ifindex; + u16 out_tc; + u64 out_tc_occ_max; + u64 latency_max; + bool is_active; +}; + +static struct sk_buff *nsim_dev_psample_skb_build(void) +{ + int tot_len, data_len = NSIM_PSAMPLE_L4_DATA_LEN; + struct sk_buff *skb; + struct udphdr *udph; + struct ethhdr *eth; + struct iphdr *iph; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return NULL; + tot_len = sizeof(struct iphdr) + sizeof(struct udphdr) + data_len; + + skb_reset_mac_header(skb); + eth = skb_put(skb, sizeof(struct ethhdr)); + eth_random_addr(eth->h_dest); + eth_random_addr(eth->h_source); + eth->h_proto = htons(ETH_P_IP); + skb->protocol = htons(ETH_P_IP); + + skb_set_network_header(skb, skb->len); + iph = skb_put(skb, sizeof(struct iphdr)); + iph->protocol = IPPROTO_UDP; + iph->saddr = in_aton("192.0.2.1"); + iph->daddr = in_aton("198.51.100.1"); + iph->version = 0x4; + iph->frag_off = 0; + iph->ihl = 0x5; + iph->tot_len = htons(tot_len); + iph->id = 0; + iph->ttl = 100; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + skb_set_transport_header(skb, skb->len); + udph = skb_put_zero(skb, sizeof(struct udphdr) + data_len); + get_random_bytes(&udph->source, sizeof(u16)); + get_random_bytes(&udph->dest, sizeof(u16)); + udph->len = htons(sizeof(struct udphdr) + data_len); + + return skb; +} + +static void nsim_dev_psample_md_prepare(const struct nsim_dev_psample *psample, + struct psample_metadata *md) +{ + md->trunc_size = psample->trunc_size; + md->in_ifindex = psample->in_ifindex; + md->out_ifindex = psample->out_ifindex; + + if (psample->out_tc != NSIM_PSAMPLE_INVALID_TC) { + md->out_tc = psample->out_tc; + md->out_tc_valid = 1; + } + + if (psample->out_tc_occ_max) { + u64 out_tc_occ; + + get_random_bytes(&out_tc_occ, sizeof(u64)); + md->out_tc_occ = out_tc_occ & (psample->out_tc_occ_max - 1); + md->out_tc_occ_valid = 1; + } + + if (psample->latency_max) { + u64 latency; + + get_random_bytes(&latency, sizeof(u64)); + md->latency = latency & (psample->latency_max - 1); + md->latency_valid = 1; + } +} + +static void nsim_dev_psample_report_work(struct work_struct *work) +{ + struct nsim_dev_psample *psample; + struct psample_metadata md = {}; + struct sk_buff *skb; + unsigned long delay; + + psample = container_of(work, struct nsim_dev_psample, psample_dw.work); + + skb = nsim_dev_psample_skb_build(); + if (!skb) + goto out; + + nsim_dev_psample_md_prepare(psample, &md); + psample_sample_packet(psample->group, skb, psample->rate, &md); + consume_skb(skb); + +out: + delay = msecs_to_jiffies(NSIM_PSAMPLE_REPORT_INTERVAL_MS); + schedule_delayed_work(&psample->psample_dw, delay); +} + +static int nsim_dev_psample_enable(struct nsim_dev *nsim_dev) +{ + struct nsim_dev_psample *psample = nsim_dev->psample; + struct devlink *devlink; + unsigned long delay; + + if (psample->is_active) + return -EBUSY; + + devlink = priv_to_devlink(nsim_dev); + psample->group = psample_group_get(devlink_net(devlink), + psample->group_num); + if (!psample->group) + return -EINVAL; + + delay = msecs_to_jiffies(NSIM_PSAMPLE_REPORT_INTERVAL_MS); + schedule_delayed_work(&psample->psample_dw, delay); + + psample->is_active = true; + + return 0; +} + +static int nsim_dev_psample_disable(struct nsim_dev *nsim_dev) +{ + struct nsim_dev_psample *psample = nsim_dev->psample; + + if (!psample->is_active) + return -EINVAL; + + psample->is_active = false; + + cancel_delayed_work_sync(&psample->psample_dw); + psample_group_put(psample->group); + + return 0; +} + +static ssize_t nsim_dev_psample_enable_write(struct file *file, + const char __user *data, + size_t count, loff_t *ppos) +{ + struct nsim_dev *nsim_dev = file->private_data; + bool enable; + int err; + + err = kstrtobool_from_user(data, count, &enable); + if (err) + return err; + + if (enable) + err = nsim_dev_psample_enable(nsim_dev); + else + err = nsim_dev_psample_disable(nsim_dev); + + return err ? err : count; +} + +static const struct file_operations nsim_psample_enable_fops = { + .open = simple_open, + .write = nsim_dev_psample_enable_write, + .llseek = generic_file_llseek, + .owner = THIS_MODULE, +}; + +int nsim_dev_psample_init(struct nsim_dev *nsim_dev) +{ + struct nsim_dev_psample *psample; + int err; + + psample = kzalloc(sizeof(*psample), GFP_KERNEL); + if (!psample) + return -ENOMEM; + nsim_dev->psample = psample; + + INIT_DELAYED_WORK(&psample->psample_dw, nsim_dev_psample_report_work); + + psample->ddir = debugfs_create_dir("psample", nsim_dev->ddir); + if (IS_ERR(psample->ddir)) { + err = PTR_ERR(psample->ddir); + goto err_psample_free; + } + + /* Populate sampling parameters with sane defaults. */ + psample->rate = 100; + debugfs_create_u32("rate", 0600, psample->ddir, &psample->rate); + + psample->group_num = 10; + debugfs_create_u32("group_num", 0600, psample->ddir, + &psample->group_num); + + psample->trunc_size = 0; + debugfs_create_u32("trunc_size", 0600, psample->ddir, + &psample->trunc_size); + + psample->in_ifindex = 1; + debugfs_create_u32("in_ifindex", 0600, psample->ddir, + &psample->in_ifindex); + + psample->out_ifindex = 2; + debugfs_create_u32("out_ifindex", 0600, psample->ddir, + &psample->out_ifindex); + + psample->out_tc = 0; + debugfs_create_u16("out_tc", 0600, psample->ddir, &psample->out_tc); + + psample->out_tc_occ_max = 10000; + debugfs_create_u64("out_tc_occ_max", 0600, psample->ddir, + &psample->out_tc_occ_max); + + psample->latency_max = 50; + debugfs_create_u64("latency_max", 0600, psample->ddir, + &psample->latency_max); + + debugfs_create_file("enable", 0200, psample->ddir, nsim_dev, + &nsim_psample_enable_fops); + + return 0; + +err_psample_free: + kfree(nsim_dev->psample); + return err; +} + +void nsim_dev_psample_exit(struct nsim_dev *nsim_dev) +{ + debugfs_remove_recursive(nsim_dev->psample->ddir); + if (nsim_dev->psample->is_active) { + cancel_delayed_work_sync(&nsim_dev->psample->psample_dw); + psample_group_put(nsim_dev->psample->group); + } + kfree(nsim_dev->psample); +} diff --git a/include/net/psample.h b/include/net/psample.h index 68ae16bb0a4a8..e328c51277571 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -14,6 +14,19 @@ struct psample_group { struct rcu_head rcu; }; +struct psample_metadata { + u32 trunc_size; + int in_ifindex; + int out_ifindex; + u16 out_tc; + u64 out_tc_occ; /* bytes */ + u64 latency; /* nanoseconds */ + u8 out_tc_valid:1, + out_tc_occ_valid:1, + latency_valid:1, + unused:5; +}; + struct psample_group *psample_group_get(struct net *net, u32 group_num); void psample_group_take(struct psample_group *group); void psample_group_put(struct psample_group *group); @@ -21,15 +34,13 @@ void psample_group_put(struct psample_group *group); #if IS_ENABLED(CONFIG_PSAMPLE) void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, - u32 trunc_size, int in_ifindex, int out_ifindex, - u32 sample_rate); + u32 sample_rate, const struct psample_metadata *md); #else static inline void psample_sample_packet(struct psample_group *group, - struct sk_buff *skb, u32 trunc_size, - int in_ifindex, int out_ifindex, - u32 sample_rate) + struct sk_buff *skb, u32 sample_rate, + const struct psample_metadata *md) { } diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h index aea26ab1431c1..c6329f71b939f 100644 --- a/include/uapi/linux/psample.h +++ b/include/uapi/linux/psample.h @@ -16,6 +16,13 @@ enum { /* commands attributes */ PSAMPLE_ATTR_GROUP_REFCOUNT, + PSAMPLE_ATTR_PAD, + PSAMPLE_ATTR_OUT_TC, /* u16 */ + PSAMPLE_ATTR_OUT_TC_OCC, /* u64, bytes */ + PSAMPLE_ATTR_LATENCY, /* u64, nanoseconds */ + PSAMPLE_ATTR_TIMESTAMP, /* u64, nanoseconds */ + PSAMPLE_ATTR_PROTO, /* u16 */ + __PSAMPLE_ATTR_MAX }; diff --git a/net/psample/psample.c b/net/psample/psample.c index 482c07f2766b1..118d5d2a81a02 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -356,9 +357,12 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) #endif void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, - u32 trunc_size, int in_ifindex, int out_ifindex, - u32 sample_rate) + u32 sample_rate, const struct psample_metadata *md) { + ktime_t tstamp = ktime_get_real(); + int out_ifindex = md->out_ifindex; + int in_ifindex = md->in_ifindex; + u32 trunc_size = md->trunc_size; #ifdef CONFIG_INET struct ip_tunnel_info *tun_info; #endif @@ -370,10 +374,15 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + + (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) + + (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + + (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + nla_total_size(sizeof(u32)) + /* sample_rate */ nla_total_size(sizeof(u32)) + /* orig_size */ nla_total_size(sizeof(u32)) + /* group_num */ - nla_total_size(sizeof(u32)); /* seq */ + nla_total_size(sizeof(u32)) + /* seq */ + nla_total_size_64bit(sizeof(u64)) + /* timestamp */ + nla_total_size(sizeof(u16)); /* protocol */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); @@ -423,6 +432,36 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, if (unlikely(ret < 0)) goto error; + if (md->out_tc_valid) { + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc); + if (unlikely(ret < 0)) + goto error; + } + + if (md->out_tc_occ_valid) { + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC, + md->out_tc_occ, PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + } + + if (md->latency_valid) { + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY, + md->latency, PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + } + + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP, + ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO, + be16_to_cpu(skb->protocol)); + if (unlikely(ret < 0)) + goto error; + if (data_len) { int nla_len = nla_total_size(data_len); struct nlattr *nla; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index db8ee9e5c8c22..6a0c16e4351d7 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -158,10 +158,8 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; + struct psample_metadata md = {}; int retval; - int size; - int iif; - int oif; tcf_lastuse_update(&s->tcf_tm); bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); @@ -172,20 +170,18 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, /* randomly sample packets according to rate */ if (psample_group && (prandom_u32() % s->rate == 0)) { if (!skb_at_tc_ingress(skb)) { - iif = skb->skb_iif; - oif = skb->dev->ifindex; + md.in_ifindex = skb->skb_iif; + md.out_ifindex = skb->dev->ifindex; } else { - iif = skb->dev->ifindex; - oif = 0; + md.in_ifindex = skb->dev->ifindex; } /* on ingress, the mac header gets popped, so push it back */ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_push(skb, skb->mac_len); - size = s->truncate ? s->trunc_size : skb->len; - psample_sample_packet(psample_group, skb, size, iif, oif, - s->rate); + md.trunc_size = s->truncate ? s->trunc_size : skb->len; + psample_sample_packet(psample_group, skb, s->rate, &md); if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_pull(skb, skb->mac_len); diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh new file mode 100755 index 0000000000000..75d00104f2915 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh @@ -0,0 +1,492 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test that packets are sampled when tc-sample is used and that reported +# metadata is correct. Two sets of hosts (with and without LAG) are used, since +# metadata extraction in mlxsw is a bit different when LAG is involved. +# +# +---------------------------------+ +---------------------------------+ +# | H1 (vrf) | | H3 (vrf) | +# | + $h1 | | + $h3_lag | +# | | 192.0.2.1/28 | | | 192.0.2.17/28 | +# | | | | | | +# | | default via 192.0.2.2 | | | default via 192.0.2.18 | +# +----|----------------------------+ +----|----------------------------+ +# | | +# +----|-----------------------------------------|----------------------------+ +# | | 192.0.2.2/28 | 192.0.2.18/28 | +# | + $rp1 + $rp3_lag | +# | | +# | + $rp2 + $rp4_lag | +# | | 198.51.100.2/28 | 198.51.100.18/28 | +# +----|-----------------------------------------|----------------------------+ +# | | +# +----|----------------------------+ +----|----------------------------+ +# | | default via 198.51.100.2 | | | default via 198.51.100.18 | +# | | | | | | +# | | 198.51.100.1/28 | | | 198.51.100.17/28 | +# | + $h2 | | + $h4_lag | +# | H2 (vrf) | | H4 (vrf) | +# +---------------------------------+ +---------------------------------+ + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + tc_sample_rate_test + tc_sample_max_rate_test + tc_sample_group_conflict_test + tc_sample_md_iif_test + tc_sample_md_lag_iif_test + tc_sample_md_oif_test + tc_sample_md_lag_oif_test + tc_sample_md_out_tc_test + tc_sample_md_out_tc_occ_test +" +NUM_NETIFS=8 +CAPTURE_FILE=$(mktemp) +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +# Available at https://github.com/Mellanox/libpsample +require_command psample + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 + + ip -4 route add default vrf v$h1 nexthop via 192.0.2.2 +} + +h1_destroy() +{ + ip -4 route del default vrf v$h1 nexthop via 192.0.2.2 + + simple_if_fini $h1 192.0.2.1/28 +} + +h2_create() +{ + simple_if_init $h2 198.51.100.1/28 + + ip -4 route add default vrf v$h2 nexthop via 198.51.100.2 +} + +h2_destroy() +{ + ip -4 route del default vrf v$h2 nexthop via 198.51.100.2 + + simple_if_fini $h2 198.51.100.1/28 +} + +h3_create() +{ + ip link set dev $h3 down + ip link add name ${h3}_bond type bond mode 802.3ad + ip link set dev $h3 master ${h3}_bond + + simple_if_init ${h3}_bond 192.0.2.17/28 + + ip -4 route add default vrf v${h3}_bond nexthop via 192.0.2.18 +} + +h3_destroy() +{ + ip -4 route del default vrf v${h3}_bond nexthop via 192.0.2.18 + + simple_if_fini ${h3}_bond 192.0.2.17/28 + + ip link set dev $h3 nomaster + ip link del dev ${h3}_bond +} + +h4_create() +{ + ip link set dev $h4 down + ip link add name ${h4}_bond type bond mode 802.3ad + ip link set dev $h4 master ${h4}_bond + + simple_if_init ${h4}_bond 198.51.100.17/28 + + ip -4 route add default vrf v${h4}_bond nexthop via 198.51.100.18 +} + +h4_destroy() +{ + ip -4 route del default vrf v${h4}_bond nexthop via 198.51.100.18 + + simple_if_fini ${h4}_bond 198.51.100.17/28 + + ip link set dev $h4 nomaster + ip link del dev ${h4}_bond +} + +router_create() +{ + ip link set dev $rp1 up + __addr_add_del $rp1 add 192.0.2.2/28 + tc qdisc add dev $rp1 clsact + + ip link set dev $rp2 up + __addr_add_del $rp2 add 198.51.100.2/28 + tc qdisc add dev $rp2 clsact + + ip link add name ${rp3}_bond type bond mode 802.3ad + ip link set dev $rp3 master ${rp3}_bond + __addr_add_del ${rp3}_bond add 192.0.2.18/28 + tc qdisc add dev $rp3 clsact + ip link set dev ${rp3}_bond up + + ip link add name ${rp4}_bond type bond mode 802.3ad + ip link set dev $rp4 master ${rp4}_bond + __addr_add_del ${rp4}_bond add 198.51.100.18/28 + tc qdisc add dev $rp4 clsact + ip link set dev ${rp4}_bond up +} + +router_destroy() +{ + ip link set dev ${rp4}_bond down + tc qdisc del dev $rp4 clsact + __addr_add_del ${rp4}_bond del 198.51.100.18/28 + ip link set dev $rp4 nomaster + ip link del dev ${rp4}_bond + + ip link set dev ${rp3}_bond down + tc qdisc del dev $rp3 clsact + __addr_add_del ${rp3}_bond del 192.0.2.18/28 + ip link set dev $rp3 nomaster + ip link del dev ${rp3}_bond + + tc qdisc del dev $rp2 clsact + __addr_add_del $rp2 del 198.51.100.2/28 + ip link set dev $rp2 down + + tc qdisc del dev $rp1 clsact + __addr_add_del $rp1 del 192.0.2.2/28 + ip link set dev $rp1 down +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + rp1=${NETIFS[p2]} + rp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + h3=${NETIFS[p5]} + rp3=${NETIFS[p6]} + h4=${NETIFS[p7]} + rp4=${NETIFS[p8]} + + vrf_prepare + + h1_create + h2_create + h3_create + h4_create + router_create +} + +cleanup() +{ + pre_cleanup + + rm -f $CAPTURE_FILE + + router_destroy + h4_destroy + h3_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +psample_capture_start() +{ + rm -f $CAPTURE_FILE + + psample &> $CAPTURE_FILE & + + sleep 1 +} + +psample_capture_stop() +{ + { kill %% && wait %%; } 2>/dev/null +} + +__tc_sample_rate_test() +{ + local desc=$1; shift + local dip=$1; shift + local pkts pct + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B $dip -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l) + pct=$((100 * (pkts - 100) / 100)) + (( -25 <= pct && pct <= 25)) + check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" + + log_test "tc sample rate ($desc)" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_rate_test() +{ + __tc_sample_rate_test "forward" 198.51.100.1 + __tc_sample_rate_test "local receive" 192.0.2.2 +} + +tc_sample_max_rate_test() +{ + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate $((35 * 10 ** 8)) group 1 + check_err $? "Failed to configure sampling rule with max rate" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate $((35 * 10 ** 8 + 1)) \ + group 1 &> /dev/null + check_fail $? "Managed to configure sampling rate above maximum" + + log_test "tc sample maximum rate" +} + +tc_sample_group_conflict_test() +{ + RET=0 + + # Test that two sampling rules cannot be configured on the same port + # with different groups. + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 2 &> /dev/null + check_fail $? "Managed to configure sampling rule with conflicting group" + + log_test "tc sample group conflict test" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_iif_test() +{ + local rp1_ifindex + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp1_ifindex=$(ip -j -p link show dev $rp1 | jq '.[]["ifindex"]') + grep -q -e "in-ifindex $rp1_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected in-ifindex" + + log_test "tc sample iif" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_lag_iif_test() +{ + local rp3_ifindex + + RET=0 + + tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \ + -A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp3_ifindex=$(ip -j -p link show dev $rp3 | jq '.[]["ifindex"]') + grep -q -e "in-ifindex $rp3_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected in-ifindex" + + log_test "tc sample lag iif" + + tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_oif_test() +{ + local rp2_ifindex + + RET=0 + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp2_ifindex=$(ip -j -p link show dev $rp2 | jq '.[]["ifindex"]') + grep -q -e "out-ifindex $rp2_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-ifindex" + + log_test "tc sample oif" + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_lag_oif_test() +{ + local rp4_ifindex + + RET=0 + + tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \ + -A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + rp4_ifindex=$(ip -j -p link show dev $rp4 | jq '.[]["ifindex"]') + grep -q -e "out-ifindex $rp4_ifindex " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-ifindex" + + log_test "tc sample lag oif" + + tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_out_tc_test() +{ + RET=0 + + # Output traffic class is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + # By default, all the packets should go to the same traffic class (0). + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "out-tc 0 " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-tc (0)" + + # Map all priorities to highest traffic class (7) and check reported + # out-tc. + tc qdisc replace dev $rp2 root handle 1: \ + prio bands 3 priomap 0 0 0 0 0 0 0 0 + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "out-tc 7 " $CAPTURE_FILE + check_err $? "Sampled packets do not have expected out-tc (7)" + + log_test "tc sample out-tc" + + tc qdisc del dev $rp2 root handle 1: + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +tc_sample_md_out_tc_occ_test() +{ + local backlog pct occ + + RET=0 + + # Output traffic class occupancy is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + # Configure a shaper on egress to create congestion. + tc qdisc replace dev $rp2 root handle 1: \ + tbf rate 1Mbit burst 256k limit 1M + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 0 -d 1usec -p 1400 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q & + + # Allow congestion to reach steady state. + sleep 10 + + backlog=$(tc -j -p -s qdisc show dev $rp2 | jq '.[0]["backlog"]') + + # Kill mausezahn. + { kill %% && wait %%; } 2>/dev/null + + psample_capture_stop + + # Record last congestion sample. + occ=$(grep -e "out-tc-occ " $CAPTURE_FILE | tail -n 1 | \ + cut -d ' ' -f 16) + + pct=$((100 * (occ - backlog) / backlog)) + (( -1 <= pct && pct <= 1)) + check_err $? "Recorded a congestion of $backlog bytes, but sampled congestion is $occ bytes, which is $pct% off. Required accuracy is +-5%" + + log_test "tc sample out-tc-occ" + + tc qdisc del dev $rp2 root handle 1: + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall +} + +trap cleanup EXIT + +setup_prepare +setup_wait + +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/netdevsim/psample.sh b/tools/testing/selftests/drivers/net/netdevsim/psample.sh new file mode 100755 index 0000000000000..ee10b1a8933c4 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/psample.sh @@ -0,0 +1,181 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# This test is for checking the psample module. It makes use of netdevsim +# which periodically generates "sampled" packets. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + psample_enable_test + psample_group_num_test + psample_md_test +" +NETDEVSIM_PATH=/sys/bus/netdevsim/ +DEV_ADDR=1337 +DEV=netdevsim${DEV_ADDR} +DEVLINK_DEV=netdevsim/${DEV} +SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/ +PSAMPLE_DIR=/sys/kernel/debug/netdevsim/$DEV/psample/ +CAPTURE_FILE=$(mktemp) +NUM_NETIFS=0 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +# Available at https://github.com/Mellanox/libpsample +require_command psample + +psample_capture() +{ + rm -f $CAPTURE_FILE + + timeout 2 ip netns exec testns1 psample &> $CAPTURE_FILE +} + +psample_enable_test() +{ + RET=0 + + echo 1 > $PSAMPLE_DIR/enable + check_err $? "Failed to enable sampling when should not" + + echo 1 > $PSAMPLE_DIR/enable 2>/dev/null + check_fail $? "Sampling enablement succeeded when should fail" + + psample_capture + if [ $(cat $CAPTURE_FILE | wc -l) -eq 0 ]; then + check_err 1 "Failed to capture sampled packets" + fi + + echo 0 > $PSAMPLE_DIR/enable + check_err $? "Failed to disable sampling when should not" + + echo 0 > $PSAMPLE_DIR/enable 2>/dev/null + check_fail $? "Sampling disablement succeeded when should fail" + + psample_capture + if [ $(cat $CAPTURE_FILE | wc -l) -ne 0 ]; then + check_err 1 "Captured sampled packets when should not" + fi + + log_test "psample enable / disable" +} + +psample_group_num_test() +{ + RET=0 + + echo 1234 > $PSAMPLE_DIR/group_num + echo 1 > $PSAMPLE_DIR/enable + + psample_capture + grep -q -e "group 1234" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong group number" + + # New group number should only be used after disable / enable. + echo 4321 > $PSAMPLE_DIR/group_num + + psample_capture + grep -q -e "group 4321" $CAPTURE_FILE + check_fail $? "Group number changed while sampling is active" + + echo 0 > $PSAMPLE_DIR/enable && echo 1 > $PSAMPLE_DIR/enable + + psample_capture + grep -q -e "group 4321" $CAPTURE_FILE + check_err $? "Group number did not change after restarting sampling" + + log_test "psample group number" + + echo 0 > $PSAMPLE_DIR/enable +} + +psample_md_test() +{ + RET=0 + + echo 1 > $PSAMPLE_DIR/enable + + echo 1234 > $PSAMPLE_DIR/in_ifindex + echo 4321 > $PSAMPLE_DIR/out_ifindex + psample_capture + + grep -q -e "in-ifindex 1234" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong in-ifindex" + + grep -q -e "out-ifindex 4321" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong out-ifindex" + + echo 5 > $PSAMPLE_DIR/out_tc + psample_capture + + grep -q -e "out-tc 5" $CAPTURE_FILE + check_err $? "Sampled packets reported with wrong out-tc" + + echo $((2**16 - 1)) > $PSAMPLE_DIR/out_tc + psample_capture + + grep -q -e "out-tc " $CAPTURE_FILE + check_fail $? "Sampled packets reported with out-tc when should not" + + echo 1 > $PSAMPLE_DIR/out_tc + echo 10000 > $PSAMPLE_DIR/out_tc_occ_max + psample_capture + + grep -q -e "out-tc-occ " $CAPTURE_FILE + check_err $? "Sampled packets not reported with out-tc-occ when should" + + echo 0 > $PSAMPLE_DIR/out_tc_occ_max + psample_capture + + grep -q -e "out-tc-occ " $CAPTURE_FILE + check_fail $? "Sampled packets reported with out-tc-occ when should not" + + echo 10000 > $PSAMPLE_DIR/latency_max + psample_capture + + grep -q -e "latency " $CAPTURE_FILE + check_err $? "Sampled packets not reported with latency when should" + + echo 0 > $PSAMPLE_DIR/latency_max + psample_capture + + grep -q -e "latency " $CAPTURE_FILE + check_fail $? "Sampled packets reported with latency when should not" + + log_test "psample metadata" + + echo 0 > $PSAMPLE_DIR/enable +} + +setup_prepare() +{ + modprobe netdevsim &> /dev/null + + echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device + while [ ! -d $SYSFS_NET_DIR ] ; do :; done + + set -e + + ip netns add testns1 + devlink dev reload $DEVLINK_DEV netns testns1 + + set +e +} + +cleanup() +{ + pre_cleanup + rm -f $CAPTURE_FILE + ip netns del testns1 + echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device + modprobe -r netdevsim &> /dev/null +} + +trap cleanup EXIT + +setup_prepare + +tests_run + +exit $EXIT_STATUS