From c7f0d4c898cb2a3b39a33813739fe4f12ac325d8 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Sun, 29 Mar 2020 14:05:53 +0300 Subject: [PATCH 1/3] netdevsim: Change dummy reporter auto recover default Health reporters should be registered with auto recover set to true. Align dummy reporter behaviour with that, as in later patch the option to set auto recover behaviour will be removed. In addition, align netdevsim selftest to the new default value. Signed-off-by: Eran Ben Elisha Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/netdevsim/health.c | 2 +- tools/testing/selftests/drivers/net/netdevsim/devlink.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c index ba8d9ad60feb3..9ff345d5524b4 100644 --- a/drivers/net/netdevsim/health.c +++ b/drivers/net/netdevsim/health.c @@ -278,7 +278,7 @@ int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink) health->dummy_reporter = devlink_health_reporter_create(devlink, &nsim_dev_dummy_reporter_ops, - 0, false, health); + 0, true, health); if (IS_ERR(health->dummy_reporter)) { err = PTR_ERR(health->dummy_reporter); goto err_empty_reporter_destroy; diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 32cb2a159c704..9f97414445496 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -377,6 +377,11 @@ dummy_reporter_test() { RET=0 + check_reporter_info dummy healthy 0 0 0 true + + devlink health set $DL_HANDLE reporter dummy auto_recover false + check_err $? "Failed to dummy reporter auto_recover option" + check_reporter_info dummy healthy 0 0 0 false local BREAK_MSG="foo bar" From ba7d16c7794213b591a0ea415d975858d6a7dfd8 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Sun, 29 Mar 2020 14:05:54 +0300 Subject: [PATCH 2/3] devlink: Implicitly set auto recover flag when registering health reporter When health reporter is registered to devlink, devlink will implicitly set auto recover if and only if the reporter has a recover method. No reason to explicitly get the auto recover flag from the driver. Remove this flag from all drivers that called devlink_health_reporter_create. All existing health reporters set auto recovery to true if they have a recover method. Yet, administrator can unset auto recover via netlink command as prior to this patch. Signed-off-by: Eran Ben Elisha Reviewed-by: Jiri Pirko Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/health.c | 4 ++-- drivers/net/netdevsim/health.c | 4 ++-- include/net/devlink.h | 3 +-- net/core/devlink.c | 9 +++------ 7 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 8e09a52a9c06c..a812beb463257 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -150,7 +150,7 @@ void bnxt_dl_fw_reporters_create(struct bnxt *bp) health->fw_reset_reporter = devlink_health_reporter_create(bp->dl, &bnxt_dl_fw_reset_reporter_ops, - 0, true, bp); + 0, bp); if (IS_ERR(health->fw_reset_reporter)) { netdev_warn(bp->dev, "Failed to create FW fatal health reporter, rc = %ld\n", PTR_ERR(health->fw_reset_reporter)); @@ -166,7 +166,7 @@ void bnxt_dl_fw_reporters_create(struct bnxt *bp) health->fw_reporter = devlink_health_reporter_create(bp->dl, &bnxt_dl_fw_reporter_ops, - 0, false, bp); + 0, bp); if (IS_ERR(health->fw_reporter)) { netdev_warn(bp->dev, "Failed to create FW health reporter, rc = %ld\n", PTR_ERR(health->fw_reporter)); @@ -182,7 +182,7 @@ void bnxt_dl_fw_reporters_create(struct bnxt *bp) health->fw_fatal_reporter = devlink_health_reporter_create(bp->dl, &bnxt_dl_fw_fatal_reporter_ops, - 0, true, bp); + 0, bp); if (IS_ERR(health->fw_fatal_reporter)) { netdev_warn(bp->dev, "Failed to create FW fatal health reporter, rc = %ld\n", PTR_ERR(health->fw_fatal_reporter)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index af77c86c9aea4..c209579fc213b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -571,7 +571,7 @@ int mlx5e_reporter_rx_create(struct mlx5e_priv *priv) reporter = devlink_health_reporter_create(devlink, &mlx5_rx_reporter_ops, MLX5E_REPORTER_RX_GRACEFUL_PERIOD, - true, priv); + priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", PTR_ERR(reporter)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 2028ce9b151fc..9805fc0855124 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -416,7 +416,7 @@ int mlx5e_reporter_tx_create(struct mlx5e_priv *priv) reporter = devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, MLX5_REPORTER_TX_GRACEFUL_PERIOD, - true, priv); + priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create tx reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index d9f4e8c59c1fb..fa1665caac462 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -627,7 +627,7 @@ static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) health->fw_reporter = devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops, - 0, false, dev); + 0, dev); if (IS_ERR(health->fw_reporter)) mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", PTR_ERR(health->fw_reporter)); @@ -636,7 +636,7 @@ static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) devlink_health_reporter_create(devlink, &mlx5_fw_fatal_reporter_ops, MLX5_REPORTER_FW_GRACEFUL_PERIOD, - true, dev); + dev); if (IS_ERR(health->fw_fatal_reporter)) mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", PTR_ERR(health->fw_fatal_reporter)); diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c index 9ff345d5524b4..62958b238d507 100644 --- a/drivers/net/netdevsim/health.c +++ b/drivers/net/netdevsim/health.c @@ -271,14 +271,14 @@ int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink) health->empty_reporter = devlink_health_reporter_create(devlink, &nsim_dev_empty_reporter_ops, - 0, false, health); + 0, health); if (IS_ERR(health->empty_reporter)) return PTR_ERR(health->empty_reporter); health->dummy_reporter = devlink_health_reporter_create(devlink, &nsim_dev_dummy_reporter_ops, - 0, true, health); + 0, health); if (IS_ERR(health->dummy_reporter)) { err = PTR_ERR(health->dummy_reporter); goto err_empty_reporter_destroy; diff --git a/include/net/devlink.h b/include/net/devlink.h index 3be50346c69bc..3f5cf62e4de82 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1040,8 +1040,7 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv); + u64 graceful_period, void *priv); void devlink_health_reporter_destroy(struct devlink_health_reporter *reporter); diff --git a/net/core/devlink.c b/net/core/devlink.c index 5b968d2040a57..963d0daea10a7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5124,14 +5124,12 @@ devlink_health_reporter_find_by_name(struct devlink *devlink, * @devlink: devlink * @ops: ops * @graceful_period: to avoid recovery loops, in msecs - * @auto_recover: auto recover when error occurs * @priv: priv */ struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) + u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; @@ -5141,8 +5139,7 @@ devlink_health_reporter_create(struct devlink *devlink, goto unlock; } - if (WARN_ON(auto_recover && !ops->recover) || - WARN_ON(graceful_period && !ops->recover)) { + if (WARN_ON(graceful_period && !ops->recover)) { reporter = ERR_PTR(-EINVAL); goto unlock; } @@ -5157,7 +5154,7 @@ devlink_health_reporter_create(struct devlink *devlink, reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = graceful_period; - reporter->auto_recover = auto_recover; + reporter->auto_recover = !!ops->recover; mutex_init(&reporter->dump_lock); refcount_set(&reporter->refcount, 1); list_add_tail(&reporter->list, &devlink->reporter_list); From 48bb52c80be0e462328f58ca3a34ecfef3584320 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Sun, 29 Mar 2020 14:05:55 +0300 Subject: [PATCH 3/3] devlink: Add auto dump flag to health reporter On low memory system, run time dumps can consume too much memory. Add administrator ability to disable auto dumps per reporter as part of the error flow handle routine. This attribute is not relevant while executing DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET. By default, auto dump is activated for any reporter that has a dump method, as part of the reporter registration to devlink. Signed-off-by: Eran Ben Elisha Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index dfdffc42e87d4..e7891d1d2ebd9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -429,6 +429,8 @@ enum devlink_attr { DEVLINK_ATTR_NETNS_FD, /* u32 */ DEVLINK_ATTR_NETNS_PID, /* u32 */ DEVLINK_ATTR_NETNS_ID, /* u32 */ + + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 963d0daea10a7..85c7887356f6e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5089,6 +5089,7 @@ struct devlink_health_reporter { struct mutex dump_lock; /* lock parallel read/write from dump buffers */ u64 graceful_period; bool auto_recover; + bool auto_dump; u8 health_state; u64 dump_ts; u64 dump_real_ts; @@ -5155,6 +5156,7 @@ devlink_health_reporter_create(struct devlink *devlink, reporter->devlink = devlink; reporter->graceful_period = graceful_period; reporter->auto_recover = !!ops->recover; + reporter->auto_dump = !!ops->dump; mutex_init(&reporter->dump_lock); refcount_set(&reporter->refcount, 1); list_add_tail(&reporter->list, &devlink->reporter_list); @@ -5235,6 +5237,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, reporter->dump_real_ts, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; + if (reporter->ops->dump && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + reporter->auto_dump)) + goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); @@ -5381,10 +5387,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - mutex_lock(&reporter->dump_lock); - /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx, NULL); - mutex_unlock(&reporter->dump_lock); + if (reporter->auto_dump) { + mutex_lock(&reporter->dump_lock); + /* store current dump of current error, for later analysis */ + devlink_health_do_dump(reporter, priv_ctx, NULL); + mutex_unlock(&reporter->dump_lock); + } if (reporter->auto_recover) return devlink_health_reporter_recover(reporter, @@ -5558,6 +5566,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, err = -EOPNOTSUPP; goto out; } + if (!reporter->ops->dump && + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { + err = -EOPNOTSUPP; + goto out; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = @@ -5567,6 +5580,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, reporter->auto_recover = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + reporter->auto_dump = + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); + devlink_health_reporter_put(reporter); return 0; out: @@ -6313,6 +6330,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = {