Skip to content

Commit

Permalink
Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/gi…
Browse files Browse the repository at this point in the history
…t/tnguy/next-queue

Tony Nguyen says:

====================
ice: Support 5 layer Tx scheduler topology

Mateusz Polchlopek says:

For performance reasons there is a need to have support for selectable
Tx scheduler topology. Currently firmware supports only the default
9-layer and 5-layer topology. This patch series enables switch from
default to 5-layer topology, if user decides to opt-in.

* '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue:
  ice: Document tx_scheduling_layers parameter
  ice: Add tx_scheduling_layers devlink param
  ice: Enable switching default Tx scheduler topology
  ice: Adjust the VSI/Aggregator layers
  ice: Support 5 layer topology
  devlink: extend devlink_param *set pointer
====================

Link: https://lore.kernel.org/r/20240422203913.225151-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Apr 25, 2024
2 parents 3c3adb2 + 9afff0d commit 21d9f92
Show file tree
Hide file tree
Showing 35 changed files with 663 additions and 84 deletions.
47 changes: 47 additions & 0 deletions Documentation/networking/devlink/ice.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,53 @@ Parameters
* - ``enable_iwarp``
- runtime
- mutually exclusive with ``enable_roce``
* - ``tx_scheduling_layers``
- permanent
- The ice hardware uses hierarchical scheduling for Tx with a fixed
number of layers in the scheduling tree. Each of them are decision
points. Root node represents a port, while all the leaves represent
the queues. This way of configuring the Tx scheduler allows features
like DCB or devlink-rate (documented below) to configure how much
bandwidth is given to any given queue or group of queues, enabling
fine-grained control because scheduling parameters can be configured
at any given layer of the tree.

The default 9-layer tree topology was deemed best for most workloads,
as it gives an optimal ratio of performance to configurability. However,
for some specific cases, this 9-layer topology might not be desired.
One example would be sending traffic to queues that are not a multiple
of 8. Because the maximum radix is limited to 8 in 9-layer topology,
the 9th queue has a different parent than the rest, and it's given
more bandwidth credits. This causes a problem when the system is
sending traffic to 9 queues:

| tx_queue_0_packets: 24163396
| tx_queue_1_packets: 24164623
| tx_queue_2_packets: 24163188
| tx_queue_3_packets: 24163701
| tx_queue_4_packets: 24163683
| tx_queue_5_packets: 24164668
| tx_queue_6_packets: 23327200
| tx_queue_7_packets: 24163853
| tx_queue_8_packets: 91101417 < Too much traffic is sent from 9th
To address this need, you can switch to a 5-layer topology, which
changes the maximum topology radix to 512. With this enhancement,
the performance characteristic is equal as all queues can be assigned
to the same parent in the tree. The obvious drawback of this solution
is a lower configuration depth of the tree.

Use the ``tx_scheduling_layer`` parameter with the devlink command
to change the transmit scheduler topology. To use 5-layer topology,
use a value of 5. For example:
$ devlink dev param set pci/0000:16:00.0 name tx_scheduling_layers
value 5 cmode permanent
Use a value of 9 to set it back to the default value.

You must do PCI slot powercycle for the selected topology to take effect.

To verify that value has been set:
$ devlink dev param show pci/0000:16:00.0 name tx_scheduling_layers

Info versions
=============
Expand Down
9 changes: 6 additions & 3 deletions drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
#include "otx2_cpt_devlink.h"

static int otx2_cpt_dl_egrp_create(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct otx2_cpt_devlink *cpt_dl = devlink_priv(dl);
struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf;
Expand All @@ -13,7 +14,8 @@ static int otx2_cpt_dl_egrp_create(struct devlink *dl, u32 id,
}

static int otx2_cpt_dl_egrp_delete(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct otx2_cpt_devlink *cpt_dl = devlink_priv(dl);
struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf;
Expand Down Expand Up @@ -45,7 +47,8 @@ static int otx2_cpt_dl_t106_mode_get(struct devlink *dl, u32 id,
}

static int otx2_cpt_dl_t106_mode_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct otx2_cpt_devlink *cpt_dl = devlink_priv(dl);
struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf;
Expand Down
3 changes: 2 additions & 1 deletion drivers/net/ethernet/amd/pds_core/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ int pdsc_dl_flash_update(struct devlink *dl,
int pdsc_dl_enable_get(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx);
int pdsc_dl_enable_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx);
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack);
int pdsc_dl_enable_validate(struct devlink *dl, u32 id,
union devlink_param_value val,
struct netlink_ext_ack *extack);
Expand Down
3 changes: 2 additions & 1 deletion drivers/net/ethernet/amd/pds_core/devlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ int pdsc_dl_enable_get(struct devlink *dl, u32 id,
}

int pdsc_dl_enable_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct pdsc *pdsc = devlink_priv(dl);
struct pdsc_viftype *vt_entry;
Expand Down
6 changes: 4 additions & 2 deletions drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,8 @@ static int bnxt_dl_nvm_param_get(struct devlink *dl, u32 id,
}

static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct bnxt *bp = bnxt_get_bp_from_dl(dl);
struct hwrm_nvm_set_variable_input *req;
Expand Down Expand Up @@ -1145,7 +1146,8 @@ static int bnxt_remote_dev_reset_get(struct devlink *dl, u32 id,
}

static int bnxt_remote_dev_reset_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx)
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct bnxt *bp = bnxt_get_bp_from_dl(dl);
int rc;
Expand Down
184 changes: 176 additions & 8 deletions drivers/net/ethernet/intel/ice/devlink/devlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,156 @@ ice_devlink_reload_empr_finish(struct ice_pf *pf,
return 0;
}

/**
* ice_get_tx_topo_user_sel - Read user's choice from flash
* @pf: pointer to pf structure
* @layers: value read from flash will be saved here
*
* Reads user's preference for Tx Scheduler Topology Tree from PFA TLV.
*
* Return: zero when read was successful, negative values otherwise.
*/
static int ice_get_tx_topo_user_sel(struct ice_pf *pf, uint8_t *layers)
{
struct ice_aqc_nvm_tx_topo_user_sel usr_sel = {};
struct ice_hw *hw = &pf->hw;
int err;

err = ice_acquire_nvm(hw, ICE_RES_READ);
if (err)
return err;

err = ice_aq_read_nvm(hw, ICE_AQC_NVM_TX_TOPO_MOD_ID, 0,
sizeof(usr_sel), &usr_sel, true, true, NULL);
if (err)
goto exit_release_res;

if (usr_sel.data & ICE_AQC_NVM_TX_TOPO_USER_SEL)
*layers = ICE_SCHED_5_LAYERS;
else
*layers = ICE_SCHED_9_LAYERS;

exit_release_res:
ice_release_nvm(hw);

return err;
}

/**
* ice_update_tx_topo_user_sel - Save user's preference in flash
* @pf: pointer to pf structure
* @layers: value to be saved in flash
*
* Variable "layers" defines user's preference about number of layers in Tx
* Scheduler Topology Tree. This choice should be stored in PFA TLV field
* and be picked up by driver, next time during init.
*
* Return: zero when save was successful, negative values otherwise.
*/
static int ice_update_tx_topo_user_sel(struct ice_pf *pf, int layers)
{
struct ice_aqc_nvm_tx_topo_user_sel usr_sel = {};
struct ice_hw *hw = &pf->hw;
int err;

err = ice_acquire_nvm(hw, ICE_RES_WRITE);
if (err)
return err;

err = ice_aq_read_nvm(hw, ICE_AQC_NVM_TX_TOPO_MOD_ID, 0,
sizeof(usr_sel), &usr_sel, true, true, NULL);
if (err)
goto exit_release_res;

if (layers == ICE_SCHED_5_LAYERS)
usr_sel.data |= ICE_AQC_NVM_TX_TOPO_USER_SEL;
else
usr_sel.data &= ~ICE_AQC_NVM_TX_TOPO_USER_SEL;

err = ice_write_one_nvm_block(pf, ICE_AQC_NVM_TX_TOPO_MOD_ID, 2,
sizeof(usr_sel.data), &usr_sel.data,
true, NULL, NULL);
exit_release_res:
ice_release_nvm(hw);

return err;
}

/**
* ice_devlink_tx_sched_layers_get - Get tx_scheduling_layers parameter
* @devlink: pointer to the devlink instance
* @id: the parameter ID to set
* @ctx: context to store the parameter value
*
* Return: zero on success and negative value on failure.
*/
static int ice_devlink_tx_sched_layers_get(struct devlink *devlink, u32 id,
struct devlink_param_gset_ctx *ctx)
{
struct ice_pf *pf = devlink_priv(devlink);
int err;

err = ice_get_tx_topo_user_sel(pf, &ctx->val.vu8);
if (err)
return err;

return 0;
}

/**
* ice_devlink_tx_sched_layers_set - Set tx_scheduling_layers parameter
* @devlink: pointer to the devlink instance
* @id: the parameter ID to set
* @ctx: context to get the parameter value
* @extack: netlink extended ACK structure
*
* Return: zero on success and negative value on failure.
*/
static int ice_devlink_tx_sched_layers_set(struct devlink *devlink, u32 id,
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct ice_pf *pf = devlink_priv(devlink);
int err;

err = ice_update_tx_topo_user_sel(pf, ctx->val.vu8);
if (err)
return err;

NL_SET_ERR_MSG_MOD(extack,
"Tx scheduling layers have been changed on this device. You must do the PCI slot powercycle for the change to take effect.");

return 0;
}

/**
* ice_devlink_tx_sched_layers_validate - Validate passed tx_scheduling_layers
* parameter value
* @devlink: unused pointer to devlink instance
* @id: the parameter ID to validate
* @val: value to validate
* @extack: netlink extended ACK structure
*
* Supported values are:
* - 5 - five layers Tx Scheduler Topology Tree
* - 9 - nine layers Tx Scheduler Topology Tree
*
* Return: zero when passed parameter value is supported. Negative value on
* error.
*/
static int ice_devlink_tx_sched_layers_validate(struct devlink *devlink, u32 id,
union devlink_param_value val,
struct netlink_ext_ack *extack)
{
if (val.vu8 != ICE_SCHED_5_LAYERS && val.vu8 != ICE_SCHED_9_LAYERS) {
NL_SET_ERR_MSG_MOD(extack,
"Wrong number of tx scheduler layers provided.");
return -EINVAL;
}

return 0;
}

/**
* ice_tear_down_devlink_rate_tree - removes devlink-rate exported tree
* @pf: pf struct
Expand Down Expand Up @@ -1144,9 +1294,9 @@ ice_devlink_enable_roce_get(struct devlink *devlink, u32 id,
return 0;
}

static int
ice_devlink_enable_roce_set(struct devlink *devlink, u32 id,
struct devlink_param_gset_ctx *ctx)
static int ice_devlink_enable_roce_set(struct devlink *devlink, u32 id,
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct ice_pf *pf = devlink_priv(devlink);
bool roce_ena = ctx->val.vbool;
Expand Down Expand Up @@ -1195,9 +1345,9 @@ ice_devlink_enable_iw_get(struct devlink *devlink, u32 id,
return 0;
}

static int
ice_devlink_enable_iw_set(struct devlink *devlink, u32 id,
struct devlink_param_gset_ctx *ctx)
static int ice_devlink_enable_iw_set(struct devlink *devlink, u32 id,
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack)
{
struct ice_pf *pf = devlink_priv(devlink);
bool iw_ena = ctx->val.vbool;
Expand Down Expand Up @@ -1235,6 +1385,11 @@ ice_devlink_enable_iw_validate(struct devlink *devlink, u32 id,
return 0;
}

enum ice_param_id {
ICE_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS,
};

static const struct devlink_param ice_devlink_params[] = {
DEVLINK_PARAM_GENERIC(ENABLE_ROCE, BIT(DEVLINK_PARAM_CMODE_RUNTIME),
ice_devlink_enable_roce_get,
Expand All @@ -1244,7 +1399,13 @@ static const struct devlink_param ice_devlink_params[] = {
ice_devlink_enable_iw_get,
ice_devlink_enable_iw_set,
ice_devlink_enable_iw_validate),

DEVLINK_PARAM_DRIVER(ICE_DEVLINK_PARAM_ID_TX_SCHED_LAYERS,
"tx_scheduling_layers",
DEVLINK_PARAM_TYPE_U8,
BIT(DEVLINK_PARAM_CMODE_PERMANENT),
ice_devlink_tx_sched_layers_get,
ice_devlink_tx_sched_layers_set,
ice_devlink_tx_sched_layers_validate),
};

static void ice_devlink_free(void *devlink_ptr)
Expand Down Expand Up @@ -1304,9 +1465,16 @@ void ice_devlink_unregister(struct ice_pf *pf)
int ice_devlink_register_params(struct ice_pf *pf)
{
struct devlink *devlink = priv_to_devlink(pf);
struct ice_hw *hw = &pf->hw;
size_t params_size;

params_size = ARRAY_SIZE(ice_devlink_params);

if (!hw->func_caps.common_cap.tx_sched_topo_comp_mode_en)
params_size--;

return devl_params_register(devlink, ice_devlink_params,
ARRAY_SIZE(ice_devlink_params));
params_size);
}

void ice_devlink_unregister_params(struct ice_pf *pf)
Expand Down
Loading

0 comments on commit 21d9f92

Please sign in to comment.