Skip to content

Commit

Permalink
ice: convert VF storage to hash table with krefs and RCU
Browse files Browse the repository at this point in the history
The ice driver stores VF structures in a simple array which is allocated
once at the time of VF creation. The VF structures are then accessed
from the array by their VF ID. The ID must be between 0 and the number
of allocated VFs.

Multiple threads can access this table:

 * .ndo operations such as .ndo_get_vf_cfg or .ndo_set_vf_trust
 * interrupts, such as due to messages from the VF using the virtchnl
   communication
 * processing such as device reset
 * commands to add or remove VFs

The current implementation does not keep track of when all threads are
done operating on a VF and can potentially result in use-after-free
issues caused by one thread accessing a VF structure after it has been
released when removing VFs. Some of these are prevented with various
state flags and checks.

In addition, this structure is quite static and does not support a
planned future where virtualization can be more dynamic. As we begin to
look at supporting Scalable IOV with the ice driver (as opposed to just
supporting Single Root IOV), this structure is not sufficient.

In the future, VFs will be able to be added and removed individually and
dynamically.

To allow for this, and to better protect against a whole class of
use-after-free bugs, replace the VF storage with a combination of a hash
table and krefs to reference track all of the accesses to VFs through
the hash table.

A hash table still allows efficient look up of the VF given its ID, but
also allows adding and removing VFs. It does not require contiguous VF
IDs.

The use of krefs allows the cleanup of the VF memory to be delayed until
after all threads have released their reference (by calling ice_put_vf).

To prevent corruption of the hash table, a combination of RCU and the
mutex table_lock are used. Addition and removal from the hash table use
the RCU-aware hash macros. This allows simple read-only look ups that
iterate to locate a single VF can be fast using RCU. Accesses which
modify the hash table, or which can't take RCU because they sleep, will
hold the mutex lock.

By using this design, we have a stronger guarantee that the VF structure
can't be released until after all threads are finished operating on it.
We also pave the way for the more dynamic Scalable IOV implementation in
the future.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
  • Loading branch information
Jacob Keller authored and Tony Nguyen committed Mar 3, 2022
1 parent fb916db commit 3d5985a
Show file tree
Hide file tree
Showing 8 changed files with 363 additions and 125 deletions.
16 changes: 16 additions & 0 deletions drivers/net/ethernet/intel/ice/ice_eswitch.c
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ static void ice_eswitch_remap_rings_to_vectors(struct ice_pf *pf)
rx_ring->q_vector = q_vector;
rx_ring->next = NULL;
rx_ring->netdev = repr->netdev;

ice_put_vf(vf);
}
}

Expand All @@ -223,6 +225,8 @@ ice_eswitch_release_reprs(struct ice_pf *pf, struct ice_vsi *ctrl_vsi)
struct ice_vf *vf;
unsigned int bkt;

lockdep_assert_held(&pf->vfs.table_lock);

ice_for_each_vf(pf, bkt, vf) {
struct ice_vsi *vsi = vf->repr->src_vsi;

Expand Down Expand Up @@ -251,6 +255,8 @@ static int ice_eswitch_setup_reprs(struct ice_pf *pf)
struct ice_vf *vf;
unsigned int bkt;

lockdep_assert_held(&pf->vfs.table_lock);

ice_for_each_vf(pf, bkt, vf) {
struct ice_vsi *vsi = vf->repr->src_vsi;

Expand Down Expand Up @@ -430,6 +436,8 @@ static void ice_eswitch_napi_del(struct ice_pf *pf)
struct ice_vf *vf;
unsigned int bkt;

lockdep_assert_held(&pf->vfs.table_lock);

ice_for_each_vf(pf, bkt, vf)
netif_napi_del(&vf->repr->q_vector->napi);
}
Expand All @@ -443,6 +451,8 @@ static void ice_eswitch_napi_enable(struct ice_pf *pf)
struct ice_vf *vf;
unsigned int bkt;

lockdep_assert_held(&pf->vfs.table_lock);

ice_for_each_vf(pf, bkt, vf)
napi_enable(&vf->repr->q_vector->napi);
}
Expand All @@ -456,6 +466,8 @@ static void ice_eswitch_napi_disable(struct ice_pf *pf)
struct ice_vf *vf;
unsigned int bkt;

lockdep_assert_held(&pf->vfs.table_lock);

ice_for_each_vf(pf, bkt, vf)
napi_disable(&vf->repr->q_vector->napi);
}
Expand Down Expand Up @@ -629,6 +641,8 @@ static void ice_eswitch_start_all_tx_queues(struct ice_pf *pf)
struct ice_vf *vf;
unsigned int bkt;

lockdep_assert_held(&pf->vfs.table_lock);

if (test_bit(ICE_DOWN, pf->state))
return;

Expand All @@ -647,6 +661,8 @@ void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf)
struct ice_vf *vf;
unsigned int bkt;

lockdep_assert_held(&pf->vfs.table_lock);

if (test_bit(ICE_DOWN, pf->state))
return;

Expand Down
13 changes: 9 additions & 4 deletions drivers/net/ethernet/intel/ice/ice_ethtool.c
Original file line number Diff line number Diff line change
Expand Up @@ -316,15 +316,20 @@ ice_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
*/
static bool ice_active_vfs(struct ice_pf *pf)
{
bool active = false;
struct ice_vf *vf;
unsigned int bkt;

ice_for_each_vf(pf, bkt, vf) {
if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
return true;
rcu_read_lock();
ice_for_each_vf_rcu(pf, bkt, vf) {
if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
active = true;
break;
}
}
rcu_read_unlock();

return false;
return active;
}

/**
Expand Down
24 changes: 18 additions & 6 deletions drivers/net/ethernet/intel/ice/ice_lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -439,8 +439,10 @@ static irqreturn_t ice_eswitch_msix_clean_rings(int __always_unused irq, void *d
if (!q_vector->tx.tx_ring && !q_vector->rx.rx_ring)
return IRQ_HANDLED;

ice_for_each_vf(pf, bkt, vf)
rcu_read_lock();
ice_for_each_vf_rcu(pf, bkt, vf)
napi_schedule(&vf->repr->q_vector->napi);
rcu_read_unlock();

return IRQ_HANDLED;
}
Expand Down Expand Up @@ -1345,11 +1347,17 @@ static int ice_get_vf_ctrl_res(struct ice_pf *pf, struct ice_vsi *vsi)
{
struct ice_vf *vf;
unsigned int bkt;
int base;

ice_for_each_vf(pf, bkt, vf) {
if (vf != vsi->vf && vf->ctrl_vsi_idx != ICE_NO_VSI)
return pf->vsi[vf->ctrl_vsi_idx]->base_vector;
rcu_read_lock();
ice_for_each_vf_rcu(pf, bkt, vf) {
if (vf != vsi->vf && vf->ctrl_vsi_idx != ICE_NO_VSI) {
base = pf->vsi[vf->ctrl_vsi_idx]->base_vector;
rcu_read_unlock();
return base;
}
}
rcu_read_unlock();

return ice_get_res(pf, pf->irq_tracker, vsi->num_q_vectors,
ICE_RES_VF_CTRL_VEC_ID);
Expand Down Expand Up @@ -2894,10 +2902,14 @@ static void ice_free_vf_ctrl_res(struct ice_pf *pf, struct ice_vsi *vsi)
struct ice_vf *vf;
unsigned int bkt;

ice_for_each_vf(pf, bkt, vf) {
if (vf != vsi->vf && vf->ctrl_vsi_idx != ICE_NO_VSI)
rcu_read_lock();
ice_for_each_vf_rcu(pf, bkt, vf) {
if (vf != vsi->vf && vf->ctrl_vsi_idx != ICE_NO_VSI) {
rcu_read_unlock();
return;
}
}
rcu_read_unlock();

/* No other VFs left that have control VSI. It is now safe to reclaim
* SW interrupts back to the common pool.
Expand Down
8 changes: 8 additions & 0 deletions drivers/net/ethernet/intel/ice/ice_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,10 @@ ice_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
ice_vc_notify_reset(pf);

/* Disable VFs until reset is completed */
mutex_lock(&pf->vfs.table_lock);
ice_for_each_vf(pf, bkt, vf)
ice_set_vf_state_qs_dis(vf);
mutex_unlock(&pf->vfs.table_lock);

if (ice_is_eswitch_mode_switchdev(pf)) {
if (reset_type != ICE_RESET_PFR)
Expand Down Expand Up @@ -1756,6 +1758,7 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
/* Check to see if one of the VFs caused an MDD event, and then
* increment counters and set print pending
*/
mutex_lock(&pf->vfs.table_lock);
ice_for_each_vf(pf, bkt, vf) {
reg = rd32(hw, VP_MDET_TX_PQM(vf->vf_id));
if (reg & VP_MDET_TX_PQM_VALID_M) {
Expand Down Expand Up @@ -1811,6 +1814,7 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
}
}
}
mutex_unlock(&pf->vfs.table_lock);

ice_print_vfs_mdd_events(pf);
}
Expand Down Expand Up @@ -3680,6 +3684,7 @@ static void ice_deinit_pf(struct ice_pf *pf)
mutex_destroy(&pf->sw_mutex);
mutex_destroy(&pf->tc_mutex);
mutex_destroy(&pf->avail_q_mutex);
mutex_destroy(&pf->vfs.table_lock);

if (pf->avail_txqs) {
bitmap_free(pf->avail_txqs);
Expand Down Expand Up @@ -3779,6 +3784,9 @@ static int ice_init_pf(struct ice_pf *pf)
return -ENOMEM;
}

mutex_init(&pf->vfs.table_lock);
hash_init(pf->vfs.table);

return 0;
}

Expand Down
4 changes: 4 additions & 0 deletions drivers/net/ethernet/intel/ice/ice_repr.c
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,8 @@ void ice_repr_rem_from_all_vfs(struct ice_pf *pf)
struct ice_vf *vf;
unsigned int bkt;

lockdep_assert_held(&pf->vfs.table_lock);

ice_for_each_vf(pf, bkt, vf)
ice_repr_rem(vf);
}
Expand All @@ -355,6 +357,8 @@ int ice_repr_add_for_all_vfs(struct ice_pf *pf)
unsigned int bkt;
int err;

lockdep_assert_held(&pf->vfs.table_lock);

ice_for_each_vf(pf, bkt, vf) {
err = ice_repr_add(vf);
if (err)
Expand Down
2 changes: 2 additions & 0 deletions drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
Original file line number Diff line number Diff line change
Expand Up @@ -1578,6 +1578,7 @@ void ice_flush_fdir_ctx(struct ice_pf *pf)
if (!test_and_clear_bit(ICE_FD_VF_FLUSH_CTX, pf->state))
return;

mutex_lock(&pf->vfs.table_lock);
ice_for_each_vf(pf, bkt, vf) {
struct device *dev = ice_pf_to_dev(pf);
enum virtchnl_fdir_prgm_status status;
Expand Down Expand Up @@ -1634,6 +1635,7 @@ void ice_flush_fdir_ctx(struct ice_pf *pf)
ctx->flags &= ~ICE_VF_FDIR_CTX_VALID;
spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
}
mutex_unlock(&pf->vfs.table_lock);
}

/**
Expand Down
Loading

0 comments on commit 3d5985a

Please sign in to comment.