Skip to content

Commit

Permalink
wifi: iwlwifi: rework firmware error handling
Browse files Browse the repository at this point in the history
In order to later add the ability to do deeper resets of the
device when it crashes, first restructure the firmware error
handling. Instead of having just a single nic_error() method
that handles all, split it:
 - nic_error() just handles and prints the error itself,
 - dump_error() synchronously creates an error dump, and
 - sw_reset() will be called to request doing a SW reset.

This changes the architecture so that the transport is now
responsible for deciding how to do the reset, and therefore
the handling of reprobe if error occurs during reconfig
moves there, which necessitates adding a method there that
notifies the transport that the recovery was completed.

Actually introducing the model under which deeper resets can
be done will be in future patches.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20241227095718.6d4f741ae907.I96a9243e7877808ed6d1bff6967c15d6c24882f0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
  • Loading branch information
Johannes Berg committed Jan 13, 2025
1 parent 2d15d21 commit 7391b2a
Show file tree
Hide file tree
Showing 10 changed files with 312 additions and 101 deletions.
11 changes: 11 additions & 0 deletions drivers/net/wireless/intel/iwlwifi/dvm/mac80211.c
Original file line number Diff line number Diff line change
Expand Up @@ -1565,6 +1565,16 @@ static void iwlagn_mac_sta_notify(struct ieee80211_hw *hw,
IWL_DEBUG_MAC80211(priv, "leave\n");
}

static void
iwlagn_mac_reconfig_complete(struct ieee80211_hw *hw,
enum ieee80211_reconfig_type reconfig_type)
{
struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw);

if (reconfig_type == IEEE80211_RECONFIG_TYPE_RESTART)
iwl_trans_finish_sw_reset(priv->trans);
}

const struct ieee80211_ops iwlagn_hw_ops = {
.add_chanctx = ieee80211_emulate_add_chanctx,
.remove_chanctx = ieee80211_emulate_remove_chanctx,
Expand Down Expand Up @@ -1598,6 +1608,7 @@ const struct ieee80211_ops iwlagn_hw_ops = {
.tx_last_beacon = iwlagn_mac_tx_last_beacon,
.event_callback = iwlagn_mac_event_callback,
.set_tim = iwlagn_mac_set_tim,
.reconfig_complete = iwlagn_mac_reconfig_complete,
};

/* This function both allocates and initializes hw and priv. */
Expand Down
27 changes: 19 additions & 8 deletions drivers/net/wireless/intel/iwlwifi/dvm/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1905,17 +1905,9 @@ static void iwlagn_fw_error(struct iwl_priv *priv, bool ondemand)
unsigned int reload_msec;
unsigned long reload_jiffies;

if (iwl_have_debug_level(IWL_DL_FW))
iwl_print_rx_config_cmd(priv, IWL_RXON_CTX_BSS);

/* uCode is no longer loaded. */
priv->ucode_loaded = false;

/* Set the FW error flag -- cleared on iwl_down */
set_bit(STATUS_FW_ERROR, &priv->status);

iwl_abort_notification_waits(&priv->notif_wait);

/* Keep the restart process from trying to send host
* commands by clearing the ready bit */
clear_bit(STATUS_READY, &priv->status);
Expand Down Expand Up @@ -1957,6 +1949,11 @@ static void iwl_nic_error(struct iwl_op_mode *op_mode,
{
struct iwl_priv *priv = IWL_OP_MODE_GET_DVM(op_mode);

/* Set the FW error flag -- cleared on iwl_down */
set_bit(STATUS_FW_ERROR, &priv->status);

iwl_abort_notification_waits(&priv->notif_wait);

if (type == IWL_ERR_TYPE_CMD_QUEUE_FULL && iwl_check_for_ct_kill(priv))
return;

Expand All @@ -1970,7 +1967,20 @@ static void iwl_nic_error(struct iwl_op_mode *op_mode,
iwl_dump_nic_event_log(priv, false, NULL);
}

if (iwl_have_debug_level(IWL_DL_FW))
iwl_print_rx_config_cmd(priv, IWL_RXON_CTX_BSS);
}

static bool iwlagn_sw_reset(struct iwl_op_mode *op_mode,
enum iwl_fw_error_type type)
{
struct iwl_priv *priv = IWL_OP_MODE_GET_DVM(op_mode);

if (type == IWL_ERR_TYPE_CMD_QUEUE_FULL && iwl_check_for_ct_kill(priv))
return false;

iwlagn_fw_error(priv, false);
return true;
}

#define EEPROM_RF_CONFIG_TYPE_MAX 0x3
Expand Down Expand Up @@ -2125,6 +2135,7 @@ static const struct iwl_op_mode_ops iwl_dvm_ops = {
.hw_rf_kill = iwl_set_hw_rfkill_state,
.free_skb = iwl_free_skb,
.nic_error = iwl_nic_error,
.sw_reset = iwlagn_sw_reset,
.nic_config = iwl_nic_config,
.wimax_active = iwl_wimax_active,
};
Expand Down
53 changes: 51 additions & 2 deletions drivers/net/wireless/intel/iwlwifi/iwl-op-mode.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,39 @@ enum iwl_fw_error_type {
IWL_ERR_TYPE_CMD_QUEUE_FULL,
};

/**
* enum iwl_fw_error_context - error dump context
* @IWL_ERR_CONTEXT_WORKER: regular from worker context,
* opmode must acquire locks and must also check
* for @IWL_ERR_CONTEXT_ABORT after acquiring locks
* @IWL_ERR_CONTEXT_FROM_OPMODE: context is in a call
* originating from the opmode, e.g. while resetting
* or stopping the device, so opmode must not acquire
* any locks
* @IWL_ERR_CONTEXT_ABORT: after lock acquisition, indicates
* that the dump already happened via another callback
* (currently only while stopping the device) via the
* @IWL_ERR_CONTEXT_FROM_OPMODE context, and this call
* must be aborted
*/
enum iwl_fw_error_context {
IWL_ERR_CONTEXT_WORKER,
IWL_ERR_CONTEXT_FROM_OPMODE,
IWL_ERR_CONTEXT_ABORT,
};

/**
* struct iwl_fw_error_dump_mode - error dump mode for callback
* @type: The reason for the dump, per &enum iwl_fw_error_type.
* @context: The context for the dump, may also indicate this
* call needs to be skipped. This MUST be checked before
* and after acquiring any locks in the op-mode!
*/
struct iwl_fw_error_dump_mode {
enum iwl_fw_error_type type;
enum iwl_fw_error_context context;
};

/**
* struct iwl_op_mode_ops - op_mode specific operations
*
Expand Down Expand Up @@ -93,8 +126,11 @@ enum iwl_fw_error_type {
* reclaimed by the op_mode. This can happen when the driver is freed and
* there are Tx packets pending in the transport layer.
* Must be atomic
* @nic_error: error notification. Must be atomic and must be called with BH
* disabled, unless the type is IWL_ERR_TYPE_RESET_HS_TIMEOUT
* @nic_error: error notification. Must be atomic, the op mode should handle
* the error (e.g. abort notification waiters) and print the error if
* applicable
* @dump_error: NIC error dump collection (can sleep, synchronous)
* @sw_reset: (maybe) initiate a software reset, return %true if started
* @nic_config: configure NIC, called before firmware is started.
* May sleep
* @wimax_active: invoked when WiMax becomes active. May sleep
Expand All @@ -120,6 +156,10 @@ struct iwl_op_mode_ops {
void (*free_skb)(struct iwl_op_mode *op_mode, struct sk_buff *skb);
void (*nic_error)(struct iwl_op_mode *op_mode,
enum iwl_fw_error_type type);
void (*dump_error)(struct iwl_op_mode *op_mode,
struct iwl_fw_error_dump_mode *mode);
bool (*sw_reset)(struct iwl_op_mode *op_mode,
enum iwl_fw_error_type type);
void (*nic_config)(struct iwl_op_mode *op_mode);
void (*wimax_active)(struct iwl_op_mode *op_mode);
void (*time_point)(struct iwl_op_mode *op_mode,
Expand Down Expand Up @@ -197,6 +237,15 @@ static inline void iwl_op_mode_nic_error(struct iwl_op_mode *op_mode,
op_mode->ops->nic_error(op_mode, type);
}

static inline void iwl_op_mode_dump_error(struct iwl_op_mode *op_mode,
struct iwl_fw_error_dump_mode *mode)
{
might_sleep();

if (op_mode->ops->dump_error)
op_mode->ops->dump_error(op_mode, mode);
}

static inline void iwl_op_mode_nic_config(struct iwl_op_mode *op_mode)
{
might_sleep();
Expand Down
109 changes: 107 additions & 2 deletions drivers/net/wireless/intel/iwlwifi/iwl-trans.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,87 @@
#include "pcie/internal.h"
#include "iwl-context-info-gen3.h"

struct iwl_trans_reprobe {
struct device *dev;
struct work_struct work;
};

static void iwl_trans_reprobe_wk(struct work_struct *wk)
{
struct iwl_trans_reprobe *reprobe;

reprobe = container_of(wk, typeof(*reprobe), work);

if (device_reprobe(reprobe->dev))
dev_err(reprobe->dev, "reprobe failed!\n");
put_device(reprobe->dev);
kfree(reprobe);
module_put(THIS_MODULE);
}

static void iwl_trans_restart_wk(struct work_struct *wk)
{
struct iwl_trans *trans = container_of(wk, typeof(*trans), restart.wk);
struct iwl_trans_reprobe *reprobe;

if (!trans->op_mode)
return;

/* might have been scheduled before marked as dead, re-check */
if (test_bit(STATUS_TRANS_DEAD, &trans->status))
return;

iwl_op_mode_dump_error(trans->op_mode, &trans->restart.mode);

/*
* If the opmode stopped the device while we were trying to dump and
* reset, then we'll have done the dump already (synchronized by the
* opmode lock that it will acquire in iwl_op_mode_dump_error()) and
* managed that via trans->restart.mode.
* Additionally, make sure that in such a case we won't attempt to do
* any resets now, since it's no longer requested.
*/
if (!test_and_clear_bit(STATUS_RESET_PENDING, &trans->status))
return;

if (!iwlwifi_mod_params.fw_restart)
return;

if (!trans->restart.during_reset) {
iwl_trans_opmode_sw_reset(trans, trans->restart.mode.type);
return;
}

IWL_ERR(trans,
"Device error during reconfiguration - reprobe!\n");

/*
* get a module reference to avoid doing this while unloading
* anyway and to avoid scheduling a work with code that's
* being removed.
*/
if (!try_module_get(THIS_MODULE)) {
IWL_ERR(trans, "Module is being unloaded - abort\n");
return;
}

reprobe = kzalloc(sizeof(*reprobe), GFP_KERNEL);
if (!reprobe) {
module_put(THIS_MODULE);
return;
}
reprobe->dev = get_device(trans->dev);
INIT_WORK(&reprobe->work, iwl_trans_reprobe_wk);
schedule_work(&reprobe->work);
}

struct iwl_trans *iwl_trans_alloc(unsigned int priv_size,
struct device *dev,
const struct iwl_cfg_trans_params *cfg_trans)
{
struct iwl_trans *trans;
#ifdef CONFIG_LOCKDEP
static struct lock_class_key __key;
static struct lock_class_key __sync_cmd_key;
#endif

trans = devm_kzalloc(dev, sizeof(*trans) + priv_size, GFP_KERNEL);
Expand All @@ -33,12 +107,14 @@ struct iwl_trans *iwl_trans_alloc(unsigned int priv_size,

#ifdef CONFIG_LOCKDEP
lockdep_init_map(&trans->sync_cmd_lockdep_map, "sync_cmd_lockdep_map",
&__key, 0);
&__sync_cmd_key, 0);
#endif

trans->dev = dev;
trans->num_rx_queues = 1;

INIT_WORK(&trans->restart.wk, iwl_trans_restart_wk);

return trans;
}

Expand Down Expand Up @@ -81,6 +157,7 @@ int iwl_trans_init(struct iwl_trans *trans)

void iwl_trans_free(struct iwl_trans *trans)
{
cancel_work_sync(&trans->restart.wk);
kmem_cache_destroy(trans->dev_cmd_pool);
}

Expand Down Expand Up @@ -391,6 +468,34 @@ void iwl_trans_stop_device(struct iwl_trans *trans)
{
might_sleep();

/*
* See also the comment in iwl_trans_restart_wk().
*
* When the opmode stops the device while a reset is pending, the
* worker (iwl_trans_restart_wk) might not have run yet or, more
* likely, will be blocked on the opmode lock. Due to the locking,
* we can't just flush the worker.
*
* If this is the case, then the test_and_clear_bit() ensures that
* the worker won't attempt to do anything after the stop.
*
* The trans->restart.mode is a handshake with the opmode, we set
* the context there to ABORT so that when the worker can finally
* acquire the lock in the opmode, the code there won't attempt to
* do any dumps. Since we'd really like to have the dump though,
* also do it inline here (with the opmode locks already held),
* but use a separate mode struct to avoid races.
*/
if (test_and_clear_bit(STATUS_RESET_PENDING, &trans->status)) {
struct iwl_fw_error_dump_mode mode;

mode = trans->restart.mode;
mode.context = IWL_ERR_CONTEXT_FROM_OPMODE;
trans->restart.mode.context = IWL_ERR_CONTEXT_ABORT;

iwl_op_mode_dump_error(trans->op_mode, &mode);
}

if (trans->trans_cfg->gen2)
iwl_trans_pcie_gen2_stop_device(trans);
else
Expand Down
Loading

0 comments on commit 7391b2a

Please sign in to comment.