From 232deb3f9567ce37d99b8616a6c07c1fc0436abf Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:48 +0300 Subject: [PATCH 01/10] net: dsa: avoid refcount warnings when ->port_{fdb,mdb}_del returns error At present, when either of ds->ops->port_fdb_del() or ds->ops->port_mdb_del() return a non-zero error code, we attempt to save the day and keep the data structure associated with that switchdev object, as the deletion procedure did not complete. However, the way in which we do this is suspicious to the checker in lib/refcount.c, who thinks it is buggy to increment a refcount that became zero, and that this is indicative of a use-after-free. Fixes: 161ca59d39e9 ("net: dsa: reference count the MDB entries at the cross-chip notifier level") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/switch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 2b1b21bde8300..8f8ed8248c2c3 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -266,7 +266,7 @@ static int dsa_port_do_mdb_del(struct dsa_port *dp, err = ds->ops->port_mdb_del(ds, port, mdb); if (err) { - refcount_inc(&a->refcount); + refcount_set(&a->refcount, 1); return err; } @@ -333,7 +333,7 @@ static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr, err = ds->ops->port_fdb_del(ds, port, addr, vid); if (err) { - refcount_inc(&a->refcount); + refcount_set(&a->refcount, 1); return err; } From df405910ab9f06834b80c3c7317a55abb6504492 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:49 +0300 Subject: [PATCH 02/10] net: dsa: sja1105: wait for dynamic config command completion on writes too The hardware manual says that software should attempt a new dynamic config access (be it a a write or a read-back) only while the VALID bit is cleared. The VALID bit is set by software to 1, and it remains set as long as the hardware is still processing the request. Currently the driver only polls for the command completion only for reads, because that's when we need the actual data read back. Writes have been more or less "asynchronous", although this has never been an observable issue. This change makes sja1105_dynamic_config_write poll the VALID bit as well, to absolutely ensure that a follow-up access to the static config finds the VALID bit cleared. So VALID means "work in progress", while VALIDENT means "entry being read is valid". On reads we check the VALIDENT bit too, while on writes that bit is not always defined. So we need to factor it out of the loop, and make the loop provide back the unpacked command structure, so that sja1105_dynamic_config_read can check the VALIDENT bit. The change also attempts to convert the open-coded loop to use the read_poll_timeout macro, since I know this will come up during review. It's more code, but hey, it uses read_poll_timeout! Tested on SJA1105T, SJA1105S, SJA1110A. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../net/dsa/sja1105/sja1105_dynamic_config.c | 81 ++++++++++++++----- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c index f2049f52833cb..32ec34f181ded 100644 --- a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c +++ b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c @@ -1170,6 +1170,56 @@ const struct sja1105_dynamic_table_ops sja1110_dyn_ops[BLK_IDX_MAX_DYN] = { }, }; +#define SJA1105_DYNAMIC_CONFIG_SLEEP_US 10 +#define SJA1105_DYNAMIC_CONFIG_TIMEOUT_US 100000 + +static int +sja1105_dynamic_config_poll_valid(struct sja1105_private *priv, + struct sja1105_dyn_cmd *cmd, + const struct sja1105_dynamic_table_ops *ops) +{ + u8 packed_buf[SJA1105_MAX_DYN_CMD_SIZE] = {}; + int rc; + + /* We don't _need_ to read the full entry, just the command area which + * is a fixed SJA1105_SIZE_DYN_CMD. But our cmd_packing() API expects a + * buffer that contains the full entry too. Additionally, our API + * doesn't really know how many bytes into the buffer does the command + * area really begin. So just read back the whole entry. + */ + rc = sja1105_xfer_buf(priv, SPI_READ, ops->addr, packed_buf, + ops->packed_size); + if (rc) + return rc; + + /* Unpack the command structure, and return it to the caller in case it + * needs to perform further checks on it (VALIDENT). + */ + memset(cmd, 0, sizeof(*cmd)); + ops->cmd_packing(packed_buf, cmd, UNPACK); + + /* Hardware hasn't cleared VALID => still working on it */ + return cmd->valid ? -EAGAIN : 0; +} + +/* Poll the dynamic config entry's control area until the hardware has + * cleared the VALID bit, which means we have confirmation that it has + * finished processing the command. + */ +static int +sja1105_dynamic_config_wait_complete(struct sja1105_private *priv, + struct sja1105_dyn_cmd *cmd, + const struct sja1105_dynamic_table_ops *ops) +{ + int rc; + + return read_poll_timeout(sja1105_dynamic_config_poll_valid, + rc, rc != -EAGAIN, + SJA1105_DYNAMIC_CONFIG_SLEEP_US, + SJA1105_DYNAMIC_CONFIG_TIMEOUT_US, + false, priv, cmd, ops); +} + /* Provides read access to the settings through the dynamic interface * of the switch. * @blk_idx is used as key to select from the sja1105_dynamic_table_ops. @@ -1196,7 +1246,6 @@ int sja1105_dynamic_config_read(struct sja1105_private *priv, struct sja1105_dyn_cmd cmd = {0}; /* SPI payload buffer */ u8 packed_buf[SJA1105_MAX_DYN_CMD_SIZE] = {0}; - int retries = 3; int rc; if (blk_idx >= BLK_IDX_MAX_DYN) @@ -1239,28 +1288,12 @@ int sja1105_dynamic_config_read(struct sja1105_private *priv, if (rc < 0) return rc; - /* Loop until we have confirmation that hardware has finished - * processing the command and has cleared the VALID field - */ - do { - memset(packed_buf, 0, ops->packed_size); - - /* Retrieve the read operation's result */ - rc = sja1105_xfer_buf(priv, SPI_READ, ops->addr, packed_buf, - ops->packed_size); - if (rc < 0) - return rc; - - cmd = (struct sja1105_dyn_cmd) {0}; - ops->cmd_packing(packed_buf, &cmd, UNPACK); - - if (!cmd.valident && !(ops->access & OP_VALID_ANYWAY)) - return -ENOENT; - cpu_relax(); - } while (cmd.valid && --retries); + rc = sja1105_dynamic_config_wait_complete(priv, &cmd, ops); + if (rc < 0) + return rc; - if (cmd.valid) - return -ETIMEDOUT; + if (!cmd.valident && !(ops->access & OP_VALID_ANYWAY)) + return -ENOENT; /* Don't dereference possibly NULL pointer - maybe caller * only wanted to see whether the entry existed or not. @@ -1321,6 +1354,10 @@ int sja1105_dynamic_config_write(struct sja1105_private *priv, if (rc < 0) return rc; + rc = sja1105_dynamic_config_wait_complete(priv, &cmd, ops); + if (rc < 0) + return rc; + cmd = (struct sja1105_dyn_cmd) {0}; ops->cmd_packing(packed_buf, &cmd, UNPACK); if (cmd.errors) From eb016afd83a9dbe32538f7b5b910fd97e9dd173e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:50 +0300 Subject: [PATCH 03/10] net: dsa: sja1105: serialize access to the dynamic config interface The sja1105 hardware seems as concurrent as can be, but when we create a background script that adds/removes a rain of FDB entries without the rtnl_mutex taken, then in parallel we do another operation like run 'bridge fdb show', we can notice these errors popping up: sja1105 spi2.0: port 2 failed to read back entry for 00:01:02:03:00:40 vid 0: -ENOENT sja1105 spi2.0: port 2 failed to add 00:01:02:03:00:40 vid 0 to fdb: -2 sja1105 spi2.0: port 2 failed to read back entry for 00:01:02:03:00:46 vid 0: -ENOENT sja1105 spi2.0: port 2 failed to add 00:01:02:03:00:46 vid 0 to fdb: -2 Luckily what is going on does not require a major rework in the driver. The sja1105_dynamic_config_read() function sends multiple SPI buffers to the peripheral until the operation completes. We should not do anything until the hardware clears the VALID bit. But since there is no locking (i.e. right now we are implicitly serialized by the rtnl_mutex, but if we remove that), it might be possible that the process which performs the dynamic config read is preempted and another one performs a dynamic config write. What will happen in that case is that sja1105_dynamic_config_read(), when it resumes, expects to see VALIDENT set for the entry it reads back. But it won't. This can be corrected by introducing a mutex for serializing SPI accesses to the dynamic config interface which should be atomic with respect to each other. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105.h | 2 ++ drivers/net/dsa/sja1105/sja1105_dynamic_config.c | 12 ++++++++++-- drivers/net/dsa/sja1105/sja1105_main.c | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105.h b/drivers/net/dsa/sja1105/sja1105.h index 808419f3b808a..21dba16af0979 100644 --- a/drivers/net/dsa/sja1105/sja1105.h +++ b/drivers/net/dsa/sja1105/sja1105.h @@ -261,6 +261,8 @@ struct sja1105_private { * the switch doesn't confuse them with one another. */ struct mutex mgmt_lock; + /* Serializes access to the dynamic config interface */ + struct mutex dynamic_config_lock; struct devlink_region **regions; struct sja1105_cbs_entry *cbs; struct mii_bus *mdio_base_t1; diff --git a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c index 32ec34f181ded..7729d3f8b7f50 100644 --- a/drivers/net/dsa/sja1105/sja1105_dynamic_config.c +++ b/drivers/net/dsa/sja1105/sja1105_dynamic_config.c @@ -1283,12 +1283,16 @@ int sja1105_dynamic_config_read(struct sja1105_private *priv, ops->entry_packing(packed_buf, entry, PACK); /* Send SPI write operation: read config table entry */ + mutex_lock(&priv->dynamic_config_lock); rc = sja1105_xfer_buf(priv, SPI_WRITE, ops->addr, packed_buf, ops->packed_size); - if (rc < 0) + if (rc < 0) { + mutex_unlock(&priv->dynamic_config_lock); return rc; + } rc = sja1105_dynamic_config_wait_complete(priv, &cmd, ops); + mutex_unlock(&priv->dynamic_config_lock); if (rc < 0) return rc; @@ -1349,12 +1353,16 @@ int sja1105_dynamic_config_write(struct sja1105_private *priv, ops->entry_packing(packed_buf, entry, PACK); /* Send SPI write operation: read config table entry */ + mutex_lock(&priv->dynamic_config_lock); rc = sja1105_xfer_buf(priv, SPI_WRITE, ops->addr, packed_buf, ops->packed_size); - if (rc < 0) + if (rc < 0) { + mutex_unlock(&priv->dynamic_config_lock); return rc; + } rc = sja1105_dynamic_config_wait_complete(priv, &cmd, ops); + mutex_unlock(&priv->dynamic_config_lock); if (rc < 0) return rc; diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index ef46ae53ab56f..c343effe2e967 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3365,6 +3365,7 @@ static int sja1105_probe(struct spi_device *spi) priv->ds = ds; mutex_init(&priv->ptp_data.lock); + mutex_init(&priv->dynamic_config_lock); mutex_init(&priv->mgmt_lock); rc = sja1105_parse_dt(priv); From 2468346c5677919de288bee85a0fefbef70c1e4e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:51 +0300 Subject: [PATCH 04/10] net: mscc: ocelot: serialize access to the MAC table DSA would like to remove the rtnl_lock from its SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE handlers, and the felix driver uses the same MAC table functions as ocelot. This means that the MAC table functions will no longer be implicitly serialized with respect to each other by the rtnl_mutex, we need to add a dedicated lock in ocelot for the non-atomic operations of selecting a MAC table row, reading/writing what we want and polling for completion. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 53 +++++++++++++++++++++++------- include/soc/mscc/ocelot.h | 3 ++ 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 4e5ae687d2e22..e6c18b598d5c5 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -20,11 +20,13 @@ struct ocelot_mact_entry { enum macaccess_entry_type type; }; +/* Caller must hold &ocelot->mact_lock */ static inline u32 ocelot_mact_read_macaccess(struct ocelot *ocelot) { return ocelot_read(ocelot, ANA_TABLES_MACACCESS); } +/* Caller must hold &ocelot->mact_lock */ static inline int ocelot_mact_wait_for_completion(struct ocelot *ocelot) { u32 val; @@ -36,6 +38,7 @@ static inline int ocelot_mact_wait_for_completion(struct ocelot *ocelot) TABLE_UPDATE_SLEEP_US, TABLE_UPDATE_TIMEOUT_US); } +/* Caller must hold &ocelot->mact_lock */ static void ocelot_mact_select(struct ocelot *ocelot, const unsigned char mac[ETH_ALEN], unsigned int vid) @@ -67,6 +70,7 @@ int ocelot_mact_learn(struct ocelot *ocelot, int port, ANA_TABLES_MACACCESS_ENTRYTYPE(type) | ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_LEARN); unsigned int mc_ports; + int err; /* Set MAC_CPU_COPY if the CPU port is used by a multicast entry */ if (type == ENTRYTYPE_MACv4) @@ -79,18 +83,28 @@ int ocelot_mact_learn(struct ocelot *ocelot, int port, if (mc_ports & BIT(ocelot->num_phys_ports)) cmd |= ANA_TABLES_MACACCESS_MAC_CPU_COPY; + mutex_lock(&ocelot->mact_lock); + ocelot_mact_select(ocelot, mac, vid); /* Issue a write command */ ocelot_write(ocelot, cmd, ANA_TABLES_MACACCESS); - return ocelot_mact_wait_for_completion(ocelot); + err = ocelot_mact_wait_for_completion(ocelot); + + mutex_unlock(&ocelot->mact_lock); + + return err; } EXPORT_SYMBOL(ocelot_mact_learn); int ocelot_mact_forget(struct ocelot *ocelot, const unsigned char mac[ETH_ALEN], unsigned int vid) { + int err; + + mutex_lock(&ocelot->mact_lock); + ocelot_mact_select(ocelot, mac, vid); /* Issue a forget command */ @@ -98,7 +112,11 @@ int ocelot_mact_forget(struct ocelot *ocelot, ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_FORGET), ANA_TABLES_MACACCESS); - return ocelot_mact_wait_for_completion(ocelot); + err = ocelot_mact_wait_for_completion(ocelot); + + mutex_unlock(&ocelot->mact_lock); + + return err; } EXPORT_SYMBOL(ocelot_mact_forget); @@ -114,7 +132,9 @@ static void ocelot_mact_init(struct ocelot *ocelot) | ANA_AGENCTRL_LEARN_IGNORE_VLAN, ANA_AGENCTRL); - /* Clear the MAC table */ + /* Clear the MAC table. We are not concurrent with anyone, so + * holding &ocelot->mact_lock is pointless. + */ ocelot_write(ocelot, MACACCESS_CMD_INIT, ANA_TABLES_MACACCESS); } @@ -1170,6 +1190,7 @@ int ocelot_port_fdb_do_dump(const unsigned char *addr, u16 vid, } EXPORT_SYMBOL(ocelot_port_fdb_do_dump); +/* Caller must hold &ocelot->mact_lock */ static int ocelot_mact_read(struct ocelot *ocelot, int port, int row, int col, struct ocelot_mact_entry *entry) { @@ -1220,33 +1241,40 @@ static int ocelot_mact_read(struct ocelot *ocelot, int port, int row, int col, int ocelot_fdb_dump(struct ocelot *ocelot, int port, dsa_fdb_dump_cb_t *cb, void *data) { + int err = 0; int i, j; + /* We could take the lock just around ocelot_mact_read, but doing so + * thousands of times in a row seems rather pointless and inefficient. + */ + mutex_lock(&ocelot->mact_lock); + /* Loop through all the mac tables entries. */ for (i = 0; i < ocelot->num_mact_rows; i++) { for (j = 0; j < 4; j++) { struct ocelot_mact_entry entry; bool is_static; - int ret; - ret = ocelot_mact_read(ocelot, port, i, j, &entry); + err = ocelot_mact_read(ocelot, port, i, j, &entry); /* If the entry is invalid (wrong port, invalid...), * skip it. */ - if (ret == -EINVAL) + if (err == -EINVAL) continue; - else if (ret) - return ret; + else if (err) + break; is_static = (entry.type == ENTRYTYPE_LOCKED); - ret = cb(entry.mac, entry.vid, is_static, data); - if (ret) - return ret; + err = cb(entry.mac, entry.vid, is_static, data); + if (err) + break; } } - return 0; + mutex_unlock(&ocelot->mact_lock); + + return err; } EXPORT_SYMBOL(ocelot_fdb_dump); @@ -2231,6 +2259,7 @@ int ocelot_init(struct ocelot *ocelot) mutex_init(&ocelot->stats_lock); mutex_init(&ocelot->ptp_lock); + mutex_init(&ocelot->mact_lock); spin_lock_init(&ocelot->ptp_clock_lock); spin_lock_init(&ocelot->ts_id_lock); snprintf(queue_name, sizeof(queue_name), "%s-stats", diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 9b872da0c2465..fef3a36b02100 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -675,6 +675,9 @@ struct ocelot { struct delayed_work stats_work; struct workqueue_struct *stats_queue; + /* Lock for serializing access to the MAC table */ + struct mutex mact_lock; + struct workqueue_struct *owq; u8 ptp:1; From f7eb4a1c0864821fe99edf12aca09739f9cbd7a4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:52 +0300 Subject: [PATCH 05/10] net: dsa: b53: serialize access to the ARL table The b53 driver performs non-atomic transactions to the ARL table when adding, deleting and reading FDB and MDB entries. Traditionally these were all serialized by the rtnl_lock(), but now it is possible that DSA calls ->port_fdb_add and ->port_fdb_del without holding that lock. So the driver must have its own serialization logic. Add a mutex and hold it from all entry points (->port_fdb_{add,del,dump}, ->port_mdb_{add,del}). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 36 +++++++++++++++++++++++++------- drivers/net/dsa/b53/b53_priv.h | 1 + 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 51dfaf817a400..af4761968733c 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1542,7 +1542,7 @@ int b53_vlan_del(struct dsa_switch *ds, int port, } EXPORT_SYMBOL(b53_vlan_del); -/* Address Resolution Logic routines */ +/* Address Resolution Logic routines. Caller must hold &dev->arl_mutex. */ static int b53_arl_op_wait(struct b53_device *dev) { unsigned int timeout = 10; @@ -1707,6 +1707,7 @@ int b53_fdb_add(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid) { struct b53_device *priv = ds->priv; + int ret; /* 5325 and 5365 require some more massaging, but could * be supported eventually @@ -1714,7 +1715,11 @@ int b53_fdb_add(struct dsa_switch *ds, int port, if (is5325(priv) || is5365(priv)) return -EOPNOTSUPP; - return b53_arl_op(priv, 0, port, addr, vid, true); + mutex_lock(&priv->arl_mutex); + ret = b53_arl_op(priv, 0, port, addr, vid, true); + mutex_unlock(&priv->arl_mutex); + + return ret; } EXPORT_SYMBOL(b53_fdb_add); @@ -1722,8 +1727,13 @@ int b53_fdb_del(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid) { struct b53_device *priv = ds->priv; + int ret; - return b53_arl_op(priv, 0, port, addr, vid, false); + mutex_lock(&priv->arl_mutex); + ret = b53_arl_op(priv, 0, port, addr, vid, false); + mutex_unlock(&priv->arl_mutex); + + return ret; } EXPORT_SYMBOL(b53_fdb_del); @@ -1780,6 +1790,8 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, int ret; u8 reg; + mutex_lock(&priv->arl_mutex); + /* Start search operation */ reg = ARL_SRCH_STDN; b53_write8(priv, B53_ARLIO_PAGE, B53_ARL_SRCH_CTL, reg); @@ -1787,18 +1799,18 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, do { ret = b53_arl_search_wait(priv); if (ret) - return ret; + break; b53_arl_search_rd(priv, 0, &results[0]); ret = b53_fdb_copy(port, &results[0], cb, data); if (ret) - return ret; + break; if (priv->num_arl_bins > 2) { b53_arl_search_rd(priv, 1, &results[1]); ret = b53_fdb_copy(port, &results[1], cb, data); if (ret) - return ret; + break; if (!results[0].is_valid && !results[1].is_valid) break; @@ -1806,6 +1818,8 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, } while (count++ < b53_max_arl_entries(priv) / 2); + mutex_unlock(&priv->arl_mutex); + return 0; } EXPORT_SYMBOL(b53_fdb_dump); @@ -1814,6 +1828,7 @@ int b53_mdb_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb) { struct b53_device *priv = ds->priv; + int ret; /* 5325 and 5365 require some more massaging, but could * be supported eventually @@ -1821,7 +1836,11 @@ int b53_mdb_add(struct dsa_switch *ds, int port, if (is5325(priv) || is5365(priv)) return -EOPNOTSUPP; - return b53_arl_op(priv, 0, port, mdb->addr, mdb->vid, true); + mutex_lock(&priv->arl_mutex); + ret = b53_arl_op(priv, 0, port, mdb->addr, mdb->vid, true); + mutex_unlock(&priv->arl_mutex); + + return ret; } EXPORT_SYMBOL(b53_mdb_add); @@ -1831,7 +1850,9 @@ int b53_mdb_del(struct dsa_switch *ds, int port, struct b53_device *priv = ds->priv; int ret; + mutex_lock(&priv->arl_mutex); ret = b53_arl_op(priv, 0, port, mdb->addr, mdb->vid, false); + mutex_unlock(&priv->arl_mutex); if (ret) dev_err(ds->dev, "failed to delete MDB entry\n"); @@ -2668,6 +2689,7 @@ struct b53_device *b53_switch_alloc(struct device *base, mutex_init(&dev->reg_mutex); mutex_init(&dev->stats_mutex); + mutex_init(&dev->arl_mutex); return dev; } diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 544101e74bca1..579da74ada645 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -107,6 +107,7 @@ struct b53_device { struct mutex reg_mutex; struct mutex stats_mutex; + struct mutex arl_mutex; const struct b53_io_ops *ops; /* chip specific data */ From cf231b436f7ceaf0d87016eedf830677c0c1f7dc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:53 +0300 Subject: [PATCH 06/10] net: dsa: lantiq_gswip: serialize access to the PCE registers The GSWIP switch accesses various bridging layer tables (VLANs, FDBs, forwarding rules) indirectly through PCE registers. These hardware accesses are non-atomic, being comprised of several register reads and writes. These accesses are currently serialized by the rtnl_lock, but DSA is changing its driver API and that lock will no longer be held when calling ->port_fdb_add() and ->port_fdb_del(). So this driver needs to serialize the access to the PCE registers using its own locking scheme. This patch adds that. Note that the driver also uses the gswip_pce_load_microcode() function to load a static configuration for the packet classification engine into a table using the same registers. It is currently not protected, but since that configuration is only done from the dsa_switch_ops :: setup method, there is no risk of it being concurrent with other operations. Signed-off-by: Vladimir Oltean Acked-by: Hauke Mehrtens Signed-off-by: David S. Miller --- drivers/net/dsa/lantiq_gswip.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index a3e937a371ef1..7056d98d8177b 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -276,6 +276,7 @@ struct gswip_priv { int num_gphy_fw; struct gswip_gphy_fw *gphy_fw; u32 port_vlan_filter; + struct mutex pce_table_lock; }; struct gswip_pce_table_entry { @@ -523,10 +524,14 @@ static int gswip_pce_table_entry_read(struct gswip_priv *priv, u16 addr_mode = tbl->key_mode ? GSWIP_PCE_TBL_CTRL_OPMOD_KSRD : GSWIP_PCE_TBL_CTRL_OPMOD_ADRD; + mutex_lock(&priv->pce_table_lock); + err = gswip_switch_r_timeout(priv, GSWIP_PCE_TBL_CTRL, GSWIP_PCE_TBL_CTRL_BAS); - if (err) + if (err) { + mutex_unlock(&priv->pce_table_lock); return err; + } gswip_switch_w(priv, tbl->index, GSWIP_PCE_TBL_ADDR); gswip_switch_mask(priv, GSWIP_PCE_TBL_CTRL_ADDR_MASK | @@ -536,8 +541,10 @@ static int gswip_pce_table_entry_read(struct gswip_priv *priv, err = gswip_switch_r_timeout(priv, GSWIP_PCE_TBL_CTRL, GSWIP_PCE_TBL_CTRL_BAS); - if (err) + if (err) { + mutex_unlock(&priv->pce_table_lock); return err; + } for (i = 0; i < ARRAY_SIZE(tbl->key); i++) tbl->key[i] = gswip_switch_r(priv, GSWIP_PCE_TBL_KEY(i)); @@ -553,6 +560,8 @@ static int gswip_pce_table_entry_read(struct gswip_priv *priv, tbl->valid = !!(crtl & GSWIP_PCE_TBL_CTRL_VLD); tbl->gmap = (crtl & GSWIP_PCE_TBL_CTRL_GMAP_MASK) >> 7; + mutex_unlock(&priv->pce_table_lock); + return 0; } @@ -565,10 +574,14 @@ static int gswip_pce_table_entry_write(struct gswip_priv *priv, u16 addr_mode = tbl->key_mode ? GSWIP_PCE_TBL_CTRL_OPMOD_KSWR : GSWIP_PCE_TBL_CTRL_OPMOD_ADWR; + mutex_lock(&priv->pce_table_lock); + err = gswip_switch_r_timeout(priv, GSWIP_PCE_TBL_CTRL, GSWIP_PCE_TBL_CTRL_BAS); - if (err) + if (err) { + mutex_unlock(&priv->pce_table_lock); return err; + } gswip_switch_w(priv, tbl->index, GSWIP_PCE_TBL_ADDR); gswip_switch_mask(priv, GSWIP_PCE_TBL_CTRL_ADDR_MASK | @@ -600,8 +613,12 @@ static int gswip_pce_table_entry_write(struct gswip_priv *priv, crtl |= GSWIP_PCE_TBL_CTRL_BAS; gswip_switch_w(priv, crtl, GSWIP_PCE_TBL_CTRL); - return gswip_switch_r_timeout(priv, GSWIP_PCE_TBL_CTRL, - GSWIP_PCE_TBL_CTRL_BAS); + err = gswip_switch_r_timeout(priv, GSWIP_PCE_TBL_CTRL, + GSWIP_PCE_TBL_CTRL_BAS); + + mutex_unlock(&priv->pce_table_lock); + + return err; } /* Add the LAN port into a bridge with the CPU port by @@ -2104,6 +2121,7 @@ static int gswip_probe(struct platform_device *pdev) priv->ds->priv = priv; priv->ds->ops = priv->hw_info->ops; priv->dev = dev; + mutex_init(&priv->pce_table_lock); version = gswip_switch_r(priv, GSWIP_VERSION); np = dev->of_node; From 338a3a4745aae748835eb611765a8e36779810e1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:54 +0300 Subject: [PATCH 07/10] net: dsa: introduce locking for the address lists on CPU and DSA ports Now that the rtnl_mutex is going away for dsa_port_{host_,}fdb_{add,del}, no one is serializing access to the address lists that DSA keeps for the purpose of reference counting on shared ports (CPU and cascade ports). It can happen for one dsa_switch_do_fdb_del to do list_del on a dp->fdbs element while another dsa_switch_do_fdb_{add,del} is traversing dp->fdbs. We need to avoid that. Currently dp->mdbs is not at risk, because dsa_switch_do_mdb_{add,del} still runs under the rtnl_mutex. But it would be nice if it would not depend on that being the case. So let's introduce a mutex per port (the address lists are per port too) and share it between dp->mdbs and dp->fdbs. The place where we put the locking is interesting. It could be tempting to put a DSA-level lock which still serializes calls to .port_fdb_{add,del}, but it would still not avoid concurrency with other driver code paths that are currently under rtnl_mutex (.port_fdb_dump, .port_fast_age). So it would add a very false sense of security (and adding a global switch-wide lock in DSA to resynchronize with the rtnl_lock is also counterproductive and hard). So the locking is intentionally done only where the dp->fdbs and dp->mdbs lists are traversed. That means, from a driver perspective, that .port_fdb_add will be called with the dp->addr_lists_lock mutex held on the CPU port, but not held on user ports. This is done so that driver writers are not encouraged to rely on any guarantee offered by dp->addr_lists_lock. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + net/dsa/dsa2.c | 1 + net/dsa/switch.c | 76 ++++++++++++++++++++++++++++++++--------------- 3 files changed, 54 insertions(+), 24 deletions(-) diff --git a/include/net/dsa.h b/include/net/dsa.h index 1cd9c2461f0d9..badd214f74700 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -287,6 +287,7 @@ struct dsa_port { /* List of MAC addresses that must be forwarded on this port. * These are only valid on CPU ports and DSA links. */ + struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f5270114dcb8c..826957b6442b0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -433,6 +433,7 @@ static int dsa_port_setup(struct dsa_port *dp) if (dp->setup) return 0; + mutex_init(&dp->addr_lists_lock); INIT_LIST_HEAD(&dp->fdbs); INIT_LIST_HEAD(&dp->mdbs); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 8f8ed8248c2c3..bb155a16d4540 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -215,26 +215,30 @@ static int dsa_port_do_mdb_add(struct dsa_port *dp, struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; int port = dp->index; - int err; + int err = 0; /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) return ds->ops->port_mdb_add(ds, port, mdb); + mutex_lock(&dp->addr_lists_lock); + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); if (a) { refcount_inc(&a->refcount); - return 0; + goto out; } a = kzalloc(sizeof(*a), GFP_KERNEL); - if (!a) - return -ENOMEM; + if (!a) { + err = -ENOMEM; + goto out; + } err = ds->ops->port_mdb_add(ds, port, mdb); if (err) { kfree(a); - return err; + goto out; } ether_addr_copy(a->addr, mdb->addr); @@ -242,7 +246,10 @@ static int dsa_port_do_mdb_add(struct dsa_port *dp, refcount_set(&a->refcount, 1); list_add_tail(&a->list, &dp->mdbs); - return 0; +out: + mutex_unlock(&dp->addr_lists_lock); + + return err; } static int dsa_port_do_mdb_del(struct dsa_port *dp, @@ -251,29 +258,36 @@ static int dsa_port_do_mdb_del(struct dsa_port *dp, struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; int port = dp->index; - int err; + int err = 0; /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) return ds->ops->port_mdb_del(ds, port, mdb); + mutex_lock(&dp->addr_lists_lock); + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); - if (!a) - return -ENOENT; + if (!a) { + err = -ENOENT; + goto out; + } if (!refcount_dec_and_test(&a->refcount)) - return 0; + goto out; err = ds->ops->port_mdb_del(ds, port, mdb); if (err) { refcount_set(&a->refcount, 1); - return err; + goto out; } list_del(&a->list); kfree(a); - return 0; +out: + mutex_unlock(&dp->addr_lists_lock); + + return err; } static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, @@ -282,26 +296,30 @@ static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; int port = dp->index; - int err; + int err = 0; /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) return ds->ops->port_fdb_add(ds, port, addr, vid); + mutex_lock(&dp->addr_lists_lock); + a = dsa_mac_addr_find(&dp->fdbs, addr, vid); if (a) { refcount_inc(&a->refcount); - return 0; + goto out; } a = kzalloc(sizeof(*a), GFP_KERNEL); - if (!a) - return -ENOMEM; + if (!a) { + err = -ENOMEM; + goto out; + } err = ds->ops->port_fdb_add(ds, port, addr, vid); if (err) { kfree(a); - return err; + goto out; } ether_addr_copy(a->addr, addr); @@ -309,7 +327,10 @@ static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, refcount_set(&a->refcount, 1); list_add_tail(&a->list, &dp->fdbs); - return 0; +out: + mutex_unlock(&dp->addr_lists_lock); + + return err; } static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr, @@ -318,29 +339,36 @@ static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr, struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; int port = dp->index; - int err; + int err = 0; /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) return ds->ops->port_fdb_del(ds, port, addr, vid); + mutex_lock(&dp->addr_lists_lock); + a = dsa_mac_addr_find(&dp->fdbs, addr, vid); - if (!a) - return -ENOENT; + if (!a) { + err = -ENOENT; + goto out; + } if (!refcount_dec_and_test(&a->refcount)) - return 0; + goto out; err = ds->ops->port_fdb_del(ds, port, addr, vid); if (err) { refcount_set(&a->refcount, 1); - return err; + goto out; } list_del(&a->list); kfree(a); - return 0; +out: + mutex_unlock(&dp->addr_lists_lock); + + return err; } static int dsa_switch_host_fdb_add(struct dsa_switch *ds, From 0faf890fc519a10573472ae6b5c244156cf0bff3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:55 +0300 Subject: [PATCH 08/10] net: dsa: drop rtnl_lock from dsa_slave_switchdev_event_work After talking with Ido Schimmel, it became clear that rtnl_lock is not actually required for anything that is done inside the SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE deferred work handlers. The reason why it was probably added by Arkadi Sharshevsky in commit c9eb3e0f8701 ("net: dsa: Add support for learning FDB through notification") was to offer the same locking/serialization guarantees as .ndo_fdb_{add,del} and avoid reworking any drivers. DSA has implemented .ndo_fdb_add and .ndo_fdb_del until commit b117e1e8a86d ("net: dsa: delete dsa_legacy_fdb_add and dsa_legacy_fdb_del") - that is to say, until fairly recently. But those methods have been deleted, so now we are free to drop the rtnl_lock as well. Note that exposing DSA switch drivers to an unlocked method which was previously serialized by the rtnl_mutex is a potentially dangerous affair. Driver writers couldn't ensure that their internal locking scheme does the right thing even if they wanted. We could err on the side of paranoia and introduce a switch-wide lock inside the DSA framework, but that seems way overreaching. Instead, we could check as many drivers for regressions as we can, fix those first, then let this change go in once it is assumed to be fairly safe. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 9d9fef668ebaa..adcfb2cb4e612 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2413,7 +2413,6 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) dp = dsa_to_port(ds, switchdev_work->port); - rtnl_lock(); switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) @@ -2448,7 +2447,6 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) break; } - rtnl_unlock(); dev_put(switchdev_work->dev); kfree(switchdev_work); From d70b51f2845d4a0352361fd4f9741913a2cf2145 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:56 +0300 Subject: [PATCH 09/10] selftests: lib: forwarding: allow tests to not require mz and jq These programs are useful, but not all selftests require them. Additionally, on embedded boards without package management (things like buildroot), installing mausezahn or jq is not always as trivial as downloading a package from the web. So it is actually a bit annoying to require programs that are not used. Introduce options that can be set by scripts to not enforce these dependencies. For compatibility, default to "yes". Cc: Nikolay Aleksandrov Cc: Ido Schimmel Cc: Guillaume Nault Cc: Po-Hsu Lin Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/lib.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 92087d423bcf1..520d8b53464b7 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -23,6 +23,8 @@ MC_CLI=${MC_CLI:=smcroutectl} PING_TIMEOUT=${PING_TIMEOUT:=5} WAIT_TIMEOUT=${WAIT_TIMEOUT:=20} INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600} +REQUIRE_JQ=${REQUIRE_JQ:=yes} +REQUIRE_MZ=${REQUIRE_MZ:=yes} relative_path="${BASH_SOURCE%/*}" if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then @@ -141,8 +143,12 @@ require_command() fi } -require_command jq -require_command $MZ +if [[ "$REQUIRE_JQ" = "yes" ]]; then + require_command jq +fi +if [[ "$REQUIRE_MZ" = "yes" ]]; then + require_command $MZ +fi if [[ ! -v NUM_NETIFS ]]; then echo "SKIP: importer does not define \"NUM_NETIFS\"" From eccd0a80dc7f4be65430236db475546b0ab9ec37 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 24 Oct 2021 20:17:57 +0300 Subject: [PATCH 10/10] selftests: net: dsa: add a stress test for unlocked FDB operations This test is a bit strange in that it is perhaps more manual than others: it does not transmit a clear OK/FAIL verdict, because user space does not have synchronous feedback from the kernel. If a hardware access fails, it is in deferred context. Nonetheless, on sja1105 I have used it successfully to find and solve a concurrency issue, so it can be used as a starting point for other driver maintainers too. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 1 + .../drivers/net/dsa/test_bridge_fdb_stress.sh | 47 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/dsa/test_bridge_fdb_stress.sh diff --git a/MAINTAINERS b/MAINTAINERS index c5aa142d4b3a7..975086c5345dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13056,6 +13056,7 @@ F: include/linux/dsa/ F: include/linux/platform_data/dsa.h F: include/net/dsa.h F: net/dsa/ +F: tools/testing/selftests/drivers/net/dsa/ NETWORKING [GENERAL] M: "David S. Miller" diff --git a/tools/testing/selftests/drivers/net/dsa/test_bridge_fdb_stress.sh b/tools/testing/selftests/drivers/net/dsa/test_bridge_fdb_stress.sh new file mode 100755 index 0000000000000..dca8be6092b92 --- /dev/null +++ b/tools/testing/selftests/drivers/net/dsa/test_bridge_fdb_stress.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Bridge FDB entries can be offloaded to DSA switches without holding the +# rtnl_mutex. Traditionally this mutex has conferred drivers implicit +# serialization, which means their code paths are not well tested in the +# presence of concurrency. +# This test creates a background task that stresses the FDB by adding and +# deleting an entry many times in a row without the rtnl_mutex held. +# It then tests the driver resistance to concurrency by calling .ndo_fdb_dump +# (with rtnl_mutex held) from a foreground task. +# Since either the FDB dump or the additions/removals can fail, but the +# additions and removals are performed in deferred as opposed to process +# context, we cannot simply check for user space error codes. + +WAIT_TIME=1 +NUM_NETIFS=1 +REQUIRE_JQ="no" +REQUIRE_MZ="no" +NETIF_CREATE="no" +lib_dir=$(dirname $0)/../../../net/forwarding +source $lib_dir/lib.sh + +cleanup() { + echo "Cleaning up" + kill $pid && wait $pid &> /dev/null + ip link del br0 + echo "Please check kernel log for errors" +} +trap 'cleanup' EXIT + +eth=${NETIFS[p1]} + +ip link del br0 2&>1 >/dev/null || : +ip link add br0 type bridge && ip link set $eth master br0 + +(while :; do + bridge fdb add 00:01:02:03:04:05 dev $eth master static + bridge fdb del 00:01:02:03:04:05 dev $eth master static +done) & +pid=$! + +for i in $(seq 1 50); do + bridge fdb show > /dev/null + sleep 3 + echo "$((${i} * 2))% complete..." +done