From 582620d9f6b352552bc9a3316fe2b1c3acd8742d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 18 Aug 2023 15:27:46 +0300 Subject: [PATCH 1/4] thunderbolt: Workaround an IOMMU fault on certain systems with Intel Maple Ridge On some systems the IOMMU blocks the first couple of driver ready messages to the connection manager firmware as can be seen in below excerpts: thunderbolt 0000:06:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0010 address=0xbb0e3400 flags=0x0020] or DMAR: DRHD: handling fault status reg 2 DMAR: [DMA Write] Request device [04:00.0] PASID ffffffff fault addr 69974000 [fault reason 05] PTE Write access is not set The reason is unknown and hard to debug because we were not able to reproduce this locally. This only happens on certain systems with Intel Maple Ridge Thunderbolt controller. If there is a device connected when the driver is loaded the issue does not happen either. Only when there is nothing connected (so typically when the system is booted up). We can work this around by sending the driver ready several times. After a couple of retries the message goes through and the controller works just fine. For this reason make the number of retries a parameter for icm_request() and then for Maple Ridge (and Titan Ridge as they us the same function but this should not matter) increase number of retries while shortening the timeout accordingly. Reported-by: Werner Sembach Reported-by: Konrad J Hambrick Reported-by: Calvin Walton Closes: https://bugzilla.kernel.org/show_bug.cgi?id=214259 Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/icm.c | 40 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c index dbdcad8d73bf6..d8b9c734abd36 100644 --- a/drivers/thunderbolt/icm.c +++ b/drivers/thunderbolt/icm.c @@ -41,6 +41,7 @@ #define PHY_PORT_CS1_LINK_STATE_SHIFT 26 #define ICM_TIMEOUT 5000 /* ms */ +#define ICM_RETRIES 3 #define ICM_APPROVE_TIMEOUT 10000 /* ms */ #define ICM_MAX_LINK 4 @@ -296,10 +297,9 @@ static bool icm_copy(struct tb_cfg_request *req, const struct ctl_pkg *pkg) static int icm_request(struct tb *tb, const void *request, size_t request_size, void *response, size_t response_size, size_t npackets, - unsigned int timeout_msec) + int retries, unsigned int timeout_msec) { struct icm *icm = tb_priv(tb); - int retries = 3; do { struct tb_cfg_request *req; @@ -410,7 +410,7 @@ static int icm_fr_get_route(struct tb *tb, u8 link, u8 depth, u64 *route) return -ENOMEM; ret = icm_request(tb, &request, sizeof(request), switches, - sizeof(*switches), npackets, ICM_TIMEOUT); + sizeof(*switches), npackets, ICM_RETRIES, ICM_TIMEOUT); if (ret) goto err_free; @@ -463,7 +463,7 @@ icm_fr_driver_ready(struct tb *tb, enum tb_security_level *security_level, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -488,7 +488,7 @@ static int icm_fr_approve_switch(struct tb *tb, struct tb_switch *sw) memset(&reply, 0, sizeof(reply)); /* Use larger timeout as establishing tunnels can take some time */ ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_APPROVE_TIMEOUT); + 1, ICM_RETRIES, ICM_APPROVE_TIMEOUT); if (ret) return ret; @@ -515,7 +515,7 @@ static int icm_fr_add_switch_key(struct tb *tb, struct tb_switch *sw) memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -543,7 +543,7 @@ static int icm_fr_challenge_switch_key(struct tb *tb, struct tb_switch *sw, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -577,7 +577,7 @@ static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1020,7 +1020,7 @@ icm_tr_driver_ready(struct tb *tb, enum tb_security_level *security_level, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, 20000); + 1, 10, 2000); if (ret) return ret; @@ -1053,7 +1053,7 @@ static int icm_tr_approve_switch(struct tb *tb, struct tb_switch *sw) memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_APPROVE_TIMEOUT); + 1, ICM_RETRIES, ICM_APPROVE_TIMEOUT); if (ret) return ret; @@ -1081,7 +1081,7 @@ static int icm_tr_add_switch_key(struct tb *tb, struct tb_switch *sw) memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1110,7 +1110,7 @@ static int icm_tr_challenge_switch_key(struct tb *tb, struct tb_switch *sw, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1144,7 +1144,7 @@ static int icm_tr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1170,7 +1170,7 @@ static int icm_tr_xdomain_tear_down(struct tb *tb, struct tb_xdomain *xd, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1496,7 +1496,7 @@ icm_ar_driver_ready(struct tb *tb, enum tb_security_level *security_level, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1522,7 +1522,7 @@ static int icm_ar_get_route(struct tb *tb, u8 link, u8 depth, u64 *route) memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1543,7 +1543,7 @@ static int icm_ar_get_boot_acl(struct tb *tb, uuid_t *uuids, size_t nuuids) memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1604,7 +1604,7 @@ static int icm_ar_set_boot_acl(struct tb *tb, const uuid_t *uuids, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; @@ -1626,7 +1626,7 @@ icm_icl_driver_ready(struct tb *tb, enum tb_security_level *security_level, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, 20000); + 1, ICM_RETRIES, 20000); if (ret) return ret; @@ -2298,7 +2298,7 @@ static int icm_usb4_switch_op(struct tb_switch *sw, u16 opcode, u32 *metadata, memset(&reply, 0, sizeof(reply)); ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), - 1, ICM_TIMEOUT); + 1, ICM_RETRIES, ICM_TIMEOUT); if (ret) return ret; From a9fdf5f933a6f2b358fad0194b1287b67f6704b1 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 22 Aug 2023 16:36:18 +0300 Subject: [PATCH 2/4] thunderbolt: Check that lane 1 is in CL0 before enabling lane bonding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marek reported that when BlackMagic UltraStudio device is connected the kernel repeatedly tries to enable lane bonding without success making the device non-functional. It looks like the device does not have lane 1 connected at all so even though it is enabled we should not try to bond the lanes. For this reason check that lane 1 is in fact CL0 (connected, active) before attempting to bond the lanes. Reported-by: Marek Ĺ anta Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217737 Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 43171cc1cc2d8..bd5815f8f23bd 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -2725,6 +2725,13 @@ int tb_switch_lane_bonding_enable(struct tb_switch *sw) !tb_port_is_width_supported(down, TB_LINK_WIDTH_DUAL)) return 0; + /* + * Both lanes need to be in CL0. Here we assume lane 0 already be in + * CL0 and check just for lane 1. + */ + if (tb_wait_for_port(down->dual_link_port, false) <= 0) + return -ENOTCONN; + ret = tb_port_lane_bonding_enable(up); if (ret) { tb_port_warn(up, "failed to enable lane bonding\n"); From e19f714ea63f861d95d3d92d45d5fd5ca2e05c8c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 31 Aug 2023 14:10:46 +0300 Subject: [PATCH 3/4] thunderbolt: Correct TMU mode initialization from hardware David reported that cppcheck found following possible copy & paste error from tmu_mode_init(): tmu.c:385:50: style: Expression is always false because 'else if' condition matches previous condition at line 383. [multiCondition] And indeed this is a bug. Fix it to use correct index (TB_SWITCH_TMU_MODE_HIFI_UNI). Reported-by: David Binderman Fixes: d49b4f043d63 ("thunderbolt: Add support for enhanced uni-directional TMU mode") Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thunderbolt/tmu.c b/drivers/thunderbolt/tmu.c index 747f88703d5ca..11f2aec2a5d37 100644 --- a/drivers/thunderbolt/tmu.c +++ b/drivers/thunderbolt/tmu.c @@ -382,7 +382,7 @@ static int tmu_mode_init(struct tb_switch *sw) } else if (ucap && tb_port_tmu_is_unidirectional(up)) { if (tmu_rates[TB_SWITCH_TMU_MODE_LOWRES] == rate) sw->tmu.mode = TB_SWITCH_TMU_MODE_LOWRES; - else if (tmu_rates[TB_SWITCH_TMU_MODE_LOWRES] == rate) + else if (tmu_rates[TB_SWITCH_TMU_MODE_HIFI_UNI] == rate) sw->tmu.mode = TB_SWITCH_TMU_MODE_HIFI_UNI; } else if (rate) { sw->tmu.mode = TB_SWITCH_TMU_MODE_HIFI_BI; From 308092d080852f8997126e5b3507536162416f4a Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 7 Sep 2023 16:02:30 +0300 Subject: [PATCH 4/4] thunderbolt: Restart XDomain discovery handshake after failure Alex reported that after rebooting the other host the peer-to-peer link does not come up anymore. The reason for this is that the host that was not rebooted tries to send the UUID request only 10 times according to the USB4 Inter-Domain spec and gives up if it does not get reply. Then when the other side is actually ready it cannot get the link established anymore. The USB4 Inter-Domain spec requires that the discovery protocol is restarted in that case so implement this now. Reported-by: Alex Balcanquall Fixes: 8e1de7042596 ("thunderbolt: Add support for XDomain lane bonding") Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg --- drivers/thunderbolt/xdomain.c | 58 +++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c index 5b5566862318b..9803f0bbf20d1 100644 --- a/drivers/thunderbolt/xdomain.c +++ b/drivers/thunderbolt/xdomain.c @@ -703,6 +703,27 @@ static void update_property_block(struct tb_xdomain *xd) mutex_unlock(&xdomain_lock); } +static void start_handshake(struct tb_xdomain *xd) +{ + xd->state = XDOMAIN_STATE_INIT; + queue_delayed_work(xd->tb->wq, &xd->state_work, + msecs_to_jiffies(XDOMAIN_SHORT_TIMEOUT)); +} + +/* Can be called from state_work */ +static void __stop_handshake(struct tb_xdomain *xd) +{ + cancel_delayed_work_sync(&xd->properties_changed_work); + xd->properties_changed_retries = 0; + xd->state_retries = 0; +} + +static void stop_handshake(struct tb_xdomain *xd) +{ + cancel_delayed_work_sync(&xd->state_work); + __stop_handshake(xd); +} + static void tb_xdp_handle_request(struct work_struct *work) { struct xdomain_request_work *xw = container_of(work, typeof(*xw), work); @@ -765,6 +786,15 @@ static void tb_xdp_handle_request(struct work_struct *work) case UUID_REQUEST: tb_dbg(tb, "%llx: received XDomain UUID request\n", route); ret = tb_xdp_uuid_response(ctl, route, sequence, uuid); + /* + * If we've stopped the discovery with an error such as + * timing out, we will restart the handshake now that we + * received UUID request from the remote host. + */ + if (!ret && xd && xd->state == XDOMAIN_STATE_ERROR) { + dev_dbg(&xd->dev, "restarting handshake\n"); + start_handshake(xd); + } break; case LINK_STATE_STATUS_REQUEST: @@ -1521,6 +1551,13 @@ static void tb_xdomain_queue_properties_changed(struct tb_xdomain *xd) msecs_to_jiffies(XDOMAIN_SHORT_TIMEOUT)); } +static void tb_xdomain_failed(struct tb_xdomain *xd) +{ + xd->state = XDOMAIN_STATE_ERROR; + queue_delayed_work(xd->tb->wq, &xd->state_work, + msecs_to_jiffies(XDOMAIN_DEFAULT_TIMEOUT)); +} + static void tb_xdomain_state_work(struct work_struct *work) { struct tb_xdomain *xd = container_of(work, typeof(*xd), state_work.work); @@ -1547,7 +1584,7 @@ static void tb_xdomain_state_work(struct work_struct *work) if (ret) { if (ret == -EAGAIN) goto retry_state; - xd->state = XDOMAIN_STATE_ERROR; + tb_xdomain_failed(xd); } else { tb_xdomain_queue_properties_changed(xd); if (xd->bonding_possible) @@ -1612,7 +1649,7 @@ static void tb_xdomain_state_work(struct work_struct *work) if (ret) { if (ret == -EAGAIN) goto retry_state; - xd->state = XDOMAIN_STATE_ERROR; + tb_xdomain_failed(xd); } else { xd->state = XDOMAIN_STATE_ENUMERATED; } @@ -1623,6 +1660,8 @@ static void tb_xdomain_state_work(struct work_struct *work) break; case XDOMAIN_STATE_ERROR: + dev_dbg(&xd->dev, "discovery failed, stopping handshake\n"); + __stop_handshake(xd); break; default: @@ -1833,21 +1872,6 @@ static void tb_xdomain_release(struct device *dev) kfree(xd); } -static void start_handshake(struct tb_xdomain *xd) -{ - xd->state = XDOMAIN_STATE_INIT; - queue_delayed_work(xd->tb->wq, &xd->state_work, - msecs_to_jiffies(XDOMAIN_SHORT_TIMEOUT)); -} - -static void stop_handshake(struct tb_xdomain *xd) -{ - cancel_delayed_work_sync(&xd->properties_changed_work); - cancel_delayed_work_sync(&xd->state_work); - xd->properties_changed_retries = 0; - xd->state_retries = 0; -} - static int __maybe_unused tb_xdomain_suspend(struct device *dev) { stop_handshake(tb_to_xdomain(dev));