From 75035fe22b808a520e1d712ebe913684ba406e01 Mon Sep 17 00:00:00 2001 From: Ben Segal Date: Tue, 23 Jul 2019 11:22:42 +0300 Subject: [PATCH 1/2] habanalabs: fix F/W download in BE architecture writeX macros might perform byte-swapping in BE architectures. As our F/W is in LE format, we need to make sure no byte-swapping will occur. There is a standard kernel function (called memcpy_toio) for copying data to I/O area which is used in a lot of drivers to download F/W to PCIe adapters. That function also makes sure the data is copied "as-is", without byte-swapping. This patch use that function to copy the F/W to the GOYA ASIC instead of writeX macros. Signed-off-by: Ben Segal Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/firmware_if.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c index cc8168bacb24b..61112eda4dd22 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/firmware_if.c @@ -24,7 +24,7 @@ int hl_fw_push_fw_to_device(struct hl_device *hdev, const char *fw_name, { const struct firmware *fw; const u64 *fw_data; - size_t fw_size, i; + size_t fw_size; int rc; rc = request_firmware(&fw, fw_name, hdev->dev); @@ -45,22 +45,7 @@ int hl_fw_push_fw_to_device(struct hl_device *hdev, const char *fw_name, fw_data = (const u64 *) fw->data; - if ((fw->size % 8) != 0) - fw_size -= 8; - - for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) { - if (!(i & (0x80000 - 1))) { - dev_dbg(hdev->dev, - "copied so far %zu out of %zu for %s firmware", - i, fw_size, fw_name); - usleep_range(20, 100); - } - - writeq(*fw_data, dst); - } - - if ((fw->size % 8) != 0) - writel(*(const u32 *) fw_data, dst); + memcpy_toio(dst, fw_data, fw_size); out: release_firmware(fw); From 2aa4e410795cb94b6577fe0e251b5f5226499310 Mon Sep 17 00:00:00 2001 From: Ben Segal Date: Thu, 18 Jul 2019 12:27:00 +0000 Subject: [PATCH 2/2] habanalabs: fix host memory polling in BE architecture This patch fix a bug in the host memory polling macro. The bug is that the memory being polled can be written by the device, which always writes it in LE. However, if the host is running Linux in BE mode, we need to convert the value that was written by the device before matching it to the required value that the caller has given to the macro. Signed-off-by: Ben Segal Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 2 +- drivers/misc/habanalabs/firmware_if.c | 3 ++- drivers/misc/habanalabs/goya/goya.c | 5 +++-- drivers/misc/habanalabs/habanalabs.h | 16 ++++++++++++++-- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 6ad83d5ef4b00..f00d1c32f6d6f 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -683,7 +683,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) rc = hl_poll_timeout_memory(hdev, &ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1), - 100, jiffies_to_usecs(hdev->timeout_jiffies)); + 100, jiffies_to_usecs(hdev->timeout_jiffies), false); if (rc == -ETIMEDOUT) { dev_err(hdev->dev, diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c index 61112eda4dd22..ea2ca67fbfbfa 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/firmware_if.c @@ -97,7 +97,8 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, } rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp, - (tmp == ARMCP_PACKET_FENCE_VAL), 1000, timeout); + (tmp == ARMCP_PACKET_FENCE_VAL), 1000, + timeout, true); hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 1a2c062a57d4d..a0e181714891f 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2864,7 +2864,8 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) } rc = hl_poll_timeout_memory(hdev, fence_ptr, tmp, - (tmp == GOYA_QMAN0_FENCE_VAL), 1000, timeout); + (tmp == GOYA_QMAN0_FENCE_VAL), 1000, + timeout, true); hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_DMA_0); @@ -2945,7 +2946,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) } rc = hl_poll_timeout_memory(hdev, fence_ptr, tmp, (tmp == fence_val), - 1000, GOYA_TEST_QUEUE_WAIT_USEC); + 1000, GOYA_TEST_QUEUE_WAIT_USEC, true); hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 10da9940ee0dd..6a4c64b97f386 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -1062,9 +1062,17 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); /* * address in this macro points always to a memory location in the * host's (server's) memory. That location is updated asynchronously - * either by the direct access of the device or by another core + * either by the direct access of the device or by another core. + * + * To work both in LE and BE architectures, we need to distinguish between the + * two states (device or another core updates the memory location). Therefore, + * if mem_written_by_device is true, the host memory being polled will be + * updated directly by the device. If false, the host memory being polled will + * be updated by host CPU. Required so host knows whether or not the memory + * might need to be byte-swapped before returning value to caller. */ -#define hl_poll_timeout_memory(hdev, addr, val, cond, sleep_us, timeout_us) \ +#define hl_poll_timeout_memory(hdev, addr, val, cond, sleep_us, timeout_us, \ + mem_written_by_device) \ ({ \ ktime_t __timeout; \ /* timeout should be longer when working with simulator */ \ @@ -1077,10 +1085,14 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); /* Verify we read updates done by other cores or by device */ \ mb(); \ (val) = *((u32 *) (uintptr_t) (addr)); \ + if (mem_written_by_device) \ + (val) = le32_to_cpu(val); \ if (cond) \ break; \ if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \ (val) = *((u32 *) (uintptr_t) (addr)); \ + if (mem_written_by_device) \ + (val) = le32_to_cpu(val); \ break; \ } \ if (sleep_us) \