Skip to content

Commit

Permalink
habanalabs: set 4s timeout for message to device CPU
Browse files Browse the repository at this point in the history
We see that sometimes the CPU in GOYA and GAUDI is occupied by the
power/thermal loop and can't answer requests from the driver fast enough.

Therefore, to avoid false notifications on timeouts, increase the timeout
to 4 seconds on each message sent to the device CPU.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
  • Loading branch information
Oded Gabbay committed Jul 10, 2020
1 parent e38bfd3 commit 788cacf
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 30 deletions.
6 changes: 3 additions & 3 deletions drivers/misc/habanalabs/debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
pkt.i2c_reg = i2c_reg;

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_DEVICE_TIMEOUT_USEC, (long *) val);
0, (long *) val);

if (rc)
dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
Expand All @@ -63,7 +63,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
pkt.value = cpu_to_le64(val);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_DEVICE_TIMEOUT_USEC, NULL);
0, NULL);

if (rc)
dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);
Expand All @@ -87,7 +87,7 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
pkt.value = cpu_to_le64(state);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_DEVICE_TIMEOUT_USEC, NULL);
0, NULL);

if (rc)
dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
Expand Down
10 changes: 5 additions & 5 deletions drivers/misc/habanalabs/firmware_if.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);

return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
sizeof(pkt), 0, NULL);
}

int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
Expand Down Expand Up @@ -144,7 +144,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
pkt.value = cpu_to_le64(event_type);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_DEVICE_TIMEOUT_USEC, &result);
0, &result);

if (rc)
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
Expand Down Expand Up @@ -183,7 +183,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
ARMCP_PKT_CTL_OPCODE_SHIFT);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result);
total_pkt_size, 0, &result);

if (rc)
dev_err(hdev->dev, "failed to unmask IRQ array\n");
Expand All @@ -204,7 +204,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
sizeof(test_pkt), 0, &result);

if (!rc) {
if (result != ARMCP_PACKET_FENCE_VAL)
Expand Down Expand Up @@ -248,7 +248,7 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
sizeof(hb_pkt), 0, &result);

if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
rc = -EIO;
Expand Down
4 changes: 4 additions & 0 deletions drivers/misc/habanalabs/gaudi/gaudi.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
#define GAUDI_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
#define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC 1000000 /* 1s */
#define GAUDI_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */

#define GAUDI_QMAN0_FENCE_VAL 0x72E91AB9

Expand Down Expand Up @@ -3479,6 +3480,9 @@ static int gaudi_send_cpu_message(struct hl_device *hdev, u32 *msg,
return 0;
}

if (!timeout)
timeout = GAUDI_MSG_TO_CPU_TIMEOUT_USEC;

return hl_fw_send_cpu_message(hdev, GAUDI_QUEUE_ID_CPU_PQ, msg, len,
timeout, result);
}
Expand Down
12 changes: 8 additions & 4 deletions drivers/misc/habanalabs/goya/goya.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
#define GOYA_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100)
#define GOYA_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
#define GOYA_BOOT_FIT_REQ_TIMEOUT_USEC 1000000 /* 1s */
#define GOYA_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */

#define GOYA_QMAN0_FENCE_VAL 0xD169B243

Expand Down Expand Up @@ -2830,6 +2831,9 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
return 0;
}

if (!timeout)
timeout = GOYA_MSG_TO_CPU_TIMEOUT_USEC;

return hl_fw_send_cpu_message(hdev, GOYA_QUEUE_ID_CPU_PQ, msg, len,
timeout, result);
}
Expand Down Expand Up @@ -4431,8 +4435,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
ARMCP_PKT_CTL_OPCODE_SHIFT);

rc = goya_send_cpu_message(hdev, (u32 *) pkt, total_pkt_size,
HL_DEVICE_TIMEOUT_USEC, &result);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
total_pkt_size, 0, &result);

if (rc)
dev_err(hdev->dev, "failed to unmask IRQ array\n");
Expand Down Expand Up @@ -4464,8 +4468,8 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.value = cpu_to_le64(event_type);

rc = goya_send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_DEVICE_TIMEOUT_USEC, &result);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
0, &result);

if (rc)
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
Expand Down
6 changes: 5 additions & 1 deletion drivers/misc/habanalabs/habanalabs.h
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,11 @@ enum hl_pll_frequency {
* @hw_queues_unlock: release H/W queues lock.
* @get_pci_id: retrieve PCI ID.
* @get_eeprom_data: retrieve EEPROM data from F/W.
* @send_cpu_message: send buffer to ArmCP.
* @send_cpu_message: send message to F/W. If the message is timedout, the
* driver will eventually reset the device. The timeout can
* be determined by the calling function or it can be 0 and
* then the timeout is the default timeout for the specific
* ASIC
* @get_hw_state: retrieve the H/W state
* @pci_bars_map: Map PCI BARs.
* @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
Expand Down
19 changes: 9 additions & 10 deletions drivers/misc/habanalabs/hwmon.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <linux/pci.h>
#include <linux/hwmon.h>

#define SENSORS_PKT_TIMEOUT 1000000 /* 1s */
#define HWMON_NR_SENSOR_TYPES (hwmon_pwm + 1)

int hl_build_hwmon_channel_info(struct hl_device *hdev,
Expand Down Expand Up @@ -323,7 +322,7 @@ int hl_get_temperature(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, value);
0, value);

if (rc) {
dev_err(hdev->dev,
Expand All @@ -350,7 +349,7 @@ int hl_set_temperature(struct hl_device *hdev,
pkt.value = __cpu_to_le64(value);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, NULL);
0, NULL);

if (rc)
dev_err(hdev->dev,
Expand All @@ -374,7 +373,7 @@ int hl_get_voltage(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, value);
0, value);

if (rc) {
dev_err(hdev->dev,
Expand All @@ -400,7 +399,7 @@ int hl_get_current(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, value);
0, value);

if (rc) {
dev_err(hdev->dev,
Expand All @@ -426,7 +425,7 @@ int hl_get_fan_speed(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, value);
0, value);

if (rc) {
dev_err(hdev->dev,
Expand All @@ -452,7 +451,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, value);
0, value);

if (rc) {
dev_err(hdev->dev,
Expand All @@ -479,7 +478,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
pkt.value = cpu_to_le64(value);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, NULL);
0, NULL);

if (rc)
dev_err(hdev->dev,
Expand All @@ -502,7 +501,7 @@ int hl_set_voltage(struct hl_device *hdev,
pkt.value = __cpu_to_le64(value);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, NULL);
0, NULL);

if (rc)
dev_err(hdev->dev,
Expand All @@ -527,7 +526,7 @@ int hl_set_current(struct hl_device *hdev,
pkt.value = __cpu_to_le64(value);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SENSORS_PKT_TIMEOUT, NULL);
0, NULL);

if (rc)
dev_err(hdev->dev,
Expand Down
11 changes: 4 additions & 7 deletions drivers/misc/habanalabs/sysfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@

#include <linux/pci.h>

#define SET_CLK_PKT_TIMEOUT 1000000 /* 1s */
#define SET_PWR_PKT_TIMEOUT 1000000 /* 1s */

long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
{
struct armcp_packet pkt;
Expand All @@ -29,7 +26,7 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
pkt.pll_index = cpu_to_le32(pll_index);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SET_CLK_PKT_TIMEOUT, &result);
0, &result);

if (rc) {
dev_err(hdev->dev,
Expand All @@ -54,7 +51,7 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
pkt.value = cpu_to_le64(freq);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SET_CLK_PKT_TIMEOUT, NULL);
0, NULL);

if (rc)
dev_err(hdev->dev,
Expand All @@ -74,7 +71,7 @@ u64 hl_get_max_power(struct hl_device *hdev)
ARMCP_PKT_CTL_OPCODE_SHIFT);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SET_PWR_PKT_TIMEOUT, &result);
0, &result);

if (rc) {
dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
Expand All @@ -96,7 +93,7 @@ void hl_set_max_power(struct hl_device *hdev, u64 value)
pkt.value = cpu_to_le64(value);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
SET_PWR_PKT_TIMEOUT, NULL);
0, NULL);

if (rc)
dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
Expand Down

0 comments on commit 788cacf

Please sign in to comment.