Skip to content

Commit

Permalink
Merge tag 'drm-habanalabs-next-2024-02-26' of https://git.kernel.org/…
Browse files Browse the repository at this point in the history
…pub/scm/linux/kernel/git/ogabbay/linux into drm-next

This tag contains habanalabs driver and accel changes for v6.9.

The notable changes are:

- New features and improvements:
  - Configure interrupt affinity according to NUMA nodes for the MSI-X interrupts that are
    assigned to the userspace application which acquires the device.
  - Move the HBM MMU page tables to reside inside the HBM to minimize latency when doing
    page-walks.
  - Improve the device reset mechanism when consecutive heartbeat failures occur (firmware
    fails to ack on heartbeat message).
  - Check also extended errors in the PCIe addr_dec interrupt information.
  - Rate limit the error messages that can be printed to dmesg log by userspace actions.

- Firmware related fixes:
  - Handle requests from firmware to reserve device memory

- Bug fixes and code cleanups:
  - constify the struct device_type usage in accel (accel_sysfs_device_minor).
  - Fix the PCI health check by reading uncached register.
  - Fix reporting of drain events.
  - Fix debugfs files permissions.
  - Fix calculation of DRAM BAR base address.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
From: Oded Gabbay <ogabbay@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/ZdxJprop0EniVQtf@ogabbay-vm-u22.habana-labs.com
  • Loading branch information
Daniel Vetter committed Feb 26, 2024
2 parents 19b232b + 576d7cc commit aa775ed
Show file tree
Hide file tree
Showing 21 changed files with 1,008 additions and 510 deletions.
2 changes: 1 addition & 1 deletion drivers/accel/drm_accel.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ static struct idr accel_minors_idr;

static struct dentry *accel_debugfs_root;

static struct device_type accel_sysfs_device_minor = {
static const struct device_type accel_sysfs_device_minor = {
.name = "accel_minor"
};

Expand Down
3 changes: 1 addition & 2 deletions drivers/accel/habanalabs/common/command_submission.c
Original file line number Diff line number Diff line change
Expand Up @@ -1360,9 +1360,8 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
return -EINVAL;
}

if (!hl_device_operational(hdev, &status)) {
if (!hl_device_operational(hdev, &status))
return -EBUSY;
}

if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
!hdev->supports_staged_submission) {
Expand Down
18 changes: 9 additions & 9 deletions drivers/accel/habanalabs/common/debugfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
struct hl_debugfs_entry *entry = s->private;
struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
struct hl_device *hdev = dev_entry->hdev;
char kbuf[MMU_KBUF_SIZE];
char kbuf[MMU_KBUF_SIZE] = {0};
char *c;
ssize_t rc;

Expand Down Expand Up @@ -546,7 +546,7 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
struct hl_debugfs_entry *entry = s->private;
struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
struct hl_device *hdev = dev_entry->hdev;
char kbuf[MMU_KBUF_SIZE];
char kbuf[MMU_KBUF_SIZE] = {0};
ssize_t rc;

if (count > sizeof(kbuf) - 1)
Expand Down Expand Up @@ -1643,19 +1643,19 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
&hl_data64b_fops);

debugfs_create_file("set_power_state",
0200,
0644,
root,
dev_entry,
&hl_power_fops);

debugfs_create_file("device",
0200,
0644,
root,
dev_entry,
&hl_device_fops);

debugfs_create_file("clk_gate",
0200,
0644,
root,
dev_entry,
&hl_clk_gate_fops);
Expand All @@ -1667,13 +1667,13 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
&hl_stop_on_err_fops);

debugfs_create_file("dump_security_violations",
0644,
0400,
root,
dev_entry,
&hl_security_violations_fops);

debugfs_create_file("dump_razwi_events",
0644,
0400,
root,
dev_entry,
&hl_razwi_check_fops);
Expand Down Expand Up @@ -1706,7 +1706,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
&hdev->reset_info.skip_reset_on_timeout);

debugfs_create_file("state_dump",
0600,
0644,
root,
dev_entry,
&hl_state_dump_fops);
Expand All @@ -1724,7 +1724,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent

for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name,
0444,
0644,
root,
entry,
&hl_debugfs_fops);
Expand Down
55 changes: 45 additions & 10 deletions drivers/accel/habanalabs/common/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi
if (is_power_of_2(prop->dram_pci_bar_size))
bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
else
bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) *
bar_base_addr = region->region_base +
div64_u64((addr - region->region_base), prop->dram_pci_bar_size) *
prop->dram_pci_bar_size;

old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
Expand Down Expand Up @@ -1034,14 +1035,14 @@ static void device_early_fini(struct hl_device *hdev)

static bool is_pci_link_healthy(struct hl_device *hdev)
{
u16 vendor_id;
u16 device_id;

if (!hdev->pdev)
return false;

pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);

return (vendor_id == PCI_VENDOR_ID_HABANALABS);
return (device_id == hdev->pdev->device);
}

static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
Expand Down Expand Up @@ -1768,14 +1769,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->device_cpu_disabled = false;
hdev->reset_info.hard_reset_pending = false;

/*
* Put the device in an unusable state if there are 2 back to back resets due to
* fatal errors.
*/
if (hdev->reset_info.reset_trigger_repeated &&
(hdev->reset_info.prev_reset_trigger ==
HL_DRV_RESET_FW_FATAL_ERR)) {
/* if there 2 back to back resets from FW,
* ensure driver puts the driver in a unusable state
*/
(hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
hdev->reset_info.prev_reset_trigger ==
HL_DRV_RESET_HEARTBEAT)) {
dev_crit(hdev->dev,
"%s Consecutive FW fatal errors received, stopping hard reset\n",
"%s Consecutive fatal errors, stopping hard reset\n",
dev_name(&(hdev)->pdev->dev));
rc = -EIO;
goto out_err;
Expand Down Expand Up @@ -2801,3 +2804,35 @@ void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
captured_err_info->undef_opcode.write_enable = true;
}

void hl_init_cpu_for_irq(struct hl_device *hdev)
{
#ifdef CONFIG_NUMA
struct cpumask *available_mask = &hdev->irq_affinity_mask;
int numa_node = hdev->pdev->dev.numa_node, i;
static struct cpumask cpu_mask;

if (numa_node < 0)
return;

if (!cpumask_and(&cpu_mask, cpumask_of_node(numa_node), cpu_online_mask)) {
dev_err(hdev->dev, "No available affinities in current numa node\n");
return;
}

/* Remove HT siblings */
for_each_cpu(i, &cpu_mask)
cpumask_set_cpu(cpumask_first(topology_sibling_cpumask(i)), available_mask);
#endif
}

void hl_set_irq_affinity(struct hl_device *hdev, int irq)
{
if (cpumask_empty(&hdev->irq_affinity_mask)) {
dev_dbg(hdev->dev, "affinity mask is empty\n");
return;
}

if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
}
25 changes: 11 additions & 14 deletions drivers/accel/habanalabs/common/firmware_if.c
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
0, &result);

if (rc)
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
dev_err(hdev->dev, "failed to unmask event %d", event_type);

return rc;
}
Expand Down Expand Up @@ -540,7 +540,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
total_pkt_size, 0, &result);

if (rc)
dev_err(hdev->dev, "failed to unmask IRQ array\n");
dev_err(hdev->dev, "failed to unmask event array\n");

kfree(pkt);

Expand Down Expand Up @@ -2718,18 +2718,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
}

rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms));
if (rc)
goto protocol_err;

if (hdev->asic_prop.support_dynamic_resereved_fw_size)
hdev->asic_prop.reserved_fw_mem_size =
le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M;

if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
struct lkd_fw_binning_info *binning_info;

rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader,
sizeof(struct lkd_msg_comms));
if (rc)
goto protocol_err;

/* read preboot version */
rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
fw_loader->dynamic_loader.comm_desc.cur_fw_ver);

if (rc)
return rc;

Expand All @@ -2756,11 +2758,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hdev->decoder_binning, hdev->rotator_binning);
}

if (hdev->asic_prop.support_dynamic_resereved_fw_size) {
hdev->asic_prop.reserved_fw_mem_size =
le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
}

return 0;
}

Expand Down Expand Up @@ -2795,7 +2792,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hdev->asic_funcs->init_cpu_scrambler_dram(hdev);

if (!(hdev->fw_components & FW_TYPE_LINUX)) {
dev_info(hdev->dev, "Skip loading Linux F/W\n");
dev_dbg(hdev->dev, "Skip loading Linux F/W\n");
return 0;
}

Expand Down
Loading

0 comments on commit aa775ed

Please sign in to comment.