Skip to content

Commit

Permalink
Merge tag 'misc-habanalabs-next-2020-05-25' of git://people.freedeskt…
Browse files Browse the repository at this point in the history
…op.org/~gabbayo/linux into char-misc-next

Oded writes:

This tag contains the following changes for kernel 5.8:

- Improve MMU cache invalidation code and handle case where the
  invalidation doesn't finish in a reasonable time.

- Remove the option to perform soft-reset to GAUDI. Soft-reset is where the
  driver only resets the compute and DMA engines of the ASIC. This is not
  relevant to GAUDI as we must also reset the NIC ports. And when we reset
  the NIC ports, we must also reset other stuff so we prefer to just do
  hard-reset (where we reset the entire ASIC except for PCIe).

- Fail the hard-reset procedure in case we still have user processes which
  have active file-descriptors on a device. Doing hard-reset in that case
  can result in a kernel panic because of gen_pool checks

- Don't initialize the default wait callback of dma_buf with the default
  wait function as that's the default...

* tag 'misc-habanalabs-next-2020-05-25' of git://people.freedesktop.org/~gabbayo/linux:
  habanalabs: handle MMU cache invalidation timeout
  habanalabs: don't allow hard reset with open processes
  habanalabs: GAUDI does not support soft-reset
  habanalabs: add print for soft reset due to event
  habanalabs: improve MMU cache invalidation code
  habanalabs: don't set default fence_ops->wait
  • Loading branch information
Greg Kroah-Hartman committed May 25, 2020
2 parents 709b41b + 8ff5f4f commit 18cbc33
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 57 deletions.
1 change: 0 additions & 1 deletion drivers/misc/habanalabs/command_submission.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ static const struct dma_fence_ops hl_fence_ops = {
.get_driver_name = hl_fence_get_driver_name,
.get_timeline_name = hl_fence_get_timeline_name,
.enable_signaling = hl_fence_enable_signaling,
.wait = dma_fence_default_wait,
.release = hl_fence_release
};

Expand Down
23 changes: 17 additions & 6 deletions drivers/misc/habanalabs/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -726,7 +726,7 @@ int hl_device_resume(struct hl_device *hdev)
return rc;
}

static void device_kill_open_processes(struct hl_device *hdev)
static int device_kill_open_processes(struct hl_device *hdev)
{
u16 pending_total, pending_cnt;
struct hl_fpriv *hpriv;
Expand Down Expand Up @@ -779,9 +779,7 @@ static void device_kill_open_processes(struct hl_device *hdev)
ssleep(1);
}

if (!list_empty(&hdev->fpriv_list))
dev_crit(hdev->dev,
"Going to hard reset with open user contexts\n");
return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
}

static void device_hard_reset_pending(struct work_struct *work)
Expand All @@ -801,6 +799,7 @@ static void device_hard_reset_pending(struct work_struct *work)
* @hdev: pointer to habanalabs device structure
* @hard_reset: should we do hard reset to all engines or just reset the
* compute/dma engines
* @from_hard_reset_thread: is the caller the hard-reset thread
*
* Block future CS and wait for pending CS to be enqueued
* Call ASIC H/W fini
Expand All @@ -823,6 +822,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
return 0;
}

if ((!hard_reset) && (!hdev->supports_soft_reset)) {
dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
hard_reset = true;
}

/*
* Prevent concurrency in this function - only one reset should be
* done at any given time. Only need to perform this if we didn't
Expand Down Expand Up @@ -902,7 +906,12 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
* process can't really exit until all its CSs are done, which
* is what we do in cs rollback
*/
device_kill_open_processes(hdev);
rc = device_kill_open_processes(hdev);
if (rc) {
dev_crit(hdev->dev,
"Failed to kill all open processes, stopping hard reset\n");
goto out_err;
}

/* Flush the Event queue workers to make sure no other thread is
* reading or writing to registers during the reset
Expand Down Expand Up @@ -1385,7 +1394,9 @@ void hl_device_fini(struct hl_device *hdev)
* can't really exit until all its CSs are done, which is what we
* do in cs rollback
*/
device_kill_open_processes(hdev);
rc = device_kill_open_processes(hdev);
if (rc)
dev_crit(hdev->dev, "Failed to kill all open processes\n");

hl_cb_pool_fini(hdev);

Expand Down
74 changes: 48 additions & 26 deletions drivers/misc/habanalabs/gaudi/gaudi.c
Original file line number Diff line number Diff line change
Expand Up @@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
>> EQ_CTL_EVENT_TYPE_SHIFT);
u8 cause;
bool soft_reset_required;
bool reset_required;

gaudi->events_stat[event_type]++;
gaudi->events_stat_aggregate[event_type]++;
Expand Down Expand Up @@ -5840,12 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_TPC6_DEC:
case GAUDI_EVENT_TPC7_DEC:
gaudi_print_irq_info(hdev, event_type, true);
soft_reset_required = gaudi_tpc_read_interrupts(hdev,
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_dec_event_to_tpc_id(event_type),
"AXI_SLV_DEC_Error");
if (soft_reset_required)
hl_device_reset(hdev, false, false);
hl_fw_unmask_irq(hdev, event_type);
if (reset_required) {
dev_err(hdev->dev, "hard reset required due to %s\n",
gaudi_irq_map_table[event_type].name);

if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
} else {
hl_fw_unmask_irq(hdev, event_type);
}
break;

case GAUDI_EVENT_TPC0_KRN_ERR:
Expand All @@ -5857,12 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_TPC6_KRN_ERR:
case GAUDI_EVENT_TPC7_KRN_ERR:
gaudi_print_irq_info(hdev, event_type, true);
soft_reset_required = gaudi_tpc_read_interrupts(hdev,
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_krn_event_to_tpc_id(event_type),
"KRN_ERR");
if (soft_reset_required)
hl_device_reset(hdev, false, false);
hl_fw_unmask_irq(hdev, event_type);
if (reset_required) {
dev_err(hdev->dev, "hard reset required due to %s\n",
gaudi_irq_map_table[event_type].name);

if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
} else {
hl_fw_unmask_irq(hdev, event_type);
}
break;

case GAUDI_EVENT_PCIE_CORE_SERR:
Expand Down Expand Up @@ -5913,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,

case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true);
hl_device_reset(hdev, false, false);
hl_fw_unmask_irq(hdev, event_type);
if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
break;

case GAUDI_EVENT_TPC0_BMON_SPMU:
Expand Down Expand Up @@ -5963,7 +5975,7 @@ static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate,
return gaudi->events_stat;
}

static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
u32 flags)
{
struct gaudi_device *gaudi = hdev->asic_specific;
Expand All @@ -5972,34 +5984,40 @@ static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,

if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return;

mutex_lock(&hdev->mmu_cache_lock);
return 0;

if (hdev->pldm)
timeout_usec = GAUDI_PLDM_MMU_TIMEOUT_USEC;
else
timeout_usec = MMU_CONFIG_TIMEOUT_USEC;

mutex_lock(&hdev->mmu_cache_lock);

/* L0 & L1 invalidation */
WREG32(mmSTLB_INV_ALL_START, 1);
WREG32(mmSTLB_INV_PS, 2);

rc = hl_poll_timeout(
hdev,
mmSTLB_INV_ALL_START,
mmSTLB_INV_PS,
status,
!status,
1000,
timeout_usec);

if (rc)
dev_notice_ratelimited(hdev->dev,
"Timeout when waiting for MMU cache invalidation\n");
WREG32(mmSTLB_INV_SET, 0);

mutex_unlock(&hdev->mmu_cache_lock);

if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false);
}

return rc;
}

static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
bool is_hard, u32 asid, u64 va, u64 size)
{
struct gaudi_device *gaudi = hdev->asic_specific;
Expand All @@ -6010,7 +6028,7 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,

if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return;
return 0;

mutex_lock(&hdev->mmu_cache_lock);

Expand Down Expand Up @@ -6041,11 +6059,15 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
1000,
timeout_usec);

if (rc)
dev_notice_ratelimited(hdev->dev,
"Timeout when waiting for MMU cache invalidation\n");

mutex_unlock(&hdev->mmu_cache_lock);

if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false);
}

return rc;
}

static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev,
Expand Down
35 changes: 22 additions & 13 deletions drivers/misc/habanalabs/goya/goya.c
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev)

spin_lock_init(&goya->hw_queues_lock);
hdev->supports_coresight = true;
hdev->supports_soft_reset = true;

return 0;

Expand Down Expand Up @@ -4883,7 +4884,7 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
}

static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
u32 flags)
{
struct goya_device *goya = hdev->asic_specific;
Expand All @@ -4892,11 +4893,11 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,

if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return;
return 0;

/* no need in L1 only invalidation in Goya */
if (!is_hard)
return;
return 0;

if (hdev->pldm)
timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
Expand All @@ -4918,25 +4919,29 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,

mutex_unlock(&hdev->mmu_cache_lock);

if (rc)
dev_notice_ratelimited(hdev->dev,
"Timeout when waiting for MMU cache invalidation\n");
if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false);
}

return rc;
}

static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
bool is_hard, u32 asid, u64 va, u64 size)
static int goya_mmu_invalidate_cache_range(struct hl_device *hdev,
bool is_hard, u32 asid, u64 va, u64 size)
{
struct goya_device *goya = hdev->asic_specific;
u32 status, timeout_usec, inv_data, pi;
int rc;

if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return;
return 0;

/* no need in L1 only invalidation in Goya */
if (!is_hard)
return;
return 0;

if (hdev->pldm)
timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
Expand Down Expand Up @@ -4969,9 +4974,13 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,

mutex_unlock(&hdev->mmu_cache_lock);

if (rc)
dev_notice_ratelimited(hdev->dev,
"Timeout when waiting for MMU cache invalidation\n");
if (rc) {
dev_err_ratelimited(hdev->dev,
"MMU cache invalidation timeout\n");
hl_device_reset(hdev, true, false);
}

return rc;
}

int goya_send_heartbeat(struct hl_device *hdev)
Expand Down
10 changes: 6 additions & 4 deletions drivers/misc/habanalabs/habanalabs.h
Original file line number Diff line number Diff line change
Expand Up @@ -675,9 +675,9 @@ struct hl_asic_funcs {
u32 *size);
u64 (*read_pte)(struct hl_device *hdev, u64 addr);
void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val);
void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
int (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
u32 flags);
void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
u32 asid, u64 va, u64 size);
int (*send_heartbeat)(struct hl_device *hdev);
void (*enable_clock_gating)(struct hl_device *hdev);
Expand Down Expand Up @@ -755,8 +755,8 @@ struct hl_va_range {
* with huge pages.
* @dram_va_range: holds available virtual addresses for DRAM mappings.
* @mem_hash_lock: protects the mem_hash.
* @mmu_lock: protects the MMU page tables. Any change to the PGT, modifing the
* MMU hash or walking the PGT requires talking this lock
* @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
* MMU hash or walking the PGT requires talking this lock.
* @debugfs_list: node in debugfs list of contexts.
* @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
* to user so user could inquire about CS. It is used as
Expand Down Expand Up @@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts {
* @stop_on_err: true if engines should stop on error.
* @supports_sync_stream: is sync stream supported.
* @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported.
*/
struct hl_device {
struct pci_dev *pdev;
Expand Down Expand Up @@ -1522,6 +1523,7 @@ struct hl_device {
u8 stop_on_err;
u8 supports_sync_stream;
u8 supports_coresight;
u8 supports_soft_reset;

/* Parameters for bring-up */
u8 mmu_enable;
Expand Down
Loading

0 comments on commit 18cbc33

Please sign in to comment.