Skip to content

Commit

Permalink
Merge tag 'amd-drm-fixes-6.0-2022-08-17' of https://gitlab.freedeskto…
Browse files Browse the repository at this point in the history
…p.org/agd5f/linux into drm-fixes

amd-drm-fixes-6.0-2022-08-17:

amdgpu:
- Revert some DML stack changes
- Rounding fixes in KFD allocations
- atombios vram info table parsing fix
- DCN 3.1.4 fixes
- Clockgating fixes for various new IPs
- SMU 13.0.4 fixes
- DCN 3.1.4 FP fixes
- TMDS fixes for YCbCr420 4k modes
- DCN 3.2.x fixes
- USB 4 fixes
- SMU 13.0 fixes
- SMU driver unload memory leak fixes
- Display orientation fix
- Regression fix for generic fbdev conversion
- SDMA 6.x fixes
- SR-IOV fixes
- IH 6.x fixes
- Use after free fix in bo list handling
- Revert pipe1 support
- XGMI hive reset fix

amdkfd:
- Fix potential crach in kfd_create_indirect_link_prop()

Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220818025206.6463-1-alexander.deucher@amd.com
  • Loading branch information
Dave Airlie committed Aug 18, 2022
2 parents 2ae6ab9 + 085292c commit b1fb6b8
Showing 108 changed files with 1,758 additions and 1,184 deletions.
45 changes: 14 additions & 31 deletions drivers/gpu/drm/amd/amdgpu/aldebaran.c
Original file line number Diff line number Diff line change
@@ -148,38 +148,30 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
struct amdgpu_reset_context *reset_context)
{
struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
struct list_head *reset_device_list = reset_context->reset_device_list;
struct amdgpu_device *tmp_adev = NULL;
struct list_head reset_device_list;
int r = 0;

dev_dbg(adev->dev, "aldebaran perform hw reset\n");

if (reset_device_list == NULL)
return -EINVAL;

if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
reset_context->hive == NULL) {
/* Wrong context, return error */
return -EINVAL;
}

INIT_LIST_HEAD(&reset_device_list);
if (reset_context->hive) {
list_for_each_entry (tmp_adev,
&reset_context->hive->device_list,
gmc.xgmi.head)
list_add_tail(&tmp_adev->reset_list,
&reset_device_list);
} else {
list_add_tail(&reset_context->reset_req_dev->reset_list,
&reset_device_list);
}

list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
mutex_lock(&tmp_adev->reset_cntl->reset_lock);
tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
}
/*
* Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
* them together so that they can be completed asynchronously on multiple nodes
*/
list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
/* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
if (!queue_work(system_unbound_wq,
@@ -197,7 +189,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,

/* For XGMI wait for all resets to complete before proceed */
if (!r) {
list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(&tmp_adev->reset_cntl->reset_work);
r = tmp_adev->asic_reset_res;
@@ -207,7 +199,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
}
}

list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
}
@@ -339,30 +331,21 @@ static int
aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
struct amdgpu_reset_context *reset_context)
{
struct list_head *reset_device_list = reset_context->reset_device_list;
struct amdgpu_device *tmp_adev = NULL;
struct list_head reset_device_list;
int r;

if (reset_device_list == NULL)
return -EINVAL;

if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
IP_VERSION(13, 0, 2) &&
reset_context->hive == NULL) {
/* Wrong context, return error */
return -EINVAL;
}

INIT_LIST_HEAD(&reset_device_list);
if (reset_context->hive) {
list_for_each_entry (tmp_adev,
&reset_context->hive->device_list,
gmc.xgmi.head)
list_add_tail(&tmp_adev->reset_list,
&reset_device_list);
} else {
list_add_tail(&reset_context->reset_req_dev->reset_list,
&reset_device_list);
}

list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
dev_info(tmp_adev->dev,
"GPU reset succeeded, trying to resume\n");
r = aldebaran_mode2_restore_ip(tmp_adev);
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
@@ -317,7 +317,7 @@ enum amdgpu_kiq_irq {
AMDGPU_CP_KIQ_IRQ_DRIVER0 = 0,
AMDGPU_CP_KIQ_IRQ_LAST
};

#define SRIOV_USEC_TIMEOUT 1200000 /* wait 12 * 100ms for SRIOV */
#define MAX_KIQ_REG_WAIT 5000 /* in usecs, 5ms */
#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */
#define MAX_KIQ_REG_TRY 1000
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
@@ -96,6 +96,7 @@ struct amdgpu_amdkfd_fence {
struct amdgpu_kfd_dev {
struct kfd_dev *dev;
uint64_t vram_used;
uint64_t vram_used_aligned;
bool init_complete;
struct work_struct reset_work;
};
21 changes: 12 additions & 9 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
Original file line number Diff line number Diff line change
@@ -40,10 +40,10 @@
#define AMDGPU_USERPTR_RESTORE_DELAY_MS 1

/*
* Align VRAM allocations to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
* Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
* BO chunk
*/
#define VRAM_ALLOCATION_ALIGN (1 << 21)
#define VRAM_AVAILABLITY_ALIGN (1 << 21)

/* Impose limit on how much memory KFD can use */
static struct {
@@ -149,7 +149,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
* to avoid fragmentation caused by 4K allocations in the tail
* 2M BO chunk.
*/
vram_needed = ALIGN(size, VRAM_ALLOCATION_ALIGN);
vram_needed = size;
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
system_mem_needed = size;
} else if (!(alloc_flag &
@@ -182,8 +182,10 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
*/
WARN_ONCE(vram_needed && !adev,
"adev reference can't be null when vram is used");
if (adev)
if (adev) {
adev->kfd.vram_used += vram_needed;
adev->kfd.vram_used_aligned += ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
}
kfd_mem_limit.system_mem_used += system_mem_needed;
kfd_mem_limit.ttm_mem_used += ttm_mem_needed;

@@ -203,8 +205,10 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
WARN_ONCE(!adev,
"adev reference can't be null when alloc mem flags vram is set");
if (adev)
adev->kfd.vram_used -= ALIGN(size, VRAM_ALLOCATION_ALIGN);
if (adev) {
adev->kfd.vram_used -= size;
adev->kfd.vram_used_aligned -= ALIGN(size, VRAM_AVAILABLITY_ALIGN);
}
} else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
kfd_mem_limit.system_mem_used -= size;
} else if (!(alloc_flag &
@@ -1608,15 +1612,14 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
size_t available;

spin_lock(&kfd_mem_limit.mem_limit_lock);
available = adev->gmc.real_vram_size
- adev->kfd.vram_used
- adev->kfd.vram_used_aligned
- atomic64_read(&adev->vram_pin_size)
- reserved_for_pt;
spin_unlock(&kfd_mem_limit.mem_limit_lock);

return ALIGN_DOWN(available, VRAM_ALLOCATION_ALIGN);
return ALIGN_DOWN(available, VRAM_AVAILABLITY_ALIGN);
}

int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
Original file line number Diff line number Diff line change
@@ -314,7 +314,7 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device *adev,
mem_channel_number = vram_info->v30.channel_num;
mem_channel_width = vram_info->v30.channel_width;
if (vram_width)
*vram_width = mem_channel_number * mem_channel_width;
*vram_width = mem_channel_number * (1 << mem_channel_width);
break;
default:
return -EINVAL;
8 changes: 2 additions & 6 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
Original file line number Diff line number Diff line change
@@ -837,16 +837,12 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
continue;

r = amdgpu_vm_bo_update(adev, bo_va, false);
if (r) {
mutex_unlock(&p->bo_list->bo_list_mutex);
if (r)
return r;
}

r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
if (r) {
mutex_unlock(&p->bo_list->bo_list_mutex);
if (r)
return r;
}
}

r = amdgpu_vm_handle_moved(adev, vm);
5 changes: 3 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
Original file line number Diff line number Diff line change
@@ -1705,7 +1705,7 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
{
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
char reg_offset[11];
uint32_t *new, *tmp = NULL;
uint32_t *new = NULL, *tmp = NULL;
int ret, i = 0, len = 0;

do {
@@ -1747,7 +1747,8 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
ret = size;

error_free:
kfree(tmp);
if (tmp != new)
kfree(tmp);
kfree(new);
return ret;
}
2 changes: 2 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
@@ -4742,6 +4742,8 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
amdgpu_reset_reg_dumps(tmp_adev);

reset_context->reset_device_list = device_list_handle;
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
/* If reset handler not implemented, continue; otherwise return */
if (r == -ENOSYS)
4 changes: 0 additions & 4 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
Original file line number Diff line number Diff line change
@@ -272,10 +272,6 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
/* Signal all jobs not yet scheduled */
for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
struct drm_sched_rq *rq = &sched->sched_rq[i];

if (!rq)
continue;

spin_lock(&rq->lock);
list_for_each_entry(s_entity, &rq->entities, list) {
while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
Original file line number Diff line number Diff line change
@@ -37,6 +37,7 @@ struct amdgpu_reset_context {
struct amdgpu_device *reset_req_dev;
struct amdgpu_job *job;
struct amdgpu_hive_info *hive;
struct list_head *reset_device_list;
unsigned long flags;
};

Loading

0 comments on commit b1fb6b8

Please sign in to comment.