From 43c4d57618bef018eecd769c2805ce6f4e849a0d Mon Sep 17 00:00:00 2001 From: John Clements Date: Thu, 19 Mar 2020 14:41:55 +0800 Subject: [PATCH 01/16] drm/amdgpu: protect RAS sysfs during GPU reset MMHub EDC becomes dirty after BACO reset EDC registers should be cleared early on in reset phase Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6f469facabfbf..faa3e7102156c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2742,6 +2742,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; + + if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) + adev->mmhub.funcs->reset_ras_error_count(adev); } else { task_barrier_full(&hive->tb); @@ -3910,8 +3913,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } } - if (!r && amdgpu_ras_intr_triggered()) + if (!r && amdgpu_ras_intr_triggered()) { + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + if (tmp_adev->mmhub.funcs && + tmp_adev->mmhub.funcs->reset_ras_error_count) + tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); + } + amdgpu_ras_intr_cleared(); + } list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 43055a01f35e4..3c32a94d24240 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -281,6 +281,11 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * struct ras_debug_if data; int ret = 0; + if (amdgpu_ras_intr_triggered()) { + DRM_WARN("RAS WARN: error injection currently inaccessible\n"); + return size; + } + ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); if (ret) return -EINVAL; @@ -394,6 +399,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, .head = obj->head, }; + if (amdgpu_ras_intr_triggered()) + return snprintf(buf, PAGE_SIZE, + "Query currently inaccessible\n"); + if (amdgpu_ras_error_query(obj->adev, &info)) return -EINVAL; From 02be064823d0c9aff4e5e6bc1cb8d5e1bea81d38 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Tue, 17 Mar 2020 12:46:34 -0400 Subject: [PATCH 02/16] drm/amdgpu/sriov : Don't resume RLCG for SRIOV guest RLCG is enabled by host driver, no need to enable it in guest for none-PSP load path Signed-off-by: shaoyunl Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 42bbc0070831c..c8f2aa1db13b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -1940,6 +1940,11 @@ static int gfx_v10_0_rlc_resume(struct amdgpu_device *adev) if (!amdgpu_sriov_vf(adev)) /* enable RLC SRM */ gfx_v10_0_rlc_enable_srm(adev); } else { + if (amdgpu_sriov_vf(adev)) { + gfx_v10_0_init_csb(adev); + return 0; + } + adev->gfx.rlc.funcs->stop(adev); /* disable CG */ From 728b3d0533a4c39ed36a33e36148544b2ec770be Mon Sep 17 00:00:00 2001 From: Zhigang Luo Date: Wed, 25 Mar 2020 15:34:21 -0400 Subject: [PATCH 03/16] Revert "drm/amdgpu: add CAP fw loading" This reverts commit 29e2501f8a64fa2fa8f6fe4be53cce5a5a4fe79f. Signed-off-by: Zhigang Luo Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 9 +-------- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 3 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h | 3 +-- drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h | 1 - drivers/gpu/drm/amd/amdgpu/psp_v3_1.c | 24 ----------------------- 5 files changed, 2 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index dc42086a672bf..be50867ea644f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -159,10 +159,6 @@ static int psp_sw_fini(void *handle) adev->psp.sos_fw = NULL; release_firmware(adev->psp.asd_fw); adev->psp.asd_fw = NULL; - if (adev->psp.cap_fw) { - release_firmware(adev->psp.cap_fw); - adev->psp.cap_fw = NULL; - } if (adev->psp.ta_fw) { release_firmware(adev->psp.ta_fw); adev->psp.ta_fw = NULL; @@ -250,7 +246,7 @@ psp_cmd_submit_buf(struct psp_context *psp, DRM_WARN("psp command (0x%X) failed and response status is (0x%X)\n", psp->cmd_buf_mem->cmd_id, psp->cmd_buf_mem->resp.status); - if ((ucode->ucode_id == AMDGPU_UCODE_ID_CAP) || !timeout) { + if (!timeout) { mutex_unlock(&psp->mutex); return -EINVAL; } @@ -1192,9 +1188,6 @@ static int psp_get_fw_type(struct amdgpu_firmware_info *ucode, enum psp_gfx_fw_type *type) { switch (ucode->ucode_id) { - case AMDGPU_UCODE_ID_CAP: - *type = GFX_FW_TYPE_CAP; - break; case AMDGPU_UCODE_ID_SDMA0: *type = GFX_FW_TYPE_SDMA0; break; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 4a4d8f2ccca2c..297435c0c7c1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -252,9 +252,6 @@ struct psp_context uint32_t asd_ucode_size; uint8_t *asd_start_addr; - /* cap firmware */ - const struct firmware *cap_fw; - /* fence buffer */ struct amdgpu_bo *fence_buf_bo; uint64_t fence_buf_mc_addr; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h index 88f226070229f..b0e656409c038 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h @@ -283,8 +283,7 @@ union amdgpu_firmware_header { * fw loading support */ enum AMDGPU_UCODE_ID { - AMDGPU_UCODE_ID_CAP = 0, /* CAP must be the 1st fw to be loaded */ - AMDGPU_UCODE_ID_SDMA0, + AMDGPU_UCODE_ID_SDMA0 = 0, AMDGPU_UCODE_ID_SDMA1, AMDGPU_UCODE_ID_SDMA2, AMDGPU_UCODE_ID_SDMA3, diff --git a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h index 6ff9a95441101..a44fd6060d5ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h +++ b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h @@ -246,7 +246,6 @@ enum psp_gfx_fw_type { GFX_FW_TYPE_SDMA6 = 56, /* SDMA6 MI */ GFX_FW_TYPE_SDMA7 = 57, /* SDMA7 MI */ GFX_FW_TYPE_VCN1 = 58, /* VCN1 MI */ - GFX_FW_TYPE_CAP = 62, /* CAP_FW VG */ GFX_FW_TYPE_MAX }; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c index 43896f4779b0f..735c43c7daab9 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c @@ -44,7 +44,6 @@ MODULE_FIRMWARE("amdgpu/vega10_sos.bin"); MODULE_FIRMWARE("amdgpu/vega10_asd.bin"); -MODULE_FIRMWARE("amdgpu/vega10_cap.bin"); MODULE_FIRMWARE("amdgpu/vega12_sos.bin"); MODULE_FIRMWARE("amdgpu/vega12_asd.bin"); @@ -64,7 +63,6 @@ static int psp_v3_1_init_microcode(struct psp_context *psp) char fw_name[30]; int err = 0; const struct psp_firmware_header_v1_0 *hdr; - struct amdgpu_firmware_info *info = NULL; DRM_DEBUG("\n"); @@ -114,26 +112,6 @@ static int psp_v3_1_init_microcode(struct psp_context *psp) adev->psp.asd_start_addr = (uint8_t *)hdr + le32_to_cpu(hdr->header.ucode_array_offset_bytes); - if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_VEGA10) { - snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_cap.bin", - chip_name); - err = request_firmware(&adev->psp.cap_fw, fw_name, adev->dev); - if (err) - goto out; - - err = amdgpu_ucode_validate(adev->psp.cap_fw); - if (err) - goto out; - - info = &adev->firmware.ucode[AMDGPU_UCODE_ID_CAP]; - info->ucode_id = AMDGPU_UCODE_ID_CAP; - info->fw = adev->psp.cap_fw; - hdr = (const struct psp_firmware_header_v1_0 *) - adev->psp.cap_fw->data; - adev->firmware.fw_size += ALIGN( - le32_to_cpu(hdr->header.ucode_size_bytes), PAGE_SIZE); - } - return 0; out: if (err) { @@ -144,8 +122,6 @@ static int psp_v3_1_init_microcode(struct psp_context *psp) adev->psp.sos_fw = NULL; release_firmware(adev->psp.asd_fw); adev->psp.asd_fw = NULL; - release_firmware(adev->psp.cap_fw); - adev->psp.cap_fw = NULL; } return err; From c7e55879642012ff96447ee5d1ed5a11a3e41416 Mon Sep 17 00:00:00 2001 From: Yassine Oudjana Date: Tue, 17 Mar 2020 07:51:27 +0000 Subject: [PATCH 04/16] drm/[radeon|amdgpu]: Remove HAINAN board from max_sclk override check Works stable without the overrides. Signed-off-by: Yassine Oudjana Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/si_dpm.c | 1 - drivers/gpu/drm/radeon/si_dpm.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/si_dpm.c b/drivers/gpu/drm/amd/amdgpu/si_dpm.c index 4cb4c891120b2..0860e85a2d358 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/si_dpm.c @@ -3439,7 +3439,6 @@ static void si_apply_state_adjust_rules(struct amdgpu_device *adev, if (adev->asic_type == CHIP_HAINAN) { if ((adev->pdev->revision == 0x81) || - (adev->pdev->revision == 0x83) || (adev->pdev->revision == 0xC3) || (adev->pdev->device == 0x6664) || (adev->pdev->device == 0x6665) || diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 05e8b4d0af3fa..2cb85dbe728f0 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -2979,7 +2979,6 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, if (rdev->family == CHIP_HAINAN) { if ((rdev->pdev->revision == 0x81) || - (rdev->pdev->revision == 0x83) || (rdev->pdev->revision == 0xC3) || (rdev->pdev->device == 0x6664) || (rdev->pdev->device == 0x6665) || From 2b5aed9ac3f72c9eee3342228349751d6a11a7d6 Mon Sep 17 00:00:00 2001 From: Mario Kleiner Date: Mon, 2 Mar 2020 07:17:32 +0100 Subject: [PATCH 05/16] drm/amd/display: Fix pageflip event race condition for DCN. Commit '16f17eda8bad ("drm/amd/display: Send vblank and user events at vsartup for DCN")' introduces a new way of pageflip completion handling for DCN, and some trouble. The current implementation introduces a race condition, which can cause pageflip completion events to be sent out one vblank too early, thereby confusing userspace and causing flicker: prepare_flip_isr(): 1. Pageflip programming takes the ddev->event_lock. 2. Sets acrtc->pflip_status == AMDGPU_FLIP_SUBMITTED 3. Releases ddev->event_lock. --> Deadline for surface address regs double-buffering passes on target pipe. 4. dc_commit_updates_for_stream() MMIO programs the new pageflip into hw, but too late for current vblank. => pflip_status == AMDGPU_FLIP_SUBMITTED, but flip won't complete in current vblank due to missing the double-buffering deadline by a tiny bit. 5. VSTARTUP trigger point in vblank is reached, VSTARTUP irq fires, dm_dcn_crtc_high_irq() gets called. 6. Detects pflip_status == AMDGPU_FLIP_SUBMITTED and assumes the pageflip has been completed/will complete in this vblank and sends out pageflip completion event to userspace and resets pflip_status = AMDGPU_FLIP_NONE. => Flip completion event sent out one vblank too early. This behaviour has been observed during my testing with measurement hardware a couple of time. The commit message says that the extra flip event code was added to dm_dcn_crtc_high_irq() to prevent missing to send out pageflip events in case the pflip irq doesn't fire, because the "DCH HUBP" component is clock gated and doesn't fire pflip irqs in that state. Also that this clock gating may happen if no planes are active. This suggests that the problem addressed by that commit can't happen if planes are active. The proposed solution is therefore to only execute the extra pflip completion code iff the count of active planes is zero and otherwise leave pflip completion handling to the pflip irq handler, for a more race-free experience. Note that i don't know if this fixes the problem the original commit tried to address, as i don't know what the test scenario was. It does fix the observed too early pageflip events though and points out the problem introduced. Fixes: 16f17eda8bad ("drm/amd/display: Send vblank and user events at vsartup for DCN") Reviewed-by: Nicholas Kazlauskas Signed-off-by: Mario Kleiner Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index a4256780e70e5..d3674d805a0ad 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -524,8 +524,9 @@ static void dm_dcn_crtc_high_irq(void *interrupt_params) acrtc_state = to_dm_crtc_state(acrtc->base.state); - DRM_DEBUG_VBL("crtc:%d, vupdate-vrr:%d\n", acrtc->crtc_id, - amdgpu_dm_vrr_active(acrtc_state)); + DRM_DEBUG_VBL("crtc:%d, vupdate-vrr:%d, planes:%d\n", acrtc->crtc_id, + amdgpu_dm_vrr_active(acrtc_state), + acrtc_state->active_planes); amdgpu_dm_crtc_handle_crc_irq(&acrtc->base); drm_crtc_handle_vblank(&acrtc->base); @@ -545,7 +546,18 @@ static void dm_dcn_crtc_high_irq(void *interrupt_params) &acrtc_state->vrr_params.adjust); } - if (acrtc->pflip_status == AMDGPU_FLIP_SUBMITTED) { + /* + * If there aren't any active_planes then DCH HUBP may be clock-gated. + * In that case, pageflip completion interrupts won't fire and pageflip + * completion events won't get delivered. Prevent this by sending + * pending pageflip events from here if a flip is still pending. + * + * If any planes are enabled, use dm_pflip_high_irq() instead, to + * avoid race conditions between flip programming and completion, + * which could cause too early flip completion events. + */ + if (acrtc->pflip_status == AMDGPU_FLIP_SUBMITTED && + acrtc_state->active_planes == 0) { if (acrtc->event) { drm_crtc_send_vblank_event(&acrtc->base, acrtc->event); acrtc->event = NULL; From 10cda519efafd64965de4ac3b8830b57c204f4f1 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Mon, 23 Mar 2020 13:02:39 +0800 Subject: [PATCH 06/16] drm/amdgpu: fix the coverage issue to clear ArcVPGRs Set ComputePGMRSRC1.VGPRS as 0x3f to clear all ArcVGPRs. Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index ba90a14089cfd..37c8231f14073 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4306,7 +4306,7 @@ static const struct soc15_reg_entry vgpr_init_regs_arcturus[] = { { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_X), 0x40 }, { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Y), 4 }, { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Z), 1 }, - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0x81 }, + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0xbf }, { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC2), 0x400000 }, /* 64KB LDS */ { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xffffffff }, { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE1), 0xffffffff }, From 77bb2f204f1f0a53a602a8fd15816d6826212077 Mon Sep 17 00:00:00 2001 From: Yintian Tao Date: Mon, 23 Mar 2020 19:19:37 +0800 Subject: [PATCH 07/16] drm/scheduler: fix rare NULL ptr race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is one one corner case at dma_fence_signal_locked which will raise the NULL pointer problem just like below. ->dma_fence_signal ->dma_fence_signal_locked ->test_and_set_bit here trigger dma_fence_release happen due to the zero of fence refcount. ->dma_fence_put ->dma_fence_release ->drm_sched_fence_release_scheduled ->call_rcu here make the union fled “cb_list” at finished fence to NULL because struct rcu_head contains two pointer which is same as struct list_head cb_list Therefore, to hold the reference of finished fence at drm_sched_process_job to prevent the null pointer during finished fence dma_fence_signal [ 732.912867] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 732.914815] #PF: supervisor write access in kernel mode [ 732.915731] #PF: error_code(0x0002) - not-present page [ 732.916621] PGD 0 P4D 0 [ 732.917072] Oops: 0002 [#1] SMP PTI [ 732.917682] CPU: 7 PID: 0 Comm: swapper/7 Tainted: G OE 5.4.0-rc7 #1 [ 732.918980] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014 [ 732.920906] RIP: 0010:dma_fence_signal_locked+0x3e/0x100 [ 732.938569] Call Trace: [ 732.939003] [ 732.939364] dma_fence_signal+0x29/0x50 [ 732.940036] drm_sched_fence_finished+0x12/0x20 [gpu_sched] [ 732.940996] drm_sched_process_job+0x34/0xa0 [gpu_sched] [ 732.941910] dma_fence_signal_locked+0x85/0x100 [ 732.942692] dma_fence_signal+0x29/0x50 [ 732.943457] amdgpu_fence_process+0x99/0x120 [amdgpu] [ 732.944393] sdma_v4_0_process_trap_irq+0x81/0xa0 [amdgpu] v2: hold the finished fence at drm_sched_process_job instead of amdgpu_fence_process v3: resume the blank line Signed-off-by: Yintian Tao Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/scheduler/sched_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index a18eabf692e44..8e731ed0d9d9f 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -651,7 +651,9 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb) trace_drm_sched_process_job(s_fence); + dma_fence_get(&s_fence->finished); drm_sched_fence_finished(s_fence); + dma_fence_put(&s_fence->finished); wake_up_interruptible(&sched->wake_up_worker); } From 2c02b38a10fc0717fadd94aab2b7eee656a3c44a Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 20 Mar 2020 14:03:25 -0400 Subject: [PATCH 08/16] drm/amd/swSMU: add callback to set AC/DC power source (v2) This is needed to tell the SMU firmware what state is in in certain cases. DC mode does not allow overclocking for example. v2: split Evan's original patch (Alex) Bug: https://gitlab.freedesktop.org/drm/amd/issues/1043 Reviewed-by: Evan Quan Signed-off-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + drivers/gpu/drm/amd/powerplay/smu_internal.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index 657a6f17e91f3..323e7e61493b7 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -570,6 +570,7 @@ struct pptable_funcs { int (*override_pcie_parameters)(struct smu_context *smu); uint32_t (*get_pptable_power_limit)(struct smu_context *smu); int (*disable_umc_cdr_12gbps_workaround)(struct smu_context *smu); + int (*set_power_source)(struct smu_context *smu, enum smu_power_src_type power_src); }; int smu_load_microcode(struct smu_context *smu); diff --git a/drivers/gpu/drm/amd/powerplay/smu_internal.h b/drivers/gpu/drm/amd/powerplay/smu_internal.h index 6900877de8459..40c35bcc5a0a2 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_internal.h +++ b/drivers/gpu/drm/amd/powerplay/smu_internal.h @@ -211,4 +211,7 @@ static inline int smu_send_smc_msg(struct smu_context *smu, enum smu_message_typ #define smu_disable_umc_cdr_12gbps_workaround(smu) \ ((smu)->ppt_funcs->disable_umc_cdr_12gbps_workaround ? (smu)->ppt_funcs->disable_umc_cdr_12gbps_workaround((smu)) : 0) +#define smu_set_power_source(smu, power_src) \ + ((smu)->ppt_funcs->set_power_source ? (smu)->ppt_funcs->set_power_source((smu), (power_src)) : 0) + #endif From f8c83215bfda10f50027dd799d694ca377db0995 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Mar 2020 13:42:26 -0400 Subject: [PATCH 09/16] drm/amdgpu/smu11: add a helper to set the power source Add a common smu11 helper to set the AC/DC power source. Bug: https://gitlab.freedesktop.org/drm/amd/issues/1043 Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h | 3 +++ drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h index 1c88219fe4032..674e426ed59bb 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h +++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h @@ -267,4 +267,7 @@ uint32_t smu_v11_0_get_max_power_limit(struct smu_context *smu); int smu_v11_0_set_performance_level(struct smu_context *smu, enum amd_dpm_forced_level level); +int smu_v11_0_set_power_source(struct smu_context *smu, + enum smu_power_src_type power_src); + #endif diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index 4fd77c7cfc806..20174bed11ce4 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -1939,3 +1939,18 @@ int smu_v11_0_set_performance_level(struct smu_context *smu, return ret; } +int smu_v11_0_set_power_source(struct smu_context *smu, + enum smu_power_src_type power_src) +{ + int pwr_source; + + pwr_source = smu_power_get_index(smu, (uint32_t)power_src); + if (pwr_source < 0) + return -EINVAL; + + return smu_send_smc_msg_with_param(smu, + SMU_MSG_NotifyPowerSource, + pwr_source, + NULL); +} + From fa34520c953b57a3ace3cfb3bfa3b3f1c8e0d414 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Mar 2020 14:07:08 -0400 Subject: [PATCH 10/16] drm/amdgpu/swSMU: use the smu11 power source helper for navi1x The smu_v11_0 version works for navi1x. Bug: https://gitlab.freedesktop.org/drm/amd/issues/1043 Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index d66dfa7410b6d..a23eaac280951 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -2369,6 +2369,7 @@ static const struct pptable_funcs navi10_ppt_funcs = { .get_pptable_power_limit = navi10_get_pptable_power_limit, .run_btc = navi10_run_btc, .disable_umc_cdr_12gbps_workaround = navi10_disable_umc_cdr_12gbps_workaround, + .set_power_source = smu_v11_0_set_power_source, }; void navi10_set_ppt_funcs(struct smu_context *smu) From 66c2f5db1fbdc987d23149082720bcbe46f41aa2 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 20 Mar 2020 14:09:21 -0400 Subject: [PATCH 11/16] drm/amdgpu/swSMU: correct the bootup power source for Navi1X (v2) PMFW may boots those ASICs with DC mode. Need to set it back to AC mode. v2: split from Evan's original patch (Alex) Bug: https://gitlab.freedesktop.org/drm/amd/issues/1043 Reviewed-by: Evan Quan Signed-off-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index f6d4b0ef46ad9..2cfb911ab370d 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1154,6 +1154,21 @@ static int smu_smc_table_hw_init(struct smu_context *smu, } } } + + if (adev->asic_type >= CHIP_NAVI10 && + adev->asic_type <= CHIP_NAVI12) { + /* + * For Navi1X, manually switch it to AC mode as PMFW + * may boot it with DC mode. + * TODO: should check whether we are indeed under AC + * mode before doing this. + */ + ret = smu_set_power_source(smu, SMU_POWER_SOURCE_AC); + if (ret) { + pr_err("Failed to switch to AC mode!\n"); + return ret; + } + } } if (adev->asic_type != CHIP_ARCTURUS) { ret = smu_notify_display_change(smu); From 75610fdd38d9d158385049c9f80547a2253c65ea Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Mar 2020 12:45:09 -0400 Subject: [PATCH 12/16] drm/amdgpu/swSMU: set AC/DC mode based on the current system state (v2) Check of the pointer exists and we are actually on AC power. v2: fix error message to reflect AC/DC mode. Bug: https://gitlab.freedesktop.org/drm/amd/issues/1043 Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 2cfb911ab370d..9a9eb23d85403 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1155,17 +1155,17 @@ static int smu_smc_table_hw_init(struct smu_context *smu, } } - if (adev->asic_type >= CHIP_NAVI10 && - adev->asic_type <= CHIP_NAVI12) { + if (smu->ppt_funcs->set_power_source) { /* * For Navi1X, manually switch it to AC mode as PMFW * may boot it with DC mode. - * TODO: should check whether we are indeed under AC - * mode before doing this. */ - ret = smu_set_power_source(smu, SMU_POWER_SOURCE_AC); + if (adev->pm.ac_power) + ret = smu_set_power_source(smu, SMU_POWER_SOURCE_AC); + else + ret = smu_set_power_source(smu, SMU_POWER_SOURCE_DC); if (ret) { - pr_err("Failed to switch to AC mode!\n"); + pr_err("Failed to switch to %s mode!\n", adev->pm.ac_power ? "AC" : "DC"); return ret; } } From f5cdd2bdd9ba08e469f655c5c6175e3e3ffb38b9 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Mar 2020 12:37:23 -0400 Subject: [PATCH 13/16] drm/amdgpu/swSMU: handle DC controlled by GPIO for navi1x Check the platform caps in the vbios pptable to decide whether to enable automatic AC/DC transitions. Bug: https://gitlab.freedesktop.org/drm/amd/issues/1043 Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index 323e7e61493b7..18172dfec947c 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -408,6 +408,7 @@ struct smu_context uint32_t smc_if_version; bool uploading_custom_pp_table; + bool dc_controlled_by_gpio; }; struct i2c_adapter; diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index a23eaac280951..9c60b38ab53a8 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -347,7 +347,6 @@ navi10_get_allowed_feature_mask(struct smu_context *smu, | FEATURE_MASK(FEATURE_DS_DCEFCLK_BIT) | FEATURE_MASK(FEATURE_FW_DSTATE_BIT) | FEATURE_MASK(FEATURE_BACO_BIT) - | FEATURE_MASK(FEATURE_ACDC_BIT) | FEATURE_MASK(FEATURE_GFX_SS_BIT) | FEATURE_MASK(FEATURE_APCC_DFLL_BIT) | FEATURE_MASK(FEATURE_FW_CTF_BIT) @@ -391,6 +390,9 @@ navi10_get_allowed_feature_mask(struct smu_context *smu, if (smu->adev->pg_flags & AMD_PG_SUPPORT_JPEG) *(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_JPEG_PG_BIT); + if (smu->dc_controlled_by_gpio) + *(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_ACDC_BIT); + /* disable DPM UCLK and DS SOCCLK on navi10 A0 secure board */ if (is_asic_secure(smu)) { /* only for navi10 A0 */ @@ -525,6 +527,9 @@ static int navi10_store_powerplay_table(struct smu_context *smu) table_context->thermal_controller_type = powerplay_table->thermal_controller_type; + if (powerplay_table->platform_caps & SMU_11_0_PP_PLATFORM_CAP_HARDWAREDC) + smu->dc_controlled_by_gpio = true; + mutex_lock(&smu_baco->mutex); if (powerplay_table->platform_caps & SMU_11_0_PP_PLATFORM_CAP_BACO || powerplay_table->platform_caps & SMU_11_0_PP_PLATFORM_CAP_MACO) From 9644bf5f4ab8d691c46e53f67b825c90d97e7219 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Mar 2020 13:03:12 -0400 Subject: [PATCH 14/16] drm/amdgpu/swSMU: handle manual AC/DC notifications For boards that do not support automatic AC/DC transitions in firmware, manually tell the firmware when the status changes. Bug: https://gitlab.freedesktop.org/drm/amd/issues/1043 Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 3 +++ drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 23 +++++++++++++++++++ .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 1 + 3 files changed, 27 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c index bc3cf04a1a94a..f197f1be09690 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c @@ -92,6 +92,9 @@ void amdgpu_pm_acpi_event_handler(struct amdgpu_device *adev) if (adev->powerplay.pp_funcs->enable_bapm) amdgpu_dpm_enable_bapm(adev, adev->pm.ac_power); mutex_unlock(&adev->pm.mutex); + + if (is_support_sw_smu(adev)) + smu_set_ac_dc(&adev->smu); } } diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 9a9eb23d85403..e8b27fab6aa1d 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -2087,6 +2087,29 @@ int smu_set_watermarks_for_clock_ranges(struct smu_context *smu, return 0; } +int smu_set_ac_dc(struct smu_context *smu) +{ + int ret = 0; + + /* controlled by firmware */ + if (smu->dc_controlled_by_gpio) + return 0; + + mutex_lock(&smu->mutex); + if (smu->ppt_funcs->set_power_source) { + if (smu->adev->pm.ac_power) + ret = smu_set_power_source(smu, SMU_POWER_SOURCE_AC); + else + ret = smu_set_power_source(smu, SMU_POWER_SOURCE_DC); + if (ret) + pr_err("Failed to switch to %s mode!\n", + smu->adev->pm.ac_power ? "AC" : "DC"); + } + mutex_unlock(&smu->mutex); + + return ret; +} + const struct amd_ip_funcs smu_ip_funcs = { .name = "smu", .early_init = smu_early_init, diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index 18172dfec947c..ae2c318dd6fac 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -720,6 +720,7 @@ int smu_get_dpm_level_range(struct smu_context *smu, enum smu_clk_type clk_type, enum amd_dpm_forced_level smu_get_performance_level(struct smu_context *smu); int smu_force_performance_level(struct smu_context *smu, enum amd_dpm_forced_level level); int smu_set_display_count(struct smu_context *smu, uint32_t count); +int smu_set_ac_dc(struct smu_context *smu); bool smu_clk_dpm_is_enabled(struct smu_context *smu, enum smu_clk_type clk_type); const char *smu_get_message_name(struct smu_context *smu, enum smu_message_type type); const char *smu_get_feature_name(struct smu_context *smu, enum smu_feature_mask feature); From e1188aacad17302510d9235adcfbc6d45e5dde7c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 20 Mar 2020 13:45:59 -0400 Subject: [PATCH 15/16] drm/amdgpu/smu11: add support for SMU AC/DC interrupts Driver needs to send the ack message when it receives the AC/DC interrupt from the SMU. TODO: verify the client and src ids. Bug: https://gitlab.freedesktop.org/drm/amd/issues/1043 Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index 20174bed11ce4..d19e1d0d56c0a 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -1525,6 +1525,13 @@ int smu_v11_0_set_xgmi_pstate(struct smu_context *smu, return ret; } +static int smu_v11_0_ack_ac_dc_interrupt(struct smu_context *smu) +{ + return smu_send_smc_msg(smu, + SMU_MSG_ReenableAcDcInterrupt, + NULL); +} + #define THM_11_0__SRCID__THM_DIG_THERM_L2H 0 /* ASIC_TEMP > CG_THERMAL_INT.DIG_THERM_INTH */ #define THM_11_0__SRCID__THM_DIG_THERM_H2L 1 /* ASIC_TEMP < CG_THERMAL_INT.DIG_THERM_INTL */ @@ -1558,6 +1565,9 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, break; } + } else if (client_id == SOC15_IH_CLIENTID_MP1) { + if (src_id == 0xfe) + smu_v11_0_ack_ac_dc_interrupt(&adev->smu); } return 0; @@ -1597,6 +1607,12 @@ int smu_v11_0_register_irq_handler(struct smu_context *smu) if (ret) return ret; + ret = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_MP1, + 0xfe, + irq_src); + if (ret) + return ret; + return ret; } From e862b08b4650be6d5196c191baceff3c43dfddef Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Wed, 4 Mar 2020 15:21:22 +0800 Subject: [PATCH 16/16] drm/amdgpu: don't try to reserve training bo for sriov (v2) 1) SRIOV guest KMD doesn't care training buffer 2) if we resered training buffer that will overlap with IP discovery reservation because training buffer is at vram_size - 0x8000 and IP discovery is at ()vram_size - 0x10000 => vram_size -1) v2: squash in warning fix from Nirmoy Signed-off-by: Monk Liu Reviewed-by: Emily Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index c10ae1cdc1b95..cd6cac4c136a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1840,9 +1840,11 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) *The reserved vram for memory training must be pinned to the specified *place on the VRAM, so reserve it early. */ - r = amdgpu_ttm_training_reserve_vram_init(adev); - if (r) - return r; + if (!amdgpu_sriov_vf(adev)) { + r = amdgpu_ttm_training_reserve_vram_init(adev); + if (r) + return r; + } /* allocate memory as required for VGA * This is used for VGA emulation and pre-OS scanout buffers to