Skip to content

Commit

Permalink
Merge tag 'amd-drm-next-6.8-2023-12-01' of https://gitlab.freedesktop…
Browse files Browse the repository at this point in the history
….org/agd5f/linux into drm-next

amd-drm-next-6.8-2023-12-01:

amdgpu:
- Add new 64 bit sequence number infrastructure.
  This will ultimately be used for user queue synchronization.
- GPUVM updates
- Misc code cleanups
- RAS updates
- DCN 3.5 updates
- Rework PCIe link speed handling
- Document GPU reset types
- DMUB fixes
- eDP fixes
- NBIO 7.9 updates
- NBIO 7.11 updates
- SubVP updates
- DCN 3.1.4 fixes
- ABM fixes
- AGP aperture fix
- DCN 3.1.5 fix
- Fix some potential error path memory leaks
- Enable PCIe PMEs
- Add XGMI, PCIe state dumping for aqua vanjaram
- GFX11 golden register updates
- Misc display fixes

amdkfd:
- Migrate TLB flushing logic to amdgpu
- Trap handler fixes
- Fix restore workers handling on suspend and reset
- Fix possible memory leak in pqm_uninit()

radeon:
- Fix some possible overflows in command buffer checking
- Check for errors in ring_lock

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231201181743.5313-1-alexander.deucher@amd.com
Signed-off-by: Dave Airlie <airlied@redhat.com>
  • Loading branch information
Dave Airlie committed Dec 5, 2023
2 parents a13fee3 + b719a9c commit 5edfd7d
Show file tree
Hide file tree
Showing 222 changed files with 3,434 additions and 1,524 deletions.
41 changes: 41 additions & 0 deletions Documentation/gpu/amdgpu/display/dc-debug.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,44 @@ change in real-time by using something like::

When reporting a bug related to DC, consider attaching this log before and
after you reproduce the bug.

DMUB Firmware Debug
===================

Sometimes, dmesg logs aren't enough. This is especially true if a feature is
implemented primarily in DMUB firmware. In such cases, all we see in dmesg when
an issue arises is some generic timeout error. So, to get more relevant
information, we can trace DMUB commands by enabling the relevant bits in
`amdgpu_dm_dmub_trace_mask`.

Currently, we support the tracing of the following groups:

Trace Groups
------------

.. csv-table::
:header-rows: 1
:widths: 1, 1
:file: ./trace-groups-table.csv

**Note: Not all ASICs support all of the listed trace groups**

So, to enable just PSR tracing you can use the following command::

# echo 0x8020 > /sys/kernel/debug/dri/0/amdgpu_dm_dmub_trace_mask

Then, you need to enable logging trace events to the buffer, which you can do
using the following::

# echo 1 > /sys/kernel/debug/dri/0/amdgpu_dm_dmcub_trace_event_en

Lastly, after you are able to reproduce the issue you are trying to debug,
you can disable tracing and read the trace log by using the following::

# echo 0 > /sys/kernel/debug/dri/0/amdgpu_dm_dmcub_trace_event_en
# cat /sys/kernel/debug/dri/0/amdgpu_dm_dmub_tracebuffer

So, when reporting bugs related to features such as PSR and ABM, consider
enabling the relevant bits in the mask before reproducing the issue and
attach the log that you obtain from the trace buffer in any bug reports that you
create.
29 changes: 29 additions & 0 deletions Documentation/gpu/amdgpu/display/trace-groups-table.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Name, Mask Value
INFO, 0x1
IRQ SVC, 0x2
VBIOS, 0x4
REGISTER, 0x8
PHY DBG, 0x10
PSR, 0x20
AUX, 0x40
SMU, 0x80
MALL, 0x100
ABM, 0x200
ALPM, 0x400
TIMER, 0x800
HW LOCK MGR, 0x1000
INBOX1, 0x2000
PHY SEQ, 0x4000
PSR STATE, 0x8000
ZSTATE, 0x10000
TRANSMITTER CTL, 0x20000
PANEL CNTL, 0x40000
FAMS, 0x80000
DPIA, 0x100000
SUBVP, 0x200000
INBOX0, 0x400000
SDP, 0x4000000
REPLAY, 0x8000000
REPLAY RESIDENCY, 0x20000000
CURSOR INFO, 0x80000000
IPS, 0x100000000
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
amdgpu_ring_mux.o amdgpu_xcp.o
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o

amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o

Expand Down
35 changes: 35 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@
#include "amdgpu_mca.h"
#include "amdgpu_ras.h"
#include "amdgpu_xcp.h"
#include "amdgpu_seq64.h"
#include "amdgpu_reg_state.h"

#define MAX_GPU_INSTANCE 64

Expand Down Expand Up @@ -468,6 +470,7 @@ struct amdgpu_fpriv {
struct amdgpu_vm vm;
struct amdgpu_bo_va *prt_va;
struct amdgpu_bo_va *csa_va;
struct amdgpu_bo_va *seq64_va;
struct mutex bo_list_lock;
struct idr bo_list_handles;
struct amdgpu_ctx_mgr ctx_mgr;
Expand Down Expand Up @@ -506,6 +509,31 @@ struct amdgpu_allowed_register_entry {
bool grbm_indexed;
};

/**
* enum amd_reset_method - Methods for resetting AMD GPU devices
*
* @AMD_RESET_METHOD_NONE: The device will not be reset.
* @AMD_RESET_LEGACY: Method reserved for SI, CIK and VI ASICs.
* @AMD_RESET_MODE0: Reset the entire ASIC. Not currently available for the
* any device.
* @AMD_RESET_MODE1: Resets all IP blocks on the ASIC (SDMA, GFX, VCN, etc.)
* individually. Suitable only for some discrete GPU, not
* available for all ASICs.
* @AMD_RESET_MODE2: Resets a lesser level of IPs compared to MODE1. Which IPs
* are reset depends on the ASIC. Notably doesn't reset IPs
* shared with the CPU on APUs or the memory controllers (so
* VRAM is not lost). Not available on all ASICs.
* @AMD_RESET_BACO: BACO (Bus Alive, Chip Off) method powers off and on the card
* but without powering off the PCI bus. Suitable only for
* discrete GPUs.
* @AMD_RESET_PCI: Does a full bus reset using core Linux subsystem PCI reset
* and does a secondary bus reset or FLR, depending on what the
* underlying hardware supports.
*
* Methods available for AMD GPU driver for resetting the device. Not all
* methods are suitable for every device. User can override the method using
* module parameter `reset_method`.
*/
enum amd_reset_method {
AMD_RESET_METHOD_NONE = -1,
AMD_RESET_METHOD_LEGACY = 0,
Expand Down Expand Up @@ -585,6 +613,10 @@ struct amdgpu_asic_funcs {
const struct amdgpu_video_codecs **codecs);
/* encode "> 32bits" smn addressing */
u64 (*encode_ext_smn_addressing)(int ext_id);

ssize_t (*get_reg_state)(struct amdgpu_device *adev,
enum amdgpu_reg_state reg_state, void *buf,
size_t max_size);
};

/*
Expand Down Expand Up @@ -986,6 +1018,9 @@ struct amdgpu_device {
/* GDS */
struct amdgpu_gds gds;

/* for userq and VM fences */
struct amdgpu_seq64 seq64;

/* KFD */
struct amdgpu_kfd_dev kfd;

Expand Down
31 changes: 1 addition & 30 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
struct amdgpu_device *adev = dst, *peer_adev;
int num_links;

if (adev->asic_type != CHIP_ALDEBARAN)
if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 4, 2))
return 0;

if (src)
Expand Down Expand Up @@ -710,35 +710,6 @@ bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid)
return false;
}

int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct amdgpu_device *adev,
uint16_t vmid)
{
if (adev->family == AMDGPU_FAMILY_AI) {
int i;

for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS)
amdgpu_gmc_flush_gpu_tlb(adev, vmid, i, 0);
} else {
amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB(0), 0);
}

return 0;
}

int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
uint16_t pasid,
enum TLB_FLUSH_TYPE flush_type,
uint32_t inst)
{
bool all_hub = false;

if (adev->family == AMDGPU_FAMILY_AI ||
adev->family == AMDGPU_FAMILY_RV)
all_hub = true;

return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub, inst);
}

bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
{
return adev->have_atomics_support;
Expand Down
5 changes: 0 additions & 5 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,6 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
uint32_t *ib_cmd, uint32_t ib_len);
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle);
bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct amdgpu_device *adev,
uint16_t vmid);
int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
uint16_t pasid, enum TLB_FLUSH_TYPE flush_type,
uint32_t inst);

bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ int kgd_arcturus_hqd_sdma_dump(struct amdgpu_device *adev,
#undef HQD_N_REGS
#define HQD_N_REGS (19+6+7+10)

*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ static int kgd_gfx_v9_4_3_hqd_sdma_dump(struct amdgpu_device *adev,
(*dump)[i++][1] = RREG32(addr); \
} while (0)

*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ static int kgd_hqd_dump(struct amdgpu_device *adev,
(*dump)[i++][1] = RREG32(addr); \
} while (0)

*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

Expand Down Expand Up @@ -301,7 +301,7 @@ static int kgd_hqd_sdma_dump(struct amdgpu_device *adev,
#undef HQD_N_REGS
#define HQD_N_REGS (19+4)

*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ static int kgd_hqd_dump(struct amdgpu_device *adev,
(*dump)[i++][1] = RREG32(addr); \
} while (0)

*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

Expand Down Expand Up @@ -324,7 +324,7 @@ static int kgd_hqd_sdma_dump(struct amdgpu_device *adev,
#undef HQD_N_REGS
#define HQD_N_REGS (19+4+2+3+7)

*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ int kgd_gfx_v9_hqd_dump(struct amdgpu_device *adev,
(*dump)[i++][1] = RREG32(addr); \
} while (0)

*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

Expand Down Expand Up @@ -460,7 +460,7 @@ static int kgd_hqd_sdma_dump(struct amdgpu_device *adev,
#undef HQD_N_REGS
#define HQD_N_REGS (19+6+7+10)

*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
*dump = kmalloc_array(HQD_N_REGS, sizeof(**dump), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;

Expand Down
Loading

0 comments on commit 5edfd7d

Please sign in to comment.